diff options
Diffstat (limited to 'hypervideo_dl/extractor')
416 files changed, 25719 insertions, 7755 deletions
diff --git a/hypervideo_dl/extractor/__init__.py b/hypervideo_dl/extractor/__init__.py index 198c4ae..b354842 100644 --- a/hypervideo_dl/extractor/__init__.py +++ b/hypervideo_dl/extractor/__init__.py @@ -1,14 +1,15 @@ -from __future__ import unicode_literals +import os from ..utils import load_plugins -try: - from .lazy_extractors import * - from .lazy_extractors import _ALL_CLASSES - _LAZY_LOADER = True - _PLUGIN_CLASSES = {} -except ImportError: - _LAZY_LOADER = False +_LAZY_LOADER = False +if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): + try: + from .lazy_extractors import * + from .lazy_extractors import _ALL_CLASSES + _LAZY_LOADER = True + except ImportError: + pass if not _LAZY_LOADER: from .extractors import * @@ -19,8 +20,8 @@ if not _LAZY_LOADER: ] _ALL_CLASSES.append(GenericIE) - _PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) - _ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES +_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) +_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES def gen_extractor_classes(): diff --git a/hypervideo_dl/extractor/abc.py b/hypervideo_dl/extractor/abc.py index 3e20216..6fe195e 100644 --- a/hypervideo_dl/extractor/abc.py +++ b/hypervideo_dl/extractor/abc.py @@ -8,6 +8,7 @@ import time from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + dict_get, ExtractorError, js_to_json, int_or_none, @@ -212,7 +213,7 @@ class ABCIViewIE(InfoExtractor): 'hdnea': token, }) - for sd in ('720', 'sd', 'sd-low'): + for sd in ('1080', '720', 'sd', 'sd-low'): sd_url = try_get( stream, lambda x: x['streams']['hls'][sd], compat_str) if not sd_url: @@ -233,8 +234,6 @@ class ABCIViewIE(InfoExtractor): }] is_live = video_params.get('livestream') == '1' - if is_live: - title = self._live_title(title) return { 'id': video_id, @@ -255,3 +254,65 @@ class ABCIViewIE(InfoExtractor): 'subtitles': subtitles, 'is_live': is_live, } + + +class ABCIViewShowSeriesIE(InfoExtractor): + IE_NAME = 'abc.net.au:iview:showseries' + _VALID_URL = r'https?://iview\.abc\.net\.au/show/(?P<id>[^/]+)(?:/series/\d+)?$' + _GEO_COUNTRIES = ['AU'] + + _TESTS = [{ + 'url': 'https://iview.abc.net.au/show/upper-middle-bogan', + 'info_dict': { + 'id': '124870-1', + 'title': 'Series 1', + 'description': 'md5:93119346c24a7c322d446d8eece430ff', + 'series': 'Upper Middle Bogan', + 'season': 'Series 1', + 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.*\.jpg$' + }, + 'playlist_count': 8, + }, { + 'url': 'https://iview.abc.net.au/show/upper-middle-bogan', + 'info_dict': { + 'id': 'CO1108V001S00', + 'ext': 'mp4', + 'title': 'Series 1 Ep 1 I\'m A Swan', + 'description': 'md5:7b676758c1de11a30b79b4d301e8da93', + 'series': 'Upper Middle Bogan', + 'uploader_id': 'abc1', + 'upload_date': '20210630', + 'timestamp': 1625036400, + }, + 'params': { + 'noplaylist': True, + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) + webpage_data = self._search_regex( + r'window\.__INITIAL_STATE__\s*=\s*[\'"](.+?)[\'"]\s*;', + webpage, 'initial state') + video_data = self._parse_json( + unescapeHTML(webpage_data).encode('utf-8').decode('unicode_escape'), show_id) + video_data = video_data['route']['pageData']['_embedded'] + + highlight = try_get(video_data, lambda x: x['highlightVideo']['shareUrl']) + if not self._yes_playlist(show_id, bool(highlight), video_label='highlight video'): + return self.url_result(highlight, ie=ABCIViewIE.ie_key()) + + series = video_data['selectedSeries'] + return { + '_type': 'playlist', + 'entries': [self.url_result(episode['shareUrl']) + for episode in series['_embedded']['videoEpisodes']], + 'id': series.get('id'), + 'title': dict_get(series, ('title', 'displaySubtitle')), + 'description': series.get('description'), + 'series': dict_get(series, ('showTitle', 'displayTitle')), + 'season': dict_get(series, ('title', 'displaySubtitle')), + 'thumbnail': series.get('thumbnail'), + } diff --git a/hypervideo_dl/extractor/abematv.py b/hypervideo_dl/extractor/abematv.py new file mode 100644 index 0000000..27b7d86 --- /dev/null +++ b/hypervideo_dl/extractor/abematv.py @@ -0,0 +1,476 @@ +import io +import json +import time +import hashlib +import hmac +import re +import struct +from base64 import urlsafe_b64encode +from binascii import unhexlify + +from .common import InfoExtractor +from ..aes import aes_ecb_decrypt +from ..compat import ( + compat_urllib_response, + compat_urllib_parse_urlparse, + compat_urllib_request, +) +from ..utils import ( + ExtractorError, + decode_base, + int_or_none, + random_uuidv4, + request_to_url, + time_seconds, + update_url_query, + traverse_obj, + intlist_to_bytes, + bytes_to_intlist, + urljoin, +) + + +# NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862) + +def add_opener(ydl, handler): + ''' Add a handler for opening URLs, like _download_webpage ''' + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 + assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + ydl._opener.add_handler(handler) + + +def remove_opener(ydl, handler): + ''' + Remove handler(s) for opening URLs + @param handler Either handler object itself or handler type. + Specifying handler type will remove all handler which isinstance returns True. + ''' + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 + opener = ydl._opener + assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + if isinstance(handler, (type, tuple)): + find_cp = lambda x: isinstance(x, handler) + else: + find_cp = lambda x: x is handler + + removed = [] + for meth in dir(handler): + if meth in ["redirect_request", "do_open", "proxy_open"]: + # oops, coincidental match + continue + + i = meth.find("_") + protocol = meth[:i] + condition = meth[i + 1:] + + if condition.startswith("error"): + j = condition.find("_") + i + 1 + kind = meth[j + 1:] + try: + kind = int(kind) + except ValueError: + pass + lookup = opener.handle_error.get(protocol, {}) + opener.handle_error[protocol] = lookup + elif condition == "open": + kind = protocol + lookup = opener.handle_open + elif condition == "response": + kind = protocol + lookup = opener.process_response + elif condition == "request": + kind = protocol + lookup = opener.process_request + else: + continue + + handlers = lookup.setdefault(kind, []) + if handlers: + handlers[:] = [x for x in handlers if not find_cp(x)] + + removed.append(x for x in handlers if find_cp(x)) + + if removed: + for x in opener.handlers: + if find_cp(x): + x.add_parent(None) + opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)] + + +class AbemaLicenseHandler(compat_urllib_request.BaseHandler): + handler_order = 499 + STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' + HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E' + + def __init__(self, ie: 'AbemaTVIE'): + # the protcol that this should really handle is 'abematv-license://' + # abematv_license_open is just a placeholder for development purposes + # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510 + setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open')) + self.ie = ie + + def _get_videokey_from_ticket(self, ticket): + to_show = self.ie._downloader.params.get('verbose', False) + media_token = self.ie._get_media_token(to_show=to_show) + + license_response = self.ie._download_json( + 'https://license.abema.io/abematv-hls', None, note='Requesting playback license' if to_show else False, + query={'t': media_token}, + data=json.dumps({ + 'kv': 'a', + 'lt': ticket + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) + + res = decode_base(license_response['k'], self.STRTABLE) + encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff)) + + h = hmac.new( + unhexlify(self.HKEY), + (license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'), + digestmod=hashlib.sha256) + enckey = bytes_to_intlist(h.digest()) + + return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey)) + + def abematv_license_open(self, url): + url = request_to_url(url) + ticket = compat_urllib_parse_urlparse(url).netloc + response_data = self._get_videokey_from_ticket(ticket) + return compat_urllib_response.addinfourl(io.BytesIO(response_data), headers={ + 'Content-Length': len(response_data), + }, url=url, code=200) + + +class AbemaTVBaseIE(InfoExtractor): + def _extract_breadcrumb_list(self, webpage, video_id): + for jld in re.finditer( + r'(?is)</span></li></ul><script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', + webpage): + jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False) + if jsonld: + if jsonld.get('@type') != 'BreadcrumbList': + continue + trav = traverse_obj(jsonld, ('itemListElement', ..., 'name')) + if trav: + return trav + return [] + + +class AbemaTVIE(AbemaTVBaseIE): + _VALID_URL = r'https?://abema\.tv/(?P<type>now-on-air|video/episode|channels/.+?/slots)/(?P<id>[^?/]+)' + _NETRC_MACHINE = 'abematv' + _TESTS = [{ + 'url': 'https://abema.tv/video/episode/194-25_s2_p1', + 'info_dict': { + 'id': '194-25_s2_p1', + 'title': '第1話 「チーズケーキ」 「モーニング再び」', + 'series': '異世界食堂2', + 'series_number': 2, + 'episode': '第1話 「チーズケーキ」 「モーニング再び」', + 'episode_number': 1, + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d', + 'info_dict': { + 'id': 'E8tvAnMJ7a9a5d', + 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', + 'series': 'ゆるキャン△ SEASON2', + 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', + 'series_number': 2, + 'episode_number': 1, + 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17', + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047', + 'info_dict': { + 'id': 'E8tvAnMJ7a9a5d', + 'title': '第5話『光射す』', + 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d', + 'thumbnail': r're:https://hayabusa\.io/.+', + 'series': '相棒', + 'episode': '第5話『光射す』', + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/now-on-air/abema-anime', + 'info_dict': { + 'id': 'abema-anime', + # this varies + # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】', + 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f', + 'is_live': True, + }, + 'skip': 'Not supported until hypervideo implements native live downloader OR AbemaTV can start a local HTTP server', + }] + _USERTOKEN = None + _DEVICE_ID = None + _TIMETABLE = None + _MEDIATOKEN = None + + _SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe' + + def _generate_aks(self, deviceid): + deviceid = deviceid.encode('utf-8') + # add 1 hour and then drop minute and secs + ts_1hour = int((time_seconds(hours=9) // 3600 + 1) * 3600) + time_struct = time.gmtime(ts_1hour) + ts_1hour_str = str(ts_1hour).encode('utf-8') + + tmp = None + + def mix_once(nonce): + nonlocal tmp + h = hmac.new(self._SECRETKEY, digestmod=hashlib.sha256) + h.update(nonce) + tmp = h.digest() + + def mix_tmp(count): + nonlocal tmp + for i in range(count): + mix_once(tmp) + + def mix_twist(nonce): + nonlocal tmp + mix_once(urlsafe_b64encode(tmp).rstrip(b'=') + nonce) + + mix_once(self._SECRETKEY) + mix_tmp(time_struct.tm_mon) + mix_twist(deviceid) + mix_tmp(time_struct.tm_mday % 5) + mix_twist(ts_1hour_str) + mix_tmp(time_struct.tm_hour % 5) + + return urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8') + + def _get_device_token(self): + if self._USERTOKEN: + return self._USERTOKEN + + self._DEVICE_ID = random_uuidv4() + aks = self._generate_aks(self._DEVICE_ID) + user_data = self._download_json( + 'https://api.abema.io/v1/users', None, note='Authorizing', + data=json.dumps({ + 'deviceId': self._DEVICE_ID, + 'applicationKeySecret': aks, + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) + self._USERTOKEN = user_data['token'] + + # don't allow adding it 2 times or more, though it's guarded + remove_opener(self._downloader, AbemaLicenseHandler) + add_opener(self._downloader, AbemaLicenseHandler(self)) + + return self._USERTOKEN + + def _get_media_token(self, invalidate=False, to_show=True): + if not invalidate and self._MEDIATOKEN: + return self._MEDIATOKEN + + self._MEDIATOKEN = self._download_json( + 'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False, + query={ + 'osName': 'android', + 'osVersion': '6.0.1', + 'osLang': 'ja_JP', + 'osTimezone': 'Asia/Tokyo', + 'appId': 'tv.abema', + 'appVersion': '3.27.1' + }, headers={ + 'Authorization': 'bearer ' + self._get_device_token() + })['token'] + + return self._MEDIATOKEN + + def _perform_login(self, username, password): + if '@' in username: # don't strictly check if it's email address or not + ep, method = 'user/email', 'email' + else: + ep, method = 'oneTimePassword', 'userId' + + login_response = self._download_json( + f'https://api.abema.io/v1/auth/{ep}', None, note='Logging in', + data=json.dumps({ + method: username, + 'password': password + }).encode('utf-8'), headers={ + 'Authorization': 'bearer ' + self._get_device_token(), + 'Origin': 'https://abema.tv', + 'Referer': 'https://abema.tv/', + 'Content-Type': 'application/json', + }) + + self._USERTOKEN = login_response['token'] + self._get_media_token(True) + + def _real_extract(self, url): + # starting download using infojson from this extractor is undefined behavior, + # and never be fixed in the future; you must trigger downloads by directly specifing URL. + # (unless there's a way to hook before downloading by extractor) + video_id, video_type = self._match_valid_url(url).group('id', 'type') + headers = { + 'Authorization': 'Bearer ' + self._get_device_token(), + } + video_type = video_type.split('/')[-1] + + webpage = self._download_webpage(url, video_id) + canonical_url = self._search_regex( + r'<link\s+rel="canonical"\s*href="(.+?)"', webpage, 'canonical URL', + default=url) + info = self._search_json_ld(webpage, video_id, default={}) + + title = self._search_regex( + r'<span\s*class=".+?EpisodeTitleBlock__title">(.+?)</span>', webpage, 'title', default=None) + if not title: + jsonld = None + for jld in re.finditer( + r'(?is)<span\s*class="com-m-Thumbnail__image">(?:</span>)?<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', + webpage): + jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False) + if jsonld: + break + if jsonld: + title = jsonld.get('caption') + if not title and video_type == 'now-on-air': + if not self._TIMETABLE: + # cache the timetable because it goes to 5MiB in size (!!) + self._TIMETABLE = self._download_json( + 'https://api.abema.io/v1/timetable/dataSet?debug=false', video_id, + headers=headers) + now = time_seconds(hours=9) + for slot in self._TIMETABLE.get('slots', []): + if slot.get('channelId') != video_id: + continue + if slot['startAt'] <= now and now < slot['endAt']: + title = slot['title'] + break + + # read breadcrumb on top of page + breadcrumb = self._extract_breadcrumb_list(webpage, video_id) + if breadcrumb: + # breadcrumb list translates to: (example is 1st test for this IE) + # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title) + # hence this works + info['series'] = breadcrumb[-2] + info['episode'] = breadcrumb[-1] + if not title: + title = info['episode'] + + description = self._html_search_regex( + (r'<p\s+class="com-video-EpisodeDetailsBlock__content"><span\s+class=".+?">(.+?)</span></p><div', + r'<span\s+class=".+?SlotSummary.+?">(.+?)</span></div><div',), + webpage, 'description', default=None, group=1) + if not description: + og_desc = self._html_search_meta( + ('description', 'og:description', 'twitter:description'), webpage) + if og_desc: + description = re.sub(r'''(?sx) + ^(.+?)(?: + アニメの動画を無料で見るならABEMA!| # anime + 等、.+ # applies for most of categories + )? + ''', r'\1', og_desc) + + # canonical URL may contain series and episode number + mobj = re.search(r's(\d+)_p(\d+)$', canonical_url) + if mobj: + seri = int_or_none(mobj.group(1), default=float('inf')) + epis = int_or_none(mobj.group(2), default=float('inf')) + info['series_number'] = seri if seri < 100 else None + # some anime like Detective Conan (though not available in AbemaTV) + # has more than 1000 episodes (1026 as of 2021/11/15) + info['episode_number'] = epis if epis < 2000 else None + + is_live, m3u8_url = False, None + if video_type == 'now-on-air': + is_live = True + channel_url = 'https://api.abema.io/v1/channels' + if video_id == 'news-global': + channel_url = update_url_query(channel_url, {'division': '1'}) + onair_channels = self._download_json(channel_url, video_id) + for ch in onair_channels['channels']: + if video_id == ch['id']: + m3u8_url = ch['playback']['hls'] + break + else: + raise ExtractorError(f'Cannot find on-air {video_id} channel.', expected=True) + elif video_type == 'episode': + api_response = self._download_json( + f'https://api.abema.io/v1/video/programs/{video_id}', video_id, + note='Checking playability', + headers=headers) + ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType'), default=[]) + if 3 not in ondemand_types: + # cannot acquire decryption key for these streams + self.report_warning('This is a premium-only stream') + + m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8' + elif video_type == 'slots': + api_response = self._download_json( + f'https://api.abema.io/v1/media/slots/{video_id}', video_id, + note='Checking playability', + headers=headers) + if not traverse_obj(api_response, ('slot', 'flags', 'timeshiftFree'), default=False): + self.report_warning('This is a premium-only stream') + + m3u8_url = f'https://vod-abematv.akamaized.net/slot/{video_id}/playlist.m3u8' + else: + raise ExtractorError('Unreachable') + + if is_live: + self.report_warning("This is a livestream; hypervideo doesn't support downloading natively, but FFmpeg cannot handle m3u8 manifests from AbemaTV") + self.report_warning('Please consider using Streamlink to download these streams (https://github.com/streamlink/streamlink)') + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', live=is_live) + + info.update({ + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'is_live': is_live, + }) + return info + + +class AbemaTVTitleIE(AbemaTVBaseIE): + _VALID_URL = r'https?://abema\.tv/video/title/(?P<id>[^?/]+)' + + _TESTS = [{ + 'url': 'https://abema.tv/video/title/90-1597', + 'info_dict': { + 'id': '90-1597', + 'title': 'シャッフルアイランド', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://abema.tv/video/title/193-132', + 'info_dict': { + 'id': '193-132', + 'title': '真心が届く~僕とスターのオフィス・ラブ!?~', + }, + 'playlist_mincount': 16, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + playlist_title, breadcrumb = None, self._extract_breadcrumb_list(webpage, video_id) + if breadcrumb: + playlist_title = breadcrumb[-1] + + playlist = [ + self.url_result(urljoin('https://abema.tv/', mobj.group(1))) + for mobj in re.finditer(r'<li\s*class=".+?EpisodeList.+?"><a\s*href="(/[^"]+?)"', webpage)] + + return self.playlist_result(playlist, playlist_title=playlist_title, playlist_id=video_id) diff --git a/hypervideo_dl/extractor/adn.py b/hypervideo_dl/extractor/adn.py index a55ebbc..fca6e60 100644 --- a/hypervideo_dl/extractor/adn.py +++ b/hypervideo_dl/extractor/adn.py @@ -8,13 +8,13 @@ import os import random from .common import InfoExtractor -from ..aes import aes_cbc_decrypt +from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 from ..compat import ( compat_HTTPError, compat_b64decode, - compat_ord, ) from ..utils import ( + ass_subtitles_timecode, bytes_to_intlist, bytes_to_long, ExtractorError, @@ -68,10 +68,6 @@ class ADNIE(InfoExtractor): 'end': 4, } - @staticmethod - def _ass_subtitles_timecode(seconds): - return '%01d:%02d:%02d.%02d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 100) - def _get_subtitles(self, sub_url, video_id): if not sub_url: return None @@ -87,14 +83,11 @@ class ADNIE(InfoExtractor): return None # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js - dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( - bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(binascii.unhexlify(self._K + 'ab9f52f5baae7c72')), - bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) - )) - subtitles_json = self._parse_json( - dec_subtitles[:-compat_ord(dec_subtitles[-1])].decode(), - None, fatal=False) + dec_subtitles = unpad_pkcs7(aes_cbc_decrypt_bytes( + compat_b64decode(enc_subtitles[24:]), + binascii.unhexlify(self._K + 'ab9f52f5baae7c72'), + compat_b64decode(enc_subtitles[:24]))) + subtitles_json = self._parse_json(dec_subtitles.decode(), None, fatal=False) if not subtitles_json: return None @@ -117,8 +110,8 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' continue alignment = self._POS_ALIGN_MAP.get(position_align, 2) + self._LINE_ALIGN_MAP.get(line_align, 0) ssa += os.linesep + 'Dialogue: Marked=0,%s,%s,Default,,0,0,0,,%s%s' % ( - self._ass_subtitles_timecode(start), - self._ass_subtitles_timecode(end), + ass_subtitles_timecode(start), + ass_subtitles_timecode(end), '{\\a%d}' % alignment if alignment != 2 else '', text.replace('\n', '\\N').replace('<i>', '{\\i1}').replace('</i>', '{\\i0}')) @@ -133,10 +126,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' }]) return subtitles - def _real_initialize(self): - username, password = self._get_login_info() - if not username: - return + def _perform_login(self, username, password): try: access_token = (self._download_json( self._API_BASE_URL + 'authentication/login', None, diff --git a/hypervideo_dl/extractor/adobeconnect.py b/hypervideo_dl/extractor/adobeconnect.py index 728549e..e2e6f93 100644 --- a/hypervideo_dl/extractor/adobeconnect.py +++ b/hypervideo_dl/extractor/adobeconnect.py @@ -14,7 +14,7 @@ class AdobeConnectIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title') + title = self._html_extract_title(webpage) qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1]) is_live = qs.get('isLive', ['false'])[0] == 'true' formats = [] @@ -31,7 +31,7 @@ class AdobeConnectIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'formats': formats, 'is_live': is_live, } diff --git a/hypervideo_dl/extractor/adobepass.py b/hypervideo_dl/extractor/adobepass.py index 9378c33..5d98301 100644 --- a/hypervideo_dl/extractor/adobepass.py +++ b/hypervideo_dl/extractor/adobepass.py @@ -39,8 +39,8 @@ MSO_INFO = { }, 'RCN': { 'name': 'RCN', - 'username_field': 'UserName', - 'password_field': 'UserPassword', + 'username_field': 'username', + 'password_field': 'password', }, 'Rogers': { 'name': 'Rogers', @@ -1345,6 +1345,11 @@ MSO_INFO = { 'username_field': 'username', 'password_field': 'password', }, + 'Suddenlink': { + 'name': 'Suddenlink', + 'username_field': 'username', + 'password_field': 'password', + }, } @@ -1636,6 +1641,58 @@ class AdobePassIE(InfoExtractor): query=hidden_data) post_form(mvpd_confirm_page_res, 'Confirming Login') + elif mso_id == 'Suddenlink': + # Suddenlink is similar to SlingTV in using a tab history count and a meta refresh, + # but they also do a dynmaic redirect using javascript that has to be followed as well + first_bookend_page, urlh = post_form( + provider_redirect_page_res, 'Pressing Continue...') + + hidden_data = self._hidden_inputs(first_bookend_page) + hidden_data['history_val'] = 1 + + provider_login_redirect_page_res = self._download_webpage_handle( + urlh.geturl(), video_id, 'Sending First Bookend', + query=hidden_data) + + provider_login_redirect_page, urlh = provider_login_redirect_page_res + + # Some website partners seem to not have the extra ajaxurl redirect step, so we check if we already + # have the login prompt or not + if 'id="password" type="password" name="password"' in provider_login_redirect_page: + provider_login_page_res = provider_login_redirect_page_res + else: + provider_tryauth_url = self._html_search_regex( + r'url:\s*[\'"]([^\'"]+)', provider_login_redirect_page, 'ajaxurl') + provider_tryauth_page = self._download_webpage( + provider_tryauth_url, video_id, 'Submitting TryAuth', + query=hidden_data) + + provider_login_page_res = self._download_webpage_handle( + f'https://authorize.suddenlink.net/saml/module.php/authSynacor/login.php?AuthState={provider_tryauth_page}', + video_id, 'Getting Login Page', + query=hidden_data) + + provider_association_redirect, urlh = post_form( + provider_login_page_res, 'Logging in', { + mso_info['username_field']: username, + mso_info['password_field']: password + }) + + provider_refresh_redirect_url = extract_redirect_url( + provider_association_redirect, url=urlh.geturl()) + + last_bookend_page, urlh = self._download_webpage_handle( + provider_refresh_redirect_url, video_id, + 'Downloading Auth Association Redirect Page') + + hidden_data = self._hidden_inputs(last_bookend_page) + hidden_data['history_val'] = 3 + + mvpd_confirm_page_res = self._download_webpage_handle( + urlh.geturl(), video_id, 'Sending Final Bookend', + query=hidden_data) + + post_form(mvpd_confirm_page_res, 'Confirming Login') else: # Some providers (e.g. DIRECTV NOW) have another meta refresh # based redirect that should be followed. diff --git a/hypervideo_dl/extractor/adobetv.py b/hypervideo_dl/extractor/adobetv.py index 12b8192..3cfa1ff 100644 --- a/hypervideo_dl/extractor/adobetv.py +++ b/hypervideo_dl/extractor/adobetv.py @@ -9,6 +9,7 @@ from ..utils import ( float_or_none, int_or_none, ISO639Utils, + join_nonempty, OnDemandPagedList, parse_duration, str_or_none, @@ -263,7 +264,7 @@ class AdobeTVVideoIE(AdobeTVBaseIE): continue formats.append({ 'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000), - 'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])), + 'format_id': join_nonempty(source.get('format'), source.get('label')), 'height': int_or_none(source.get('height') or None), 'tbr': int_or_none(source.get('bitrate') or None), 'width': int_or_none(source.get('width') or None), diff --git a/hypervideo_dl/extractor/afreecatv.py b/hypervideo_dl/extractor/afreecatv.py index 063872b..77f0e3c 100644 --- a/hypervideo_dl/extractor/afreecatv.py +++ b/hypervideo_dl/extractor/afreecatv.py @@ -10,7 +10,11 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + qualities, + traverse_obj, unified_strdate, + unified_timestamp, + update_url_query, url_or_none, urlencode_postdata, xpath_text, @@ -28,7 +32,7 @@ class AfreecaTVIE(InfoExtractor): /app/(?:index|read_ucc_bbs)\.cgi| /player/[Pp]layer\.(?:swf|html) )\?.*?\bnTitleNo=| - vod\.afreecatv\.com/PLAYER/STATION/ + vod\.afreecatv\.com/(PLAYER/STATION|player)/ ) (?P<id>\d+) ''' @@ -166,6 +170,9 @@ class AfreecaTVIE(InfoExtractor): }, { 'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030', 'only_matching': True, + }, { + 'url': 'http://vod.afreecatv.com/player/15055030', + 'only_matching': True, }] @staticmethod @@ -177,14 +184,7 @@ class AfreecaTVIE(InfoExtractor): video_key['part'] = int(m.group('part')) return video_key - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_form = { 'szWork': 'login', 'szType': 'json', @@ -380,3 +380,105 @@ class AfreecaTVIE(InfoExtractor): }) return info + + +class AfreecaTVLiveIE(AfreecaTVIE): + + IE_NAME = 'afreecatv:live' + _VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P<id>[^/]+)(?:/(?P<bno>\d+))?' + _TESTS = [{ + 'url': 'https://play.afreecatv.com/pyh3646/237852185', + 'info_dict': { + 'id': '237852185', + 'ext': 'mp4', + 'title': '【 우루과이 오늘은 무슨일이? 】', + 'uploader': '박진우[JINU]', + 'uploader_id': 'pyh3646', + 'timestamp': 1640661495, + 'is_live': True, + }, + 'skip': 'Livestream has ended', + }, { + 'url': 'http://play.afreeca.com/pyh3646/237852185', + 'only_matching': True, + }, { + 'url': 'http://play.afreeca.com/pyh3646', + 'only_matching': True, + }] + + _LIVE_API_URL = 'https://live.afreecatv.com/afreeca/player_live_api.php' + + _QUALITIES = ('sd', 'hd', 'hd2k', 'original') + + def _real_extract(self, url): + broadcaster_id, broadcast_no = self._match_valid_url(url).group('id', 'bno') + password = self.get_param('videopassword') + + info = self._download_json(self._LIVE_API_URL, broadcaster_id, fatal=False, + data=urlencode_postdata({'bid': broadcaster_id})) or {} + channel_info = info.get('CHANNEL') or {} + broadcaster_id = channel_info.get('BJID') or broadcaster_id + broadcast_no = channel_info.get('BNO') or broadcast_no + password_protected = channel_info.get('BPWD') + if not broadcast_no: + raise ExtractorError(f'Unable to extract broadcast number ({broadcaster_id} may not be live)', expected=True) + if password_protected == 'Y' and password is None: + raise ExtractorError( + 'This livestream is protected by a password, use the --video-password option', + expected=True) + + formats = [] + quality_key = qualities(self._QUALITIES) + for quality_str in self._QUALITIES: + params = { + 'bno': broadcast_no, + 'stream_type': 'common', + 'type': 'aid', + 'quality': quality_str, + } + if password is not None: + params['pwd'] = password + aid_response = self._download_json( + self._LIVE_API_URL, broadcast_no, fatal=False, + data=urlencode_postdata(params), + note=f'Downloading access token for {quality_str} stream', + errnote=f'Unable to download access token for {quality_str} stream') + aid = traverse_obj(aid_response, ('CHANNEL', 'AID')) + if not aid: + continue + + stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com' + stream_info = self._download_json( + f'{stream_base_url}/broad_stream_assign.html', broadcast_no, fatal=False, + query={ + 'return_type': channel_info.get('CDN', 'gcp_cdn'), + 'broad_key': f'{broadcast_no}-common-{quality_str}-hls', + }, + note=f'Downloading metadata for {quality_str} stream', + errnote=f'Unable to download metadata for {quality_str} stream') or {} + + if stream_info.get('view_url'): + formats.append({ + 'format_id': quality_str, + 'url': update_url_query(stream_info['view_url'], {'aid': aid}), + 'ext': 'mp4', + 'protocol': 'm3u8', + 'quality': quality_key(quality_str), + }) + + self._sort_formats(formats) + + station_info = self._download_json( + 'https://st.afreecatv.com/api/get_station_status.php', broadcast_no, + query={'szBjId': broadcaster_id}, fatal=False, + note='Downloading channel metadata', errnote='Unable to download channel metadata') or {} + + return { + 'id': broadcast_no, + 'title': channel_info.get('TITLE') or station_info.get('station_title'), + 'uploader': channel_info.get('BJNICK') or station_info.get('station_name'), + 'uploader_id': broadcaster_id, + 'timestamp': unified_timestamp(station_info.get('broad_start')), + 'formats': formats, + 'is_live': True, + } diff --git a/hypervideo_dl/extractor/aliexpress.py b/hypervideo_dl/extractor/aliexpress.py index 6f241e6..9722fe9 100644 --- a/hypervideo_dl/extractor/aliexpress.py +++ b/hypervideo_dl/extractor/aliexpress.py @@ -18,7 +18,7 @@ class AliExpressLiveIE(InfoExtractor): 'id': '2800002704436634', 'ext': 'mp4', 'title': 'CASIMA7.22', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'uploader': 'CASIMA Official Store', 'timestamp': 1500717600, 'upload_date': '20170722', diff --git a/hypervideo_dl/extractor/aljazeera.py b/hypervideo_dl/extractor/aljazeera.py index e829b45..7bcdb7a 100644 --- a/hypervideo_dl/extractor/aljazeera.py +++ b/hypervideo_dl/extractor/aljazeera.py @@ -1,55 +1,86 @@ +# coding: utf-8 from __future__ import unicode_literals import json from .common import InfoExtractor +from ..utils import ( + try_get, +) class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?P<type>program/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)' + _VALID_URL = r'https?://(?P<base>\w+\.aljazeera\.\w+)/(?P<type>programs?/[^/]+|(?:feature|video|new)s)?/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)' _TESTS = [{ - 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', + 'url': 'https://balkans.aljazeera.net/videos/2021/11/6/pojedini-domovi-u-sarajevu-jos-pod-vodom-mjestanima-se-dostavlja-hrana', 'info_dict': { - 'id': '3792260579001', + 'id': '6280641530001', 'ext': 'mp4', - 'title': 'The Slum - Episode 1: Deliverance', - 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', - 'uploader_id': '665003303001', - 'timestamp': 1411116829, - 'upload_date': '20140919', - }, - 'add_ie': ['BrightcoveNew'], - 'skip': 'Not accessible from Travis CI server', - }, { - 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', - 'only_matching': True, + 'title': 'Pojedini domovi u Sarajevu još pod vodom, mještanima se dostavlja hrana', + 'timestamp': 1636219149, + 'description': 'U sarajevskim naseljima Rajlovac i Reljevo stambeni objekti, ali i industrijska postrojenja i dalje su pod vodom.', + 'upload_date': '20211106', + } }, { - 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', - 'only_matching': True, + 'url': 'https://balkans.aljazeera.net/videos/2021/11/6/djokovic-usao-u-finale-mastersa-u-parizu', + 'info_dict': { + 'id': '6280654936001', + 'ext': 'mp4', + 'title': 'Đoković ušao u finale Mastersa u Parizu', + 'timestamp': 1636221686, + 'description': 'Novak Đoković je u polufinalu Mastersa u Parizu nakon preokreta pobijedio Poljaka Huberta Hurkacza.', + 'upload_date': '20211106', + }, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + BRIGHTCOVE_URL_RE = r'https?://players.brightcove.net/(?P<account>\d+)/(?P<player_id>[a-zA-Z0-9]+)_(?P<embed>[^/]+)/index.html\?videoId=(?P<id>\d+)' def _real_extract(self, url): - post_type, name = self._match_valid_url(url).groups() + base, post_type, id = self._match_valid_url(url).groups() + wp = { + 'balkans.aljazeera.net': 'ajb', + 'chinese.aljazeera.net': 'chinese', + 'mubasher.aljazeera.net': 'ajm', + }.get(base) or 'aje' post_type = { 'features': 'post', 'program': 'episode', + 'programs': 'episode', 'videos': 'video', + 'news': 'news', }[post_type.split('/')[0]] video = self._download_json( - 'https://www.aljazeera.com/graphql', name, query={ + f'https://{base}/graphql', id, query={ + 'wp-site': wp, 'operationName': 'ArchipelagoSingleArticleQuery', 'variables': json.dumps({ - 'name': name, + 'name': id, 'postType': post_type, }), }, headers={ - 'wp-site': 'aje', - })['data']['article']['video'] - video_id = video['id'] - account_id = video.get('accountId') or '665003303001' - player_id = video.get('playerId') or 'BkeSH5BDb' - return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), - 'BrightcoveNew', video_id) + 'wp-site': wp, + }) + video = try_get(video, lambda x: x['data']['article']['video']) or {} + video_id = video.get('id') + account = video.get('accountId') or '911432371001' + player_id = video.get('playerId') or 'csvTfAlKW' + embed = 'default' + + if video_id is None: + webpage = self._download_webpage(url, id) + + account, player_id, embed, video_id = self._search_regex(self.BRIGHTCOVE_URL_RE, webpage, 'video id', + group=(1, 2, 3, 4), default=(None, None, None, None)) + + if video_id is None: + return { + '_type': 'url_transparent', + 'url': url, + 'ie_key': 'Generic' + } + + return { + '_type': 'url_transparent', + 'url': f'https://players.brightcove.net/{account}/{player_id}_{embed}/index.html?videoId={video_id}', + 'ie_key': 'BrightcoveNew' + } diff --git a/hypervideo_dl/extractor/allocine.py b/hypervideo_dl/extractor/allocine.py index cd533ac..403a277 100644 --- a/hypervideo_dl/extractor/allocine.py +++ b/hypervideo_dl/extractor/allocine.py @@ -7,6 +7,7 @@ from ..utils import ( int_or_none, qualities, remove_end, + strip_or_none, try_get, unified_timestamp, url_basename, @@ -102,10 +103,7 @@ class AllocineIE(InfoExtractor): video_id = display_id media_data = self._download_json( 'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id) - title = remove_end( - self._html_search_regex( - r'(?s)<title>(.+?)</title>', webpage, 'title').strip(), - ' - AlloCiné') + title = remove_end(strip_or_none(self._html_extract_title(webpage), ' - AlloCiné')) for key, value in media_data['video'].items(): if not key.endswith('Path'): continue diff --git a/hypervideo_dl/extractor/alsace20tv.py b/hypervideo_dl/extractor/alsace20tv.py new file mode 100644 index 0000000..4aae6fe --- /dev/null +++ b/hypervideo_dl/extractor/alsace20tv.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + get_element_by_class, + int_or_none, + unified_strdate, + url_or_none, +) + + +class Alsace20TVBaseIE(InfoExtractor): + def _extract_video(self, video_id, url=None): + info = self._download_json( + 'https://www.alsace20.tv/visionneuse/visio_v9_js.php?key=%s&habillage=0&mode=html' % (video_id, ), + video_id) or {} + title = info.get('titre') + + formats = [] + for res, fmt_url in (info.get('files') or {}).items(): + formats.extend( + self._extract_smil_formats(fmt_url, video_id, fatal=False) + if '/smil:_' in fmt_url + else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False)) + self._sort_formats(formats) + + webpage = (url and self._download_webpage(url, video_id, fatal=False)) or '' + thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage)) + upload_date = self._search_regex(r'/(\d{6})_', thumbnail, 'upload_date', default=None) + upload_date = unified_strdate('20%s-%s-%s' % (upload_date[:2], upload_date[2:4], upload_date[4:])) if upload_date else None + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': clean_html(get_element_by_class('wysiwyg', webpage)), + 'upload_date': upload_date, + 'thumbnail': thumbnail, + 'duration': int_or_none(self._og_search_property('video:duration', webpage) if webpage else None), + 'view_count': int_or_none(info.get('nb_vues')), + } + + +class Alsace20TVIE(Alsace20TVBaseIE): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/(?:[\w-]+/)+[\w-]+-(?P<id>[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/VOD/Actu/JT/Votre-JT-jeudi-3-fevrier-lyNHCXpYJh.html', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'description': 'md5:fc0bc4a0692d3d2dba4524053de4c7b7', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'duration': 1073, + 'view_count': int, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id, url) + + +class Alsace20TVEmbedIE(Alsace20TVBaseIE): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/emb/(?P<id>[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/emb/lyNHCXpYJh', + # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'view_count': int, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id) diff --git a/hypervideo_dl/extractor/alura.py b/hypervideo_dl/extractor/alura.py index f5325de..d2e2df2 100644 --- a/hypervideo_dl/extractor/alura.py +++ b/hypervideo_dl/extractor/alura.py @@ -74,14 +74,7 @@ class AluraIE(InfoExtractor): "formats": formats } - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - pass + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') diff --git a/hypervideo_dl/extractor/amazon.py b/hypervideo_dl/extractor/amazon.py new file mode 100644 index 0000000..07b1b18 --- /dev/null +++ b/hypervideo_dl/extractor/amazon.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import int_or_none + + +class AmazonStoreIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P<id>[^/&#$?]+)' + + _TESTS = [{ + 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', + 'info_dict': { + 'id': 'B098XNCHLD', + 'title': 'md5:5f3194dbf75a8dcfc83079bd63a2abed', + }, + 'playlist_mincount': 1, + 'playlist': [{ + 'info_dict': { + 'id': 'A1F83G8C2ARO7P', + 'ext': 'mp4', + 'title': 'mcdodo usb c cable 100W 5a', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }] + }, { + 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', + 'info_dict': { + 'id': 'B0863TXGM3', + 'title': 'md5:b0bde4881d3cfd40d63af19f7898b8ff', + }, + 'playlist_mincount': 4, + }, { + 'url': 'https://www.amazon.com/dp/B0845NXCXF/', + 'info_dict': { + 'id': 'B0845NXCXF', + 'title': 'md5:2145cd4e3c7782f1ee73649a3cff1171', + }, + 'playlist-mincount': 1, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id) + entries = [{ + 'id': video['marketPlaceID'], + 'url': video['url'], + 'title': video.get('title'), + 'thumbnail': video.get('thumbUrl') or video.get('thumb'), + 'duration': video.get('durationSeconds'), + 'height': int_or_none(video.get('videoHeight')), + 'width': int_or_none(video.get('videoWidth')), + } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] + return self.playlist_result(entries, playlist_id=id, playlist_title=data_json['title']) diff --git a/hypervideo_dl/extractor/animelab.py b/hypervideo_dl/extractor/animelab.py index 4fb7ee4..1c2cc47 100644 --- a/hypervideo_dl/extractor/animelab.py +++ b/hypervideo_dl/extractor/animelab.py @@ -15,25 +15,21 @@ from ..compat import compat_HTTPError class AnimeLabBaseIE(InfoExtractor): - _LOGIN_REQUIRED = True _LOGIN_URL = 'https://www.animelab.com/login' _NETRC_MACHINE = 'animelab' + _LOGGED_IN = False - def _login(self): - def is_logged_in(login_webpage): - return 'Sign In' not in login_webpage + def _is_logged_in(self, login_page=None): + if not self._LOGGED_IN: + if not login_page: + login_page = self._download_webpage(self._LOGIN_URL, None, 'Downloading login page') + AnimeLabBaseIE._LOGGED_IN = 'Sign In' not in login_page + return self._LOGGED_IN - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - # Check if already logged in - if is_logged_in(login_page): + def _perform_login(self, username, password): + if self._is_logged_in(): return - (username, password) = self._get_login_info() - if username is None and self._LOGIN_REQUIRED: - self.raise_login_required('Login is required to access any AnimeLab content') - login_form = { 'email': username, 'password': password, @@ -47,17 +43,14 @@ class AnimeLabBaseIE(InfoExtractor): except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: raise ExtractorError('Unable to log in (wrong credentials?)', expected=True) - else: - raise + raise - # if login was successful - if is_logged_in(response): - return - - raise ExtractorError('Unable to login (cannot verify if logged in)') + if not self._is_logged_in(response): + raise ExtractorError('Unable to login (cannot verify if logged in)') def _real_initialize(self): - self._login() + if not self._is_logged_in(): + self.raise_login_required('Login is required to access any AnimeLab content') class AnimeLabIE(AnimeLabBaseIE): diff --git a/hypervideo_dl/extractor/animeondemand.py b/hypervideo_dl/extractor/animeondemand.py index 54e097d..2e674d5 100644 --- a/hypervideo_dl/extractor/animeondemand.py +++ b/hypervideo_dl/extractor/animeondemand.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, extract_attributes, ExtractorError, + join_nonempty, url_or_none, urlencode_postdata, urljoin, @@ -52,11 +53,7 @@ class AnimeOnDemandIE(InfoExtractor): 'only_matching': True, }] - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') @@ -92,9 +89,6 @@ class AnimeOnDemandIE(InfoExtractor): raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') - def _real_initialize(self): - self._login() - def _real_extract(self, url): anime_id = self._match_id(url) @@ -140,15 +134,8 @@ class AnimeOnDemandIE(InfoExtractor): kind = self._search_regex( r'videomaterialurl/\d+/([^/]+)/', playlist_url, 'media kind', default=None) - format_id_list = [] - if lang: - format_id_list.append(lang) - if kind: - format_id_list.append(kind) - if not format_id_list and num is not None: - format_id_list.append(compat_str(num)) - format_id = '-'.join(format_id_list) - format_note = ', '.join(filter(None, (kind, lang_note))) + format_id = join_nonempty(lang, kind) if lang or kind else str(num) + format_note = join_nonempty(kind, lang_note, delim=', ') item_id_list = [] if format_id: item_id_list.append(format_id) @@ -195,12 +182,10 @@ class AnimeOnDemandIE(InfoExtractor): if not file_: continue ext = determine_ext(file_) - format_id_list = [lang, kind] - if ext == 'm3u8': - format_id_list.append('hls') - elif source.get('type') == 'video/dash' or ext == 'mpd': - format_id_list.append('dash') - format_id = '-'.join(filter(None, format_id_list)) + format_id = join_nonempty( + lang, kind, + 'hls' if ext == 'm3u8' else None, + 'dash' if source.get('type') == 'video/dash' or ext == 'mpd' else None) if ext == 'm3u8': file_formats = self._extract_m3u8_formats( file_, video_id, 'mp4', diff --git a/hypervideo_dl/extractor/ant1newsgr.py b/hypervideo_dl/extractor/ant1newsgr.py new file mode 100644 index 0000000..1075b46 --- /dev/null +++ b/hypervideo_dl/extractor/ant1newsgr.py @@ -0,0 +1,143 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + HEADRequest, + ExtractorError, + determine_ext, + scale_thumbnails_to_max_format_width, + unescapeHTML, +) + + +class Ant1NewsGrBaseIE(InfoExtractor): + def _download_and_extract_api_data(self, video_id, netloc, cid=None): + url = f'{self.http_scheme()}//{netloc}{self._API_PATH}' + info = self._download_json(url, video_id, query={'cid': cid or video_id}) + try: + source = info['url'] + except KeyError: + raise ExtractorError('no source found for %s' % video_id) + formats, subs = (self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4') + if determine_ext(source) == 'm3u8' else ([{'url': source}], {})) + self._sort_formats(formats) + thumbnails = scale_thumbnails_to_max_format_width( + formats, [{'url': info['thumb']}], r'(?<=/imgHandler/)\d+') + return { + 'id': video_id, + 'title': info.get('title'), + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subs, + } + + +class Ant1NewsGrWatchIE(Ant1NewsGrBaseIE): + IE_NAME = 'ant1newsgr:watch' + IE_DESC = 'ant1news.gr videos' + _VALID_URL = r'https?://(?P<netloc>(?:www\.)?ant1news\.gr)/watch/(?P<id>\d+)/' + _API_PATH = '/templates/data/player' + + _TESTS = [{ + 'url': 'https://www.ant1news.gr/watch/1506168/ant1-news-09112021-stis-18-45', + 'md5': '95925e6b32106754235f2417e0d2dfab', + 'info_dict': { + 'id': '1506168', + 'ext': 'mp4', + 'title': 'md5:0ad00fa66ecf8aa233d26ab0dba7514a', + 'description': 'md5:18665af715a6dcfeac1d6153a44f16b0', + 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/26d46bf6-8158-4f02-b197-7096c714b2de.jpg', + }, + }] + + def _real_extract(self, url): + video_id, netloc = self._match_valid_url(url).group('id', 'netloc') + webpage = self._download_webpage(url, video_id) + info = self._download_and_extract_api_data(video_id, netloc) + info['description'] = self._og_search_description(webpage) + return info + + +class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE): + IE_NAME = 'ant1newsgr:article' + IE_DESC = 'ant1news.gr articles' + _VALID_URL = r'https?://(?:www\.)?ant1news\.gr/[^/]+/article/(?P<id>\d+)/' + + _TESTS = [{ + 'url': 'https://www.ant1news.gr/afieromata/article/549468/o-tzeims-mpont-sta-meteora-oi-apeiles-kai-o-xesikomos-ton-kalogeron', + 'md5': '294f18331bb516539d72d85a82887dcc', + 'info_dict': { + 'id': '_xvg/m_cmbatw=', + 'ext': 'mp4', + 'title': 'md5:a93e8ecf2e4073bfdffcb38f59945411', + 'timestamp': 1603092840, + 'upload_date': '20201019', + 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/756206d2-d640-40e2-b201-3555abdfc0db.jpg', + }, + }, { + 'url': 'https://ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn', + 'info_dict': { + 'id': '620286', + 'title': 'md5:91fe569e952e4d146485740ae927662b', + }, + 'playlist_mincount': 2, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle') + embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) + if not embed_urls: + raise ExtractorError('no videos found for %s' % video_id, expected=True) + return self.playlist_from_matches( + embed_urls, video_id, info.get('title'), ie=Ant1NewsGrEmbedIE.ie_key(), + video_kwargs={'url_transparent': True, 'timestamp': info.get('timestamp')}) + + +class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): + IE_NAME = 'ant1newsgr:embed' + IE_DESC = 'ant1news.gr embedded videos' + _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player' + _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)' + _API_PATH = '/news/templates/data/jsonPlayer' + + _TESTS = [{ + 'url': 'https://www.antenna.gr/templates/pages/player?cid=3f_li_c_az_jw_y_u=&w=670&h=377', + 'md5': 'dfc58c3a11a5a9aad2ba316ed447def3', + 'info_dict': { + 'id': '3f_li_c_az_jw_y_u=', + 'ext': 'mp4', + 'title': 'md5:a30c93332455f53e1e84ae0724f0adf7', + 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/bbe31201-3f09-4a4e-87f5-8ad2159fffe2.jpg', + }, + }] + + @classmethod + def _extract_urls(cls, webpage): + _EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' + _EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_EMBED_URL_RE})(?P=_q1)' + for mobj in re.finditer(_EMBED_RE, webpage): + url = unescapeHTML(mobj.group('url')) + if not cls.suitable(url): + continue + yield url + + def _real_extract(self, url): + video_id = self._match_id(url) + + canonical_url = self._request_webpage( + HEADRequest(url), video_id, + note='Resolve canonical player URL', + errnote='Could not resolve canonical player URL').geturl() + _, netloc, _, _, query, _ = urllib.parse.urlparse(canonical_url) + cid = urllib.parse.parse_qs(query)['cid'][0] + + return self._download_and_extract_api_data(video_id, netloc, cid=cid) diff --git a/hypervideo_dl/extractor/anvato.py b/hypervideo_dl/extractor/anvato.py index b82f0b5..686d453 100644 --- a/hypervideo_dl/extractor/anvato.py +++ b/hypervideo_dl/extractor/anvato.py @@ -16,6 +16,7 @@ from ..utils import ( determine_ext, intlist_to_bytes, int_or_none, + join_nonempty, strip_jsonp, unescapeHTML, unsmuggle_url, @@ -303,13 +304,13 @@ class AnvatoIE(InfoExtractor): tbr = int_or_none(published_url.get('kbps')) a_format = { 'url': video_url, - 'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(), - 'tbr': tbr if tbr != 0 else None, + 'format_id': join_nonempty('http', published_url.get('cdn_name')).lower(), + 'tbr': tbr or None, } if media_format == 'm3u8' and tbr is not None: a_format.update({ - 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), + 'format_id': join_nonempty('hls', tbr), 'ext': 'mp4', }) elif media_format == 'm3u8-variant' or ext == 'm3u8': diff --git a/hypervideo_dl/extractor/aparat.py b/hypervideo_dl/extractor/aparat.py index da06a3c..1057233 100644 --- a/hypervideo_dl/extractor/aparat.py +++ b/hypervideo_dl/extractor/aparat.py @@ -33,19 +33,22 @@ class AparatIE(InfoExtractor): 'only_matching': True, }] + def _parse_options(self, webpage, video_id, fatal=True): + return self._parse_json(self._search_regex( + r'options\s*=\s*({.+?})\s*;', webpage, 'options', default='{}'), video_id) + def _real_extract(self, url): video_id = self._match_id(url) - # Provides more metadata + # If available, provides more metadata webpage = self._download_webpage(url, video_id, fatal=False) + options = self._parse_options(webpage, video_id, fatal=False) - if not webpage: + if not options: webpage = self._download_webpage( 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, - video_id) - - options = self._parse_json(self._search_regex( - r'options\s*=\s*({.+?})\s*;', webpage, 'options'), video_id) + video_id, 'Downloading embed webpage') + options = self._parse_options(webpage, video_id) formats = [] for sources in (options.get('multiSRC') or []): diff --git a/hypervideo_dl/extractor/applepodcasts.py b/hypervideo_dl/extractor/applepodcasts.py index 6a74de7..9139ff7 100644 --- a/hypervideo_dl/extractor/applepodcasts.py +++ b/hypervideo_dl/extractor/applepodcasts.py @@ -3,7 +3,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + clean_html, clean_podcast_url, + get_element_by_class, int_or_none, parse_iso8601, try_get, @@ -14,16 +16,17 @@ class ApplePodcastsIE(InfoExtractor): _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)' _TESTS = [{ 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', - 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'md5': '41dc31cd650143e530d9423b6b5a344f', 'info_dict': { 'id': '1000482637777', 'ext': 'mp3', 'title': '207 - Whitney Webb Returns', - 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'description': 'md5:75ef4316031df7b41ced4e7b987f79c6', 'upload_date': '20200705', - 'timestamp': 1593921600, - 'duration': 6425, + 'timestamp': 1593932400, + 'duration': 6454, 'series': 'The Tim Dillon Show', + 'thumbnail': 're:.+[.](png|jpe?g|webp)', } }, { 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', @@ -39,24 +42,47 @@ class ApplePodcastsIE(InfoExtractor): def _real_extract(self, url): episode_id = self._match_id(url) webpage = self._download_webpage(url, episode_id) - ember_data = self._parse_json(self._search_regex( - r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', - webpage, 'ember data'), episode_id) - ember_data = ember_data.get(episode_id) or ember_data - episode = ember_data['data']['attributes'] + episode_data = {} + ember_data = {} + # new page type 2021-11 + amp_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<', + webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {} + amp_data = try_get(amp_data, + lambda a: self._parse_json( + next(a[x] for x in iter(a) if episode_id in x), + episode_id), + dict) or {} + amp_data = amp_data.get('d') or [] + episode_data = try_get( + amp_data, + lambda a: next(x for x in a + if x['type'] == 'podcast-episodes' and x['id'] == episode_id), + dict) + if not episode_data: + # try pre 2021-11 page type: TODO: consider deleting if no longer used + ember_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) or {} + ember_data = ember_data.get(episode_id) or ember_data + episode_data = try_get(ember_data, lambda x: x['data'], dict) + episode = episode_data['attributes'] description = episode.get('description') or {} series = None - for inc in (ember_data.get('included') or []): + for inc in (amp_data or ember_data.get('included') or []): if inc.get('type') == 'media/podcast': series = try_get(inc, lambda x: x['attributes']['name']) + series = series or clean_html(get_element_by_class('podcast-header__identity', webpage)) return { 'id': episode_id, - 'title': episode['name'], + 'title': episode.get('name'), 'url': clean_podcast_url(episode['assetUrl']), 'description': description.get('standard') or description.get('short'), 'timestamp': parse_iso8601(episode.get('releaseDateTime')), 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), 'series': series, + 'thumbnail': self._og_search_thumbnail(webpage), + 'vcodec': 'none', } diff --git a/hypervideo_dl/extractor/archiveorg.py b/hypervideo_dl/extractor/archiveorg.py index d90fcb1..2ab3c1b 100644 --- a/hypervideo_dl/extractor/archiveorg.py +++ b/hypervideo_dl/extractor/archiveorg.py @@ -3,33 +3,37 @@ from __future__ import unicode_literals import re import json - from .common import InfoExtractor -from .youtube import YoutubeIE +from .youtube import YoutubeIE, YoutubeBaseInfoExtractor from ..compat import ( compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_HTTPError ) from ..utils import ( + bug_reports_message, clean_html, - determine_ext, dict_get, extract_attributes, ExtractorError, + get_element_by_id, HEADRequest, int_or_none, + join_nonempty, KNOWN_EXTENSIONS, merge_dicts, mimetype2ext, + orderedSet, parse_duration, parse_qs, - RegexNotFoundError, str_to_int, str_or_none, + traverse_obj, try_get, unified_strdate, unified_timestamp, + urlhandle_detect_ext, + url_or_none ) @@ -61,7 +65,7 @@ class ArchiveOrgIE(InfoExtractor): 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', 'uploader': 'yorkmba99@hotmail.com', 'timestamp': 1387699629, - 'upload_date': "20131222", + 'upload_date': '20131222', }, }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', @@ -147,8 +151,7 @@ class ArchiveOrgIE(InfoExtractor): # Archive.org metadata API doesn't clearly demarcate playlist entries # or subtitle tracks, so we get them from the embeddable player. - embed_page = self._download_webpage( - 'https://archive.org/embed/' + identifier, identifier) + embed_page = self._download_webpage(f'https://archive.org/embed/{identifier}', identifier) playlist = self._playlist_data(embed_page) entries = {} @@ -163,17 +166,17 @@ class ArchiveOrgIE(InfoExtractor): 'thumbnails': [], 'artist': p.get('artist'), 'track': p.get('title'), - 'subtitles': {}} + 'subtitles': {}, + } for track in p.get('tracks', []): if track['kind'] != 'subtitles': continue - entries[p['orig']][track['label']] = { - 'url': 'https://archive.org/' + track['file'].lstrip('/')} + 'url': 'https://archive.org/' + track['file'].lstrip('/') + } - metadata = self._download_json( - 'http://archive.org/metadata/' + identifier, identifier) + metadata = self._download_json('http://archive.org/metadata/' + identifier, identifier) m = metadata['metadata'] identifier = m['identifier'] @@ -186,7 +189,7 @@ class ArchiveOrgIE(InfoExtractor): 'license': m.get('licenseurl'), 'release_date': unified_strdate(m.get('date')), 'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])), - 'webpage_url': 'https://archive.org/details/' + identifier, + 'webpage_url': f'https://archive.org/details/{identifier}', 'location': m.get('venue'), 'release_year': int_or_none(m.get('year'))} @@ -204,7 +207,7 @@ class ArchiveOrgIE(InfoExtractor): 'discnumber': int_or_none(f.get('disc')), 'release_year': int_or_none(f.get('year'))}) entry = entries[f['name']] - elif f.get('original') in entries: + elif traverse_obj(f, 'original', expected_type=str) in entries: entry = entries[f['original']] else: continue @@ -227,13 +230,12 @@ class ArchiveOrgIE(InfoExtractor): 'filesize': int_or_none(f.get('size')), 'protocol': 'https'}) - # Sort available formats by filesize for entry in entries.values(): - entry['formats'] = list(sorted(entry['formats'], key=lambda x: x.get('filesize', -1))) + self._sort_formats(entry['formats']) if len(entries) == 1: # If there's only one item, use it as the main info dict - only_video = entries[list(entries.keys())[0]] + only_video = next(iter(entries.values())) if entry_id: info = merge_dicts(only_video, info) else: @@ -258,19 +260,19 @@ class ArchiveOrgIE(InfoExtractor): class YoutubeWebArchiveIE(InfoExtractor): IE_NAME = 'web.archive:youtube' - IE_DESC = 'web.archive.org saved youtube videos' - _VALID_URL = r"""(?x)^ - (?:https?://)?web\.archive\.org/ - (?:web/)? - (?:[0-9A-Za-z_*]+/)? # /web and the version index is optional - - (?:https?(?::|%3[Aa])//)? - (?: - (?:\w+\.)?youtube\.com/watch(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL - |(wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url - ) - (?P<id>[0-9A-Za-z_-]{11})(?:%26|\#|&|$) - """ + IE_DESC = 'web.archive.org saved youtube videos, "ytarchive:" prefix' + _VALID_URL = r'''(?x)(?:(?P<prefix>ytarchive:)| + (?:https?://)?web\.archive\.org/ + (?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional + (?:https?(?::|%3[Aa])//)?(?: + (?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL + |(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url + ) + )(?P<id>[0-9A-Za-z_-]{11}) + (?(prefix) + (?::(?P<date2>[0-9]{14}))?$| + (?:%26|[#&]|$) + )''' _TESTS = [ { @@ -278,141 +280,395 @@ class YoutubeWebArchiveIE(InfoExtractor): 'info_dict': { 'id': 'aYAGB11YrSs', 'ext': 'webm', - 'title': 'Team Fortress 2 - Sandviches!' + 'title': 'Team Fortress 2 - Sandviches!', + 'description': 'md5:4984c0f9a07f349fc5d8e82ab7af4eaf', + 'upload_date': '20110926', + 'uploader': 'Zeurel', + 'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg', + 'duration': 32, + 'uploader_id': 'Zeurel', + 'uploader_url': 'http://www.youtube.com/user/Zeurel' } - }, - { + }, { # Internal link 'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0', 'info_dict': { 'id': '97t7Xj_iBv0', 'ext': 'mp4', - 'title': 'How Flexible Machines Could Save The World' + 'title': 'Why Machines That Bend Are Better', + 'description': 'md5:00404df2c632d16a674ff8df1ecfbb6c', + 'upload_date': '20190312', + 'uploader': 'Veritasium', + 'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA', + 'duration': 771, + 'uploader_id': '1veritasium', + 'uploader_url': 'http://www.youtube.com/user/1veritasium' } - }, - { - # Video from 2012, webm format itag 45. + }, { + # Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description. + # Should use the date in the link. Title ends with '- Youtube'. Capture has description in eow-description 'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en', 'info_dict': { 'id': 'AkhihxRKcrs', 'ext': 'webm', - 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)' + 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)', + 'upload_date': '20120712', + 'duration': 398, + 'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3', + 'uploader_id': 'machinima', + 'uploader_url': 'http://www.youtube.com/user/machinima' } - }, - { - # Old flash-only video. Webpage title starts with "YouTube - ". + }, { + # FLV video. Video file URL does not provide itag information 'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw', 'info_dict': { 'id': 'jNQXAC9IVRw', - 'ext': 'unknown_video', - 'title': 'Me at the zoo' + 'ext': 'flv', + 'title': 'Me at the zoo', + 'upload_date': '20050423', + 'channel_id': 'UC4QobU6STFB0P71PMvOGN5A', + 'duration': 19, + 'description': 'md5:10436b12e07ac43ff8df65287a56efb4', + 'uploader_id': 'jawed', + 'uploader_url': 'http://www.youtube.com/user/jawed' } - }, - { - # Flash video with .flv extension (itag 34). Title has prefix "YouTube -" - # Title has some weird unicode characters too. + }, { 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA', 'info_dict': { 'id': 'lTx3G6h2xyA', 'ext': 'flv', - 'title': 'Madeon - Pop Culture (live mashup)' + 'title': 'Madeon - Pop Culture (live mashup)', + 'upload_date': '20110711', + 'uploader': 'Madeon', + 'channel_id': 'UCqMDNf3Pn5L7pcNkuSEeO3w', + 'duration': 204, + 'description': 'md5:f7535343b6eda34a314eff8b85444680', + 'uploader_id': 'itsmadeon', + 'uploader_url': 'http://www.youtube.com/user/itsmadeon' } - }, - { # Some versions of Youtube have have "YouTube" as page title in html (and later rewritten by js). - 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', + }, { + # First capture is of dead video, second is the oldest from CDX response. + 'url': 'https://web.archive.org/https://www.youtube.com/watch?v=1JYutPM8O6E', 'info_dict': { - 'id': 'kH-G_aIBlFw', + 'id': '1JYutPM8O6E', 'ext': 'mp4', - 'title': 'kH-G_aIBlFw' - }, - 'expected_warnings': [ - 'unable to extract title', - ] - }, - { - # First capture is a 302 redirect intermediary page. - 'url': 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=0altSZ96U4M', + 'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News', + 'upload_date': '20160218', + 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', + 'duration': 1236, + 'description': 'md5:21032bae736421e89c2edf36d1936947', + 'uploader_id': 'MachinimaETC', + 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' + } + }, { + # First capture of dead video, capture date in link links to dead capture. + 'url': 'https://web.archive.org/web/20180803221945/https://www.youtube.com/watch?v=6FPhZJGvf4E', 'info_dict': { - 'id': '0altSZ96U4M', + 'id': '6FPhZJGvf4E', 'ext': 'mp4', - 'title': '0altSZ96U4M' + 'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.', + 'upload_date': '20160219', + 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', + 'duration': 798, + 'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7', + 'uploader_id': 'MachinimaETC', + 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' }, 'expected_warnings': [ - 'unable to extract title', + r'unable to download capture webpage \(it may not be archived\)' ] - }, - { + }, { # Very old YouTube page, has - YouTube in title. + 'url': 'http://web.archive.org/web/20070302011044/http://youtube.com/watch?v=-06-KB9XTzg', + 'info_dict': { + 'id': '-06-KB9XTzg', + 'ext': 'flv', + 'title': 'New Coin Hack!! 100% Safe!!' + } + }, { + 'url': 'web.archive.org/https://www.youtube.com/watch?v=dWW7qP423y8', + 'info_dict': { + 'id': 'dWW7qP423y8', + 'ext': 'mp4', + 'title': 'It\'s Bootleg AirPods Time.', + 'upload_date': '20211021', + 'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug', + 'channel_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug', + 'duration': 810, + 'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc', + 'uploader': 'DankPods', + 'uploader_id': 'UC7Jwj9fkrf1adN4fMmTkpug', + 'uploader_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug' + } + }, { + # player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093 + 'url': 'https://web.archive.org/web/20200827003909if_/http://www.youtube.com/watch?v=6Dh-RL__uN4', + 'info_dict': { + 'id': '6Dh-RL__uN4', + 'ext': 'mp4', + 'title': 'bitch lasagna', + 'upload_date': '20181005', + 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'channel_url': 'http://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'duration': 135, + 'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0', + 'uploader': 'PewDiePie', + 'uploader_id': 'PewDiePie', + 'uploader_url': 'http://www.youtube.com/user/PewDiePie' + } + }, { + 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', + 'only_matching': True + }, { + 'url': 'https://web.archive.org/web/20050214000000_if/http://www.youtube.com/watch?v=0altSZ96U4M', + 'only_matching': True + }, { # Video not archived, only capture is unavailable video page 'url': 'https://web.archive.org/web/20210530071008/https://www.youtube.com/watch?v=lHJTf93HL1s&spfreload=10', - 'only_matching': True, - }, - { # Encoded url + 'only_matching': True + }, { # Encoded url 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fgl%3DUS%26v%3DAkhihxRKcrs%26hl%3Den', - 'only_matching': True, - }, - { + 'only_matching': True + }, { 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fv%3DAkhihxRKcrs%26gl%3DUS%26hl%3Den', - 'only_matching': True, - } + 'only_matching': True + }, { + 'url': 'https://web.archive.org/web/20060527081937/http://www.youtube.com:80/watch.php?v=ELTFsLT73fA&search=soccer', + 'only_matching': True + }, { + 'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg', + 'only_matching': True + }, { + 'url': 'ytarchive:BaW_jenozKc:20050214000000', + 'only_matching': True + }, { + 'url': 'ytarchive:BaW_jenozKc', + 'only_matching': True + }, ] + _YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE + _YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE + _YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|</script|\n)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_BOUNDARY_RE + + _YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers + _YT_ALL_THUMB_SERVERS = orderedSet( + _YT_DEFAULT_THUMB_SERVERS + ['img.youtube.com', *[f'{c}{n or ""}.ytimg.com' for c in ('i', 's') for n in (*range(0, 5), 9)]]) + + _WAYBACK_BASE_URL = 'https://web.archive.org/web/%sif_/' + _OLDEST_CAPTURE_DATE = 20050214000000 + _NEWEST_CAPTURE_DATE = 20500101000000 + + def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note=None, fatal=False): + # CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md + query = { + 'url': url, + 'output': 'json', + 'fl': 'original,mimetype,length,timestamp', + 'limit': 500, + 'filter': ['statuscode:200'] + (filters or []), + 'collapse': collapse or [], + **(query or {}) + } + res = self._download_json( + 'https://web.archive.org/cdx/search/cdx', item_id, + note or 'Downloading CDX API JSON', query=query, fatal=fatal) + if isinstance(res, list) and len(res) >= 2: + # format response to make it easier to use + return list(dict(zip(res[0], v)) for v in res[1:]) + elif not isinstance(res, list) or len(res) != 0: + self.report_warning('Error while parsing CDX API response' + bug_reports_message()) + + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): + return self._parse_json(self._search_regex( + (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), + regex), webpage, name, default='{}'), video_id, fatal=False) + + def _extract_webpage_title(self, webpage): + page_title = self._html_extract_title(webpage, default='') + # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix. + return self._html_search_regex( + r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)', + page_title, 'title', default='') + + def _extract_metadata(self, video_id, webpage): + search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None)) + player_response = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {} + initial_data = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {} + + initial_data_video = traverse_obj( + initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'), + expected_type=dict, get_all=False, default={}) + + video_details = traverse_obj( + player_response, 'videoDetails', expected_type=dict, get_all=False, default={}) + + microformats = traverse_obj( + player_response, ('microformat', 'playerMicroformatRenderer'), expected_type=dict, get_all=False, default={}) + + video_title = ( + video_details.get('title') + or YoutubeBaseInfoExtractor._get_text(microformats, 'title') + or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title') + or self._extract_webpage_title(webpage) + or search_meta(['og:title', 'twitter:title', 'title'])) + + channel_id = str_or_none( + video_details.get('channelId') + or microformats.get('externalChannelId') + or search_meta('channelId') + or self._search_regex( + r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', # @b45a9e6 + webpage, 'channel id', default=None, group='id')) + channel_url = f'http://www.youtube.com/channel/{channel_id}' if channel_id else None + + duration = int_or_none( + video_details.get('lengthSeconds') + or microformats.get('lengthSeconds') + or parse_duration(search_meta('duration'))) + description = ( + video_details.get('shortDescription') + or YoutubeBaseInfoExtractor._get_text(microformats, 'description') + or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23 + or search_meta(['description', 'og:description', 'twitter:description'])) + + uploader = video_details.get('author') + + # Uploader ID and URL + uploader_mobj = re.search( + r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', # @fd05024 + webpage) + if uploader_mobj is not None: + uploader_id, uploader_url = uploader_mobj.group('uploader_id'), uploader_mobj.group('uploader_url') + else: + # @a6211d2 + uploader_url = url_or_none(microformats.get('ownerProfileUrl')) + uploader_id = self._search_regex( + r'(?:user|channel)/([^/]+)', uploader_url or '', 'uploader id', default=None) + + upload_date = unified_strdate( + dict_get(microformats, ('uploadDate', 'publishDate')) + or search_meta(['uploadDate', 'datePublished']) + or self._search_regex( + [r'(?s)id="eow-date.*?>(.*?)</span>', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], # @7998520 + webpage, 'upload date', default=None)) + + return { + 'title': video_title, + 'description': description, + 'upload_date': upload_date, + 'uploader': uploader, + 'channel_id': channel_id, + 'channel_url': channel_url, + 'duration': duration, + 'uploader_url': uploader_url, + 'uploader_id': uploader_id, + } + + def _extract_thumbnails(self, video_id): + try_all = 'thumbnails' in self._configuration_arg('check_all') + thumbnail_base_urls = ['http://{server}/vi{webp}/{video_id}'.format( + webp='_webp' if ext == 'webp' else '', video_id=video_id, server=server) + for server in (self._YT_ALL_THUMB_SERVERS if try_all else self._YT_DEFAULT_THUMB_SERVERS) for ext in (('jpg', 'webp') if try_all else ('jpg',))] + + thumbnails = [] + for url in thumbnail_base_urls: + response = self._call_cdx_api( + video_id, url, filters=['mimetype:image/(?:webp|jpeg)'], + collapse=['urlkey'], query={'matchType': 'prefix'}) + if not response: + continue + thumbnails.extend( + { + 'url': (self._WAYBACK_BASE_URL % (int_or_none(thumbnail_dict.get('timestamp')) or self._OLDEST_CAPTURE_DATE)) + thumbnail_dict.get('original'), + 'filesize': int_or_none(thumbnail_dict.get('length')), + 'preference': int_or_none(thumbnail_dict.get('length')) + } for thumbnail_dict in response) + if not try_all: + break + + self._remove_duplicate_formats(thumbnails) + return thumbnails + + def _get_capture_dates(self, video_id, url_date): + capture_dates = [] + # Note: CDX API will not find watch pages with extra params in the url. + response = self._call_cdx_api( + video_id, f'https://www.youtube.com/watch?v={video_id}', + filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or [] + all_captures = sorted([int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None]) + + # Prefer the new polymer UI captures as we support extracting more metadata from them + # WBM captures seem to all switch to this layout ~July 2020 + modern_captures = [x for x in all_captures if x >= 20200701000000] + if modern_captures: + capture_dates.append(modern_captures[0]) + capture_dates.append(url_date) + if all_captures: + capture_dates.append(all_captures[0]) + + if 'captures' in self._configuration_arg('check_all'): + capture_dates.extend(modern_captures + all_captures) + + # Fallbacks if any of the above fail + capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE]) + return orderedSet(filter(None, capture_dates)) def _real_extract(self, url): - video_id = self._match_id(url) - title = video_id # if we are not able get a title - - def _extract_title(webpage): - page_title = self._html_search_regex( - r'<title>([^<]*)</title>', webpage, 'title', fatal=False) or '' - # YouTube video pages appear to always have either 'YouTube -' as suffix or '- YouTube' as prefix. - try: - page_title = self._html_search_regex( - r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)', - page_title, 'title', default='') - except RegexNotFoundError: - page_title = None - - if not page_title: - self.report_warning('unable to extract title', video_id=video_id) - return - return page_title - - # If the video is no longer available, the oldest capture may be one before it was removed. - # Setting the capture date in url to early date seems to redirect to earliest capture. - webpage = self._download_webpage( - 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=%s' % video_id, - video_id=video_id, fatal=False, errnote='unable to download video webpage (probably not archived).') - if webpage: - title = _extract_title(webpage) or title - - # Use link translator mentioned in https://github.com/ytdl-org/youtube-dl/issues/13655 - internal_fake_url = 'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id + video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2') + url_date = url_date or url_date_2 + + urlh = None try: - video_file_webpage = self._request_webpage( - HEADRequest(internal_fake_url), video_id, - note='Fetching video file url', expected_status=True) + urlh = self._request_webpage( + HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id), + video_id, note='Fetching archived video file url', expected_status=True) except ExtractorError as e: # HTTP Error 404 is expected if the video is not saved. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - raise ExtractorError( - 'HTTP Error %s. Most likely the video is not archived or issue with web.archive.org.' % e.cause.code, + self.raise_no_formats( + 'The requested video is not archived, indexed, or there is an issue with web.archive.org', expected=True) - raise - video_file_url = compat_urllib_parse_unquote(video_file_webpage.url) - video_file_url_qs = parse_qs(video_file_url) - - # Attempt to recover any ext & format info from playback url - format = {'url': video_file_url} - itag = try_get(video_file_url_qs, lambda x: x['itag'][0]) - if itag and itag in YoutubeIE._formats: # Naughty access but it works - format.update(YoutubeIE._formats[itag]) - format.update({'format_id': itag}) - else: - mime = try_get(video_file_url_qs, lambda x: x['mime'][0]) - ext = mimetype2ext(mime) or determine_ext(video_file_url) - format.update({'ext': ext}) - return { - 'id': video_id, - 'title': title, - 'formats': [format], - 'duration': str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0])) - } + else: + raise + + capture_dates = self._get_capture_dates(video_id, int_or_none(url_date)) + self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', ')) + info = {'id': video_id} + for capture in capture_dates: + webpage = self._download_webpage( + (self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id), + video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)', + note='Downloading capture webpage') + current_info = self._extract_metadata(video_id, webpage or '') + # Try avoid getting deleted video metadata + if current_info.get('title'): + info = merge_dicts(info, current_info) + if 'captures' not in self._configuration_arg('check_all'): + break + + info['thumbnails'] = self._extract_thumbnails(video_id) + + if urlh: + url = compat_urllib_parse_unquote(urlh.geturl()) + video_file_url_qs = parse_qs(url) + # Attempt to recover any ext & format info from playback url & response headers + format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))} + itag = try_get(video_file_url_qs, lambda x: x['itag'][0]) + if itag and itag in YoutubeIE._formats: + format.update(YoutubeIE._formats[itag]) + format.update({'format_id': itag}) + else: + mime = try_get(video_file_url_qs, lambda x: x['mime'][0]) + ext = (mimetype2ext(mime) + or urlhandle_detect_ext(urlh) + or mimetype2ext(urlh.headers.get('x-archive-guessed-content-type'))) + format.update({'ext': ext}) + info['formats'] = [format] + if not info.get('duration'): + info['duration'] = str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0])) + + if not info.get('title'): + info['title'] = video_id + return info diff --git a/hypervideo_dl/extractor/arcpublishing.py b/hypervideo_dl/extractor/arcpublishing.py index 5a9b818..8880e5c 100644 --- a/hypervideo_dl/extractor/arcpublishing.py +++ b/hypervideo_dl/extractor/arcpublishing.py @@ -124,8 +124,7 @@ class ArcPublishingIE(InfoExtractor): formats.extend(smil_formats) elif stream_type in ('ts', 'hls'): m3u8_formats = self._extract_m3u8_formats( - s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native', - m3u8_id='hls', fatal=False) + s_url, uuid, 'mp4', live=is_live, m3u8_id='hls', fatal=False) if all([f.get('acodec') == 'none' for f in m3u8_formats]): continue for f in m3u8_formats: @@ -158,7 +157,7 @@ class ArcPublishingIE(InfoExtractor): return { 'id': uuid, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'thumbnail': try_get(video, lambda x: x['promo_image']['url']), 'description': try_get(video, lambda x: x['subheadlines']['basic']), 'formats': formats, diff --git a/hypervideo_dl/extractor/ard.py b/hypervideo_dl/extractor/ard.py index 048d30f..7ea339b 100644 --- a/hypervideo_dl/extractor/ard.py +++ b/hypervideo_dl/extractor/ard.py @@ -280,7 +280,7 @@ class ARDMediathekIE(ARDMediathekBaseIE): info.update({ 'id': video_id, - 'title': self._live_title(title) if info.get('is_live') else title, + 'title': title, 'description': description, 'thumbnail': thumbnail, }) @@ -376,9 +376,24 @@ class ARDIE(InfoExtractor): formats.append(f) self._sort_formats(formats) + _SUB_FORMATS = ( + ('./dataTimedText', 'ttml'), + ('./dataTimedTextNoOffset', 'ttml'), + ('./dataTimedTextVtt', 'vtt'), + ) + + subtitles = {} + for subsel, subext in _SUB_FORMATS: + for node in video_node.findall(subsel): + subtitles.setdefault('de', []).append({ + 'url': node.attrib['url'], + 'ext': subext, + }) + return { 'id': xpath_text(video_node, './videoId', default=display_id), 'formats': formats, + 'subtitles': subtitles, 'display_id': display_id, 'title': video_node.find('./title').text, 'duration': parse_duration(video_node.find('./duration').text), @@ -388,7 +403,14 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?P<mode>player|live|video|sendung|sammlung)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)' + _VALID_URL = r'''(?x)https:// + (?:(?:beta|www)\.)?ardmediathek\.de/ + (?:(?P<client>[^/]+)/)? + (?:player|live|video|(?P<playlist>sendung|sammlung))/ + (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)? + (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+) + (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))''' + _TESTS = [{ 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', @@ -403,6 +425,25 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'upload_date': '20200805', 'ext': 'mp4', }, + 'skip': 'Error', + }, { + 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', + 'md5': 'f1837e563323b8a642a8ddeff0131f51', + 'info_dict': { + 'id': '10049223', + 'ext': 'mp4', + 'title': 'tagesschau, 20:00 Uhr', + 'timestamp': 1636398000, + 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b', + 'upload_date': '20211108', + }, + }, { + 'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1', + 'playlist_count': 6, + 'info_dict': { + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw', + 'title': 'beforeigners/beforeigners/staffel-1', + }, }, { 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', 'only_matching': True, @@ -426,6 +467,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): # playlist of type 'sammlung' 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/', 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', + 'only_matching': True, }] def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber): @@ -522,23 +569,16 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): break pageNumber = pageNumber + 1 - return self.playlist_result(entries, playlist_title=display_id) + return self.playlist_result(entries, playlist_id, playlist_title=display_id) def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - if display_id: - display_id = display_id.rstrip('/') - if not display_id: - display_id = video_id - - if mobj.group('mode') in ('sendung', 'sammlung'): - # this is a playlist-URL - return self._ARD_extract_playlist( - url, video_id, display_id, - mobj.group('client'), - mobj.group('mode')) + video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group( + 'id', 'display_id', 'playlist', 'client', 'season') + display_id, client = display_id or video_id, client or 'ard' + + if playlist_type: + # TODO: Extract only specified season + return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type) player_page = self._download_json( 'https://api.ardmediathek.de/public-gateway', @@ -574,7 +614,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): } } } -}''' % (mobj.group('client'), video_id), +}''' % (client, video_id), }).encode(), headers={ 'Content-Type': 'application/json' })['data']['playerPage'] diff --git a/hypervideo_dl/extractor/arnes.py b/hypervideo_dl/extractor/arnes.py index c0032fc..050c252 100644 --- a/hypervideo_dl/extractor/arnes.py +++ b/hypervideo_dl/extractor/arnes.py @@ -7,6 +7,7 @@ from ..compat import ( compat_urllib_parse_urlparse, ) from ..utils import ( + format_field, float_or_none, int_or_none, parse_iso8601, @@ -92,7 +93,7 @@ class ArnesIE(InfoExtractor): 'timestamp': parse_iso8601(video.get('creationTime')), 'channel': channel.get('name'), 'channel_id': channel_id, - 'channel_url': self._BASE_URL + '/?channel=' + channel_id if channel_id else None, + 'channel_url': format_field(channel_id, template=f'{self._BASE_URL}/?channel=%s'), 'duration': float_or_none(video.get('duration'), 1000), 'view_count': int_or_none(video.get('views')), 'tags': video.get('hashtags'), diff --git a/hypervideo_dl/extractor/arte.py b/hypervideo_dl/extractor/arte.py index 296b169..c2f2c1b 100644 --- a/hypervideo_dl/extractor/arte.py +++ b/hypervideo_dl/extractor/arte.py @@ -12,6 +12,7 @@ from ..utils import ( int_or_none, parse_qs, qualities, + strip_or_none, try_get, unified_strdate, url_or_none, @@ -137,6 +138,7 @@ class ArteTVIE(ArteTVBaseIE): break else: lang_pref = -1 + format_note = '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')) media_type = f.get('mediaType') if media_type == 'hls': @@ -144,14 +146,17 @@ class ArteTVIE(ArteTVBaseIE): format_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) for m3u8_format in m3u8_formats: - m3u8_format['language_preference'] = lang_pref + m3u8_format.update({ + 'language_preference': lang_pref, + 'format_note': format_note, + }) formats.extend(m3u8_formats) continue format = { 'format_id': format_id, 'language_preference': lang_pref, - 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')), + 'format_note': format_note, 'width': int_or_none(f.get('width')), 'height': int_or_none(f.get('height')), 'tbr': int_or_none(f.get('bitrate')), @@ -253,3 +258,44 @@ class ArteTVPlaylistIE(ArteTVBaseIE): title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') return self.playlist_result(entries, playlist_id, title, description) + + +class ArteTVCategoryIE(ArteTVBaseIE): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/politics-and-society/', + 'info_dict': { + 'id': 'politics-and-society', + 'title': 'Politics and society', + 'description': 'Investigative documentary series, geopolitical analysis, and international commentary', + }, + 'playlist_mincount': 13, + }, + ] + + @classmethod + def suitable(cls, url): + return ( + not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, )) + and super(ArteTVCategoryIE, cls).suitable(url)) + + def _real_extract(self, url): + lang, playlist_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, playlist_id) + + items = [] + for video in re.finditer( + r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang, + webpage): + video = video.group('url') + if video == url: + continue + if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): + items.append(video) + + title = (self._og_search_title(webpage, default=None) + or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None)) + title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) + + return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title, + description=self._og_search_description(webpage, default=None)) diff --git a/hypervideo_dl/extractor/asiancrush.py b/hypervideo_dl/extractor/asiancrush.py index 75a6329..7f1940f 100644 --- a/hypervideo_dl/extractor/asiancrush.py +++ b/hypervideo_dl/extractor/asiancrush.py @@ -181,8 +181,7 @@ class AsianCrushPlaylistIE(AsianCrushBaseIE): 'title', default=None) or self._og_search_title( webpage, default=None) or self._html_search_meta( 'twitter:title', webpage, 'title', - default=None) or self._search_regex( - r'<title>([^<]+)</title>', webpage, 'title', fatal=False) + default=None) or self._html_extract_title(webpage) if title: title = re.sub(r'\s*\|\s*.+?$', '', title) diff --git a/hypervideo_dl/extractor/atresplayer.py b/hypervideo_dl/extractor/atresplayer.py index 8143eb4..465af4e 100644 --- a/hypervideo_dl/extractor/atresplayer.py +++ b/hypervideo_dl/extractor/atresplayer.py @@ -24,9 +24,6 @@ class AtresPlayerIE(InfoExtractor): 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', 'duration': 3413, }, - 'params': { - 'format': 'bestvideo', - }, 'skip': 'This video is only available for registered users' }, { @@ -40,9 +37,6 @@ class AtresPlayerIE(InfoExtractor): ] _API_BASE = 'https://api.atresplayer.com/' - def _real_initialize(self): - self._login() - def _handle_error(self, e, code): if isinstance(e.cause, compat_HTTPError) and e.cause.code == code: error = self._parse_json(e.cause.read(), None) @@ -51,11 +45,7 @@ class AtresPlayerIE(InfoExtractor): raise ExtractorError(error['error_description'], expected=True) raise - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): self._request_webpage( self._API_BASE + 'login', None, 'Downloading login page') diff --git a/hypervideo_dl/extractor/atvat.py b/hypervideo_dl/extractor/atvat.py index 7c30cfc..481a097 100644 --- a/hypervideo_dl/extractor/atvat.py +++ b/hypervideo_dl/extractor/atvat.py @@ -8,6 +8,7 @@ from ..utils import ( float_or_none, jwt_encode_hs256, try_get, + ExtractorError, ) @@ -94,6 +95,11 @@ class ATVAtIE(InfoExtractor): }) video_id, videos_data = list(videos['data'].items())[0] + error_msg = try_get(videos_data, lambda x: x['error']['title']) + if error_msg == 'Geo check failed': + self.raise_geo_restricted(error_msg) + elif error_msg: + raise ExtractorError(error_msg) entries = [ self._extract_video_info(url, contentResource[video['id']], video) for video in videos_data] diff --git a/hypervideo_dl/extractor/audiomack.py b/hypervideo_dl/extractor/audiomack.py index cc77713..19775cf 100644 --- a/hypervideo_dl/extractor/audiomack.py +++ b/hypervideo_dl/extractor/audiomack.py @@ -14,7 +14,7 @@ from ..utils import ( class AudiomackIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P<id>[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:song/|(?=.+/song/))(?P<id>[\w/-]+)' IE_NAME = 'audiomack' _TESTS = [ # hosted on audiomack @@ -29,6 +29,7 @@ class AudiomackIE(InfoExtractor): } }, # audiomack wrapper around soundcloud song + # Needs new test URL. { 'add_ie': ['Soundcloud'], 'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle', @@ -39,15 +40,16 @@ class AudiomackIE(InfoExtractor): 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', 'uploader': 'ILOVEMAKONNEN', 'upload_date': '20160414', - } + }, + 'skip': 'Song has been removed from the site', }, ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/song/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/song/', '/') # Request the extended version of the api for extra fields like artist and title api_response = self._download_json( @@ -73,13 +75,13 @@ class AudiomackIE(InfoExtractor): class AudiomackAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P<id>[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:album/|(?=.+/album/))(?P<id>[\w/-]+)' IE_NAME = 'audiomack:album' _TESTS = [ # Standard album playlist { 'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape', - 'playlist_count': 15, + 'playlist_count': 11, 'info_dict': { 'id': '812251', @@ -95,24 +97,27 @@ class AudiomackAlbumIE(InfoExtractor): }, 'playlist': [{ 'info_dict': { - 'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)', - 'id': '837577', + 'title': 'PPP (Pistol P Project) - 8. Real (prod by SYK SENSE )', + 'id': '837576', + 'ext': 'mp3', + 'uploader': 'Lil Herb a.k.a. G Herbo', + } + }, { + 'info_dict': { + 'title': 'PPP (Pistol P Project) - 10. 4 Minutes Of Hell Part 4 (prod by DY OF 808 MAFIA)', + 'id': '837580', 'ext': 'mp3', 'uploader': 'Lil Herb a.k.a. G Herbo', } }], - 'params': { - 'playliststart': 9, - 'playlistend': 9, - } } ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/album/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/album/', '/') result = {'_type': 'playlist', 'entries': []} # There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata # Therefore we don't know how many songs the album has and must infi-loop until failure @@ -134,7 +139,7 @@ class AudiomackAlbumIE(InfoExtractor): # Pull out the album metadata and add to result (if it exists) for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]: if apikey in api_response and resultkey not in result: - result[resultkey] = api_response[apikey] + result[resultkey] = compat_str(api_response[apikey]) song_id = url_basename(api_response['url']).rpartition('.')[0] result['entries'].append({ 'id': compat_str(api_response.get('id', song_id)), diff --git a/hypervideo_dl/extractor/awaan.py b/hypervideo_dl/extractor/awaan.py index 22cc10d..f5e559c 100644 --- a/hypervideo_dl/extractor/awaan.py +++ b/hypervideo_dl/extractor/awaan.py @@ -9,6 +9,7 @@ from ..compat import ( compat_str, ) from ..utils import ( + format_field, int_or_none, parse_iso8601, smuggle_url, @@ -41,9 +42,9 @@ class AWAANBaseIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': video_data.get('description_en') or video_data.get('description_ar'), - 'thumbnail': 'http://admin.mangomolo.com/analytics/%s' % img if img else None, + 'thumbnail': format_field(img, template='http://admin.mangomolo.com/analytics/%s'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, diff --git a/hypervideo_dl/extractor/azmedien.py b/hypervideo_dl/extractor/azmedien.py index fee640e..0168340 100644 --- a/hypervideo_dl/extractor/azmedien.py +++ b/hypervideo_dl/extractor/azmedien.py @@ -11,11 +11,12 @@ class AZMedienIE(InfoExtractor): IE_DESC = 'AZ Medien videos' _VALID_URL = r'''(?x) https?:// - (?:www\.)? + (?:www\.|tv\.)? (?P<host> telezueri\.ch| telebaern\.tv| - telem1\.ch + telem1\.ch| + tvo-online\.ch )/ [^/]+/ (?P<id> @@ -30,7 +31,7 @@ class AZMedienIE(InfoExtractor): ''' _TESTS = [{ - 'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', + 'url': 'https://tv.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', 'info_dict': { 'id': '1_anruz3wy', 'ext': 'mp4', @@ -38,6 +39,9 @@ class AZMedienIE(InfoExtractor): 'uploader_id': 'TVOnline', 'upload_date': '20180930', 'timestamp': 1538328802, + 'view_count': int, + 'thumbnail': 'http://cfvod.kaltura.com/p/1719221/sp/171922100/thumbnail/entry_id/1_anruz3wy/version/100031', + 'duration': 1930 }, 'params': { 'skip_download': True, diff --git a/hypervideo_dl/extractor/banbye.py b/hypervideo_dl/extractor/banbye.py new file mode 100644 index 0000000..3d4d36e --- /dev/null +++ b/hypervideo_dl/extractor/banbye.py @@ -0,0 +1,153 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import math + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) +from ..utils import ( + format_field, + InAdvancePagedList, + traverse_obj, + unified_timestamp, +) + + +class BanByeBaseIE(InfoExtractor): + _API_BASE = 'https://api.banbye.com' + _CDN_BASE = 'https://cdn.banbye.com' + _VIDEO_BASE = 'https://banbye.com/watch' + + @staticmethod + def _extract_playlist_id(url, param='playlist'): + return compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get(param, [None])[0] + + def _extract_playlist(self, playlist_id): + data = self._download_json(f'{self._API_BASE}/playlists/{playlist_id}', playlist_id) + return self.playlist_result([ + self.url_result(f'{self._VIDEO_BASE}/{video_id}', BanByeIE) + for video_id in data['videoIds']], playlist_id, data.get('name')) + + +class BanByeIE(BanByeBaseIE): + _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T', + 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5', + 'info_dict': { + 'id': 'v_ytfmvkVYLE8T', + 'ext': 'mp4', + 'title': 'md5:5ec098f88a0d796f987648de6322ba0f', + 'description': 'md5:4d94836e73396bc18ef1fa0f43e5a63a', + 'uploader': 'wRealu24', + 'channel_id': 'ch_wrealu24', + 'channel_url': 'https://banbye.com/channel/ch_wrealu24', + 'timestamp': 1647604800, + 'upload_date': '20220318', + 'duration': 1931, + 'thumbnail': r're:https?://.*\.webp', + 'tags': 'count:5', + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, + }, { + 'url': 'https://banbye.com/watch/v_2JjQtqjKUE_F?playlistId=p_Ld82N6gBw_OJ', + 'info_dict': { + 'title': 'Krzysztof Karoń', + 'id': 'p_Ld82N6gBw_OJ', + }, + 'playlist_count': 9, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + playlist_id = self._extract_playlist_id(url, 'playlistId') + + if self._yes_playlist(playlist_id, video_id): + return self._extract_playlist(playlist_id) + + data = self._download_json(f'{self._API_BASE}/videos/{video_id}', video_id) + thumbnails = [{ + 'id': f'{quality}p', + 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.webp', + } for quality in [48, 96, 144, 240, 512, 1080]] + formats = [{ + 'format_id': f'http-{quality}p', + 'quality': quality, + 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4', + } for quality in data['quality']] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': data.get('title'), + 'description': data.get('desc'), + 'uploader': traverse_obj(data, ('channel', 'name')), + 'channel_id': data.get('channelId'), + 'channel_url': format_field(data, 'channelId', 'https://banbye.com/channel/%s'), + 'timestamp': unified_timestamp(data.get('publishedAt')), + 'duration': data.get('duration'), + 'tags': data.get('tags'), + 'formats': formats, + 'thumbnails': thumbnails, + 'like_count': data.get('likes'), + 'dislike_count': data.get('dislikes'), + 'view_count': data.get('views'), + 'comment_count': data.get('commentCount'), + } + + +class BanByeChannelIE(BanByeBaseIE): + _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?channel/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://banbye.com/channel/ch_wrealu24', + 'info_dict': { + 'title': 'wRealu24', + 'id': 'ch_wrealu24', + 'description': 'md5:da54e48416b74dfdde20a04867c0c2f6', + }, + 'playlist_mincount': 791, + }, { + 'url': 'https://banbye.com/channel/ch_wrealu24?playlist=p_Ld82N6gBw_OJ', + 'info_dict': { + 'title': 'Krzysztof Karoń', + 'id': 'p_Ld82N6gBw_OJ', + }, + 'playlist_count': 9, + }] + _PAGE_SIZE = 100 + + def _real_extract(self, url): + channel_id = self._match_id(url) + playlist_id = self._extract_playlist_id(url) + + if playlist_id: + return self._extract_playlist(playlist_id) + + def page_func(page_num): + data = self._download_json(f'{self._API_BASE}/videos', channel_id, query={ + 'channelId': channel_id, + 'sort': 'new', + 'limit': self._PAGE_SIZE, + 'offset': page_num * self._PAGE_SIZE, + }, note=f'Downloading page {page_num+1}') + return [ + self.url_result(f"{self._VIDEO_BASE}/{video['_id']}", BanByeIE) + for video in data['items'] + ] + + channel_data = self._download_json(f'{self._API_BASE}/channels/{channel_id}', channel_id) + entries = InAdvancePagedList( + page_func, + math.ceil(channel_data['videoCount'] / self._PAGE_SIZE), + self._PAGE_SIZE) + + return self.playlist_result( + entries, channel_id, channel_data.get('name'), channel_data.get('description')) diff --git a/hypervideo_dl/extractor/bandaichannel.py b/hypervideo_dl/extractor/bandaichannel.py index d672859..f1bcdef 100644 --- a/hypervideo_dl/extractor/bandaichannel.py +++ b/hypervideo_dl/extractor/bandaichannel.py @@ -21,7 +21,6 @@ class BandaiChannelIE(BrightcoveNewIE): 'duration': 1387.733, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }] diff --git a/hypervideo_dl/extractor/bandcamp.py b/hypervideo_dl/extractor/bandcamp.py index b664145..745055e 100644 --- a/hypervideo_dl/extractor/bandcamp.py +++ b/hypervideo_dl/extractor/bandcamp.py @@ -183,6 +183,7 @@ class BandcampIE(InfoExtractor): 'format_note': f.get('description'), 'filesize': parse_filesize(f.get('size_mb')), 'vcodec': 'none', + 'acodec': format_id.split('-')[0], }) self._sort_formats(formats) @@ -212,7 +213,7 @@ class BandcampIE(InfoExtractor): class BandcampAlbumIE(BandcampIE): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?!/music)(?:/album/(?P<id>[^/?#&]+))?' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com/album/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -258,14 +259,6 @@ class BandcampAlbumIE(BandcampIE): }, 'playlist_mincount': 9, }, { - 'url': 'http://dotscale.bandcamp.com', - 'info_dict': { - 'title': 'Loom', - 'id': 'dotscale', - 'uploader_id': 'dotscale', - }, - 'playlist_mincount': 7, - }, { # with escaped quote in title 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep', 'info_dict': { @@ -391,41 +384,63 @@ class BandcampWeeklyIE(BandcampIE): } -class BandcampMusicIE(InfoExtractor): - _VALID_URL = r'https?://(?P<id>[^/]+)\.bandcamp\.com/music' +class BandcampUserIE(InfoExtractor): + IE_NAME = 'Bandcamp:user' + _VALID_URL = r'https?://(?!www\.)(?P<id>[^.]+)\.bandcamp\.com(?:/music)?/?(?:[#?]|$)' + _TESTS = [{ + # Type 1 Bandcamp user page. + 'url': 'https://adrianvonziegler.bandcamp.com', + 'info_dict': { + 'id': 'adrianvonziegler', + 'title': 'Discography of adrianvonziegler', + }, + 'playlist_mincount': 23, + }, { + # Bandcamp user page with only one album + 'url': 'http://dotscale.bandcamp.com', + 'info_dict': { + 'id': 'dotscale', + 'title': 'Discography of dotscale' + }, + 'playlist_count': 1, + }, { + # Type 2 Bandcamp user page. + 'url': 'https://nightcallofficial.bandcamp.com', + 'info_dict': { + 'id': 'nightcallofficial', + 'title': 'Discography of nightcallofficial', + }, + 'playlist_count': 4, + }, { 'url': 'https://steviasphere.bandcamp.com/music', 'playlist_mincount': 47, 'info_dict': { 'id': 'steviasphere', + 'title': 'Discography of steviasphere', }, }, { 'url': 'https://coldworldofficial.bandcamp.com/music', 'playlist_mincount': 10, 'info_dict': { 'id': 'coldworldofficial', + 'title': 'Discography of coldworldofficial', }, }, { 'url': 'https://nuclearwarnowproductions.bandcamp.com/music', 'playlist_mincount': 399, 'info_dict': { 'id': 'nuclearwarnowproductions', + 'title': 'Discography of nuclearwarnowproductions', }, - } - ] - - _TYPE_IE_DICT = { - 'album': BandcampAlbumIE.ie_key(), - 'track': BandcampIE.ie_key() - } + }] def _real_extract(self, url): - id = self._match_id(url) - webpage = self._download_webpage(url, id) - items = re.findall(r'href\=\"\/(?P<path>(?P<type>album|track)+/[^\"]+)', webpage) - entries = [ - self.url_result( - f'https://{id}.bandcamp.com/{item[0]}', - ie=self._TYPE_IE_DICT[item[1]]) - for item in items] - return self.playlist_result(entries, id) + uploader = self._match_id(url) + webpage = self._download_webpage(url, uploader) + + discography_data = (re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\']([^"\']+)', webpage) + or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage)) + + return self.playlist_from_matches( + discography_data, uploader, f'Discography of {uploader}', getter=lambda x: urljoin(url, x)) diff --git a/hypervideo_dl/extractor/bbc.py b/hypervideo_dl/extractor/bbc.py index 4e2dcd7..29ad7de 100644 --- a/hypervideo_dl/extractor/bbc.py +++ b/hypervideo_dl/extractor/bbc.py @@ -11,6 +11,7 @@ from ..compat import ( compat_etree_Element, compat_HTTPError, compat_str, + compat_urllib_error, compat_urlparse, ) from ..utils import ( @@ -38,7 +39,7 @@ from ..utils import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' + _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ @@ -263,11 +264,7 @@ class BBCCoUkIE(InfoExtractor): 'only_matching': True, }] - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading signin page') @@ -293,9 +290,6 @@ class BBCCoUkIE(InfoExtractor): 'Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') - def _real_initialize(self): - self._login() - class MediaSelectionError(Exception): def __init__(self, id): self.id = id @@ -394,9 +388,17 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_mpd_formats( href, programme_id, mpd_id=format_id, fatal=False)) elif transfer_format == 'hls': - formats.extend(self._extract_m3u8_formats( - href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False)) + # TODO: let expected_status be passed into _extract_xxx_formats() instead + try: + fmts = self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + except ExtractorError as e: + if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError) + and e.exc_info[1].code in (403, 404)): + raise + fmts = [] + formats.extend(fmts) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) @@ -451,9 +453,10 @@ class BBCCoUkIE(InfoExtractor): playlist = self._download_json( 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, playlist_id, 'Downloading playlist JSON') + formats = [] + subtitles = {} - version = playlist.get('defaultAvailableVersion') - if version: + for version in playlist.get('allAvailableVersions', []): smp_config = version['smpConfig'] title = smp_config['title'] description = smp_config['summary'] @@ -463,8 +466,17 @@ class BBCCoUkIE(InfoExtractor): continue programme_id = item.get('vpid') duration = int_or_none(item.get('duration')) - formats, subtitles = self._download_media_selector(programme_id) - return programme_id, title, description, duration, formats, subtitles + version_formats, version_subtitles = self._download_media_selector(programme_id) + types = version['types'] + for f in version_formats: + f['format_note'] = ', '.join(types) + if any('AudioDescribed' in x for x in types): + f['language_preference'] = -10 + formats += version_formats + for tag, subformats in (version_subtitles or {}).items(): + subtitles.setdefault(tag, []).extend(subformats) + + return programme_id, title, description, duration, formats, subtitles except ExtractorError as ee: if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): raise @@ -775,20 +787,32 @@ class BBCIE(BBCCoUkIE): 'upload_date': '20150725', }, }, { + # video with window.__INITIAL_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/av/world-europe-59468682', + 'info_dict': { + 'id': 'p0b71qth', + 'ext': 'mp4', + 'title': 'Why France is making this woman a national hero', + 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1638230731, + 'upload_date': '20211130', + }, + }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, }, { + # bbcthreeConfig 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', 'info_dict': { 'id': 'p06556y7', 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + 'title': 'Things Not To Say to people that live on council estates', + 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.", + 'duration': 360, + 'thumbnail': r're:https?://.+/.+\.jpg', }, - 'params': { - 'skip_download': True, - } }, { # window.__PRELOADED_STATE__ 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', @@ -882,9 +906,8 @@ class BBCIE(BBCCoUkIE): playlist_title = json_ld_info.get('title') if not playlist_title: - playlist_title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'<title>(.+?)</title>', webpage, 'playlist title', default=None) + playlist_title = (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'playlist title', default=None)) if playlist_title: playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() @@ -1161,9 +1184,16 @@ class BBCIE(BBCCoUkIE): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) - initial_data = self._parse_json(self._search_regex( - r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, + 'quoted preload state', default=None) + if initial_data is None: + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage, + 'preload state', default={}) + else: + initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) + initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: def parse_media(media): if not media: @@ -1204,7 +1234,10 @@ class BBCIE(BBCCoUkIE): if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': - for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + for block in (try_get(resp, + (lambda x: x['data']['blocks'], + lambda x: x['data']['content']['model']['blocks'],), + list) or []): if block.get('type') != 'media': continue parse_media(block.get('model')) diff --git a/hypervideo_dl/extractor/beeg.py b/hypervideo_dl/extractor/beeg.py index 8fbabe7..717fff3 100644 --- a/hypervideo_dl/extractor/beeg.py +++ b/hypervideo_dl/extractor/beeg.py @@ -1,32 +1,45 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_str, -) + from ..utils import ( int_or_none, - parse_qs, + traverse_obj, + try_get, unified_timestamp, ) class BeegIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com|porn(?:/video)?)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com(?:/video)?)/-?(?P<id>\d+)' _TESTS = [{ - # api/v6 v1 - 'url': 'http://beeg.com/5416503', - 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820', + 'url': 'https://beeg.com/-0983946056129650', + 'md5': '51d235147c4627cfce884f844293ff88', 'info_dict': { - 'id': '5416503', + 'id': '0983946056129650', 'ext': 'mp4', - 'title': 'Sultry Striptease', - 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2', - 'timestamp': 1391813355, - 'upload_date': '20140207', - 'duration': 383, + 'title': 'sucked cock and fucked in a private plane', + 'duration': 927, 'tags': list, 'age_limit': 18, + 'upload_date': '20220131', + 'timestamp': 1643656455, + 'display_id': 2540839, + } + }, { + 'url': 'https://beeg.com/-0599050563103750?t=4-861', + 'md5': 'bd8b5ea75134f7f07fad63008db2060e', + 'info_dict': { + 'id': '0599050563103750', + 'ext': 'mp4', + 'title': 'Bad Relatives', + 'duration': 2060, + 'tags': list, + 'age_limit': 18, + 'description': 'md5:b4fc879a58ae6c604f8f259155b7e3b9', + 'timestamp': 1643623200, + 'display_id': 2569965, + 'upload_date': '20220131', } }, { # api/v6 v2 @@ -36,12 +49,6 @@ class BeegIE(InfoExtractor): # api/v6 v2 w/o t 'url': 'https://beeg.com/1277207756', 'only_matching': True, - }, { - 'url': 'https://beeg.porn/video/5416503', - 'only_matching': True, - }, { - 'url': 'https://beeg.porn/5416503', - 'only_matching': True, }] def _real_extract(self, url): @@ -49,68 +56,38 @@ class BeegIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - beeg_version = self._search_regex( - r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version', - default='1546225636701') + video = self._download_json( + 'https://store.externulls.com/facts/file/%s' % video_id, + video_id, 'Downloading JSON for %s' % video_id) - if len(video_id) >= 10: - query = { - 'v': 2, - } - qs = parse_qs(url) - t = qs.get('t', [''])[0].split('-') - if len(t) > 1: - query.update({ - 's': t[0], - 'e': t[1], - }) - else: - query = {'v': 1} + fc_facts = video.get('fc_facts') + first_fact = {} + for fact in fc_facts: + if not first_fact or try_get(fact, lambda x: x['id'] < first_fact['id']): + first_fact = fact - for api_path in ('', 'api.'): - video = self._download_json( - 'https://%sbeeg.com/api/v6/%s/video/%s' - % (api_path, beeg_version, video_id), video_id, - fatal=api_path == 'api.', query=query) - if video: - break + resources = traverse_obj(video, ('file', 'hls_resources')) or first_fact.get('hls_resources') formats = [] - for format_id, video_url in video.items(): - if not video_url: - continue - height = self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None) - if not height: + for format_id, video_uri in resources.items(): + if not video_uri: continue - formats.append({ - 'url': self._proto_relative_url( - video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'), - 'format_id': format_id, - 'height': int(height), - }) - self._sort_formats(formats) - - title = video['title'] - video_id = compat_str(video.get('id') or video_id) - display_id = video.get('code') - description = video.get('desc') - series = video.get('ps_name') + height = int_or_none(self._search_regex(r'fl_cdn_(\d+)', format_id, 'height', default=None)) + current_formats = self._extract_m3u8_formats(f'https://video.beeg.com/{video_uri}', video_id, ext='mp4', m3u8_id=str(height)) + for f in current_formats: + f['height'] = height + formats.extend(current_formats) - timestamp = unified_timestamp(video.get('date')) - duration = int_or_none(video.get('duration')) - - tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None + self._sort_formats(formats) return { 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'series': series, - 'timestamp': timestamp, - 'duration': duration, - 'tags': tags, + 'display_id': first_fact.get('id'), + 'title': traverse_obj(video, ('file', 'stuff', 'sf_name')), + 'description': traverse_obj(video, ('file', 'stuff', 'sf_story')), + 'timestamp': unified_timestamp(first_fact.get('fc_created')), + 'duration': int_or_none(traverse_obj(video, ('file', 'fl_duration'))), + 'tags': traverse_obj(video, ('tags', ..., 'tg_name')), 'formats': formats, 'age_limit': self._rta_search(webpage), } diff --git a/hypervideo_dl/extractor/bigo.py b/hypervideo_dl/extractor/bigo.py new file mode 100644 index 0000000..ddf76ac --- /dev/null +++ b/hypervideo_dl/extractor/bigo.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, urlencode_postdata + + +class BigoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bigo\.tv/(?:[a-z]{2,}/)?(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'https://www.bigo.tv/ja/221338632', + 'info_dict': { + 'id': '6576287577575737440', + 'title': '土よ〜💁♂️ 休憩室/REST room', + 'thumbnail': r're:https?://.+', + 'uploader': '✨Shin💫', + 'uploader_id': '221338632', + 'is_live': True, + }, + 'skip': 'livestream', + }, { + 'url': 'https://www.bigo.tv/th/Tarlerm1304', + 'only_matching': True, + }, { + 'url': 'https://bigo.tv/115976881', + 'only_matching': True, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + info_raw = self._download_json( + 'https://bigo.tv/studio/getInternalStudioInfo', + user_id, data=urlencode_postdata({'siteId': user_id})) + + if not isinstance(info_raw, dict): + raise ExtractorError('Received invalid JSON data') + if info_raw.get('code'): + raise ExtractorError( + 'Bigo says: %s (code %s)' % (info_raw.get('msg'), info_raw.get('code')), expected=True) + info = info_raw.get('data') or {} + + if not info.get('alive'): + raise ExtractorError('This user is offline.', expected=True) + + return { + 'id': info.get('roomId') or user_id, + 'title': info.get('roomTopic') or info.get('nick_name') or user_id, + 'formats': [{ + 'url': info.get('hls_src'), + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + 'thumbnail': info.get('snapshot'), + 'uploader': info.get('nick_name'), + 'uploader_id': user_id, + 'is_live': True, + } diff --git a/hypervideo_dl/extractor/bilibili.py b/hypervideo_dl/extractor/bilibili.py index 8d66b43..909f7f8 100644 --- a/hypervideo_dl/extractor/bilibili.py +++ b/hypervideo_dl/extractor/bilibili.py @@ -1,5 +1,6 @@ # coding: utf-8 +import base64 import hashlib import itertools import functools @@ -14,19 +15,21 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + filter_dict, int_or_none, float_or_none, + mimetype2ext, parse_iso8601, traverse_obj, - try_get, + parse_count, smuggle_url, srt_subtitles_timecode, str_or_none, - str_to_int, strip_jsonp, unified_timestamp, unsmuggle_url, urlencode_postdata, + url_or_none, OnDemandPagedList ) @@ -50,16 +53,14 @@ class BiliBiliIE(InfoExtractor): 'url': 'http://www.bilibili.com/video/av1074402/', 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', 'info_dict': { - 'id': '1074402', - 'ext': 'flv', + 'id': '1074402_part1', + 'ext': 'mp4', 'title': '【金坷垃】金泡沫', + 'uploader_id': '156160', + 'uploader': '菊子桑', + 'upload_date': '20140420', 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'duration': 308.067, 'timestamp': 1398012678, - 'upload_date': '20140420', - 'thumbnail': r're:^https?://.+\.jpg', - 'uploader': '菊子桑', - 'uploader_id': '156160', }, }, { # Tested in BiliBiliBangumiIE @@ -73,49 +74,27 @@ class BiliBiliIE(InfoExtractor): 'url': 'http://bangumi.bilibili.com/anime/5802/play#100643', 'md5': '3f721ad1e75030cc06faf73587cfec57', 'info_dict': { - 'id': '100643', + 'id': '100643_part1', 'ext': 'mp4', 'title': 'CHAOS;CHILD', 'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...', }, 'skip': 'Geo-restricted to China', }, { - # Title with double quotes 'url': 'http://www.bilibili.com/video/av8903802/', 'info_dict': { - 'id': '8903802', + 'id': '8903802_part1', + 'ext': 'mp4', 'title': '阿滴英文|英文歌分享#6 "Closer', + 'upload_date': '20170301', 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', + 'timestamp': 1488382634, + 'uploader_id': '65880958', + 'uploader': '阿滴英文', + }, + 'params': { + 'skip_download': True, }, - 'playlist': [{ - 'info_dict': { - 'id': '8903802_part1', - 'ext': 'flv', - 'title': '阿滴英文|英文歌分享#6 "Closer', - 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', - 'uploader': '阿滴英文', - 'uploader_id': '65880958', - 'timestamp': 1488382634, - 'upload_date': '20170301', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'info_dict': { - 'id': '8903802_part2', - 'ext': 'flv', - 'title': '阿滴英文|英文歌分享#6 "Closer', - 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', - 'uploader': '阿滴英文', - 'uploader_id': '65880958', - 'timestamp': 1488382634, - 'upload_date': '20170301', - }, - 'params': { - 'skip_download': True, - }, - }] }, { # new BV video id format 'url': 'https://www.bilibili.com/video/BV1JE411F741', @@ -150,6 +129,7 @@ class BiliBiliIE(InfoExtractor): av_id, bv_id = self._get_video_id_set(video_id, mobj.group('id_bv') is not None) video_id = av_id + info = {} anime_id = mobj.group('anime_id') page_id = mobj.group('page') webpage = self._download_webpage(url, video_id) @@ -201,66 +181,95 @@ class BiliBiliIE(InfoExtractor): } headers.update(self.geo_verification_headers()) + video_info = self._parse_json( + self._search_regex(r'window.__playinfo__\s*=\s*({.+?})</script>', webpage, 'video info', default=None) or '{}', + video_id, fatal=False) + video_info = video_info.get('data') or {} + + durl = traverse_obj(video_info, ('dash', 'video')) + audios = traverse_obj(video_info, ('dash', 'audio')) or [] entries = [] RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') for num, rendition in enumerate(RENDITIONS, start=1): payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() - - video_info = self._download_json( - 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), - video_id, note='Downloading video info page', - headers=headers, fatal=num == len(RENDITIONS)) - if not video_info: - continue + video_info = self._download_json( + 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), + video_id, note='Downloading video info page', + headers=headers, fatal=num == len(RENDITIONS)) + if not video_info: + continue - if 'durl' not in video_info: + if not durl and 'durl' not in video_info: if num < len(RENDITIONS): continue self._report_error(video_info) - for idx, durl in enumerate(video_info['durl']): - formats = [{ - 'url': durl['url'], - 'filesize': int_or_none(durl['size']), - }] - for backup_url in durl.get('backup_url', []): + formats = [] + for idx, durl in enumerate(durl or video_info['durl']): + formats.append({ + 'url': durl.get('baseUrl') or durl.get('base_url') or durl.get('url'), + 'ext': mimetype2ext(durl.get('mimeType') or durl.get('mime_type')), + 'fps': int_or_none(durl.get('frameRate') or durl.get('frame_rate')), + 'width': int_or_none(durl.get('width')), + 'height': int_or_none(durl.get('height')), + 'vcodec': durl.get('codecs'), + 'acodec': 'none' if audios else None, + 'tbr': float_or_none(durl.get('bandwidth'), scale=1000), + 'filesize': int_or_none(durl.get('size')), + }) + for backup_url in traverse_obj(durl, 'backup_url', expected_type=list) or []: formats.append({ 'url': backup_url, - # backup URLs have lower priorities 'quality': -2 if 'hd.mp4' in backup_url else -3, }) - for a_format in formats: - a_format.setdefault('http_headers', {}).update({ - 'Referer': url, + for audio in audios: + formats.append({ + 'url': audio.get('baseUrl') or audio.get('base_url') or audio.get('url'), + 'ext': mimetype2ext(audio.get('mimeType') or audio.get('mime_type')), + 'fps': int_or_none(audio.get('frameRate') or audio.get('frame_rate')), + 'width': int_or_none(audio.get('width')), + 'height': int_or_none(audio.get('height')), + 'acodec': audio.get('codecs'), + 'vcodec': 'none', + 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), + 'filesize': int_or_none(audio.get('size')) + }) + for backup_url in traverse_obj(audio, 'backup_url', expected_type=list) or []: + formats.append({ + 'url': backup_url, + # backup URLs have lower priorities + 'quality': -3, }) - self._sort_formats(formats) - - entries.append({ - 'id': '%s_part%s' % (video_id, idx), - 'duration': float_or_none(durl.get('length'), 1000), - 'formats': formats, - }) + info.update({ + 'id': video_id, + 'duration': float_or_none(durl.get('length'), 1000), + 'formats': formats, + 'http_headers': { + 'Referer': url, + }, + }) break - title = self._html_search_regex( - (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', - r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', - group='title') + self._sort_formats(formats) + + title = self._html_search_regex(( + r'<h1[^>]+title=(["\'])(?P<content>[^"\']+)', + r'(?s)<h1[^>]*>(?P<content>.+?)</h1>', + self._meta_regex('title') + ), webpage, 'title', group='content', fatal=False) # Get part title for anthologies if page_id is not None: - # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video - part_title = try_get( - self._download_json( - f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', - video_id, note='Extracting videos in anthology'), - lambda x: x['data'][int(page_id) - 1]['part']) - title = part_title or title + # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video. + part_info = traverse_obj(self._download_json( + f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', + video_id, note='Extracting videos in anthology'), 'data', expected_type=list) + title = title if len(part_info) == 1 else traverse_obj(part_info, (int(page_id) - 1, 'part')) or title description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( @@ -270,15 +279,15 @@ class BiliBiliIE(InfoExtractor): thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) # TODO 'view_count' requires deobfuscating Javascript - info = { - 'id': str(video_id) if page_id is None else '%s_part%s' % (video_id, page_id), + info.update({ + 'id': f'{video_id}_part{page_id or 1}', 'cid': cid, 'title': title, 'description': description, 'timestamp': timestamp, 'thumbnail': thumbnail, 'duration': float_or_none(video_info.get('timelength'), scale=1000), - } + }) uploader_mobj = re.search( r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>\s*(?P<name>[^<]+?)\s*<', @@ -299,7 +308,7 @@ class BiliBiliIE(InfoExtractor): video_id, fatal=False, note='Downloading tags'), ('data', ..., 'tag_name')), } - entries[0]['subtitles'] = { + info['subtitles'] = { 'danmaku': [{ 'ext': 'xml', 'url': f'https://comment.bilibili.com/{cid}.xml', @@ -334,19 +343,18 @@ class BiliBiliIE(InfoExtractor): entry['id'] = '%s_part%d' % (video_id, (idx + 1)) return { - '_type': 'multi_video', 'id': str(video_id), 'bv_id': bv_id, 'title': title, 'description': description, - 'entries': entries, **info, **top_level_info } def _extract_anthology_entries(self, bv_id, video_id, webpage): title = self._html_search_regex( (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', - r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', + r'(?s)<h1[^>]*>(?P<title>.+?)</h1>', + r'<title>(?P<title>.+?)</title>'), webpage, 'title', group='title') json_data = self._download_json( f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', @@ -376,8 +384,10 @@ class BiliBiliIE(InfoExtractor): replies = traverse_obj( self._download_json( f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={video_id}&type=1&jsonp=jsonp&sort=2&_=1567227301685', - video_id, note=f'Extracting comments from page {idx}'), - ('data', 'replies')) or [] + video_id, note=f'Extracting comments from page {idx}', fatal=False), + ('data', 'replies')) + if not replies: + return for children in map(self._get_all_children, replies): yield from children @@ -477,9 +487,9 @@ class BilibiliChannelIE(InfoExtractor): data = self._download_json( self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data'] - max_count = max_count or try_get(data, lambda x: x['page']['count']) + max_count = max_count or traverse_obj(data, ('page', 'count')) - entries = try_get(data, lambda x: x['list']['vlist']) + entries = traverse_obj(data, ('list', 'vlist')) if not entries: return for entry in entries: @@ -517,7 +527,7 @@ class BilibiliCategoryIE(InfoExtractor): api_url, query, query={'Search_key': query, 'pn': page_num}, note='Extracting results from page %s of %s' % (page_num, num_pages)) - video_list = try_get(parsed_json, lambda x: x['data']['archives'], list) + video_list = traverse_obj(parsed_json, ('data', 'archives'), expected_type=list) if not video_list: raise ExtractorError('Failed to retrieve video list for page %d' % page_num) @@ -547,7 +557,7 @@ class BilibiliCategoryIE(InfoExtractor): api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value page_json = self._download_json(api_url, query, query={'Search_key': query, 'pn': '1'}) - page_data = try_get(page_json, lambda x: x['data']['page'], dict) + page_data = traverse_obj(page_json, ('data', 'page'), expected_type=dict) count, size = int_or_none(page_data.get('count')), int_or_none(page_data.get('size')) if count is None or not size: raise ExtractorError('Failed to calculate either page count or size') @@ -566,7 +576,7 @@ class BilibiliCategoryIE(InfoExtractor): class BiliBiliSearchIE(SearchInfoExtractor): - IE_DESC = 'Bilibili video search, "bilisearch" keyword' + IE_DESC = 'Bilibili video search' _MAX_RESULTS = 100000 _SEARCH_KEY = 'bilisearch' @@ -719,40 +729,68 @@ class BiliBiliPlayerIE(InfoExtractor): class BiliIntlBaseIE(InfoExtractor): - _API_URL = 'https://api.bili{}/intl/gateway{}' - - def _call_api(self, type, endpoint, id): - return self._download_json(self._API_URL.format(type, endpoint), id)['data'] + _API_URL = 'https://api.bilibili.tv/intl/gateway' + _NETRC_MACHINE = 'biliintl' + + def _call_api(self, endpoint, *args, **kwargs): + json = self._download_json(self._API_URL + endpoint, *args, **kwargs) + if json.get('code'): + if json['code'] in (10004004, 10004005, 10023006): + self.raise_login_required() + elif json['code'] == 10004001: + self.raise_geo_restricted() + else: + if json.get('message') and str(json['code']) != json['message']: + errmsg = f'{kwargs.get("errnote", "Unable to download JSON metadata")}: {self.IE_NAME} said: {json["message"]}' + else: + errmsg = kwargs.get('errnote', 'Unable to download JSON metadata') + if kwargs.get('fatal'): + raise ExtractorError(errmsg) + else: + self.report_warning(errmsg) + return json.get('data') def json2srt(self, json): data = '\n\n'.join( f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}' - for i, line in enumerate(json['body'])) + for i, line in enumerate(json['body']) if line.get('content')) return data - def _get_subtitles(self, type, ep_id): - sub_json = self._call_api(type, f'/m/subtitle?ep_id={ep_id}&platform=web', ep_id) + def _get_subtitles(self, *, ep_id=None, aid=None): + sub_json = self._call_api( + '/web/v2/subtitle', ep_id or aid, note='Downloading subtitles list', + errnote='Unable to download subtitles list', query=filter_dict({ + 'platform': 'web', + 'episode_id': ep_id, + 'aid': aid, + })) subtitles = {} - for sub in sub_json.get('subtitles', []): + for sub in sub_json.get('subtitles') or []: sub_url = sub.get('url') if not sub_url: continue - sub_data = self._download_json(sub_url, ep_id, fatal=False) + sub_data = self._download_json( + sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False, + note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '') if not sub_data: continue - subtitles.setdefault(sub.get('key', 'en'), []).append({ + subtitles.setdefault(sub.get('lang_key', 'en'), []).append({ 'ext': 'srt', 'data': self.json2srt(sub_data) }) return subtitles - def _get_formats(self, type, ep_id): - video_json = self._call_api(type, f'/web/playurl?ep_id={ep_id}&platform=web', ep_id) - if not video_json: - self.raise_login_required(method='cookies') + def _get_formats(self, *, ep_id=None, aid=None): + video_json = self._call_api( + '/web/playurl', ep_id or aid, note='Downloading video formats', + errnote='Unable to download video formats', query=filter_dict({ + 'platform': 'web', + 'ep_id': ep_id, + 'aid': aid, + })) video_json = video_json['playurl'] formats = [] - for vid in video_json.get('video', []): + for vid in video_json.get('video') or []: video_res = vid.get('video_resource') or {} video_info = vid.get('stream_info') or {} if not video_res.get('url'): @@ -768,7 +806,7 @@ class BiliIntlBaseIE(InfoExtractor): 'vcodec': video_res.get('codecs'), 'filesize': video_res.get('size'), }) - for aud in video_json.get('audio_resource', []): + for aud in video_json.get('audio_resource') or []: if not aud.get('url'): continue formats.append({ @@ -783,85 +821,148 @@ class BiliIntlBaseIE(InfoExtractor): self._sort_formats(formats) return formats - def _extract_ep_info(self, type, episode_data, ep_id): + def _extract_video_info(self, video_data, *, ep_id=None, aid=None): return { - 'id': ep_id, - 'title': episode_data.get('long_title') or episode_data['title'], - 'thumbnail': episode_data.get('cover'), - 'episode_number': str_to_int(episode_data.get('title')), - 'formats': self._get_formats(type, ep_id), - 'subtitles': self._get_subtitles(type, ep_id), + 'id': ep_id or aid, + 'title': video_data.get('title_display') or video_data.get('title'), + 'thumbnail': video_data.get('cover'), + 'episode_number': int_or_none(self._search_regex( + r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)), + 'formats': self._get_formats(ep_id=ep_id, aid=aid), + 'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid), 'extractor_key': BiliIntlIE.ie_key(), } + def _perform_login(self, username, password): + try: + from Cryptodome.PublicKey import RSA + from Cryptodome.Cipher import PKCS1_v1_5 + except ImportError: + try: + from Crypto.PublicKey import RSA + from Crypto.Cipher import PKCS1_v1_5 + except ImportError: + raise ExtractorError('pycryptodomex not found. Please install', expected=True) + + key_data = self._download_json( + 'https://passport.bilibili.tv/x/intl/passport-login/web/key?lang=en-US', None, + note='Downloading login key', errnote='Unable to download login key')['data'] + + public_key = RSA.importKey(key_data['key']) + password_hash = PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8')) + login_post = self._download_json( + 'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({ + 'username': username, + 'password': base64.b64encode(password_hash).decode('ascii'), + 'keep_me': 'true', + 's_locale': 'en_US', + 'isTrusted': 'true' + }), note='Logging in', errnote='Unable to log in') + if login_post.get('code'): + if login_post.get('message'): + raise ExtractorError(f'Unable to log in: {self.IE_NAME} said: {login_post["message"]}', expected=True) + else: + raise ExtractorError('Unable to log in') + class BiliIntlIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?P<type>bili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?(play/(?P<season_id>\d+)/(?P<ep_id>\d+)|video/(?P<aid>\d+))' _TESTS = [{ + # Bstation page 'url': 'https://www.bilibili.tv/en/play/34613/341736', 'info_dict': { 'id': '341736', 'ext': 'mp4', - 'title': 'The First Night', - 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png', + 'title': 'E2 - The First Night', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', 'episode_number': 2, - }, - 'params': { - 'format': 'bv', - }, + } }, { - 'url': 'https://www.biliintl.com/en/play/34613/341736', + # Non-Bstation page + 'url': 'https://www.bilibili.tv/en/play/1033760/11005006', 'info_dict': { - 'id': '341736', + 'id': '11005006', 'ext': 'mp4', - 'title': 'The First Night', - 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png', - 'episode_number': 2, - }, - 'params': { - 'format': 'bv', + 'title': 'E3 - Who?', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode_number': 3, + } + }, { + # Subtitle with empty content + 'url': 'https://www.bilibili.tv/en/play/1005144/10131790', + 'info_dict': { + 'id': '10131790', + 'ext': 'mp4', + 'title': 'E140 - Two Heartbeats: Kabuto\'s Trap', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode_number': 140, }, + 'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.' + }, { + 'url': 'https://www.biliintl.com/en/play/34613/341736', + 'only_matching': True, + }, { + # User-generated content (as opposed to a series licensed from a studio) + 'url': 'https://bilibili.tv/en/video/2019955076', + 'only_matching': True, + }, { + # No language in URL + 'url': 'https://www.bilibili.tv/video/2019955076', + 'only_matching': True, }] def _real_extract(self, url): - type, season_id, id = self._match_valid_url(url).groups() - data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={season_id}', id) - episode_data = next( - episode for episode in data_json.get('episodes', []) - if str(episode.get('ep_id')) == id) - return self._extract_ep_info(type, episode_data, id) + season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') + video_id = ep_id or aid + webpage = self._download_webpage(url, video_id) + # Bstation layout + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), video_id, fatal=False) or {} + video_data = ( + traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) + or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {}) + + if season_id and not video_data: + # Non-Bstation layout, read through episode list + season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) + video_data = traverse_obj(season_json, + ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id), + expected_type=dict, get_all=False) + return self._extract_video_info(video_data, ep_id=ep_id, aid=aid) class BiliIntlSeriesIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?P<type>bili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)$' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)$' _TESTS = [{ 'url': 'https://www.bilibili.tv/en/play/34613', 'playlist_mincount': 15, 'info_dict': { 'id': '34613', + 'title': 'Fly Me to the Moon', + 'description': 'md5:a861ee1c4dc0acfad85f557cc42ac627', + 'categories': ['Romance', 'Comedy', 'Slice of life'], + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'view_count': int, }, 'params': { 'skip_download': True, - 'format': 'bv', }, }, { 'url': 'https://www.biliintl.com/en/play/34613', - 'playlist_mincount': 15, - 'info_dict': { - 'id': '34613', - }, - 'params': { - 'skip_download': True, - 'format': 'bv', - }, + 'only_matching': True, }] - def _entries(self, id, type): - data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={id}', id) - for episode in data_json.get('episodes', []): - episode_id = str(episode.get('ep_id')) - yield self._extract_ep_info(type, episode, episode_id) + def _entries(self, series_id): + series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id) + for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]): + episode_id = str(episode.get('episode_id')) + yield self._extract_video_info(episode, ep_id=episode_id) def _real_extract(self, url): - type, id = self._match_valid_url(url).groups() - return self.playlist_result(self._entries(id, type), playlist_id=id) + series_id = self._match_id(url) + series_info = self._call_api(f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {} + return self.playlist_result( + self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'), + categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none), + thumbnail=url_or_none(series_info.get('horizontal_cover')), view_count=parse_count(series_info.get('view'))) diff --git a/hypervideo_dl/extractor/biqle.py b/hypervideo_dl/extractor/biqle.py index 17ebbb2..2b57bad 100644 --- a/hypervideo_dl/extractor/biqle.py +++ b/hypervideo_dl/extractor/biqle.py @@ -3,27 +3,28 @@ from __future__ import unicode_literals from .common import InfoExtractor from .vk import VKIE -from ..compat import ( - compat_b64decode, - compat_urllib_parse_unquote, +from ..compat import compat_b64decode +from ..utils import ( + int_or_none, + js_to_json, + traverse_obj, + unified_timestamp, ) -from ..utils import int_or_none class BIQLEIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)' _TESTS = [{ - # Youtube embed - 'url': 'https://biqle.ru/watch/-115995369_456239081', - 'md5': '97af5a06ee4c29bbf9c001bdb1cf5c06', + 'url': 'https://biqle.ru/watch/-2000421746_85421746', + 'md5': 'ae6ef4f04d19ac84e4658046d02c151c', 'info_dict': { - 'id': '8v4f-avW-VI', + 'id': '-2000421746_85421746', 'ext': 'mp4', - 'title': "PASSE-PARTOUT - L'ete c'est fait pour jouer", - 'description': 'Passe-Partout', - 'uploader_id': 'mrsimpsonstef3', - 'uploader': 'Phanolito', - 'upload_date': '20120822', + 'title': 'Forsaken By Hope Studio Clip', + 'description': 'Forsaken By Hope Studio Clip — Смотреть онлайн', + 'upload_date': '19700101', + 'thumbnail': r're:https://[^/]+/impf/7vN3ACwSTgChP96OdOfzFjUCzFR6ZglDQgWsIw/KPaACiVJJxM\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=b48ea459c4d33dbcba5e26d63574b1cb&type=video_thumb', + 'timestamp': 0, }, }, { 'url': 'http://biqle.org/watch/-44781847_168547604', @@ -32,53 +33,62 @@ class BIQLEIE(InfoExtractor): 'id': '-44781847_168547604', 'ext': 'mp4', 'title': 'Ребенок в шоке от автоматической мойки', + 'description': 'Ребенок в шоке от автоматической мойки — Смотреть онлайн', 'timestamp': 1396633454, - 'uploader': 'Dmitry Kotov', 'upload_date': '20140404', - 'uploader_id': '47850140', + 'thumbnail': r're:https://[^/]+/c535507/u190034692/video/l_b84df002\.jpg', }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - embed_url = self._proto_relative_url(self._search_regex( - r'<iframe.+?src="((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^"]+)".*?></iframe>', - webpage, 'embed url')) + + title = self._html_search_meta('name', webpage, 'Title', fatal=False) + timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None)) + description = self._html_search_meta('description', webpage, 'Description', default=None) + + global_embed_url = self._search_regex( + r'<script[^<]+?window.globEmbedUrl\s*=\s*\'((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^\']+)\'', + webpage, 'global Embed url') + hash = self._search_regex( + r'<script id="data-embed-video[^<]+?hash: "([^"]+)"[^<]*</script>', webpage, 'Hash') + + embed_url = global_embed_url + hash + if VKIE.suitable(embed_url): return self.url_result(embed_url, VKIE.ie_key(), video_id) embed_page = self._download_webpage( - embed_url, video_id, headers={'Referer': url}) - video_ext = self._get_cookies(embed_url).get('video_ext') - if video_ext: - video_ext = compat_urllib_parse_unquote(video_ext.value) - if not video_ext: - video_ext = compat_b64decode(self._search_regex( - r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)', - embed_page, 'video_ext')).decode() - video_id, sig, _, access_token = video_ext.split(':') + embed_url, video_id, 'Downloading embed webpage', headers={'Referer': url}) + + glob_params = self._parse_json(self._search_regex( + r'<script id="globParams">[^<]*window.globParams = ([^;]+);[^<]+</script>', + embed_page, 'Global Parameters'), video_id, transform_source=js_to_json) + host_name = compat_b64decode(glob_params['server'][::-1]).decode() + item = self._download_json( - 'https://api.vk.com/method/video.get', video_id, - headers={'User-Agent': 'okhttp/3.4.1'}, query={ - 'access_token': access_token, - 'sig': sig, - 'v': 5.44, + f'https://{host_name}/method/video.get/{video_id}', video_id, + headers={'Referer': url}, query={ + 'token': glob_params['video']['access_token'], 'videos': video_id, + 'ckey': glob_params['c_key'], + 'credentials': glob_params['video']['credentials'], })['response']['items'][0] - title = item['title'] formats = [] for f_id, f_url in item.get('files', {}).items(): if f_id == 'external': return self.url_result(f_url) ext, height = f_id.split('_') - formats.append({ - 'format_id': height + 'p', - 'url': f_url, - 'height': int_or_none(height), - 'ext': ext, - }) + height_extra_key = traverse_obj(glob_params, ('video', 'partial', 'quality', height)) + if height_extra_key: + formats.append({ + 'format_id': f'{height}p', + 'url': f'https://{host_name}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}', + 'height': int_or_none(height), + 'ext': ext, + }) self._sort_formats(formats) thumbnails = [] @@ -96,10 +106,9 @@ class BIQLEIE(InfoExtractor): 'title': title, 'formats': formats, 'comment_count': int_or_none(item.get('comments')), - 'description': item.get('description'), + 'description': description, 'duration': int_or_none(item.get('duration')), 'thumbnails': thumbnails, - 'timestamp': int_or_none(item.get('date')), - 'uploader': item.get('owner_id'), + 'timestamp': timestamp, 'view_count': int_or_none(item.get('views')), } diff --git a/hypervideo_dl/extractor/bitwave.py b/hypervideo_dl/extractor/bitwave.py index eb16c46..e6e093f 100644 --- a/hypervideo_dl/extractor/bitwave.py +++ b/hypervideo_dl/extractor/bitwave.py @@ -51,7 +51,7 @@ class BitwaveStreamIE(InfoExtractor): return { 'id': username, - 'title': self._live_title(channel['data']['title']), + 'title': channel['data']['title'], 'uploader': username, 'uploader_id': username, 'formats': formats, diff --git a/hypervideo_dl/extractor/blogger.py b/hypervideo_dl/extractor/blogger.py new file mode 100644 index 0000000..dba131c --- /dev/null +++ b/hypervideo_dl/extractor/blogger.py @@ -0,0 +1,54 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from ..utils import ( + mimetype2ext, + parse_duration, + parse_qs, + str_or_none, + traverse_obj, +) +from .common import InfoExtractor + + +class BloggerIE(InfoExtractor): + IE_NAME = 'blogger.com' + _VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P<id>.+)' + _VALID_EMBED = r'''<iframe[^>]+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']''' + _TESTS = [{ + 'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw', + 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac', + 'info_dict': { + 'id': 'BLOGGER-video-3c740e3a49197e16-796', + 'title': 'BLOGGER-video-3c740e3a49197e16-796', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*', + 'duration': 76.068, + } + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall(BloggerIE._VALID_EMBED, webpage) + + def _real_extract(self, url): + token_id = self._match_id(url) + webpage = self._download_webpage(url, token_id) + data_json = self._search_regex(r'var\s+VIDEO_CONFIG\s*=\s*(\{.*)', webpage, 'JSON data') + data = self._parse_json(data_json.encode('utf-8').decode('unicode_escape'), token_id) + streams = data['streams'] + formats = [{ + 'ext': mimetype2ext(traverse_obj(parse_qs(stream['play_url']), ('mime', 0))), + 'url': stream['play_url'], + 'format_id': str_or_none(stream.get('format_id')), + } for stream in streams] + + return { + 'id': data.get('iframe_id', token_id), + 'title': data.get('iframe_id', token_id), + 'formats': formats, + 'thumbnail': data.get('thumbnail'), + 'duration': parse_duration(traverse_obj(parse_qs(streams[0]['play_url']), ('dur', 0))), + } diff --git a/hypervideo_dl/extractor/bongacams.py b/hypervideo_dl/extractor/bongacams.py index 9e75511..4e346e7 100644 --- a/hypervideo_dl/extractor/bongacams.py +++ b/hypervideo_dl/extractor/bongacams.py @@ -49,7 +49,7 @@ class BongaCamsIE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(uploader or uploader_id), + 'title': uploader or uploader_id, 'uploader': uploader, 'uploader_id': uploader_id, 'like_count': like_count, diff --git a/hypervideo_dl/extractor/br.py b/hypervideo_dl/extractor/br.py index 7169ece..0155827 100644 --- a/hypervideo_dl/extractor/br.py +++ b/hypervideo_dl/extractor/br.py @@ -175,7 +175,7 @@ class BRIE(InfoExtractor): class BRMediathekIE(InfoExtractor): IE_DESC = 'Bayerischer Rundfunk Mediathek' - _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})' + _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek//?video/(?:[^/?&#]+?-)?(?P<id>av:[0-9a-f]{24})' _TESTS = [{ 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', @@ -188,6 +188,9 @@ class BRMediathekIE(InfoExtractor): 'timestamp': 1511942766, 'upload_date': '20171129', } + }, { + 'url': 'https://www.br.de/mediathek//video/av:61b0db581aed360007558c12', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/breitbart.py b/hypervideo_dl/extractor/breitbart.py new file mode 100644 index 0000000..e029aa6 --- /dev/null +++ b/hypervideo_dl/extractor/breitbart.py @@ -0,0 +1,38 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BreitBartIE(InfoExtractor): + _VALID_URL = r'https?:\/\/(?:www\.)breitbart.com/videos/v/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.breitbart.com/videos/v/5cOz1yup/?pl=Ij6NDOji', + 'md5': '0aa6d1d6e183ac5ca09207fe49f17ade', + 'info_dict': { + 'id': '5cOz1yup', + 'ext': 'mp4', + 'title': 'Watch \u2013 Clyburn: Statues in Congress Have to Go Because they Are Honoring Slavery', + 'description': 'md5:bac35eb0256d1cb17f517f54c79404d5', + 'thumbnail': 'https://cdn.jwplayer.com/thumbs/5cOz1yup-1920.jpg', + 'age_limit': 0, + } + }, { + 'url': 'https://www.breitbart.com/videos/v/eaiZjVOn/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + formats = self._extract_m3u8_formats(f'https://cdn.jwplayer.com/manifests/{video_id}.m3u8', video_id, ext='mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'video title')), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'age_limit': self._rta_search(webpage), + 'formats': formats + } diff --git a/hypervideo_dl/extractor/brightcove.py b/hypervideo_dl/extractor/brightcove.py index cd1c3f0..dcd332b 100644 --- a/hypervideo_dl/extractor/brightcove.py +++ b/hypervideo_dl/extractor/brightcove.py @@ -16,6 +16,7 @@ from ..compat import ( ) from ..utils import ( clean_html, + dict_get, extract_attributes, ExtractorError, find_xpath_attr, @@ -471,32 +472,22 @@ class BrightcoveNewIE(AdobePassIE): def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() - num_drm_sources = 0 formats, subtitles = [], {} sources = json_data.get('sources') or [] for source in sources: container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - skip_unplayable = not self.get_param('allow_unplayable_formats') - # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if skip_unplayable and (container == 'WVM' or source.get('key_systems')): - num_drm_sources += 1 - continue - elif ext == 'ism' and skip_unplayable: - continue - elif ext == 'm3u8' or container == 'M2TS': + if ext == 'm3u8' or container == 'M2TS': if not src: continue - f, subs = self._extract_m3u8_formats_and_subtitles( + fmts, subs = self._extract_m3u8_formats_and_subtitles( src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(f) subtitles = self._merge_subtitles(subtitles, subs) elif ext == 'mpd': if not src: continue - f, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False) - formats.extend(f) + fmts, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False) subtitles = self._merge_subtitles(subtitles, subs) else: streaming_src = source.get('streaming_src') @@ -543,7 +534,13 @@ class BrightcoveNewIE(AdobePassIE): 'play_path': stream_name, 'format_id': build_format_id('rtmp'), }) - formats.append(f) + fmts = [f] + + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if container == 'WVM' or source.get('key_systems') or ext == 'ism': + for f in fmts: + f['has_drm'] = True + formats.extend(fmts) if not formats: errors = json_data.get('errors') @@ -551,9 +548,6 @@ class BrightcoveNewIE(AdobePassIE): error = errors[0] self.raise_no_formats( error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) - elif (not self.get_param('allow_unplayable_formats') - and sources and num_drm_sources == len(sources)): - self.report_drm(video_id) self._sort_formats(formats) @@ -577,11 +571,19 @@ class BrightcoveNewIE(AdobePassIE): if duration is not None and duration <= 0: is_live = True + common_res = [(160, 90), (320, 180), (480, 720), (640, 360), (768, 432), (1024, 576), (1280, 720), (1366, 768), (1920, 1080)] + thumb_base_url = dict_get(json_data, ('poster', 'thumbnail')) + thumbnails = [{ + 'url': re.sub(r'\d+x\d+', f'{w}x{h}', thumb_base_url), + 'width': w, + 'height': h, + } for w, h in common_res] if thumb_base_url else None + return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': clean_html(json_data.get('description')), - 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), + 'thumbnails': thumbnails, 'duration': duration, 'timestamp': parse_iso8601(json_data.get('published_at')), 'uploader_id': json_data.get('account_id'), diff --git a/hypervideo_dl/extractor/cableav.py b/hypervideo_dl/extractor/cableav.py new file mode 100644 index 0000000..77efdf4 --- /dev/null +++ b/hypervideo_dl/extractor/cableav.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from .common import InfoExtractor + + +class CableAVIE(InfoExtractor): + _VALID_URL = r'https://cableav\.tv/(?P<id>[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'https://cableav.tv/lS4iR9lWjN8/', + 'md5': '7e3fe5e49d61c4233b7f5b0f69b15e18', + 'info_dict': { + 'id': 'lS4iR9lWjN8', + 'ext': 'mp4', + 'title': '國產麻豆AV 叮叮映畫 DDF001 情欲小說家 - CableAV', + 'description': '國產AV 480p, 720p 国产麻豆AV 叮叮映画 DDF001 情欲小说家', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_url = self._og_search_video_url(webpage, secure=False) + + formats = self._extract_m3u8_formats(video_url, video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': formats, + } diff --git a/hypervideo_dl/extractor/callin.py b/hypervideo_dl/extractor/callin.py new file mode 100644 index 0000000..1f3b7cf --- /dev/null +++ b/hypervideo_dl/extractor/callin.py @@ -0,0 +1,114 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import ( + traverse_obj, + float_or_none, + int_or_none +) + + +class CallinIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?callin\.com/(episode)/(?P<id>[-a-zA-Z]+)' + _TESTS = [{ + 'url': 'https://www.callin.com/episode/the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc', + 'info_dict': { + 'id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd', + 'title': 'The Title IX Regime and the Long March Through and Beyond the Institutions', + 'ext': 'ts', + 'display_id': 'the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc', + 'thumbnail': 're:https://.+\\.png', + 'description': 'First episode', + 'uploader': 'Wesley Yang', + 'timestamp': 1639404128.65, + 'upload_date': '20211213', + 'uploader_id': 'wesyang', + 'uploader_url': 'http://wesleyyang.substack.com', + 'channel': 'Conversations in Year Zero', + 'channel_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553', + 'channel_url': 'https://callin.com/show/conversations-in-year-zero-oJNllRFSfx', + 'duration': 9951.936, + 'view_count': int, + 'categories': ['News & Politics', 'History', 'Technology'], + 'cast': ['Wesley Yang', 'KC Johnson', 'Gabi Abramovich'], + 'series': 'Conversations in Year Zero', + 'series_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553', + 'episode': 'The Title IX Regime and the Long March Through and Beyond the Institutions', + 'episode_number': 1, + 'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd' + } + }] + + def try_get_user_name(self, d): + names = [d.get(n) for n in ('first', 'last')] + if None in names: + return next((n for n in names if n), default=None) + return ' '.join(names) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + next_data = self._search_nextjs_data(webpage, display_id) + episode = next_data['props']['pageProps']['episode'] + + id = episode['id'] + title = (episode.get('title') + or self._og_search_title(webpage, fatal=False) + or self._html_extract_title(webpage)) + url = episode['m3u8'] + formats = self._extract_m3u8_formats(url, display_id, ext='ts') + self._sort_formats(formats) + + show = traverse_obj(episode, ('show', 'title')) + show_id = traverse_obj(episode, ('show', 'id')) + + show_json = None + app_slug = (self._html_search_regex( + '<script\\s+src=["\']/_next/static/([-_a-zA-Z0-9]+)/_', + webpage, 'app slug', fatal=False) or next_data.get('buildId')) + show_slug = traverse_obj(episode, ('show', 'linkObj', 'resourceUrl')) + if app_slug and show_slug and '/' in show_slug: + show_slug = show_slug.rsplit('/', 1)[1] + show_json_url = f'https://www.callin.com/_next/data/{app_slug}/show/{show_slug}.json' + show_json = self._download_json(show_json_url, display_id, fatal=False) + + host = (traverse_obj(show_json, ('pageProps', 'show', 'hosts', 0)) + or traverse_obj(episode, ('speakers', 0))) + + host_nick = traverse_obj(host, ('linkObj', 'resourceUrl')) + host_nick = host_nick.rsplit('/', 1)[1] if (host_nick and '/' in host_nick) else None + + cast = list(filter(None, [ + self.try_get_user_name(u) for u in + traverse_obj(episode, (('speakers', 'callerTags'), ...)) or [] + ])) + + episode_list = traverse_obj(show_json, ('pageProps', 'show', 'episodes')) or [] + episode_number = next( + (len(episode_list) - i for (i, e) in enumerate(episode_list) if e.get('id') == id), + None) + + return { + 'id': id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'thumbnail': traverse_obj(episode, ('show', 'photo')), + 'description': episode.get('description'), + 'uploader': self.try_get_user_name(host) if host else None, + 'timestamp': episode.get('publishedAt'), + 'uploader_id': host_nick, + 'uploader_url': traverse_obj(show_json, ('pageProps', 'show', 'url')), + 'channel': show, + 'channel_id': show_id, + 'channel_url': traverse_obj(episode, ('show', 'linkObj', 'resourceUrl')), + 'duration': float_or_none(episode.get('runtime')), + 'view_count': int_or_none(episode.get('plays')), + 'categories': traverse_obj(episode, ('show', 'categorizations', ..., 'name')), + 'cast': cast if cast else None, + 'series': show, + 'series_id': show_id, + 'episode': title, + 'episode_number': episode_number, + 'episode_id': id + } diff --git a/hypervideo_dl/extractor/caltrans.py b/hypervideo_dl/extractor/caltrans.py new file mode 100644 index 0000000..9ac740f --- /dev/null +++ b/hypervideo_dl/extractor/caltrans.py @@ -0,0 +1,41 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CaltransIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?ca\.gov/vm/loc/[^/]+/(?P<id>[a-z0-9_]+)\.htm' + _TEST = { + 'url': 'https://cwwp2.dot.ca.gov/vm/loc/d3/hwy50at24th.htm', + 'info_dict': { + 'id': 'hwy50at24th', + 'ext': 'ts', + 'title': 'US-50 : Sacramento : Hwy 50 at 24th', + 'live_status': 'is_live', + 'thumbnail': 'https://cwwp2.dot.ca.gov/data/d3/cctv/image/hwy50at24th/hwy50at24th.jpg', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + global_vars = self._search_regex( + r'<script[^<]+?([^<]+\.m3u8[^<]+)</script>', + webpage, 'Global Vars') + route_place = self._search_regex(r'routePlace\s*=\s*"([^"]+)"', global_vars, 'Route Place', fatal=False) + location_name = self._search_regex(r'locationName\s*=\s*"([^"]+)"', global_vars, 'Location Name', fatal=False) + poster_url = self._search_regex(r'posterURL\s*=\s*"([^"]+)"', global_vars, 'Poster Url', fatal=False) + video_stream = self._search_regex(r'videoStreamURL\s*=\s*"([^"]+)"', global_vars, 'Video Stream URL', fatal=False) + + formats = self._extract_m3u8_formats(video_stream, video_id, 'ts', live=True) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': f'{route_place} : {location_name}', + 'is_live': True, + 'formats': formats, + 'thumbnail': poster_url, + } diff --git a/hypervideo_dl/extractor/cam4.py b/hypervideo_dl/extractor/cam4.py index 30daf2b..2a3931f 100644 --- a/hypervideo_dl/extractor/cam4.py +++ b/hypervideo_dl/extractor/cam4.py @@ -13,6 +13,8 @@ class CAM4IE(InfoExtractor): 'ext': 'mp4', 'title': 're:^foxynesss [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'age_limit': 18, + 'live_status': 'is_live', + 'thumbnail': 'https://snapshots.xcdnpro.com/thumbnails/foxynesss', } } @@ -25,8 +27,9 @@ class CAM4IE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(channel_id), + 'title': channel_id, 'is_live': True, 'age_limit': 18, 'formats': formats, + 'thumbnail': f'https://snapshots.xcdnpro.com/thumbnails/{channel_id}', } diff --git a/hypervideo_dl/extractor/cammodels.py b/hypervideo_dl/extractor/cammodels.py index eb2a8b4..3dc1937 100644 --- a/hypervideo_dl/extractor/cammodels.py +++ b/hypervideo_dl/extractor/cammodels.py @@ -91,7 +91,7 @@ class CamModelsIE(InfoExtractor): return { 'id': user_id, - 'title': self._live_title(user_id), + 'title': user_id, 'is_live': True, 'formats': formats, 'age_limit': 18 diff --git a/hypervideo_dl/extractor/canalalpha.py b/hypervideo_dl/extractor/canalalpha.py new file mode 100644 index 0000000..0365cb2 --- /dev/null +++ b/hypervideo_dl/extractor/canalalpha.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + try_get, + unified_strdate, +) + + +class CanalAlphaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?canalalpha\.ch/play/[^/]+/[^/]+/(?P<id>\d+)/?.*' + + _TESTS = [{ + 'url': 'https://www.canalalpha.ch/play/le-journal/episode/24520/jeudi-28-octobre-2021', + 'info_dict': { + 'id': '24520', + 'ext': 'mp4', + 'title': 'Jeudi 28 octobre 2021', + 'description': 'md5:d30c6c3e53f8ad40d405379601973b30', + 'thumbnail': 'https://static.canalalpha.ch/poster/journal/journal_20211028.jpg', + 'upload_date': '20211028', + 'duration': 1125, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/le-journal/topic/24512/la-poste-fait-de-neuchatel-un-pole-cryptographique', + 'info_dict': { + 'id': '24512', + 'ext': 'mp4', + 'title': 'La Poste fait de Neuchâtel un pôle cryptographique', + 'description': 'md5:4ba63ae78a0974d1a53d6703b6e1dedf', + 'thumbnail': 'https://static.canalalpha.ch/poster/news/news_39712.jpg', + 'upload_date': '20211028', + 'duration': 138, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/eureka/episode/24484/ces-innovations-qui-veulent-rendre-lagriculture-plus-durable', + 'info_dict': { + 'id': '24484', + 'ext': 'mp4', + 'title': 'Ces innovations qui veulent rendre l’agriculture plus durable', + 'description': 'md5:3de3f151180684621e85be7c10e4e613', + 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_10236.jpg', + 'upload_date': '20211026', + 'duration': 360, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/avec-le-temps/episode/23516/redonner-de-leclat-grace-au-polissage', + 'info_dict': { + 'id': '23516', + 'ext': 'mp4', + 'title': 'Redonner de l\'éclat grâce au polissage', + 'description': 'md5:0d8fbcda1a5a4d6f6daa3165402177e1', + 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_9990.png', + 'upload_date': '20210726', + 'duration': 360, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._search_regex( + r'window\.__SERVER_STATE__\s?=\s?({(?:(?!};)[^"]|"([^"]|\\")*")+})\s?;', + webpage, 'data_json'), id)['1']['data']['data'] + manifests = try_get(data_json, lambda x: x['video']['manifests'], expected_type=dict) or {} + subtitles = {} + formats = [{ + 'url': video['$url'], + 'ext': 'mp4', + 'width': try_get(video, lambda x: x['res']['width'], expected_type=int), + 'height': try_get(video, lambda x: x['res']['height'], expected_type=int), + } for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')] + if manifests.get('hls'): + m3u8_frmts, m3u8_subs = self._parse_m3u8_formats_and_subtitles(manifests['hls'], video_id=id) + formats.extend(m3u8_frmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + if manifests.get('dash'): + dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash']) + formats.extend(dash_frmts) + subtitles = self._merge_subtitles(subtitles, dash_subs) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title').strip(), + 'description': clean_html(dict_get(data_json, ('longDesc', 'shortDesc'))), + 'thumbnail': data_json.get('poster'), + 'upload_date': unified_strdate(dict_get(data_json, ('webPublishAt', 'featuredAt', 'diffusionDate'))), + 'duration': try_get(data_json, lambda x: x['video']['duration'], expected_type=int), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/canvas.py b/hypervideo_dl/extractor/canvas.py index 49e7e4e..8b99037 100644 --- a/hypervideo_dl/extractor/canvas.py +++ b/hypervideo_dl/extractor/canvas.py @@ -1,4 +1,5 @@ from __future__ import unicode_literals +import json from .common import InfoExtractor @@ -41,9 +42,9 @@ class CanvasIE(InfoExtractor): _GEO_BYPASS = False _HLS_ENTRY_PROTOCOLS_MAP = { 'HLS': 'm3u8_native', - 'HLS_AES': 'm3u8', + 'HLS_AES': 'm3u8_native', } - _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1' + _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -59,18 +60,23 @@ class CanvasIE(InfoExtractor): # New API endpoint if not data: + vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', + video_id, note='refreshtoken: Retrieve vrtnutoken', + errnote='refreshtoken failed')['vrtnutoken'] headers = self.geo_verification_headers() - headers.update({'Content-Type': 'application/json'}) - token = self._download_json( + headers.update({'Content-Type': 'application/json; charset=utf-8'}) + vrtPlayerToken = self._download_json( '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] + 'Downloading token', headers=headers, data=json.dumps({ + 'identityToken': vrtnutoken + }).encode('utf-8'))['vrtPlayerToken'] data = self._download_json( '%s/videos/%s' % (self._REST_API_BASE, video_id), video_id, 'Downloading video JSON', query={ - 'vrtPlayerToken': token, - 'client': '%s@PROD' % site_id, + 'vrtPlayerToken': vrtPlayerToken, + 'client': 'null', }, expected_status=400) - if not data.get('title'): + if 'title' not in data: code = data.get('code') if code == 'AUTHENTICATION_REQUIRED': self.raise_login_required() @@ -78,7 +84,8 @@ class CanvasIE(InfoExtractor): self.raise_geo_restricted(countries=['BE']) raise ExtractorError(data.get('message') or code, expected=True) - title = data['title'] + # Note: The title may be an empty string + title = data['title'] or f'{site_id} {video_id}' description = data.get('description') formats = [] @@ -238,10 +245,6 @@ class VrtNUIE(GigyaBaseIE): 'upload_date': '20200727', }, 'skip': 'This video is only available for registered users', - 'params': { - 'username': '<snip>', - 'password': '<snip>', - }, 'expected_warnings': ['is not a supported codec'], }, { # Only available via new API endpoint @@ -257,34 +260,20 @@ class VrtNUIE(GigyaBaseIE): 'episode_number': 5, }, 'skip': 'This video is only available for registered users', - 'params': { - 'username': '<snip>', - 'password': '<snip>', - }, 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], }] _NETRC_MACHINE = 'vrtnu' - _APIKEY = '3_qhEcPa5JGFROVwu5SWKqJ4mVOIkwlFNMSKwzPDAh8QZOtHqu6L4nD5Q7lk0eXOOG' + _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' _CONTEXT_ID = 'R3595707040' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - auth_info = self._download_json( - 'https://accounts.vrt.be/accounts.login', None, - note='Login data', errnote='Could not get Login data', - headers={}, data=urlencode_postdata({ - 'loginID': username, - 'password': password, - 'sessionExpiration': '-2', - 'APIKey': self._APIKEY, - 'targetEnv': 'jssdk', - })) + def _perform_login(self, username, password): + auth_info = self._gigya_login({ + 'APIKey': self._APIKEY, + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + }) if auth_info.get('errorDetails'): raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True) @@ -301,14 +290,15 @@ class VrtNUIE(GigyaBaseIE): 'UID': auth_info['UID'], 'UIDSignature': auth_info['UIDSignature'], 'signatureTimestamp': auth_info['signatureTimestamp'], - 'client_id': 'vrtnu-site', '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, } self._request_webpage( 'https://login.vrt.be/perform_login', - None, note='Requesting a token', errnote='Could not get a token', - headers={}, data=urlencode_postdata(post_data)) + None, note='Performing login', errnote='perform login failed', + headers={}, query={ + 'client_id': 'vrtnu-site' + }, data=urlencode_postdata(post_data)) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: diff --git a/hypervideo_dl/extractor/carambatv.py b/hypervideo_dl/extractor/carambatv.py index b57b86a..7e5cc90 100644 --- a/hypervideo_dl/extractor/carambatv.py +++ b/hypervideo_dl/extractor/carambatv.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + format_field, float_or_none, int_or_none, try_get, @@ -43,7 +44,7 @@ class CarambaTVIE(InfoExtractor): formats = [{ 'url': base_url + f['fn'], 'height': int_or_none(f.get('height')), - 'format_id': '%sp' % f['height'] if f.get('height') else None, + 'format_id': format_field(f, 'height', '%sp'), } for f in video['qualities'] if f.get('fn')] self._sort_formats(formats) diff --git a/hypervideo_dl/extractor/cbc.py b/hypervideo_dl/extractor/cbc.py index 2429521..4892419 100644 --- a/hypervideo_dl/extractor/cbc.py +++ b/hypervideo_dl/extractor/cbc.py @@ -2,17 +2,22 @@ from __future__ import unicode_literals import re +import json +import base64 +import time from .common import InfoExtractor from ..compat import ( compat_str, ) from ..utils import ( + int_or_none, + join_nonempty, js_to_json, - smuggle_url, - try_get, orderedSet, + smuggle_url, strip_or_none, + try_get, ExtractorError, ) @@ -122,9 +127,9 @@ class CBCIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - title = self._og_search_title(webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title', fatal=False) + title = (self._og_search_title(webpage, default=None) + or self._html_search_meta('twitter:title', webpage, 'title', default=None) + or self._html_extract_title(webpage)) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] @@ -244,37 +249,129 @@ class CBCGemIE(InfoExtractor): 'params': {'format': 'bv'}, 'skip': 'Geo-restricted to Canada', }] - _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + + _GEO_COUNTRIES = ['CA'] + _TOKEN_API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' + _NETRC_MACHINE = 'cbcgem' + _claims_token = None + + def _new_claims_token(self, email, password): + data = json.dumps({ + 'email': email, + 'password': password, + }).encode() + headers = {'content-type': 'application/json'} + query = {'apikey': self._TOKEN_API_KEY} + resp = self._download_json('https://api.loginradius.com/identity/v2/auth/login', + None, data=data, headers=headers, query=query) + access_token = resp['access_token'] + + query = { + 'access_token': access_token, + 'apikey': self._TOKEN_API_KEY, + 'jwtapp': 'jwt', + } + resp = self._download_json('https://cloud-api.loginradius.com/sso/jwt/api/token', + None, headers=headers, query=query) + sig = resp['signature'] + + data = json.dumps({'jwt': sig}).encode() + headers = {'content-type': 'application/json', 'ott-device-type': 'web'} + resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token', + None, data=data, headers=headers) + cbc_access_token = resp['accessToken'] + + headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token} + resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile', + None, headers=headers) + return resp['claimsToken'] + + def _get_claims_token_expiry(self): + # Token is a JWT + # JWT is decoded here and 'exp' field is extracted + # It is a Unix timestamp for when the token expires + b64_data = self._claims_token.split('.')[1] + data = base64.urlsafe_b64decode(b64_data + "==") + return json.loads(data)['exp'] + + def claims_token_expired(self): + exp = self._get_claims_token_expiry() + if exp - time.time() < 10: + # It will expire in less than 10 seconds, or has already expired + return True + return False + + def claims_token_valid(self): + return self._claims_token is not None and not self.claims_token_expired() + + def _get_claims_token(self, email, password): + if not self.claims_token_valid(): + self._claims_token = self._new_claims_token(email, password) + self._downloader.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token) + return self._claims_token + + def _real_initialize(self): + if self.claims_token_valid(): + return + self._claims_token = self._downloader.cache.load(self._NETRC_MACHINE, 'claims_token') + + def _find_secret_formats(self, formats, video_id): + """ Find a valid video url and convert it to the secret variant """ + base_format = next((f for f in formats if f.get('vcodec') != 'none'), None) + if not base_format: + return + + base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url']) + url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url) + + secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False) + if not secret_xml: + return + + for child in secret_xml: + if child.attrib.get('Type') != 'video': + continue + for video_quality in child: + bitrate = int_or_none(video_quality.attrib.get('Bitrate')) + if not bitrate or 'Index' not in video_quality.attrib: + continue + height = int_or_none(video_quality.attrib.get('MaxHeight')) + + yield { + **base_format, + 'format_id': join_nonempty('sec', height), + # Note: \g<1> is necessary instead of \1 since bitrate is a number + 'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\g<1>{bitrate}\2', base_url), + 'width': int_or_none(video_quality.attrib.get('MaxWidth')), + 'tbr': bitrate / 1000.0, + 'height': height, + } def _real_extract(self, url): video_id = self._match_id(url) - video_info = self._download_json(self._API_BASE + video_id, video_id) - - last_error = None - attempt = -1 - retries = self.get_param('extractor_retries', 15) - while attempt < retries: - attempt += 1 - if last_error: - self.report_warning('%s. Retrying ...' % last_error) - m3u8_info = self._download_json( - video_info['playSession']['url'], video_id, - note='Downloading JSON metadata%s' % f' (attempt {attempt})') - m3u8_url = m3u8_info.get('url') - if m3u8_url: - break - elif m3u8_info.get('errorCode') == 1: - self.raise_geo_restricted(countries=['CA']) - else: - last_error = f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}' - # 35 means media unavailable, but retries work - if m3u8_info.get('errorCode') != 35 or attempt >= retries: - raise ExtractorError(last_error) + video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id) + + email, password = self._get_login_info() + if email and password: + claims_token = self._get_claims_token(email, password) + headers = {'x-claims-token': claims_token} + else: + headers = {} + m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers) + m3u8_url = m3u8_info.get('url') + + if m3u8_info.get('errorCode') == 1: + self.raise_geo_restricted(countries=['CA']) + elif m3u8_info.get('errorCode') == 35: + self.raise_login_required(method='password') + elif m3u8_info.get('errorCode') != 0: + raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}') formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') self._remove_duplicate_formats(formats) + formats.extend(self._find_secret_formats(formats, video_id)) - for i, format in enumerate(formats): + for format in formats: if format.get('vcodec') == 'none': if format.get('ext') is None: format['ext'] = 'm4a' @@ -328,7 +425,8 @@ class CBCGemPlaylistIE(InfoExtractor): show = match.group('show') show_info = self._download_json(self._API_BASE + show, season_id) season = int(match.group('season')) - season_info = try_get(show_info, lambda x: x['seasons'][season - 1]) + + season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) if season_info is None: raise ExtractorError(f'Couldn\'t find season {season} of {show}') @@ -377,7 +475,7 @@ class CBCGemPlaylistIE(InfoExtractor): class CBCGemLiveIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:live' - _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>[0-9]{12})' + _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>\d+)' _TEST = { 'url': 'https://gem.cbc.ca/live/920604739687', 'info_dict': { @@ -396,21 +494,21 @@ class CBCGemLiveIE(InfoExtractor): # It's unclear where the chars at the end come from, but they appear to be # constant. Might need updating in the future. - _API = 'https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT' + # There are two URLs, some livestreams are in one, and some + # in the other. The JSON schema is the same for both. + _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT'] def _real_extract(self, url): video_id = self._match_id(url) - live_info = self._download_json(self._API, video_id)['entries'] - video_info = None - for stream in live_info: - if stream.get('guid') == video_id: - video_info = stream - - if video_info is None: - raise ExtractorError( - 'Couldn\'t find video metadata, maybe this livestream is now offline', - expected=True) + for api_url in self._API_URLS: + video_info = next(( + stream for stream in self._download_json(api_url, video_id)['entries'] + if stream.get('guid') == video_id), None) + if video_info: + break + else: + raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True) return { '_type': 'url_transparent', diff --git a/hypervideo_dl/extractor/cbs.py b/hypervideo_dl/extractor/cbs.py index ae9ce58..2af36ea 100644 --- a/hypervideo_dl/extractor/cbs.py +++ b/hypervideo_dl/extractor/cbs.py @@ -77,21 +77,21 @@ class CBSIE(CBSBaseIE): (?: cbs:| https?://(?:www\.)?(?: - cbs\.com/(?:shows/[^/]+/video|movies/[^/]+)/| + cbs\.com/(?:shows|movies)/(?:video|[^/]+/video|[^/]+)/| colbertlateshow\.com/(?:video|podcasts)/) )(?P<id>[\w-]+)''' # All tests are blocked outside US _TESTS = [{ - 'url': 'https://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', + 'url': 'https://www.cbs.com/shows/video/xrUyNLtl9wd8D_RWWAg9NU2F_V6QpB3R/', 'info_dict': { - 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', + 'id': 'xrUyNLtl9wd8D_RWWAg9NU2F_V6QpB3R', 'ext': 'mp4', - 'title': 'Connect Chat feat. Garth Brooks', - 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', - 'duration': 1495, - 'timestamp': 1385585425, - 'upload_date': '20131127', + 'title': 'Tough As Nails - Dreams Never Die', + 'description': 'md5:a3535a62531cdd52b0364248a2c1ae33', + 'duration': 2588, + 'timestamp': 1639015200, + 'upload_date': '20211209', 'uploader': 'CBSI-NEW', }, 'params': { @@ -99,14 +99,14 @@ class CBSIE(CBSBaseIE): 'skip_download': True, }, }, { - 'url': 'https://www.cbs.com/shows/the-late-show-with-stephen-colbert/video/60icOhMb9NcjbcWnF_gub9XXHdeBcNk2/the-late-show-6-23-21-christine-baranski-joy-oladokun-', + 'url': 'https://www.cbs.com/shows/video/sZH1MGgomIosZgxGJ1l263MFq16oMtW1/', 'info_dict': { - 'id': '60icOhMb9NcjbcWnF_gub9XXHdeBcNk2', - 'title': 'The Late Show - 6/23/21 (Christine Baranski, Joy Oladokun)', - 'timestamp': 1624507140, - 'description': 'md5:e01af24e95c74d55e8775aef86117b95', + 'id': 'sZH1MGgomIosZgxGJ1l263MFq16oMtW1', + 'title': 'The Late Show - 3/16/22 (Michael Buble, Rose Matafeo)', + 'timestamp': 1647488100, + 'description': 'md5:d0e6ec23c544b7fa8e39a8e6844d2439', 'uploader': 'CBSI-NEW', - 'upload_date': '20210624', + 'upload_date': '20220317', }, 'params': { 'ignore_no_formats_error': True, diff --git a/hypervideo_dl/extractor/ccma.py b/hypervideo_dl/extractor/ccma.py index ea98f86..9dbaabf 100644 --- a/hypervideo_dl/extractor/ccma.py +++ b/hypervideo_dl/extractor/ccma.py @@ -1,17 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar -import datetime - from .common import InfoExtractor from ..utils import ( clean_html, - extract_timezone, int_or_none, parse_duration, parse_resolution, try_get, + unified_timestamp, url_or_none, ) @@ -95,14 +92,8 @@ class CCMAIE(InfoExtractor): duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) tematica = try_get(informacio, lambda x: x['tematica']['text']) - timestamp = None data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) - try: - timezone, data_utc = extract_timezone(data_utc) - timestamp = calendar.timegm((datetime.datetime.strptime( - data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple()) - except TypeError: - pass + timestamp = unified_timestamp(data_utc) subtitles = {} subtitols = media.get('subtitols') or [] diff --git a/hypervideo_dl/extractor/cctv.py b/hypervideo_dl/extractor/cctv.py index 9b86121..0ed5f32 100644 --- a/hypervideo_dl/extractor/cctv.py +++ b/hypervideo_dl/extractor/cctv.py @@ -162,7 +162,8 @@ class CCTVIE(InfoExtractor): 'url': video_url, 'format_id': 'http', 'quality': quality, - 'source_preference': -10 + # Sample clip + 'preference': -10 }) hls_url = try_get(data, lambda x: x['hls_url'], compat_str) diff --git a/hypervideo_dl/extractor/ceskatelevize.py b/hypervideo_dl/extractor/ceskatelevize.py index 5e04d38..ddf66b2 100644 --- a/hypervideo_dl/extractor/ceskatelevize.py +++ b/hypervideo_dl/extractor/ceskatelevize.py @@ -12,30 +12,15 @@ from ..utils import ( ExtractorError, float_or_none, sanitized_Request, - unescapeHTML, - update_url_query, + traverse_obj, urlencode_postdata, USER_AGENTS, ) class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' _TESTS = [{ - 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', - 'info_dict': { - 'id': '61924494877246241', - 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Život v Grónsku', - 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3350, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', 'info_dict': { 'id': '61924494877028507', @@ -66,12 +51,60 @@ class CeskaTelevizeIE(InfoExtractor): }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', 'only_matching': True, + }, { + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Queer: Bogotart', + 'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti. Připravil Peter Serge Butko', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494877311053', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Varování 18+)', + 'duration': 11.9, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Queer)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # iframe embed + 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', + 'only_matching': True, }] def _real_extract(self, url): playlist_id = self._match_id(url) - + parsed_url = compat_urllib_parse_urlparse(url) webpage = self._download_webpage(url, playlist_id) + site_name = self._og_search_property('site_name', webpage, fatal=False, default=None) + playlist_title = self._og_search_title(webpage, default=None) + if site_name and playlist_title: + playlist_title = playlist_title.replace(f' — {site_name}', '', 1) + playlist_description = self._og_search_description(webpage, default=None) + if playlist_description: + playlist_description = playlist_description.replace('\xa0', ' ') + + if parsed_url.path.startswith('/porady/'): + next_data = self._search_nextjs_data(webpage, playlist_id) + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False) + if not idec: + raise ExtractorError('Failed to find IDEC id') + iframe_hash = self._download_webpage('https://www.ceskatelevize.cz/v-api/iframe-hash/', playlist_id) + webpage = self._download_webpage('https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', playlist_id, + query={'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', 'IDEC': idec}) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s</p>' % NOT_AVAILABLE_STRING in webpage: @@ -100,7 +133,7 @@ class CeskaTelevizeIE(InfoExtractor): data = { 'playlist[0][type]': type_, 'playlist[0][id]': episode_id, - 'requestUrl': compat_urllib_parse_urlparse(url).path, + 'requestUrl': parsed_url.path, 'requestSource': 'iVysilani', } @@ -108,7 +141,7 @@ class CeskaTelevizeIE(InfoExtractor): for user_agent in (None, USER_AGENTS['Safari']): req = sanitized_Request( - 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') @@ -130,9 +163,6 @@ class CeskaTelevizeIE(InfoExtractor): req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist_title = self._og_search_title(webpage, default=None) - playlist_description = self._og_search_description(webpage, default=None) - playlist = self._download_json(req, playlist_id, fatal=False) if not playlist: continue @@ -147,6 +177,7 @@ class CeskaTelevizeIE(InfoExtractor): is_live = item.get('type') == 'LIVE' formats = [] for format_id, stream_url in item.get('streamUrls', {}).items(): + stream_url = stream_url.replace('https://', 'http://') if 'playerType=flash' in stream_url: stream_formats = self._extract_m3u8_formats( stream_url, playlist_id, 'mp4', 'm3u8_native', @@ -182,8 +213,6 @@ class CeskaTelevizeIE(InfoExtractor): if playlist_len == 1: final_title = playlist_title or title - if is_live: - final_title = self._live_title(final_title) else: final_title = '%s (%s)' % (playlist_title, title) @@ -237,54 +266,3 @@ class CeskaTelevizeIE(InfoExtractor): yield line return '\r\n'.join(_fix_subtitle(subtitles)) - - -class CeskaTelevizePoradyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' - _TESTS = [{ - # video with 18+ caution trailer - 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', - 'info_dict': { - 'id': '215562210900007-bogotart', - 'title': 'Queer: Bogotart', - 'description': 'Alternativní průvodce současným queer světem', - }, - 'playlist': [{ - 'info_dict': { - 'id': '61924494876844842', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Varování 18+)', - 'duration': 10.2, - }, - }, { - 'info_dict': { - 'id': '61924494877068022', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Queer)', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 1558.3, - }, - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # iframe embed - 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - data_url = update_url_query(unescapeHTML(self._search_regex( - (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), - webpage, 'iframe player url', group='url')), query={ - 'autoStart': 'true', - }) - - return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/hypervideo_dl/extractor/chaturbate.py b/hypervideo_dl/extractor/chaturbate.py index a459dcb..8da51f9 100644 --- a/hypervideo_dl/extractor/chaturbate.py +++ b/hypervideo_dl/extractor/chaturbate.py @@ -101,7 +101,7 @@ class ChaturbateIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(video_id), + 'title': video_id, 'thumbnail': 'https://roomimg.stream.highwebmedia.com/ri/%s.jpg' % video_id, 'age_limit': self._rta_search(webpage), 'is_live': True, diff --git a/hypervideo_dl/extractor/chingari.py b/hypervideo_dl/extractor/chingari.py index 6bdc4f6..e6841fb 100644 --- a/hypervideo_dl/extractor/chingari.py +++ b/hypervideo_dl/extractor/chingari.py @@ -67,7 +67,7 @@ class ChingariBaseIE(InfoExtractor): class ChingariIE(ChingariBaseIE): - _VALID_URL = r'(?:https?://)(?:www\.)?chingari\.io/share/post\?id=(?P<id>[^&/#?]+)' + _VALID_URL = r'https?://(?:www\.)?chingari\.io/share/post\?id=(?P<id>[^&/#?]+)' _TESTS = [{ 'url': 'https://chingari.io/share/post?id=612f8f4ce1dc57090e8a7beb', 'info_dict': { @@ -102,7 +102,7 @@ class ChingariIE(ChingariBaseIE): class ChingariUserIE(ChingariBaseIE): - _VALID_URL = r'(?:https?://)(?:www\.)?chingari\.io/(?!share/post)(?P<id>[^/?]+)' + _VALID_URL = r'https?://(?:www\.)?chingari\.io/(?!share/post)(?P<id>[^/?]+)' _TESTS = [{ 'url': 'https://chingari.io/dada1023', 'playlist_mincount': 3, diff --git a/hypervideo_dl/extractor/closertotruth.py b/hypervideo_dl/extractor/closertotruth.py index 26243d5..517e121 100644 --- a/hypervideo_dl/extractor/closertotruth.py +++ b/hypervideo_dl/extractor/closertotruth.py @@ -54,8 +54,7 @@ class CloserToTruthIE(InfoExtractor): r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)', webpage, 'kaltura partner_id') - title = self._search_regex( - r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title') + title = self._html_extract_title(webpage, 'video title') select = self._search_regex( r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>', diff --git a/hypervideo_dl/extractor/common.py b/hypervideo_dl/extractor/common.py index df74c75..0035191 100644 --- a/hypervideo_dl/extractor/common.py +++ b/hypervideo_dl/extractor/common.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import base64 -import datetime +import collections import hashlib import itertools import json @@ -45,15 +45,18 @@ from ..utils import ( determine_ext, determine_protocol, dict_get, + encode_data_uri, error_to_compat_str, extract_attributes, ExtractorError, + filter_dict, fix_xml_ampersands, float_or_none, format_field, GeoRestrictedError, GeoUtils, int_or_none, + join_nonempty, js_to_json, JSON_LD_RE, mimetype2ext, @@ -73,7 +76,9 @@ from ..utils import ( str_to_int, strip_or_none, traverse_obj, + try_get, unescapeHTML, + UnsupportedError, unified_strdate, unified_timestamp, update_Request, @@ -134,6 +139,8 @@ class InfoExtractor(object): for HDS - URL of the F4M manifest, for DASH - URL of the MPD manifest, for MSS - URL of the ISM manifest. + * manifest_stream_number (For internal use only) + The index of the stream in the manifest file * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). @@ -161,9 +168,8 @@ class InfoExtractor(object): * filesize_approx An estimate for the number of bytes * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual - download, lower-case. - "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe", - "m3u8", "m3u8_native" or "http_dash_segments". + download, lower-case. One of "http", "https" or + one of the protocols defined in downloader.PROTOCOL_MAP * fragment_base_url Base URL for fragments. Each fragment's path value (if present) will be relative to @@ -179,6 +185,8 @@ class InfoExtractor(object): fragment_base_url * "duration" (optional, int or float) * "filesize" (optional, int) + * is_from_start Is a live format that can be downloaded + from the start. Boolean * preference Order number of this format. If this field is present and not None, the formats get sorted by this field, regardless of all other values. @@ -209,7 +217,7 @@ class InfoExtractor(object): (HTTP or RTMP) download. Boolean. * has_drm The format has DRM and cannot be downloaded. Boolean * downloader_options A dictionary of downloader options as - described in FileDownloader + described in FileDownloader (For internal use only) RTMP formats can also have the additional fields: page_url, app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn, rtmp_protocol, rtmp_real_time @@ -221,6 +229,7 @@ class InfoExtractor(object): The following fields are optional: + direct: True if a direct video file was given (must only be set by GenericIE) alt_title: A secondary title of the video. display_id An alternative identifier for the video, not necessarily unique, but available before title. Typically, id is @@ -235,16 +244,22 @@ class InfoExtractor(object): * "resolution" (optional, string "{width}x{height}", deprecated) * "filesize" (optional, int) + * "http_headers" (dict) - HTTP headers for the request thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. license: License name the video is licensed under. creator: The creator of the video. - release_timestamp: UNIX timestamp of the moment the video was released. - release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video was uploaded - upload_date: Video upload date (YYYYMMDD). - If not explicitly set, calculated from timestamp. + upload_date: Video upload date in UTC (YYYYMMDD). + If not explicitly set, calculated from timestamp + release_timestamp: UNIX timestamp of the moment the video was released. + If it is not clear whether to use timestamp or this, use the former + release_date: The date (YYYYMMDD) when the video was released in UTC. + If not explicitly set, calculated from release_timestamp + modified_timestamp: UNIX timestamp of the moment the video was last modified. + modified_date: The date (YYYYMMDD) when the video was last modified in UTC. + If not explicitly set, calculated from modified_timestamp uploader_id: Nickname or id of the video uploader. uploader_url: Full URL to a personal webpage of the video uploader. channel: Full name of the channel the video is uploaded on. @@ -252,6 +267,7 @@ class InfoExtractor(object): fields. This depends on a particular extractor. channel_id: Id of the channel. channel_url: Full URL to a channel webpage. + channel_follower_count: Number of followers of the channel. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and @@ -262,6 +278,8 @@ class InfoExtractor(object): * "url": A URL pointing to the subtitles file It can optionally also have: * "name": Name or description of the subtitles + * "http_headers": A dictionary of additional HTTP headers + to add to the request. "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles'; contains automatically generated captions instead of normal subtitles @@ -340,6 +358,7 @@ class InfoExtractor(object): series, programme or podcast: series: Title of the series or programme the video episode belongs to. + series_id: Id of the series or programme the video episode belongs to, as a unicode string. season: Title of the season the video episode belongs to. season_number: Number of the season the video episode belongs to, as an integer. season_id: Id of the season the video episode belongs to, as a unicode string. @@ -366,6 +385,7 @@ class InfoExtractor(object): disc_number: Number of the disc or other physical medium the track belongs to, as an integer. release_year: Year (YYYY) when the album was released. + composer: Composer of the piece Unless mentioned otherwise, the fields should be Unicode strings. @@ -379,6 +399,11 @@ class InfoExtractor(object): Additionally, playlists can have "id", "title", and any other relevent attributes with the same semantics as videos (see above). + It can also have the following optional fields: + + playlist_count: The total number of videos in a playlist. If not given, + YoutubeDL tries to calculate it from "entries" + _type "multi_video" indicates that there are multiple videos that form a single show, for examples multiple acts of an opera or TV episode. @@ -404,13 +429,21 @@ class InfoExtractor(object): title, description etc. - Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods and define a _VALID_URL regexp. + Subclasses of this should define a _VALID_URL regexp and, re-define the + _real_extract() and (optionally) _real_initialize() methods. Probably, they should also be added to the list of extractors. Subclasses may also override suitable() if necessary, but ensure the function signature is preserved and that this function imports everything it needs - (except other extractors), so that lazy_extractors works correctly + (except other extractors), so that lazy_extractors works correctly. + + To support username + password (or netrc) login, the extractor must define a + _NETRC_MACHINE and re-define _perform_login(username, password) and + (optionally) _initialize_pre_login() methods. The _perform_login method will + be called between _initialize_pre_login and _real_initialize if credentials + are passed by the user. In cases where it is necessary to have the login + process as part of the extraction rather than initialization, _perform_login + can be left undefined. _GEO_BYPASS attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. @@ -438,17 +471,21 @@ class InfoExtractor(object): _GEO_COUNTRIES = None _GEO_IP_BLOCKS = None _WORKING = True + _NETRC_MACHINE = None + IE_DESC = None _LOGIN_HINTS = { - 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials', + 'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials', 'cookies': ( 'Use --cookies-from-browser or --cookies for the authentication. ' 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'), - 'password': 'Use --username and --password or --netrc to provide account credentials', + 'password': 'Use --username and --password, or --netrc to provide account credentials', } def __init__(self, downloader=None): - """Constructor. Receives an optional downloader.""" + """Constructor. Receives an optional downloader (a YoutubeDL instance). + If a downloader is not passed during initialization, + it must be set using "set_downloader()" before "extract()" is called""" self._ready = False self._x_forwarded_for_ip = None self._printed_messages = set() @@ -460,6 +497,8 @@ class InfoExtractor(object): # we have cached the regexp for *this* class, whereas getattr would also # match the superclass if '_VALID_URL_RE' not in cls.__dict__: + if '_VALID_URL' not in cls.__dict__: + cls._VALID_URL = cls._make_valid_url() cls._VALID_URL_RE = re.compile(cls._VALID_URL) return cls._VALID_URL_RE.match(url) @@ -486,6 +525,10 @@ class InfoExtractor(object): """Getter method for _WORKING.""" return cls._WORKING + @classmethod + def supports_login(cls): + return bool(cls._NETRC_MACHINE) + def initialize(self): """Initializes an instance (authentication, etc).""" self._printed_messages = set() @@ -494,6 +537,13 @@ class InfoExtractor(object): 'ip_blocks': self._GEO_IP_BLOCKS, }) if not self._ready: + self._initialize_pre_login() + if self.supports_login(): + username, password = self._get_login_info() + if username: + self._perform_login(username, password) + elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE): + self.report_warning(f'Login with password is not supported for this website. {self._LOGIN_HINTS["cookies"]}') self._real_initialize() self._ready = True @@ -602,10 +652,19 @@ class InfoExtractor(object): if self.__maybe_fake_ip_and_retry(e.countries): continue raise + except UnsupportedError: + raise except ExtractorError as e: - video_id = e.video_id or self.get_temp_id(url) - raise ExtractorError( - e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause) + kwargs = { + 'video_id': e.video_id or self.get_temp_id(url), + 'ie': self.IE_NAME, + 'tb': e.traceback or sys.exc_info()[2], + 'expected': e.expected, + 'cause': e.cause + } + if hasattr(e, 'countries'): + kwargs['countries'] = e.countries + raise type(e)(e.orig_msg, **kwargs) except compat_http_client.IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) except (KeyError, StopIteration) as e: @@ -627,16 +686,24 @@ class InfoExtractor(object): return False def set_downloader(self, downloader): - """Sets the downloader for this IE.""" + """Sets a YoutubeDL instance as the downloader for this IE.""" self._downloader = downloader + def _initialize_pre_login(self): + """ Intialization before login. Redefine in subclasses.""" + pass + + def _perform_login(self, username, password): + """ Login with username and password. Redefine in subclasses.""" + pass + def _real_initialize(self): """Real initialization process. Redefine in subclasses.""" pass def _real_extract(self, url): """Real extraction process. Redefine in subclasses.""" - pass + raise NotImplementedError('This method must be implemented by subclasses') @classmethod def ie_key(cls): @@ -664,7 +731,7 @@ class InfoExtractor(object): See _download_webpage docstring for arguments specification. """ if not self._downloader._first_webpage_request: - sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0 + sleep_interval = self.get_param('sleep_interval_requests') or 0 if sleep_interval > 0: self.to_screen('Sleeping %s seconds ...' % sleep_interval) time.sleep(sleep_interval) @@ -715,7 +782,7 @@ class InfoExtractor(object): errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) if fatal: - raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) + raise ExtractorError(errmsg, cause=err) else: self.report_warning(errmsg) return False @@ -970,7 +1037,7 @@ class InfoExtractor(object): if transform_source: json_string = transform_source(json_string) try: - return json.loads(json_string) + return json.loads(json_string, strict=False) except ValueError as ve: errmsg = '%s: Failed to parse JSON ' % video_id if fatal: @@ -1063,23 +1130,30 @@ class InfoExtractor(object): def raise_login_required( self, msg='This video is only available for registered users', - metadata_available=False, method='any'): - if metadata_available and self.get_param('ignore_no_formats_error'): + metadata_available=False, method=NO_DEFAULT): + if metadata_available and ( + self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg) + return + if method is NO_DEFAULT: + method = 'any' if self.supports_login() else 'cookies' if method is not None: + assert method in self._LOGIN_HINTS, 'Invalid login method' msg = '%s. %s' % (msg, self._LOGIN_HINTS[method]) raise ExtractorError(msg, expected=True) def raise_geo_restricted( self, msg='This video is not available from your location due to geo restriction', countries=None, metadata_available=False): - if metadata_available and self.get_param('ignore_no_formats_error'): + if metadata_available and ( + self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg) else: raise GeoRestrictedError(msg, countries=countries) def raise_no_formats(self, msg, expected=False, video_id=None): - if expected and self.get_param('ignore_no_formats_error'): + if expected and ( + self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg, video_id) elif isinstance(msg, ExtractorError): raise msg @@ -1088,39 +1162,39 @@ class InfoExtractor(object): # Methods for following #608 @staticmethod - def url_result(url, ie=None, video_id=None, video_title=None, **kwargs): + def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs): """Returns a URL that points to a page that should be processed""" - # TODO: ie should be the class used for getting the info - video_info = {'_type': 'url', - 'url': url, - 'ie_key': ie} - video_info.update(kwargs) + if ie is not None: + kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key() if video_id is not None: - video_info['id'] = video_id + kwargs['id'] = video_id if video_title is not None: - video_info['title'] = video_title - return video_info + kwargs['title'] = video_title + return { + **kwargs, + '_type': 'url_transparent' if url_transparent else 'url', + 'url': url, + } - def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): - urls = orderedSet( - self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) - for m in matches) - return self.playlist_result( - urls, playlist_id=playlist_id, playlist_title=playlist_title) + def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs): + urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {})) + for m in orderedSet(map(getter, matches) if getter else matches)) + return self.playlist_result(urls, playlist_id, playlist_title, **kwargs) @staticmethod - def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs): + def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs): """Returns a playlist""" - video_info = {'_type': 'playlist', - 'entries': entries} - video_info.update(kwargs) if playlist_id: - video_info['id'] = playlist_id + kwargs['id'] = playlist_id if playlist_title: - video_info['title'] = playlist_title + kwargs['title'] = playlist_title if playlist_description is not None: - video_info['description'] = playlist_description - return video_info + kwargs['description'] = playlist_description + return { + **kwargs, + '_type': 'multi_video' if multi_video else 'playlist', + 'entries': entries, + } def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ @@ -1137,7 +1211,7 @@ class InfoExtractor(object): if mobj: break - _name = self._downloader._color_text(name, 'blue') + _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS) if mobj: if group is None: @@ -1225,8 +1299,8 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' - % {'prop': re.escape(prop)}) + property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)' + % {'prop': re.escape(prop), 'sep': '(?::|[:-])'}) template = r'<meta[^>]+?%s[^>]+?%s' return [ template % (property_re, content_re), @@ -1257,8 +1331,8 @@ class InfoExtractor(object): def _og_search_description(self, html, **kargs): return self._og_search_property('description', html, fatal=False, **kargs) - def _og_search_title(self, html, **kargs): - return self._og_search_property('title', html, **kargs) + def _og_search_title(self, html, *, fatal=False, **kargs): + return self._og_search_property('title', html, fatal=fatal, **kargs) def _og_search_video_url(self, html, name='video url', secure=True, **kargs): regexes = self._og_regexes('video') + self._og_regexes('video:url') @@ -1269,6 +1343,9 @@ class InfoExtractor(object): def _og_search_url(self, html, **kargs): return self._og_search_property('url', html, **kargs) + def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs): + return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs) + def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): name = variadic(name) if display_name is None: @@ -1409,6 +1486,23 @@ class InfoExtractor(object): continue info[count_key] = interaction_count + def extract_chapter_information(e): + chapters = [{ + 'title': part.get('name'), + 'start_time': part.get('startOffset'), + 'end_time': part.get('endOffset'), + } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip'] + for idx, (last_c, current_c, next_c) in enumerate(zip( + [{'end_time': 0}] + chapters, chapters, chapters[1:])): + current_c['end_time'] = current_c['end_time'] or next_c['start_time'] + current_c['start_time'] = current_c['start_time'] or last_c['end_time'] + if None in current_c.values(): + self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters') + return + if chapters: + chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration'] + info['chapters'] = chapters + def extract_video_object(e): assert e['@type'] == 'VideoObject' author = e.get('author') @@ -1416,7 +1510,8 @@ class InfoExtractor(object): 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), + 'thumbnails': [{'url': url_or_none(url)} + for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))], 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), # author can be an instance of 'Organization' or 'Person' types. @@ -1431,12 +1526,21 @@ class InfoExtractor(object): 'view_count': int_or_none(e.get('interactionCount')), }) extract_interaction_statistic(e) + extract_chapter_information(e) - for e in json_ld: - if '@context' in e: + def traverse_json_ld(json_ld, at_top_level=True): + for e in json_ld: + if at_top_level and '@context' not in e: + continue + if at_top_level and set(e.keys()) == {'@context', '@graph'}: + traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False) + break item_type = e.get('@type') if expected_type is not None and expected_type != item_type: continue + rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none) + if rating is not None: + info['average_rating'] = rating if item_type in ('TVEpisode', 'Episode'): episode_name = unescapeHTML(e.get('name')) info.update({ @@ -1466,8 +1570,10 @@ class InfoExtractor(object): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), 'title': unescapeHTML(e.get('headline')), - 'description': unescapeHTML(e.get('articleBody')), + 'description': unescapeHTML(e.get('articleBody') or e.get('description')), }) + if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject': + extract_video_object(e['video'][0]) elif item_type == 'VideoObject': extract_video_object(e) if expected_type is None: @@ -1481,7 +1587,34 @@ class InfoExtractor(object): continue else: break - return dict((k, v) for k, v in info.items() if v is not None) + traverse_json_ld(json_ld) + + return filter_dict(info) + + def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): + return self._parse_json( + self._search_regex( + r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', + webpage, 'next.js data', fatal=fatal, **kw), + video_id, transform_source=transform_source, fatal=fatal) + + def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'): + ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. ''' + # not all website do this, but it can be changed + # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source + rectx = re.escape(context_name) + js, arg_keys, arg_vals = self._search_regex( + (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx, + r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx), + webpage, context_name, group=['js', 'arg_keys', 'arg_vals']) + + args = dict(zip(arg_keys.split(','), arg_vals.split(','))) + + for key, val in args.items(): + if val in ('undefined', 'void 0'): + args[key] = 'null' + + return self._parse_json(js_to_json(js, args), video_id)['data'][0] @staticmethod def _hidden_inputs(html): @@ -1510,20 +1643,20 @@ class InfoExtractor(object): default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr', - 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases + 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', 'height', 'width', 'proto', 'vext', 'abr', 'aext', - 'fps', 'fs_approx', 'source', 'format_id') + 'fps', 'fs_approx', 'source', 'id') settings = { 'vcodec': {'type': 'ordered', 'regex': True, 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, 'acodec': {'type': 'ordered', 'regex': True, - 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']}, + 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]}, 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol', - 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']}, + 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']}, 'vext': {'type': 'ordered', 'field': 'video_ext', 'order': ('mp4', 'webm', 'flv', '', 'none'), 'order_free': ('webm', 'mp4', 'flv', '', 'none')}, @@ -1537,8 +1670,8 @@ class InfoExtractor(object): 'ie_pref': {'priority': True, 'type': 'extractor'}, 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}, 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}, - 'lang': {'convert': 'ignore', 'field': 'language_preference'}, - 'quality': {'convert': 'float_none', 'default': -1}, + 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1}, + 'quality': {'convert': 'float', 'default': -1}, 'filesize': {'convert': 'bytes'}, 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}, 'id': {'convert': 'string', 'field': 'format_id'}, @@ -1549,7 +1682,7 @@ class InfoExtractor(object): 'vbr': {'convert': 'float_none'}, 'abr': {'convert': 'float_none'}, 'asr': {'convert': 'float_none'}, - 'source': {'convert': 'ignore', 'field': 'source_preference'}, + 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}, 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}, 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True}, @@ -1558,39 +1691,51 @@ class InfoExtractor(object): 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))}, - # Most of these exist only for compatibility reasons - 'dimension': {'type': 'alias', 'field': 'res'}, - 'resolution': {'type': 'alias', 'field': 'res'}, - 'extension': {'type': 'alias', 'field': 'ext'}, - 'bitrate': {'type': 'alias', 'field': 'br'}, - 'total_bitrate': {'type': 'alias', 'field': 'tbr'}, - 'video_bitrate': {'type': 'alias', 'field': 'vbr'}, - 'audio_bitrate': {'type': 'alias', 'field': 'abr'}, - 'framerate': {'type': 'alias', 'field': 'fps'}, - 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists - 'protocol': {'type': 'alias', 'field': 'proto'}, + # For compatibility with youtube-dl + 'format_id': {'type': 'alias', 'field': 'id'}, + 'preference': {'type': 'alias', 'field': 'ie_pref'}, + 'language_preference': {'type': 'alias', 'field': 'lang'}, 'source_preference': {'type': 'alias', 'field': 'source'}, + 'protocol': {'type': 'alias', 'field': 'proto'}, 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}, - 'filesize_estimate': {'type': 'alias', 'field': 'size'}, - 'samplerate': {'type': 'alias', 'field': 'asr'}, - 'video_ext': {'type': 'alias', 'field': 'vext'}, - 'audio_ext': {'type': 'alias', 'field': 'aext'}, - 'video_codec': {'type': 'alias', 'field': 'vcodec'}, - 'audio_codec': {'type': 'alias', 'field': 'acodec'}, - 'video': {'type': 'alias', 'field': 'hasvid'}, - 'has_video': {'type': 'alias', 'field': 'hasvid'}, - 'audio': {'type': 'alias', 'field': 'hasaud'}, - 'has_audio': {'type': 'alias', 'field': 'hasaud'}, - 'extractor': {'type': 'alias', 'field': 'ie_pref'}, - 'preference': {'type': 'alias', 'field': 'ie_pref'}, - 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'}, - 'format_id': {'type': 'alias', 'field': 'id'}, + + # Deprecated + 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True}, + 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True}, + 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True}, + 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True}, + 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True}, + 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True}, + 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True}, + 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True}, + 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True}, + 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True}, + 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True}, + 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True}, + 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True}, + 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True}, + 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, + 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, + 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, + 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, + 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, + 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, } - _order = [] + def __init__(self, ie, field_preference): + self._order = [] + self.ydl = ie._downloader + self.evaluate_params(self.ydl.params, field_preference) + if ie.get_param('verbose'): + self.print_verbose_info(self.ydl.write_debug) def _get_field_setting(self, field, key): if field not in self.settings: + if key in ('forced', 'priority'): + return False + self.ydl.deprecation_warning( + f'Using arbitrary fields ({field}) for format sorting is deprecated ' + 'and may be removed in a future version') self.settings[field] = {} propObj = self.settings[field] if key not in propObj: @@ -1673,7 +1818,11 @@ class InfoExtractor(object): if field is None: continue if self._get_field_setting(field, 'type') == 'alias': - field = self._get_field_setting(field, 'field') + alias, field = field, self._get_field_setting(field, 'field') + if self._get_field_setting(alias, 'deprecated'): + self.ydl.deprecation_warning( + f'Format sorting alias {alias} is deprecated ' + f'and may be removed in a future version. Please use {field} instead') reverse = match.group('reverse') is not None closest = match.group('separator') == '~' limit_text = match.group('limit') @@ -1777,10 +1926,7 @@ class InfoExtractor(object): def _sort_formats(self, formats, field_preference=[]): if not formats: return - format_sort = self.FormatSort() # params and to_screen are taken from the downloader - format_sort.evaluate_params(self._downloader.params, field_preference) - if self.get_param('verbose', False): - format_sort.print_verbose_info(self._downloader.write_debug) + format_sort = self.FormatSort(self, field_preference) formats.sort(key=lambda f: format_sort.calculate_preference(f)) def _check_formats(self, formats, video_id): @@ -1899,7 +2045,7 @@ class InfoExtractor(object): tbr = int_or_none(media_el.attrib.get('bitrate')) width = int_or_none(media_el.attrib.get('width')) height = int_or_none(media_el.attrib.get('height')) - format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) + format_id = join_nonempty(f4m_id, tbr or i) # If <bootstrapInfo> is present, the specified f4m is a # stream-level manifest, and only set-level manifests may refer to # external resources. See section 11.4 and section 4 of F4M spec @@ -1961,7 +2107,7 @@ class InfoExtractor(object): def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None): return { - 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), + 'format_id': join_nonempty(m3u8_id, 'meta'), 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', @@ -2008,16 +2154,16 @@ class InfoExtractor(object): headers=headers, query=query, video_id=video_id) def _parse_m3u8_formats_and_subtitles( - self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native', + self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native', preference=None, quality=None, m3u8_id=None, live=False, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, video_id=None): formats, subtitles = [], {} - if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access - return formats, subtitles - - has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc) + has_drm = re.search('|'.join([ + r'#EXT-X-FAXS-CM:', # Adobe Flash Access + r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay + ]), m3u8_doc) def format_url(url): return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url) @@ -2056,9 +2202,9 @@ class InfoExtractor(object): if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is formats = [{ - 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))), + 'format_id': join_nonempty(m3u8_id, idx), 'format_index': idx, - 'url': m3u8_url, + 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'), 'ext': ext, 'protocol': entry_protocol, 'preference': preference, @@ -2105,7 +2251,7 @@ class InfoExtractor(object): if media_url: manifest_url = format_url(media_url) formats.extend({ - 'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))), + 'format_id': join_nonempty(m3u8_id, group_id, name, idx), 'format_note': name, 'format_index': idx, 'url': manifest_url, @@ -2162,9 +2308,9 @@ class InfoExtractor(object): # format_id intact. if not live: stream_name = build_stream_name() - format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats)) + format_id[1] = stream_name or '%d' % (tbr or len(formats)) f = { - 'format_id': '-'.join(map(str, filter(None, format_id))), + 'format_id': join_nonempty(*format_id), 'format_index': idx, 'url': manifest_url, 'manifest_url': m3u8_url, @@ -2264,7 +2410,7 @@ class InfoExtractor(object): if smil is False: assert not fatal - return [] + return [], {} namespace = self._parse_smil_namespace(smil) @@ -2628,7 +2774,7 @@ class InfoExtractor(object): mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) formats, subtitles = [], {} - stream_numbers = {'audio': 0, 'video': 0} + stream_numbers = collections.defaultdict(int) for period in mpd_doc.findall(_add_ns('Period')): period_duration = parse_duration(period.get('duration')) or mpd_duration period_ms_info = extract_multisegment_info(period, { @@ -2644,11 +2790,15 @@ class InfoExtractor(object): mime_type = representation_attrib['mimeType'] content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) - codecs = representation_attrib.get('codecs', '') + codecs = parse_codecs(representation_attrib.get('codecs', '')) if content_type not in ('video', 'audio', 'text'): if mime_type == 'image/jpeg': content_type = mime_type - elif codecs.split('.')[0] == 'stpp': + elif codecs['vcodec'] != 'none': + content_type = 'video' + elif codecs['acodec'] != 'none': + content_type = 'audio' + elif codecs.get('tcodec', 'none') != 'none': content_type = 'text' elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'): content_type = 'text' @@ -2694,10 +2844,8 @@ class InfoExtractor(object): 'format_note': 'DASH %s' % content_type, 'filesize': filesize, 'container': mimetype2ext(mime_type) + '_dash', - 'manifest_stream_number': stream_numbers[content_type] + **codecs } - f.update(parse_codecs(codecs)) - stream_numbers[content_type] += 1 elif content_type == 'text': f = { 'ext': mimetype2ext(mime_type), @@ -2770,7 +2918,8 @@ class InfoExtractor(object): segment_duration = None if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) - representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) + representation_ms_info['total_number'] = int(math.ceil( + float_or_none(period_duration, segment_duration, default=0))) representation_ms_info['fragments'] = [{ media_location_key: media_template % { 'Number': segment_number, @@ -2861,10 +3010,16 @@ class InfoExtractor(object): f['url'] = initialization_url f['fragments'].append({location_key(initialization_url): initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) + if not period_duration: + period_duration = try_get( + representation_ms_info, + lambda r: sum(frag['duration'] for frag in r['fragments']), float) else: # Assuming direct URL to unfragmented media. f['url'] = base_url - if content_type in ('video', 'audio') or mime_type == 'image/jpeg': + if content_type in ('video', 'audio', 'image/jpeg'): + f['manifest_stream_number'] = stream_numbers[f['url']] + stream_numbers[f['url']] += 1 formats.append(f) elif content_type == 'text': subtitles.setdefault(lang or 'und', []).append(f) @@ -2953,13 +3108,6 @@ class InfoExtractor(object): }) fragment_ctx['time'] += fragment_ctx['duration'] - format_id = [] - if ism_id: - format_id.append(ism_id) - if stream_name: - format_id.append(stream_name) - format_id.append(compat_str(tbr)) - if stream_type == 'text': subtitles.setdefault(stream_language, []).append({ 'ext': 'ismt', @@ -2978,7 +3126,7 @@ class InfoExtractor(object): }) elif stream_type in ('video', 'audio'): formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(ism_id, stream_name, tbr), 'url': ism_url, 'manifest_url': ism_url, 'ext': 'ismv' if stream_type == 'video' else 'isma', @@ -3008,7 +3156,7 @@ class InfoExtractor(object): }) return formats, subtitles - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None): + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None): def absolute_url(item_url): return urljoin(base_url, item_url) @@ -3402,15 +3550,11 @@ class InfoExtractor(object): return formats def _live_title(self, name): - """ Generate the title for a live video """ - now = datetime.datetime.now() - now_str = now.strftime('%Y-%m-%d %H:%M') - return name + ' ' + now_str + self._downloader.deprecation_warning('hypervideo_dl.InfoExtractor._live_title is deprecated and does not work as expected') + return name def _int(self, v, name, fatal=False, **kwargs): res = int_or_none(v, **kwargs) - if 'get_attr' in kwargs: - print(getattr(v, kwargs['get_attr'])) if res is None: msg = 'Failed to extract %s: Could not parse value %r' % (name, v) if fatal: @@ -3515,14 +3659,18 @@ class InfoExtractor(object): def extractor(): comments = [] + interrupted = True try: while True: comments.append(next(generator)) - except KeyboardInterrupt: - interrupted = True - self.to_screen('Interrupted by user') except StopIteration: interrupted = False + except KeyboardInterrupt: + self.to_screen('Interrupted by user') + except Exception as e: + if self.get_param('ignoreerrors') is not True: + raise + self._downloader.report_error(e) comment_count = len(comments) self.to_screen(f'Extracted {comment_count} comments') return { @@ -3536,11 +3684,11 @@ class InfoExtractor(object): @staticmethod def _merge_subtitle_items(subtitle_list1, subtitle_list2): - """ Merge subtitle items for one language. Items with duplicated URLs + """ Merge subtitle items for one language. Items with duplicated URLs/data will be dropped. """ - list1_urls = set([item['url'] for item in subtitle_list1]) + list1_data = set((item.get('url'), item.get('data')) for item in subtitle_list1) ret = list(subtitle_list1) - ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) + ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data) return ret @classmethod @@ -3565,9 +3713,8 @@ class InfoExtractor(object): def mark_watched(self, *args, **kwargs): if not self.get_param('mark_watched', False): return - if (self._get_login_info()[0] is not None - or self.get_param('cookiefile') - or self.get_param('cookiesfrombrowser')): + if (self.supports_login() and self._get_login_info()[0] is not None + or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')): self._mark_watched(*args, **kwargs) def _mark_watched(self, *args, **kwargs): @@ -3600,7 +3747,7 @@ class InfoExtractor(object): else 'public' if all_known else None) - def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False): + def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False): ''' @returns A list of values for the extractor argument given by "key" or "default" if no such key is present @@ -3608,34 +3755,43 @@ class InfoExtractor(object): @param casesense When false, the values are converted to lower case ''' val = traverse_obj( - self._downloader.params, ('extractor_args', self.ie_key().lower(), key)) + self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key)) if val is None: return [] if default is NO_DEFAULT else default return list(val) if casesense else [x.lower() for x in val] + def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'): + if not playlist_id or not video_id: + return not video_id + + no_playlist = (smuggled_data or {}).get('force_noplaylist') + if no_playlist is not None: + return not no_playlist + + video_id = '' if video_id is True else f' {video_id}' + playlist_id = '' if playlist_id is True else f' {playlist_id}' + if self.get_param('noplaylist'): + self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist') + return False + self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}') + return True + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} - Instances should define _SEARCH_KEY and _MAX_RESULTS. + Instances should define _SEARCH_KEY and optionally _MAX_RESULTS """ + _MAX_RESULTS = float('inf') + @classmethod def _make_valid_url(cls): return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY - @classmethod - def suitable(cls, url): - return re.match(cls._make_valid_url(), url) is not None - def _real_extract(self, query): - mobj = re.match(self._make_valid_url(), query) - if mobj is None: - raise ExtractorError('Invalid search query "%s"' % query) - - prefix = mobj.group('prefix') - query = mobj.group('query') + prefix, query = self._match_valid_url(query).group('prefix', 'query') if prefix == '': return self._get_n_results(query, 1) elif prefix == 'all': diff --git a/hypervideo_dl/extractor/corus.py b/hypervideo_dl/extractor/corus.py index 352951e..1194613 100644 --- a/hypervideo_dl/extractor/corus.py +++ b/hypervideo_dl/extractor/corus.py @@ -55,7 +55,6 @@ class CorusIE(ThePlatformFeedIE): 'timestamp': 1486392197, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'expected_warnings': ['Failed to parse JSON'], diff --git a/hypervideo_dl/extractor/coub.py b/hypervideo_dl/extractor/coub.py index eba6b73..e90aa19 100644 --- a/hypervideo_dl/extractor/coub.py +++ b/hypervideo_dl/extractor/coub.py @@ -57,7 +57,7 @@ class CoubIE(InfoExtractor): file_versions = coub['file_versions'] - QUALITIES = ('low', 'med', 'high') + QUALITIES = ('low', 'med', 'high', 'higher') MOBILE = 'mobile' IPHONE = 'iphone' @@ -86,6 +86,7 @@ class CoubIE(InfoExtractor): 'format_id': '%s-%s-%s' % (HTML5, kind, quality), 'filesize': int_or_none(item.get('size')), 'vcodec': 'none' if kind == 'audio' else None, + 'acodec': 'none' if kind == 'video' else None, 'quality': quality_key(quality), 'source_preference': preference_key(HTML5), }) diff --git a/hypervideo_dl/extractor/cozytv.py b/hypervideo_dl/extractor/cozytv.py new file mode 100644 index 0000000..d49f1ca --- /dev/null +++ b/hypervideo_dl/extractor/cozytv.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class CozyTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cozy\.tv/(?P<uploader>[^/]+)/replays/(?P<id>[^/$#&?]+)' + + _TESTS = [{ + 'url': 'https://cozy.tv/beardson/replays/2021-11-19_1', + 'info_dict': { + 'id': 'beardson-2021-11-19_1', + 'ext': 'mp4', + 'title': 'pokemon pt2', + 'uploader': 'beardson', + 'upload_date': '20211119', + 'was_live': True, + 'duration': 7981, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + uploader, date = self._match_valid_url(url).groups() + id = f'{uploader}-{date}' + data_json = self._download_json(f'https://api.cozy.tv/cache/{uploader}/replay/{date}', id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://cozycdn.foxtrotstream.xyz/replays/{uploader}/{date}/index.m3u8', id, ext='mp4') + return { + 'id': id, + 'title': data_json.get('title'), + 'uploader': data_json.get('user') or uploader, + 'upload_date': unified_strdate(data_json.get('date')), + 'was_live': True, + 'duration': data_json.get('duration'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/cpac.py b/hypervideo_dl/extractor/cpac.py new file mode 100644 index 0000000..2274115 --- /dev/null +++ b/hypervideo_dl/extractor/cpac.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + urljoin, +) + +# compat_range +try: + if callable(xrange): + range = xrange +except (NameError, TypeError): + pass + + +class CPACIE(InfoExtractor): + IE_NAME = 'cpac' + _VALID_URL = r'https?://(?:www\.)?cpac\.ca/(?P<fr>l-)?episode\?id=(?P<id>[\da-f]{8}(?:-[\da-f]{4}){3}-[\da-f]{12})' + _TEST = { + # 'url': 'http://www.cpac.ca/en/programs/primetime-politics/episodes/65490909', + 'url': 'https://www.cpac.ca/episode?id=fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'md5': 'e46ad699caafd7aa6024279f2614e8fa', + 'info_dict': { + 'id': 'fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'ext': 'mp4', + 'upload_date': '20220215', + 'title': 'News Conference to Celebrate National Kindness Week – February 15, 2022', + 'description': 'md5:466a206abd21f3a6f776cdef290c23fb', + 'timestamp': 1644901200, + }, + 'params': { + 'format': 'bestvideo', + 'hls_prefer_native': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if '/l-episode?' in url else 'en' + + content = self._download_json( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/episode/index.xml&crafterSite=cpacca&id=' + video_id, + video_id) + video_url = try_get(content, lambda x: x['page']['details']['videoUrl'], compat_str) + formats = [] + if video_url: + content = content['page'] + title = str_or_none(content['details']['title_%s_t' % (url_lang, )]) + formats = self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', ext='mp4') + for fmt in formats: + # prefer language to match URL + fmt_lang = fmt.get('language') + if fmt_lang == url_lang: + fmt['language_preference'] = 10 + elif not fmt_lang: + fmt['language_preference'] = -1 + else: + fmt['language_preference'] = -10 + + self._sort_formats(formats) + + category = str_or_none(content['details']['category_%s_t' % (url_lang, )]) + + def is_live(v_type): + return (v_type == 'live') if v_type is not None else None + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': str_or_none(content['details'].get('description_%s_t' % (url_lang, ))), + 'timestamp': unified_timestamp(content['details'].get('liveDateTime')), + 'category': [category] if category else None, + 'thumbnail': urljoin(url, str_or_none(content['details'].get('image_%s_s' % (url_lang, )))), + 'is_live': is_live(content['details'].get('type')), + } + + +class CPACPlaylistIE(InfoExtractor): + IE_NAME = 'cpac:playlist' + _VALID_URL = r'(?i)https?://(?:www\.)?cpac\.ca/(?:program|search|(?P<fr>emission|rechercher))\?(?:[^&]+&)*?(?P<id>(?:id=\d+|programId=\d+|key=[^&]+))' + + _TESTS = [{ + 'url': 'https://www.cpac.ca/program?id=6', + 'info_dict': { + 'id': 'id=6', + 'title': 'Headline Politics', + 'description': 'Watch CPAC’s signature long-form coverage of the day’s pressing political events as they unfold.', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.cpac.ca/search?key=hudson&type=all&order=desc', + 'info_dict': { + 'id': 'key=hudson', + 'title': 'hudson', + }, + 'playlist_count': 22, + }, { + 'url': 'https://www.cpac.ca/search?programId=50', + 'info_dict': { + 'id': 'programId=50', + 'title': '50', + }, + 'playlist_count': 9, + }, { + 'url': 'https://www.cpac.ca/emission?id=6', + 'only_matching': True, + }, { + 'url': 'https://www.cpac.ca/rechercher?key=hudson&type=all&order=desc', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if any(x in url for x in ('/emission?', '/rechercher?')) else 'en' + pl_type, list_type = ('program', 'itemList') if any(x in url for x in ('/program?', '/emission?')) else ('search', 'searchResult') + api_url = ( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/%s/index.xml&crafterSite=cpacca&%s' + % (pl_type, video_id, )) + content = self._download_json(api_url, video_id) + entries = [] + total_pages = int_or_none(try_get(content, lambda x: x['page'][list_type]['totalPages']), default=1) + for page in range(1, total_pages + 1): + if page > 1: + api_url = update_url_query(api_url, {'page': '%d' % (page, ), }) + content = self._download_json( + api_url, video_id, + note='Downloading continuation - %d' % (page, ), + fatal=False) + + for item in try_get(content, lambda x: x['page'][list_type]['item'], list) or []: + episode_url = urljoin(url, try_get(item, lambda x: x['url_%s_s' % (url_lang, )])) + if episode_url: + entries.append(episode_url) + + return self.playlist_result( + (self.url_result(entry) for entry in entries), + playlist_id=video_id, + playlist_title=try_get(content, lambda x: x['page']['program']['title_%s_t' % (url_lang, )]) or video_id.split('=')[-1], + playlist_description=try_get(content, lambda x: x['page']['program']['description_%s_t' % (url_lang, )]), + ) diff --git a/hypervideo_dl/extractor/crackle.py b/hypervideo_dl/extractor/crackle.py index 2c9d28d..db4962c 100644 --- a/hypervideo_dl/extractor/crackle.py +++ b/hypervideo_dl/extractor/crackle.py @@ -23,32 +23,35 @@ from ..utils import ( class CrackleIE(InfoExtractor): _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)' _TESTS = [{ - # geo restricted to CA - 'url': 'https://www.crackle.com/andromeda/2502343', + # Crackle is available in the United States and territories + 'url': 'https://www.crackle.com/thanksgiving/2510064', 'info_dict': { - 'id': '2502343', + 'id': '2510064', 'ext': 'mp4', - 'title': 'Under The Night', - 'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a', - 'duration': 2583, + 'title': 'Touch Football', + 'description': 'md5:cfbb513cf5de41e8b56d7ab756cff4df', + 'duration': 1398, 'view_count': int, 'average_rating': 0, - 'age_limit': 14, - 'genre': 'Action, Sci-Fi', - 'creator': 'Allan Kroeker', - 'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe', - 'release_year': 2000, - 'series': 'Andromeda', - 'episode': 'Under The Night', + 'age_limit': 17, + 'genre': 'Comedy', + 'creator': 'Daniel Powell', + 'artist': 'Chris Elliott, Amy Sedaris', + 'release_year': 2016, + 'series': 'Thanksgiving', + 'episode': 'Touch Football', 'season_number': 1, 'episode_number': 1, }, 'params': { # m3u8 download 'skip_download': True, - } + }, + 'expected_warnings': [ + 'Trying with a list of known countries' + ], }, { - 'url': 'https://www.sonycrackle.com/andromeda/2502343', + 'url': 'https://www.sonycrackle.com/thanksgiving/2510064', 'only_matching': True, }] @@ -129,7 +132,6 @@ class CrackleIE(InfoExtractor): break ignore_no_formats = self.get_param('ignore_no_formats_error') - allow_unplayable_formats = self.get_param('allow_unplayable_formats') if not media or (not media.get('MediaURLs') and not ignore_no_formats): raise ExtractorError( @@ -143,9 +145,9 @@ class CrackleIE(InfoExtractor): for e in media.get('MediaURLs') or []: if e.get('UseDRM'): has_drm = True - if not allow_unplayable_formats: - continue - format_url = url_or_none(e.get('Path')) + format_url = url_or_none(e.get('DRMPath')) + else: + format_url = url_or_none(e.get('Path')) if not format_url: continue ext = determine_ext(format_url) diff --git a/hypervideo_dl/extractor/craftsy.py b/hypervideo_dl/extractor/craftsy.py new file mode 100644 index 0000000..ed2f442 --- /dev/null +++ b/hypervideo_dl/extractor/craftsy.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor + +from ..utils import ( + dict_get, + get_element_by_id, + js_to_json, + traverse_obj, +) + + +class CraftsyIE(InfoExtractor): + _VALID_URL = r'https?://www.craftsy.com/class/(?P<id>[a-z0-9_-]+)/' + _TESTS = [{ + 'url': 'https://www.craftsy.com/class/the-midnight-quilt-show-season-5/', + 'info_dict': { + 'id': 'the-midnight-quilt-show-season-5', + 'title': 'The Midnight Quilt Show Season 5', + 'description': 'md5:113eda818e985d1a566625fb2f833b7a', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.craftsy.com/class/sew-your-own-designer-handbag/', + 'info_dict': { + 'id': 'sew-your-own-designer-handbag', + 'title': 'Sew Your Own Designer Handbag', + 'description': 'md5:8270d0ef5427d3c895a27351aeaac276', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://www.craftsy.com/class/all-access-estes-park-wool-market/', + 'info_dict': { + 'id': 'all-access-estes-park-wool-market', + 'title': 'All Access: Estes Park Wool Market', + 'description': 'md5:aded1bd8d38ae2fae4dae936c0ae01e7', + }, + 'playlist_count': 6, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json(self._search_regex( + r'class_video_player_vars\s*=\s*({.*})\s*;', + get_element_by_id('vidstore-classes_class-video-player-js-extra', webpage), + 'video data'), video_id, transform_source=js_to_json) + + account_id = traverse_obj(video_data, ('video_player', 'bc_account_id')) + + entries = [] + class_preview = traverse_obj(video_data, ('video_player', 'class_preview')) + if class_preview: + v_id = class_preview.get('video_id') + entries.append(self.url_result( + f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={v_id}', + BrightcoveNewIE, v_id, class_preview.get('title'))) + + if dict_get(video_data, ('is_free', 'user_has_access')): + entries += [ + self.url_result( + f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={lesson["video_id"]}', + BrightcoveNewIE, lesson['video_id'], lesson.get('title')) + for lesson in video_data['lessons']] + + return self.playlist_result( + entries, video_id, video_data.get('class_title'), + self._html_search_meta(('og:description', 'description'), webpage, default=None)) diff --git a/hypervideo_dl/extractor/crowdbunker.py b/hypervideo_dl/extractor/crowdbunker.py new file mode 100644 index 0000000..72906af --- /dev/null +++ b/hypervideo_dl/extractor/crowdbunker.py @@ -0,0 +1,113 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + unified_strdate, +) + + +class CrowdBunkerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/v/(?P<id>[^/?#$&]+)' + + _TESTS = [{ + 'url': 'https://crowdbunker.com/v/0z4Kms8pi8I', + 'info_dict': { + 'id': '0z4Kms8pi8I', + 'ext': 'mp4', + 'title': '117) Pass vax et solutions', + 'description': 'md5:86bcb422c29475dbd2b5dcfa6ec3749c', + 'view_count': int, + 'duration': 5386, + 'uploader': 'Jérémie Mercier', + 'uploader_id': 'UCeN_qQV829NYf0pvPJhW5dQ', + 'like_count': int, + 'upload_date': '20211218', + 'thumbnail': 'https://scw.divulg.org/cb-medias4/images/0z4Kms8pi8I/maxres.jpg' + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://api.divulg.org/post/{id}/details', + id, headers={'accept': 'application/json, text/plain, */*'}) + video_json = data_json['video'] + formats, subtitles = [], {} + for sub in video_json.get('captions') or []: + sub_url = try_get(sub, lambda x: x['file']['url']) + if not sub_url: + continue + subtitles.setdefault(sub.get('languageCode', 'fr'), []).append({ + 'url': sub_url, + }) + + mpd_url = try_get(video_json, lambda x: x['dashManifest']['url']) + if mpd_url: + fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + m3u8_url = try_get(video_json, lambda x: x['hlsManifest']['url']) + if m3u8_url: + fmts, subs = self._extract_m3u8_formats_and_subtitles(mpd_url, id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + + thumbnails = [{ + 'url': image['url'], + 'height': int_or_none(image.get('height')), + 'width': int_or_none(image.get('width')), + } for image in video_json.get('thumbnails') or [] if image.get('url')] + + self._sort_formats(formats) + return { + 'id': id, + 'title': video_json.get('title'), + 'description': video_json.get('description'), + 'view_count': video_json.get('viewCount'), + 'duration': video_json.get('duration'), + 'uploader': try_get(data_json, lambda x: x['channel']['name']), + 'uploader_id': try_get(data_json, lambda x: x['channel']['id']), + 'like_count': data_json.get('likesCount'), + 'upload_date': unified_strdate(video_json.get('publishedAt') or video_json.get('createdAt')), + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } + + +class CrowdBunkerChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/@(?P<id>[^/?#$&]+)' + + _TESTS = [{ + 'url': 'https://crowdbunker.com/@Milan_UHRIN', + 'playlist_mincount': 14, + 'info_dict': { + 'id': 'Milan_UHRIN', + }, + }] + + def _entries(self, id): + last = None + + for page in itertools.count(): + channel_json = self._download_json( + f'https://api.divulg.org/organization/{id}/posts', id, headers={'accept': 'application/json, text/plain, */*'}, + query={'after': last} if last else {}, note=f'Downloading Page {page}') + for item in channel_json.get('items') or []: + v_id = item.get('uid') + if not v_id: + continue + yield self.url_result( + 'https://crowdbunker.com/v/%s' % v_id, ie=CrowdBunkerIE.ie_key(), video_id=v_id) + last = channel_json.get('last') + if not last: + break + + def _real_extract(self, url): + id = self._match_id(url) + return self.playlist_result(self._entries(id), playlist_id=id) diff --git a/hypervideo_dl/extractor/crunchyroll.py b/hypervideo_dl/extractor/crunchyroll.py index 511ac1b..7edb645 100644 --- a/hypervideo_dl/extractor/crunchyroll.py +++ b/hypervideo_dl/extractor/crunchyroll.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import re import json import zlib @@ -8,7 +9,7 @@ import zlib from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor -from .vrv import VRVIE +from .vrv import VRVBaseIE from ..compat import ( compat_b64decode, compat_etree_Element, @@ -23,14 +24,17 @@ from ..utils import ( bytes_to_intlist, extract_attributes, float_or_none, + format_field, intlist_to_bytes, int_or_none, + join_nonempty, lowercase_escape, merge_dicts, + qualities, remove_end, sanitized_Request, + traverse_obj, try_get, - urlencode_postdata, xpath_text, ) from ..aes import ( @@ -39,8 +43,8 @@ from ..aes import ( class CrunchyrollBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.crunchyroll.com/login' - _LOGIN_FORM = 'login_form' + _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login' + _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' def _call_rpc_api(self, method, video_id, note=None, data=None): @@ -53,57 +57,50 @@ class CrunchyrollBaseIE(InfoExtractor): 'Content-Type': 'application/x-www-form-urlencoded', }) - def _login(self): - username, password = self._get_login_info() - if username is None: + def _perform_login(self, username, password): + if self._get_cookies(self._LOGIN_URL).get('etp_rt'): return - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - def is_logged(webpage): - return 'href="/logout"' in webpage - - # Already logged in - if is_logged(login_page): - return - - login_form_str = self._search_regex( - r'(?P<form><form[^>]+?id=(["\'])%s\2[^>]*>)' % self._LOGIN_FORM, - login_page, 'login form', group='form') - - post_url = extract_attributes(login_form_str).get('action') - if not post_url: - post_url = self._LOGIN_URL - elif not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - - login_form = self._form_hidden_inputs(self._LOGIN_FORM, login_page) - - login_form.update({ - 'login_form[name]': username, - 'login_form[password]': password, - }) - - response = self._download_webpage( - post_url, None, 'Logging in', 'Wrong login info', - data=urlencode_postdata(login_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - # Successful login - if is_logged(response): - return - - error = self._html_search_regex( - '(?s)<ul[^>]+class=["\']messages["\'][^>]*>(.+?)</ul>', - response, 'error message', default=None) - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - - raise ExtractorError('Unable to log in') - - def _real_initialize(self): - self._login() + upsell_response = self._download_json( + f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id', + query={ + 'sess_id': 1, + 'device_id': 'whatvalueshouldbeforweb', + 'device_type': 'com.crunchyroll.static', + 'access_token': 'giKq5eY27ny3cqz', + 'referer': self._LOGIN_URL + }) + if upsell_response['code'] != 'ok': + raise ExtractorError('Could not get session id') + session_id = upsell_response['data']['session_id'] + + login_response = self._download_json( + f'{self._API_BASE}/login.1.json', None, 'Logging in', + data=compat_urllib_parse_urlencode({ + 'account': username, + 'password': password, + 'session_id': session_id + }).encode('ascii')) + if login_response['code'] != 'ok': + raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True) + if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): + raise ExtractorError('Login succeeded but did not set etp_rt cookie') + + # Beta-specific, but needed for redirects + def _get_beta_embedded_json(self, webpage, display_id): + initial_state = self._parse_json(self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) + app_config = self._parse_json(self._search_regex( + r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) + return initial_state, app_config + + def _redirect_to_beta(self, webpage, iekey, video_id): + if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): + raise ExtractorError('Received a beta page from non-beta url when not logged in.') + initial_state, app_config = self._get_beta_embedded_json(webpage, video_id) + url = app_config['baseSiteUrl'] + initial_state['router']['locations']['current']['pathname'] + self.to_screen(f'{video_id}: Redirected to beta site - {url}') + return self.url_result(f'{url}', iekey, video_id) @staticmethod def _add_skip_wall(url): @@ -119,7 +116,7 @@ class CrunchyrollBaseIE(InfoExtractor): parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) -class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): +class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE): IE_NAME = 'crunchyroll' _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<id>[0-9]+))(?:[/?&]|$)' _TESTS = [{ @@ -425,6 +422,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage = self._download_webpage( self._add_skip_wall(webpage_url), video_id, headers=self.geo_verification_headers()) + if re.search(r'<div id="preload-data">', webpage): + return self._redirect_to_beta(webpage, CrunchyrollBetaIE.ie_key(), video_id) note_m = self._html_search_regex( r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='') @@ -478,19 +477,24 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], webpage, 'video_uploader', default=False) + requested_languages = self._configuration_arg('language') + requested_hardsubs = [('' if val == 'none' else val) for val in self._configuration_arg('hardsub')] + language_preference = qualities((requested_languages or [language or ''])[::-1]) + hardsub_preference = qualities((requested_hardsubs or ['', language or ''])[::-1]) + formats = [] for stream in media.get('streams', []): - audio_lang = stream.get('audio_lang') - hardsub_lang = stream.get('hardsub_lang') + audio_lang = stream.get('audio_lang') or '' + hardsub_lang = stream.get('hardsub_lang') or '' + if (requested_languages and audio_lang.lower() not in requested_languages + or requested_hardsubs and hardsub_lang.lower() not in requested_hardsubs): + continue vrv_formats = self._extract_vrv_formats( stream.get('url'), video_id, stream.get('format'), audio_lang, hardsub_lang) for f in vrv_formats: - f['language_preference'] = 1 if audio_lang == language else 0 - f['quality'] = ( - 1 if not hardsub_lang - else 0 if hardsub_lang == language - else -1) + f['language_preference'] = language_preference(audio_lang) + f['quality'] = hardsub_preference(hardsub_lang) formats.extend(vrv_formats) if not formats: available_fmts = [] @@ -684,6 +688,8 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): # https:// gives a 403, but http:// does not self._add_skip_wall(url).replace('https://', 'http://'), show_id, headers=self.geo_verification_headers()) + if re.search(r'<div id="preload-data">', webpage): + return self._redirect_to_beta(webpage, CrunchyrollBetaShowIE.ie_key(), show_id) title = self._html_search_meta('name', webpage, default=None) episode_re = r'<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"' @@ -706,9 +712,56 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): } -class CrunchyrollBetaIE(CrunchyrollBaseIE): +class CrunchyrollBetaBaseIE(CrunchyrollBaseIE): + params = None + + def _get_params(self, lang): + if not CrunchyrollBetaBaseIE.params: + initial_state, app_config = self._get_beta_embedded_json(self._download_webpage( + f'https://beta.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) + api_domain = app_config['cxApiParams']['apiDomain'] + basic_token = str(base64.b64encode(('%s:' % app_config['cxApiParams']['accountAuthClientId']).encode('ascii')), 'ascii') + auth_response = self._download_json( + f'{api_domain}/auth/v1/token', None, note='Authenticating with cookie', + headers={ + 'Authorization': 'Basic ' + basic_token + }, data='grant_type=etp_rt_cookie'.encode('ascii')) + policy_response = self._download_json( + f'{api_domain}/index/v2', None, note='Retrieving signed policy', + headers={ + 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] + }) + bucket = policy_response['cms']['bucket'] + params = { + 'Policy': policy_response['cms']['policy'], + 'Signature': policy_response['cms']['signature'], + 'Key-Pair-Id': policy_response['cms']['key_pair_id'] + } + locale = traverse_obj(initial_state, ('localization', 'locale')) + if locale: + params['locale'] = locale + CrunchyrollBetaBaseIE.params = (api_domain, bucket, params) + return CrunchyrollBetaBaseIE.params + + def _redirect_from_beta(self, url, lang, internal_id, display_id, is_episode, iekey): + initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(url, display_id), display_id) + content_data = initial_state['content']['byId'][internal_id] + if is_episode: + video_id = content_data['external_id'].split('.')[1] + series_id = content_data['episode_metadata']['series_slug_title'] + else: + series_id = content_data['slug_title'] + series_id = re.sub(r'-{2,}', '-', series_id) + url = f'https://www.crunchyroll.com/{lang}{series_id}' + if is_episode: + url = url + f'/{display_id}-{video_id}' + self.to_screen(f'{display_id}: Not logged in. Redirecting to non-beta site - {url}') + return self.url_result(url, iekey, display_id) + + +class CrunchyrollBetaIE(CrunchyrollBetaBaseIE): IE_NAME = 'crunchyroll:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<internal_id>\w+)/(?P<id>[\w\-]+)/?(?:\?|$)' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)' _TESTS = [{ 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', 'info_dict': { @@ -719,26 +772,129 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'uploader': 'Toei Animation', 'title': 'World Trigger Episode 73 – To the Future', 'upload_date': '20160402', + 'episode_number': 73, + 'series': 'World Trigger', + 'average_rating': 4.9, + 'episode': 'To the Future', + 'season': 'World Trigger', + 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/c870dedca1a83137c2d3d144984155ed1459527119_main.jpg', + 'season_number': 1, + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Unable to download XML'] + }, { + 'url': 'https://beta.crunchyroll.com/watch/GYK53DMPR/wicked-lord-shingan-reborn', + 'info_dict': { + 'id': '648781', + 'ext': 'mp4', + 'episode_number': 1, + 'timestamp': 1389173400, + 'series': 'Love, Chunibyo & Other Delusions - Heart Throb -', + 'description': 'md5:5579d1a0355cc618558ba23d27067a62', + 'uploader': 'TBS', + 'episode': 'Wicked Lord Shingan... Reborn', + 'average_rating': 4.9, + 'season': 'Love, Chunibyo & Other Delusions - Heart Throb -', + 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/2ba0384e225a5370d5f0ee9496d91ea51389046521_main.jpg', + 'title': 'Love, Chunibyo & Other Delusions - Heart Throb - Episode 1 – Wicked Lord Shingan... Reborn', + 'season_number': 2, + 'upload_date': '20140108', }, 'params': {'skip_download': 'm3u8'}, 'expected_warnings': ['Unable to download XML'] + }, { + 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/', + 'only_matching': True, }] def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id') - webpage = self._download_webpage(url, display_id) - episode_data = self._parse_json( - self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'episode data'), - display_id)['content']['byId'][internal_id] - video_id = episode_data['external_id'].split('.')[1] - series_id = episode_data['episode_metadata']['series_slug_title'] - return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}', - CrunchyrollIE.ie_key(), video_id) - - -class CrunchyrollBetaShowIE(CrunchyrollBaseIE): + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') + + if not self._get_cookies(url).get('etp_rt'): + return self._redirect_from_beta(url, lang, internal_id, display_id, True, CrunchyrollIE.ie_key()) + + api_domain, bucket, params = self._get_params(lang) + + episode_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, + note='Retrieving episode metadata', + query=params) + if episode_response.get('is_premium_only') and not episode_response.get('playback'): + raise ExtractorError('This video is for premium members only.', expected=True) + stream_response = self._download_json( + episode_response['playback'], display_id, + note='Retrieving stream info') + + thumbnails = [] + for thumbnails_data in traverse_obj(episode_response, ('images', 'thumbnail')): + for thumbnail_data in thumbnails_data: + thumbnails.append({ + 'url': thumbnail_data.get('source'), + 'width': thumbnail_data.get('width'), + 'height': thumbnail_data.get('height'), + }) + subtitles = {} + for lang, subtitle_data in stream_response.get('subtitles').items(): + subtitles[lang] = [{ + 'url': subtitle_data.get('url'), + 'ext': subtitle_data.get('format') + }] + + requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] + hardsub_preference = qualities(requested_hardsubs[::-1]) + requested_formats = self._configuration_arg('format') or ['adaptive_hls'] + + formats = [] + for stream_type, streams in stream_response.get('streams', {}).items(): + if stream_type not in requested_formats: + continue + for stream in streams.values(): + hardsub_lang = stream.get('hardsub_locale') or '' + if hardsub_lang.lower() not in requested_hardsubs: + continue + format_id = join_nonempty( + stream_type, + format_field(stream, 'hardsub_locale', 'hardsub-%s')) + if not stream.get('url'): + continue + if stream_type.split('_')[-1] == 'hls': + adaptive_formats = self._extract_m3u8_formats( + stream['url'], display_id, 'mp4', m3u8_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + elif stream_type.split('_')[-1] == 'dash': + adaptive_formats = self._extract_mpd_formats( + stream['url'], display_id, mpd_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = stream_response.get('audio_locale') + f['quality'] = hardsub_preference(hardsub_lang.lower()) + formats.extend(adaptive_formats) + self._sort_formats(formats) + + return { + 'id': internal_id, + 'title': '%s Episode %s – %s' % (episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), + 'description': episode_response.get('description').replace(r'\r\n', '\n'), + 'duration': float_or_none(episode_response.get('duration_ms'), 1000), + 'thumbnails': thumbnails, + 'series': episode_response.get('series_title'), + 'series_id': episode_response.get('series_id'), + 'season': episode_response.get('season_title'), + 'season_id': episode_response.get('season_id'), + 'season_number': episode_response.get('season_number'), + 'episode': episode_response.get('title'), + 'episode_number': episode_response.get('sequence_number'), + 'subtitles': subtitles, + 'formats': formats + } + + +class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): IE_NAME = 'crunchyroll:playlist:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/\w+/(?P<id>[\w\-]+)/?(?:\?|$)' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)' _TESTS = [{ 'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { @@ -747,11 +903,56 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE): }, 'playlist_mincount': 10, }, { + 'url': 'https://beta.crunchyroll.com/series/GYJQV73V6/love-chunibyo--other-delusions---heart-throb--', + 'info_dict': { + 'id': 'love-chunibyo-other-delusions-heart-throb-', + 'title': 'Love, Chunibyo & Other Delusions - Heart Throb -', + }, + 'playlist_mincount': 10, + }, { 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA', 'only_matching': True, }] def _real_extract(self, url): - lang, series_id = self._match_valid_url(url).group('lang', 'id') - return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id.lower()}', - CrunchyrollShowPlaylistIE.ie_key(), series_id) + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') + + if not self._get_cookies(url).get('etp_rt'): + return self._redirect_from_beta(url, lang, internal_id, display_id, False, CrunchyrollShowPlaylistIE.ie_key()) + + api_domain, bucket, params = self._get_params(lang) + + series_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id, + note='Retrieving series metadata', query=params) + + seasons_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id, + note='Retrieving season list', query=params) + + def entries(): + for season in seasons_response['items']: + episodes_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id, + note=f'Retrieving episode list for {season.get("slug_title")}', query=params) + for episode in episodes_response['items']: + episode_id = episode['id'] + episode_display_id = episode['slug_title'] + yield { + '_type': 'url', + 'url': f'https://beta.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', + 'ie_key': CrunchyrollBetaIE.ie_key(), + 'id': episode_id, + 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), + 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')), + 'duration': float_or_none(episode.get('duration_ms'), 1000), + 'series': episode.get('series_title'), + 'series_id': episode.get('series_id'), + 'season': episode.get('season_title'), + 'season_id': episode.get('season_id'), + 'season_number': episode.get('season_number'), + 'episode': episode.get('title'), + 'episode_number': episode.get('sequence_number') + } + + return self.playlist_result(entries(), internal_id, series_response.get('title')) diff --git a/hypervideo_dl/extractor/cspan.py b/hypervideo_dl/extractor/cspan.py index 2e01aff..f51159b 100644 --- a/hypervideo_dl/extractor/cspan.py +++ b/hypervideo_dl/extractor/cspan.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTMLParseError from ..utils import ( determine_ext, ExtractorError, @@ -11,14 +12,16 @@ from ..utils import ( get_element_by_attribute, get_element_by_class, int_or_none, + join_nonempty, js_to_json, merge_dicts, parse_iso8601, + parse_qs, smuggle_url, str_to_int, unescapeHTML, ) -from .senateisvp import SenateISVPIE +from .senategov import SenateISVPIE from .ustream import UstreamIE @@ -126,8 +129,12 @@ class CSpanIE(InfoExtractor): ext = 'vtt' subtitle['ext'] = ext ld_info = self._search_json_ld(webpage, video_id, default={}) - title = get_element_by_class('video-page-title', webpage) or \ - self._og_search_title(webpage) + try: + title = get_element_by_class('video-page-title', webpage) + except compat_HTMLParseError: + title = None + if title is None: + title = self._og_search_title(webpage) description = get_element_by_attribute('itemprop', 'description', webpage) or \ self._html_search_meta(['og:description', 'description'], webpage) return merge_dicts(info, ld_info, { @@ -242,3 +249,42 @@ class CSpanIE(InfoExtractor): 'title': title, 'id': 'c' + video_id if video_type == 'clip' else video_id, } + + +class CSpanCongressIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?c-span\.org/congress/' + _TESTS = [{ + 'url': 'https://www.c-span.org/congress/?chamber=house&date=2017-12-13&t=1513208380', + 'info_dict': { + 'id': 'house_2017-12-13', + 'title': 'Congressional Chronicle - Members of Congress, Hearings and More', + 'description': 'md5:54c264b7a8f219937987610243305a84', + 'thumbnail': r're:https://ximage.c-spanvideo.org/.+', + 'ext': 'mp4' + } + }] + + def _real_extract(self, url): + query = parse_qs(url) + video_date = query.get('date', [None])[0] + video_id = join_nonempty(query.get('chamber', ['senate'])[0], video_date, delim='_') + webpage = self._download_webpage(url, video_id) + if not video_date: + jwp_date = re.search(r'jwsetup.clipprogdate = \'(?P<date>\d{4}-\d{2}-\d{2})\';', webpage) + if jwp_date: + video_id = f'{video_id}_{jwp_date.group("date")}' + jwplayer_data = self._parse_json( + self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'), + video_id, transform_source=js_to_json) + + title = (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'video title')) + description = (self._og_search_description(webpage, default=None) + or self._html_search_meta('description', webpage, 'description', default=None)) + + return { + **self._parse_jwplayer_data(jwplayer_data, video_id, False), + 'title': re.sub(r'\s+', ' ', title.split('|')[0]).strip(), + 'description': description, + 'http_headers': {'Referer': 'https://www.c-span.org/'}, + } diff --git a/hypervideo_dl/extractor/ctvnews.py b/hypervideo_dl/extractor/ctvnews.py index 03f8cef..952f4c7 100644 --- a/hypervideo_dl/extractor/ctvnews.py +++ b/hypervideo_dl/extractor/ctvnews.py @@ -65,4 +65,9 @@ class CTVNewsIE(InfoExtractor): }) entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet( re.findall(r'clip\.id\s*=\s*(\d+);', webpage))] + if not entries: + webpage = self._download_webpage(url, page_id) + if 'getAuthStates("' in webpage: + entries = [ninecninemedia_url_result(clip_id) for clip_id in + self._search_regex(r'getAuthStates\("([\d+,]+)"', webpage, 'clip ids').split(',')] return self.playlist_result(entries, page_id) diff --git a/hypervideo_dl/extractor/curiositystream.py b/hypervideo_dl/extractor/curiositystream.py index 034a5c9..b8abcf7 100644 --- a/hypervideo_dl/extractor/curiositystream.py +++ b/hypervideo_dl/extractor/curiositystream.py @@ -15,7 +15,6 @@ from ..utils import ( class CuriosityStreamBaseIE(InfoExtractor): _NETRC_MACHINE = 'curiositystream' _auth_token = None - _API_BASE_URL = 'https://api.curiositystream.com/v1/' def _handle_errors(self, result): error = result.get('error', {}).get('message') @@ -34,43 +33,46 @@ class CuriosityStreamBaseIE(InfoExtractor): self._handle_errors(result) return result['data'] - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return + def _perform_login(self, username, password): result = self._download_json( - self._API_BASE_URL + 'login', None, data=urlencode_postdata({ - 'email': email, + 'https://api.curiositystream.com/v1/login', None, + note='Logging in', data=urlencode_postdata({ + 'email': username, 'password': password, })) self._handle_errors(result) - self._auth_token = result['message']['auth_token'] + CuriosityStreamBaseIE._auth_token = result['message']['auth_token'] class CuriosityStreamIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream' _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://app.curiositystream.com/video/2', 'info_dict': { 'id': '2', 'ext': 'mp4', 'title': 'How Did You Develop The Internet?', 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', + 'channel': 'Curiosity Stream', + 'categories': ['Technology', 'Interview'], + 'average_rating': 96.79, + 'series_id': '2', }, 'params': { - 'format': 'bestvideo', # m3u8 download 'skip_download': True, }, - } + }] + + _API_BASE_URL = 'https://api.curiositystream.com/v1/media/' def _real_extract(self, url): video_id = self._match_id(url) formats = [] for encoding_format in ('m3u8', 'mpd'): - media = self._call_api('media/' + video_id, video_id, query={ + media = self._call_api(video_id, video_id, query={ 'encodingsNew': 'true', 'encodingsFormat': encoding_format, }) @@ -140,12 +142,33 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): 'duration': int_or_none(media.get('duration')), 'tags': media.get('tags'), 'subtitles': subtitles, + 'channel': media.get('producer'), + 'categories': [media.get('primary_category'), media.get('type')], + 'average_rating': media.get('rating_percentage'), + 'series_id': str(media.get('collection_id') or '') or None, } -class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): - IE_NAME = 'curiositystream:collection' - _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P<id>\d+)' +class CuriosityStreamCollectionBaseIE(CuriosityStreamBaseIE): + + def _real_extract(self, url): + collection_id = self._match_id(url) + collection = self._call_api(collection_id, collection_id) + entries = [] + for media in collection.get('media', []): + media_id = compat_str(media.get('id')) + media_type, ie = ('series', CuriosityStreamSeriesIE) if media.get('is_collection') else ('video', CuriosityStreamIE) + entries.append(self.url_result( + 'https://curiositystream.com/%s/%s' % (media_type, media_id), + ie=ie.ie_key(), video_id=media_id)) + return self.playlist_result( + entries, collection_id, + collection.get('title'), collection.get('description')) + + +class CuriosityStreamCollectionsIE(CuriosityStreamCollectionBaseIE): + IE_NAME = 'curiositystream:collections' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/collections/(?P<id>\d+)' _API_BASE_URL = 'https://api.curiositystream.com/v2/collections/' _TESTS = [{ 'url': 'https://curiositystream.com/collections/86', @@ -156,7 +179,17 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): }, 'playlist_mincount': 7, }, { - 'url': 'https://app.curiositystream.com/collection/2', + 'url': 'https://curiositystream.com/collections/36', + 'only_matching': True, + }] + + +class CuriosityStreamSeriesIE(CuriosityStreamCollectionBaseIE): + IE_NAME = 'curiositystream:series' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:series|collection)/(?P<id>\d+)' + _API_BASE_URL = 'https://api.curiositystream.com/v2/series/' + _TESTS = [{ + 'url': 'https://curiositystream.com/series/2', 'info_dict': { 'id': '2', 'title': 'Curious Minds: The Internet', @@ -164,23 +197,6 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): }, 'playlist_mincount': 16, }, { - 'url': 'https://curiositystream.com/series/2', - 'only_matching': True, - }, { - 'url': 'https://curiositystream.com/collections/36', + 'url': 'https://curiositystream.com/collection/2', 'only_matching': True, }] - - def _real_extract(self, url): - collection_id = self._match_id(url) - collection = self._call_api(collection_id, collection_id) - entries = [] - for media in collection.get('media', []): - media_id = compat_str(media.get('id')) - media_type, ie = ('series', CuriosityStreamCollectionIE) if media.get('is_collection') else ('video', CuriosityStreamIE) - entries.append(self.url_result( - 'https://curiositystream.com/%s/%s' % (media_type, media_id), - ie=ie.ie_key(), video_id=media_id)) - return self.playlist_result( - entries, collection_id, - collection.get('title'), collection.get('description')) diff --git a/hypervideo_dl/extractor/cybrary.py b/hypervideo_dl/extractor/cybrary.py new file mode 100644 index 0000000..c278f0f --- /dev/null +++ b/hypervideo_dl/extractor/cybrary.py @@ -0,0 +1,146 @@ +# coding: utf-8 +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + smuggle_url, + str_or_none, + traverse_obj, + urlencode_postdata +) + + +class CybraryBaseIE(InfoExtractor): + _API_KEY = 'AIzaSyCX9ru6j70PX2My1Eq6Q1zoMAhuTdXlzSw' + _ENDPOINTS = { + 'course': 'https://app.cybrary.it/courses/api/catalog/browse/course/{}', + 'course_enrollment': 'https://app.cybrary.it/courses/api/catalog/{}/enrollment', + 'enrollment': 'https://app.cybrary.it/courses/api/enrollment/{}', + 'launch': 'https://app.cybrary.it/courses/api/catalog/{}/launch', + 'vimeo_oembed': 'https://vimeo.com/api/oembed.json?url=https://vimeo.com/{}', + } + _NETRC_MACHINE = 'cybrary' + _TOKEN = None + + def _perform_login(self, username, password): + CybraryBaseIE._TOKEN = self._download_json( + f'https://identitytoolkit.googleapis.com/v1/accounts:signInWithPassword?key={self._API_KEY}', + None, data=urlencode_postdata({'email': username, 'password': password, 'returnSecureToken': True}), + note='Logging in')['idToken'] + + def _real_initialize(self): + if not self._TOKEN: + self.raise_login_required(method='password') + + def _call_api(self, endpoint, item_id): + return self._download_json( + self._ENDPOINTS[endpoint].format(item_id), item_id, + note=f'Downloading {endpoint} JSON metadata', + headers={'Authorization': f'Bearer {self._TOKEN}'}) + + def _get_vimeo_id(self, activity_id): + launch_api = self._call_api('launch', activity_id) + + if launch_api.get('url'): + return self._search_regex(r'https?://player\.vimeo\.com/video/(?P<vimeo_id>[0-9]+)', launch_api['url'], 'vimeo_id') + return traverse_obj(launch_api, ('vendor_data', 'content', ..., 'videoId'), get_all=False) + + +class CybraryIE(CybraryBaseIE): + _VALID_URL = r'https?://app.cybrary.it/immersive/(?P<enrollment>[0-9]+)/activity/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://app.cybrary.it/immersive/12487950/activity/63102', + 'md5': '9ae12d37e555cb2ed554223a71a701d0', + 'info_dict': { + 'id': '646609770', + 'ext': 'mp4', + 'title': 'Getting Started', + 'thumbnail': 'https://i.vimeocdn.com/video/1301817996-76a268f0c56cff18a5cecbbdc44131eb9dda0c80eb0b3a036_1280', + 'series_id': '63111', + 'uploader_url': 'https://vimeo.com/user30867300', + 'duration': 88, + 'uploader_id': 'user30867300', + 'series': 'Cybrary Orientation', + 'uploader': 'Cybrary', + 'chapter': 'Cybrary Orientation Series', + 'chapter_id': '63110' + }, + 'expected_warnings': ['No authenticators for vimeo'] + }, { + 'url': 'https://app.cybrary.it/immersive/12747143/activity/52686', + 'md5': '62f26547dccc59c44363e2a13d4ad08d', + 'info_dict': { + 'id': '445638073', + 'ext': 'mp4', + 'title': 'Azure Virtual Network IP Addressing', + 'thumbnail': 'https://i.vimeocdn.com/video/936667051-1647ace66c627d4a2382185e0dae8deb830309bfddd53f8b2367b2f91e92ed0e-d_1280', + 'series_id': '52733', + 'uploader_url': 'https://vimeo.com/user30867300', + 'duration': 426, + 'uploader_id': 'user30867300', + 'series': 'AZ-500: Microsoft Azure Security Technologies', + 'uploader': 'Cybrary', + 'chapter': 'Implement Network Security', + 'chapter_id': '52693' + }, + 'expected_warnings': ['No authenticators for vimeo'] + }] + + def _real_extract(self, url): + activity_id, enrollment_id = self._match_valid_url(url).group('id', 'enrollment') + course = self._call_api('enrollment', enrollment_id)['content'] + activity = traverse_obj(course, ('learning_modules', ..., 'activities', lambda _, v: int(activity_id) == v['id']), get_all=False) + + if activity.get('type') not in ['Video Activity', 'Lesson Activity']: + raise ExtractorError('The activity is not a video', expected=True) + + module = next((m for m in course.get('learning_modules') or [] + if int(activity_id) in traverse_obj(m, ('activities', ..., 'id') or [])), None) + + vimeo_id = self._get_vimeo_id(activity_id) + + return { + '_type': 'url_transparent', + 'series': traverse_obj(course, ('content_description', 'title')), + 'series_id': str_or_none(traverse_obj(course, ('content_description', 'id'))), + 'id': vimeo_id, + 'chapter': module.get('title'), + 'chapter_id': str_or_none(module.get('id')), + 'title': activity.get('title'), + 'url': smuggle_url(f'https://player.vimeo.com/video/{vimeo_id}', {'http_headers': {'Referer': 'https://api.cybrary.it'}}) + } + + +class CybraryCourseIE(CybraryBaseIE): + _VALID_URL = r'https://app.cybrary.it/browse/course/(?P<id>[\w-]+)/?(?:$|[#?])' + _TESTS = [{ + 'url': 'https://app.cybrary.it/browse/course/az-500-microsoft-azure-security-technologies', + 'info_dict': { + 'id': 898, + 'title': 'AZ-500: Microsoft Azure Security Technologies', + 'description': 'md5:69549d379c0fc1dec92926d4e8b6fbd4' + }, + 'playlist_count': 59 + }, { + 'url': 'https://app.cybrary.it/browse/course/cybrary-orientation', + 'info_dict': { + 'id': 1245, + 'title': 'Cybrary Orientation', + 'description': 'md5:9e69ff66b32fe78744e0ad4babe2e88e' + }, + 'playlist_count': 4 + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + course = self._call_api('course', course_id) + enrollment_info = self._call_api('course_enrollment', course['id']) + + entries = [self.url_result( + f'https://app.cybrary.it/immersive/{enrollment_info["id"]}/activity/{activity["id"]}') + for activity in traverse_obj(course, ('content_item', 'learning_modules', ..., 'activities', ...))] + + return self.playlist_result( + entries, + traverse_obj(course, ('content_item', 'id'), expected_type=str_or_none), + course.get('title'), course.get('short_description')) diff --git a/hypervideo_dl/extractor/daftsex.py b/hypervideo_dl/extractor/daftsex.py new file mode 100644 index 0000000..6037fd9 --- /dev/null +++ b/hypervideo_dl/extractor/daftsex.py @@ -0,0 +1,146 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_b64decode +from ..utils import ( + int_or_none, + js_to_json, + parse_count, + parse_duration, + traverse_obj, + try_get, + unified_timestamp, +) + + +class DaftsexIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?daftsex\.com/watch/(?P<id>-?\d+_\d+)' + _TESTS = [{ + 'url': 'https://daftsex.com/watch/-35370899_456246186', + 'md5': 'd95135e6cea2d905bea20dbe82cda64a', + 'info_dict': { + 'id': '-35370899_456246186', + 'ext': 'mp4', + 'title': 'just relaxing', + 'description': 'just relaxing - Watch video Watch video in high quality', + 'upload_date': '20201113', + 'timestamp': 1605261911, + 'thumbnail': r're:https://[^/]+/impf/-43BuMDIawmBGr3GLcZ93CYwWf2PBv_tVWoS1A/dnu41DnARU4\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=6af2c26ff4a45e55334189301c867384&type=video_thumb', + }, + }, { + 'url': 'https://daftsex.com/watch/-156601359_456242791', + 'info_dict': { + 'id': '-156601359_456242791', + 'ext': 'mp4', + 'title': 'Skye Blue - Dinner And A Show', + 'description': 'Skye Blue - Dinner And A Show - Watch video Watch video in high quality', + 'upload_date': '20200916', + 'timestamp': 1600250735, + 'thumbnail': 'https://psv153-1.crazycloud.ru/videos/-156601359/456242791/thumb.jpg?extra=i3D32KaBbBFf9TqDRMAVmQ', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta('name', webpage, 'title') + timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None)) + description = self._html_search_meta('description', webpage, 'Description', default=None) + + duration = parse_duration(self._search_regex( + r'Duration: ((?:[0-9]{2}:){0,2}[0-9]{2})', + webpage, 'duration', fatal=False)) + views = parse_count(self._search_regex( + r'Views: ([0-9 ]+)', + webpage, 'views', fatal=False)) + + player_hash = self._search_regex( + r'DaxabPlayer\.Init\({[\s\S]*hash:\s*"([0-9a-zA-Z_\-]+)"[\s\S]*}', + webpage, 'player hash') + player_color = self._search_regex( + r'DaxabPlayer\.Init\({[\s\S]*color:\s*"([0-9a-z]+)"[\s\S]*}', + webpage, 'player color', fatal=False) or '' + + embed_page = self._download_webpage( + 'https://daxab.com/player/%s?color=%s' % (player_hash, player_color), + video_id, headers={'Referer': url}) + video_params = self._parse_json( + self._search_regex( + r'window\.globParams\s*=\s*({[\S\s]+})\s*;\s*<\/script>', + embed_page, 'video parameters'), + video_id, transform_source=js_to_json) + + server_domain = 'https://%s' % compat_b64decode(video_params['server'][::-1]).decode('utf-8') + + cdn_files = traverse_obj(video_params, ('video', 'cdn_files')) or {} + if cdn_files: + formats = [] + for format_id, format_data in cdn_files.items(): + ext, height = format_id.split('_') + formats.append({ + 'format_id': format_id, + 'url': f'{server_domain}/videos/{video_id.replace("_", "/")}/{height}.mp4?extra={format_data.split(".")[-1]}', + 'height': int_or_none(height), + 'ext': ext, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'duration': duration, + 'thumbnail': try_get(video_params, lambda vi: 'https:' + compat_b64decode(vi['video']['thumb']).decode('utf-8')), + 'timestamp': timestamp, + 'view_count': views, + 'age_limit': 18, + } + + item = self._download_json( + f'{server_domain}/method/video.get/{video_id}', video_id, + headers={'Referer': url}, query={ + 'token': video_params['video']['access_token'], + 'videos': video_id, + 'ckey': video_params['c_key'], + 'credentials': video_params['video']['credentials'], + })['response']['items'][0] + + formats = [] + for f_id, f_url in item.get('files', {}).items(): + if f_id == 'external': + return self.url_result(f_url) + ext, height = f_id.split('_') + height_extra_key = traverse_obj(video_params, ('video', 'partial', 'quality', height)) + if height_extra_key: + formats.append({ + 'format_id': f'{height}p', + 'url': f'{server_domain}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}', + 'height': int_or_none(height), + 'ext': ext, + }) + self._sort_formats(formats) + + thumbnails = [] + for k, v in item.items(): + if k.startswith('photo_') and v: + width = k.replace('photo_', '') + thumbnails.append({ + 'id': width, + 'url': v, + 'width': int_or_none(width), + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'comment_count': int_or_none(item.get('comments')), + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'view_count': views, + 'age_limit': 18, + } diff --git a/hypervideo_dl/extractor/dailymotion.py b/hypervideo_dl/extractor/dailymotion.py index e04e10b..9cb5618 100644 --- a/hypervideo_dl/extractor/dailymotion.py +++ b/hypervideo_dl/extractor/dailymotion.py @@ -94,10 +94,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor): _VALID_URL = r'''(?ix) https?:// (?: - (?:(?:www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|\#)/)?video|swf)| + (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player\.html\?)?video|swf)| (?:www\.)?lequipe\.fr/video ) - /(?P<id>[^/?_]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))? + [/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))? ''' IE_NAME = 'dailymotion' _TESTS = [{ @@ -116,6 +116,25 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'age_limit': 0, }, }, { + 'url': 'https://geo.dailymotion.com/player.html?video=x89eyek&mute=true', + 'md5': 'e2f9717c6604773f963f069ca53a07f8', + 'info_dict': { + 'id': 'x89eyek', + 'ext': 'mp4', + 'title': "En quête d'esprit du 27/03/2022", + 'description': 'md5:66542b9f4df2eb23f314fc097488e553', + 'duration': 2756, + 'timestamp': 1648383669, + 'upload_date': '20220327', + 'uploader': 'CNEWS', + 'uploader_id': 'x24vth', + 'age_limit': 0, + 'view_count': int, + 'like_count': int, + 'tags': ['en_quete_d_esprit'], + 'thumbnail': 'https://s2.dmcdn.net/v/Tncwi1YGKdvFbDuDY/x1080', + } + }, { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', 'md5': '2137c41a8e78554bb09225b8eb322406', 'info_dict': { @@ -207,12 +226,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor): video_id, playlist_id = self._match_valid_url(url).groups() if playlist_id: - if not self.get_param('noplaylist'): - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id) + if self._yes_playlist(playlist_id, video_id): return self.url_result( 'http://www.dailymotion.com/playlist/' + playlist_id, 'DailymotionPlaylist', playlist_id) - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) password = self.get_param('videopassword') media = self._call_api( @@ -261,9 +278,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): continue if media_type == 'application/x-mpegURL': formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', - 'm3u8' if is_live else 'm3u8_native', - m3u8_id='hls', fatal=False)) + media_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False)) else: f = { 'url': media_url, @@ -305,7 +320,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': clean_html(media.get('description')), 'thumbnails': thumbnails, 'duration': int_or_none(metadata.get('duration')) or None, diff --git a/hypervideo_dl/extractor/daum.py b/hypervideo_dl/extractor/daum.py index 8aa2af9..4362e92 100644 --- a/hypervideo_dl/extractor/daum.py +++ b/hypervideo_dl/extractor/daum.py @@ -157,11 +157,8 @@ class DaumListIE(InfoExtractor): query_dict = parse_qs(url) if 'clipid' in query_dict: clip_id = query_dict['clipid'][0] - if self.get_param('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % clip_id) + if not self._yes_playlist(list_id, clip_id): return self.url_result(DaumClipIE._URL_TEMPLATE % clip_id, 'DaumClip') - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % list_id) class DaumPlaylistIE(DaumListIE): diff --git a/hypervideo_dl/extractor/daystar.py b/hypervideo_dl/extractor/daystar.py new file mode 100644 index 0000000..4f59d90 --- /dev/null +++ b/hypervideo_dl/extractor/daystar.py @@ -0,0 +1,48 @@ +from .common import InfoExtractor +from ..utils import js_to_json, urljoin + + +class DaystarClipIE(InfoExtractor): + IE_NAME = 'daystar:clip' + _VALID_URL = r'https?://player\.daystar\.tv/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://player.daystar.tv/0MTO2ITM', + 'info_dict': { + 'id': '0MTO2ITM', + 'ext': 'mp4', + 'title': 'The Dark World of COVID Pt. 1 | Aaron Siri', + 'description': 'a420d320dda734e5f29458df3606c5f4', + 'thumbnail': r're:^https?://.+\.jpg', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + src_iframe = self._search_regex(r'\<iframe[^>]+src="([^"]+)"', webpage, 'src iframe') + webpage_iframe = self._download_webpage( + src_iframe.replace('player.php', 'config2.php'), video_id, headers={'Referer': src_iframe}) + + sources = self._parse_json(self._search_regex( + r'sources\:\s*(\[.*?\])', webpage_iframe, 'm3u8 source'), video_id, transform_source=js_to_json) + + formats, subtitles = [], {} + for source in sources: + file = source.get('file') + if file and source.get('type') == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + urljoin('https://www.lightcast.com/embed/', file), + video_id, 'mp4', fatal=False, headers={'Referer': src_iframe}) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), + 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), + 'thumbnail': self._search_regex(r'image:\s*"([^"]+)', webpage_iframe, 'thumbnail'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/digitalconcerthall.py b/hypervideo_dl/extractor/digitalconcerthall.py new file mode 100644 index 0000000..8398ae3 --- /dev/null +++ b/hypervideo_dl/extractor/digitalconcerthall.py @@ -0,0 +1,141 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + parse_resolution, + traverse_obj, + try_get, + urlencode_postdata, +) + + +class DigitalConcertHallIE(InfoExtractor): + IE_DESC = 'DigitalConcertHall extractor' + _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P<language>[a-z]+)/concert/(?P<id>[0-9]+)' + _OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token' + _ACCESS_TOKEN = None + _NETRC_MACHINE = 'digitalconcerthall' + _TESTS = [{ + 'note': 'Playlist with only one video', + 'url': 'https://www.digitalconcerthall.com/en/concert/53201', + 'info_dict': { + 'id': '53201-1', + 'ext': 'mp4', + 'composer': 'Kurt Weill', + 'title': '[Magic Night]', + 'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$', + 'upload_date': '20210624', + 'timestamp': 1624548600, + 'duration': 2798, + 'album_artist': 'Members of the Berliner Philharmoniker / Simon Rössler', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'Concert with several works and an interview', + 'url': 'https://www.digitalconcerthall.com/en/concert/53785', + 'info_dict': { + 'id': '53785', + 'album_artist': 'Berliner Philharmoniker / Kirill Petrenko', + 'title': 'Kirill Petrenko conducts Mendelssohn and Shostakovich', + }, + 'params': {'skip_download': 'm3u8'}, + 'playlist_count': 3, + }] + + def _perform_login(self, username, password): + token_response = self._download_json( + self._OAUTH_URL, + None, 'Obtaining token', errnote='Unable to obtain token', data=urlencode_postdata({ + 'affiliate': 'none', + 'grant_type': 'device', + 'device_vendor': 'unknown', + 'app_id': 'dch.webapp', + 'app_version': '1.0.0', + 'client_secret': '2ySLN+2Fwb', + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + self._ACCESS_TOKEN = token_response['access_token'] + try: + self._download_json( + self._OAUTH_URL, + None, note='Logging in', errnote='Unable to login', data=urlencode_postdata({ + 'grant_type': 'password', + 'username': username, + 'password': password, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': 'https://www.digitalconcerthall.com', + 'Authorization': f'Bearer {self._ACCESS_TOKEN}' + }) + except ExtractorError: + self.raise_login_required(msg='Login info incorrect') + + def _real_initialize(self): + if not self._ACCESS_TOKEN: + self.raise_login_required(method='password') + + def _entries(self, items, language, **kwargs): + for item in items: + video_id = item['id'] + stream_info = self._download_json( + self._proto_relative_url(item['_links']['streams']['href']), video_id, headers={ + 'Accept': 'application/json', + 'Authorization': f'Bearer {self._ACCESS_TOKEN}', + 'Accept-Language': language + }) + + m3u8_url = traverse_obj( + stream_info, ('channel', lambda x: x.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False) + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', fatal=False) + self._sort_formats(formats) + + yield { + 'id': video_id, + 'title': item.get('title'), + 'composer': item.get('name_composer'), + 'url': m3u8_url, + 'formats': formats, + 'duration': item.get('duration_total'), + 'timestamp': traverse_obj(item, ('date', 'published')), + 'description': item.get('short_description') or stream_info.get('short_description'), + **kwargs, + 'chapters': [{ + 'start_time': chapter.get('time'), + 'end_time': try_get(chapter, lambda x: x['time'] + x['duration']), + 'title': chapter.get('text'), + } for chapter in item['cuepoints']] if item.get('cuepoints') else None, + } + + def _real_extract(self, url): + language, video_id = self._match_valid_url(url).group('language', 'id') + if not language: + language = 'en' + + thumbnail_url = self._html_search_regex( + r'(https?://images\.digitalconcerthall\.com/cms/thumbnails/.*\.jpg)', + self._download_webpage(url, video_id), 'thumbnail') + thumbnails = [{ + 'url': thumbnail_url, + **parse_resolution(thumbnail_url) + }] + + vid_info = self._download_json( + f'https://api.digitalconcerthall.com/v2/concert/{video_id}', video_id, headers={ + 'Accept': 'application/json', + 'Accept-Language': language + }) + album_artist = ' / '.join(traverse_obj(vid_info, ('_links', 'artist', ..., 'name')) or '') + + return { + '_type': 'playlist', + 'id': video_id, + 'title': vid_info.get('title'), + 'entries': self._entries(traverse_obj(vid_info, ('_embedded', ..., ...)), language, + thumbnails=thumbnails, album_artist=album_artist), + 'thumbnails': thumbnails, + 'album_artist': album_artist, + } diff --git a/hypervideo_dl/extractor/disney.py b/hypervideo_dl/extractor/disney.py index f018cbe..0ad7b1f 100644 --- a/hypervideo_dl/extractor/disney.py +++ b/hypervideo_dl/extractor/disney.py @@ -7,8 +7,8 @@ from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, - compat_str, determine_ext, + join_nonempty, update_url_query, ) @@ -119,18 +119,13 @@ class DisneyIE(InfoExtractor): continue formats.append(f) continue - format_id = [] - if flavor_format: - format_id.append(flavor_format) - if tbr: - format_id.append(compat_str(tbr)) ext = determine_ext(flavor_url) if flavor_format == 'applehttp' or ext == 'm3u8': ext = 'mp4' width = int_or_none(flavor.get('width')) height = int_or_none(flavor.get('height')) formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(flavor_format, tbr), 'url': flavor_url, 'width': width, 'height': height, diff --git a/hypervideo_dl/extractor/dispeak.py b/hypervideo_dl/extractor/dispeak.py index be7ad12..3d651f3 100644 --- a/hypervideo_dl/extractor/dispeak.py +++ b/hypervideo_dl/extractor/dispeak.py @@ -74,13 +74,11 @@ class DigitallySpeakingIE(InfoExtractor): tbr = int_or_none(bitrate) vbr = int_or_none(self._search_regex( r'-(\d+)\.mp4', video_path, 'vbr', default=None)) - abr = tbr - vbr if tbr and vbr else None video_formats.append({ 'format_id': bitrate, 'url': url, 'tbr': tbr, 'vbr': vbr, - 'abr': abr, }) return video_formats @@ -121,6 +119,7 @@ class DigitallySpeakingIE(InfoExtractor): video_formats = self._parse_mp4(metadata) if video_formats is None: video_formats = self._parse_flv(metadata) + self._sort_formats(video_formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dlive.py b/hypervideo_dl/extractor/dlive.py index 90462c0..7410eb6 100644 --- a/hypervideo_dl/extractor/dlive.py +++ b/hypervideo_dl/extractor/dlive.py @@ -84,7 +84,7 @@ class DLiveStreamIE(InfoExtractor): self._sort_formats(formats) return { 'id': display_name, - 'title': self._live_title(title), + 'title': title, 'uploader': display_name, 'uploader_id': username, 'formats': formats, diff --git a/hypervideo_dl/extractor/doodstream.py b/hypervideo_dl/extractor/doodstream.py index 2c9ea68..f692127 100644 --- a/hypervideo_dl/extractor/doodstream.py +++ b/hypervideo_dl/extractor/doodstream.py @@ -21,6 +21,16 @@ class DoodStreamIE(InfoExtractor): 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', } }, { + 'url': 'http://dood.watch/d/5s1wmbdacezb', + 'md5': '4568b83b31e13242b3f1ff96c55f0595', + 'info_dict': { + 'id': '5s1wmbdacezb', + 'ext': 'mp4', + 'title': 'Kat Wonders - Monthly May 2020', + 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com', + 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', + } + }, { 'url': 'https://dood.to/d/jzrxn12t2s7n', 'md5': '3207e199426eca7c2aa23c2872e6728a', 'info_dict': { @@ -34,31 +44,26 @@ class DoodStreamIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + url = f'https://dood.to/e/{video_id}' webpage = self._download_webpage(url, video_id) - if '/d/' in url: - url = "https://dood.to" + self._html_search_regex( - r'<iframe src="(/e/[a-z0-9]+)"', webpage, 'embed') - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_meta(['og:title', 'twitter:title'], - webpage, default=None) - thumb = self._html_search_meta(['og:image', 'twitter:image'], - webpage, default=None) + title = self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None) + thumb = self._html_search_meta(['og:image', 'twitter:image'], webpage, default=None) token = self._html_search_regex(r'[?&]token=([a-z0-9]+)[&\']', webpage, 'token') description = self._html_search_meta( - ['og:description', 'description', 'twitter:description'], - webpage, default=None) - auth_url = 'https://dood.to' + self._html_search_regex( - r'(/pass_md5.*?)\'', webpage, 'pass_md5') + ['og:description', 'description', 'twitter:description'], webpage, default=None) + headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/66.0', 'referer': url } - webpage = self._download_webpage(auth_url, video_id, headers=headers) - final_url = webpage + ''.join([random.choice(string.ascii_letters + string.digits) for _ in range(10)]) + "?token=" + token + "&expiry=" + str(int(time.time() * 1000)) + pass_md5 = self._html_search_regex(r'(/pass_md5.*?)\'', webpage, 'pass_md5') + final_url = ''.join(( + self._download_webpage(f'https://dood.to{pass_md5}', video_id, headers=headers), + *(random.choice(string.ascii_letters + string.digits) for _ in range(10)), + f'?token={token}&expiry={int(time.time() * 1000)}', + )) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/douyutv.py b/hypervideo_dl/extractor/douyutv.py index 9757f44..26a8d64 100644 --- a/hypervideo_dl/extractor/douyutv.py +++ b/hypervideo_dl/extractor/douyutv.py @@ -105,7 +105,7 @@ class DouyuTVIE(InfoExtractor): 'aid': 'pcclient' })['data']['live_url'] - title = self._live_title(unescapeHTML(room['room_name'])) + title = unescapeHTML(room['room_name']) description = room.get('show_details') thumbnail = room.get('room_src') uploader = room.get('nickname') diff --git a/hypervideo_dl/extractor/dplay.py b/hypervideo_dl/extractor/dplay.py index e0e446b..a25f27c 100644 --- a/hypervideo_dl/extractor/dplay.py +++ b/hypervideo_dl/extractor/dplay.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import json +import uuid from .common import InfoExtractor from ..compat import compat_HTTPError @@ -11,12 +12,172 @@ from ..utils import ( float_or_none, int_or_none, strip_or_none, + try_get, unified_timestamp, ) -class DPlayIE(InfoExtractor): +class DPlayBaseIE(InfoExtractor): _PATH_REGEX = r'/(?P<id>[^/]+/[^/?#]+)' + _auth_token_cache = {} + + def _get_auth(self, disco_base, display_id, realm, needs_device_id=True): + key = (disco_base, realm) + st = self._get_cookies(disco_base).get('st') + token = (st and st.value) or self._auth_token_cache.get(key) + + if not token: + query = {'realm': realm} + if needs_device_id: + query['deviceId'] = uuid.uuid4().hex + token = self._download_json( + disco_base + 'token', display_id, 'Downloading token', + query=query)['data']['attributes']['token'] + + # Save cache only if cookies are not being set + if not self._get_cookies(disco_base).get('st'): + self._auth_token_cache[key] = token + + return f'Bearer {token}' + + def _process_errors(self, e, geo_countries): + info = self._parse_json(e.cause.read().decode('utf-8'), None) + error = info['errors'][0] + error_code = error.get('code') + if error_code == 'access.denied.geoblocked': + self.raise_geo_restricted(countries=geo_countries) + elif error_code in ('access.denied.missingpackage', 'invalid.token'): + raise ExtractorError( + 'This video is only available for registered users. You may want to use --cookies.', expected=True) + raise ExtractorError(info['errors'][0]['detail'], expected=True) + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['Authorization'] = self._get_auth(disco_base, display_id, realm, False) + + def _download_video_playback_info(self, disco_base, video_id, headers): + streaming = self._download_json( + disco_base + 'playback/videoPlaybackInfo/' + video_id, + video_id, headers=headers)['data']['attributes']['streaming'] + streaming_list = [] + for format_id, format_dict in streaming.items(): + streaming_list.append({ + 'type': format_id, + 'url': format_dict.get('url'), + }) + return streaming_list + + def _get_disco_api_info(self, url, display_id, disco_host, realm, country, domain=''): + geo_countries = [country.upper()] + self._initialize_geo_bypass({ + 'countries': geo_countries, + }) + disco_base = 'https://%s/' % disco_host + headers = { + 'Referer': url, + } + self._update_disco_api_headers(headers, disco_base, display_id, realm) + try: + video = self._download_json( + disco_base + 'content/videos/' + display_id, display_id, + headers=headers, query={ + 'fields[channel]': 'name', + 'fields[image]': 'height,src,width', + 'fields[show]': 'name', + 'fields[tag]': 'name', + 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', + 'include': 'images,primaryChannel,show,tags' + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + self._process_errors(e, geo_countries) + raise + video_id = video['data']['id'] + info = video['data']['attributes'] + title = info['name'].strip() + formats = [] + subtitles = {} + try: + streaming = self._download_video_playback_info( + disco_base, video_id, headers) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self._process_errors(e, geo_countries) + raise + for format_dict in streaming: + if not isinstance(format_dict, dict): + continue + format_url = format_dict.get('url') + if not format_url: + continue + format_id = format_dict.get('type') + ext = determine_ext(format_url) + if format_id == 'dash' or ext == 'mpd': + dash_fmts, dash_subs = self._extract_mpd_formats_and_subtitles( + format_url, display_id, mpd_id='dash', fatal=False) + formats.extend(dash_fmts) + subtitles = self._merge_subtitles(subtitles, dash_subs) + elif format_id == 'hls' or ext == 'm3u8': + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + format_url, display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False) + formats.extend(m3u8_fmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + + creator = series = None + tags = [] + thumbnails = [] + included = video.get('included') or [] + if isinstance(included, list): + for e in included: + attributes = e.get('attributes') + if not attributes: + continue + e_type = e.get('type') + if e_type == 'channel': + creator = attributes.get('name') + elif e_type == 'image': + src = attributes.get('src') + if src: + thumbnails.append({ + 'url': src, + 'width': int_or_none(attributes.get('width')), + 'height': int_or_none(attributes.get('height')), + }) + if e_type == 'show': + series = attributes.get('name') + elif e_type == 'tag': + name = attributes.get('name') + if name: + tags.append(name) + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': strip_or_none(info.get('description')), + 'duration': float_or_none(info.get('videoDuration'), 1000), + 'timestamp': unified_timestamp(info.get('publishStart')), + 'series': series, + 'season_number': int_or_none(info.get('seasonNumber')), + 'episode_number': int_or_none(info.get('episodeNumber')), + 'creator': creator, + 'tags': tags, + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + 'http_headers': { + 'referer': domain, + }, + } + + +class DPlayIE(DPlayBaseIE): _VALID_URL = r'''(?x)https?:// (?P<domain> (?:www\.)?(?P<host>d @@ -26,7 +187,7 @@ class DPlayIE(InfoExtractor): ) )| (?P<subdomain_country>es|it)\.dplay\.com - )/[^/]+''' + _PATH_REGEX + )/[^/]+''' + DPlayBaseIE._PATH_REGEX _TESTS = [{ # non geo restricted, via secure api, unsigned download hls URL @@ -46,7 +207,6 @@ class DPlayIE(InfoExtractor): 'episode_number': 1, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { @@ -67,7 +227,6 @@ class DPlayIE(InfoExtractor): 'episode_number': 1, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { @@ -87,7 +246,6 @@ class DPlayIE(InfoExtractor): 'episode_number': 7, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'skip': 'Available for Premium users', @@ -153,138 +311,6 @@ class DPlayIE(InfoExtractor): 'only_matching': True, }] - def _process_errors(self, e, geo_countries): - info = self._parse_json(e.cause.read().decode('utf-8'), None) - error = info['errors'][0] - error_code = error.get('code') - if error_code == 'access.denied.geoblocked': - self.raise_geo_restricted(countries=geo_countries) - elif error_code in ('access.denied.missingpackage', 'invalid.token'): - raise ExtractorError( - 'This video is only available for registered users. You may want to use --cookies.', expected=True) - raise ExtractorError(info['errors'][0]['detail'], expected=True) - - def _update_disco_api_headers(self, headers, disco_base, display_id, realm): - headers['Authorization'] = 'Bearer ' + self._download_json( - disco_base + 'token', display_id, 'Downloading token', - query={ - 'realm': realm, - })['data']['attributes']['token'] - - def _download_video_playback_info(self, disco_base, video_id, headers): - streaming = self._download_json( - disco_base + 'playback/videoPlaybackInfo/' + video_id, - video_id, headers=headers)['data']['attributes']['streaming'] - streaming_list = [] - for format_id, format_dict in streaming.items(): - streaming_list.append({ - 'type': format_id, - 'url': format_dict.get('url'), - }) - return streaming_list - - def _get_disco_api_info(self, url, display_id, disco_host, realm, country): - geo_countries = [country.upper()] - self._initialize_geo_bypass({ - 'countries': geo_countries, - }) - disco_base = 'https://%s/' % disco_host - headers = { - 'Referer': url, - } - self._update_disco_api_headers(headers, disco_base, display_id, realm) - try: - video = self._download_json( - disco_base + 'content/videos/' + display_id, display_id, - headers=headers, query={ - 'fields[channel]': 'name', - 'fields[image]': 'height,src,width', - 'fields[show]': 'name', - 'fields[tag]': 'name', - 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', - 'include': 'images,primaryChannel,show,tags' - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - self._process_errors(e, geo_countries) - raise - video_id = video['data']['id'] - info = video['data']['attributes'] - title = info['name'].strip() - formats = [] - try: - streaming = self._download_video_playback_info( - disco_base, video_id, headers) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self._process_errors(e, geo_countries) - raise - for format_dict in streaming: - if not isinstance(format_dict, dict): - continue - format_url = format_dict.get('url') - if not format_url: - continue - format_id = format_dict.get('type') - ext = determine_ext(format_url) - if format_id == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, display_id, mpd_id='dash', fatal=False)) - elif format_id == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) - - creator = series = None - tags = [] - thumbnails = [] - included = video.get('included') or [] - if isinstance(included, list): - for e in included: - attributes = e.get('attributes') - if not attributes: - continue - e_type = e.get('type') - if e_type == 'channel': - creator = attributes.get('name') - elif e_type == 'image': - src = attributes.get('src') - if src: - thumbnails.append({ - 'url': src, - 'width': int_or_none(attributes.get('width')), - 'height': int_or_none(attributes.get('height')), - }) - if e_type == 'show': - series = attributes.get('name') - elif e_type == 'tag': - name = attributes.get('name') - if name: - tags.append(name) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': strip_or_none(info.get('description')), - 'duration': float_or_none(info.get('videoDuration'), 1000), - 'timestamp': unified_timestamp(info.get('publishStart')), - 'series': series, - 'season_number': int_or_none(info.get('seasonNumber')), - 'episode_number': int_or_none(info.get('episodeNumber')), - 'creator': creator, - 'tags': tags, - 'thumbnails': thumbnails, - 'formats': formats, - } - def _real_extract(self, url): mobj = self._match_valid_url(url) display_id = mobj.group('id') @@ -292,11 +318,11 @@ class DPlayIE(InfoExtractor): country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country') host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( - url, display_id, host, 'dplay' + country, country) + url, display_id, host, 'dplay' + country, country, domain) -class HGTVDeIE(DPlayIE): - _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX +class HGTVDeIE(DPlayBaseIE): + _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayBaseIE._PATH_REGEX _TESTS = [{ 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/', 'info_dict': { @@ -313,9 +339,6 @@ class HGTVDeIE(DPlayIE): 'season_number': 3, 'episode_number': 3, }, - 'params': { - 'format': 'bestvideo', - }, }] def _real_extract(self, url): @@ -324,30 +347,7 @@ class HGTVDeIE(DPlayIE): url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de') -class DiscoveryPlusIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX - _TESTS = [{ - 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', - 'info_dict': { - 'id': '1140794', - 'display_id': 'property-brothers-forever-home/food-and-family', - 'ext': 'mp4', - 'title': 'Food and Family', - 'description': 'The brothers help a Richmond family expand their single-level home.', - 'duration': 2583.113, - 'timestamp': 1609304400, - 'upload_date': '20201230', - 'creator': 'HGTV', - 'series': 'Property Brothers: Forever Home', - 'season_number': 1, - 'episode_number': 1, - }, - 'skip': 'Available for Premium users', - }] - - _PRODUCT = 'dplus_us' - _API_URL = 'us1-prod-direct.discoveryplus.com' - +class DiscoveryPlusBaseIE(DPlayBaseIE): def _update_disco_api_headers(self, headers, disco_base, display_id, realm): headers['x-disco-client'] = f'WEB:UNKNOWN:{self._PRODUCT}:25.2.6' @@ -366,13 +366,227 @@ class DiscoveryPlusIE(DPlayIE): }).encode('utf-8'))['data']['attributes']['streaming'] def _real_extract(self, url): - display_id = self._match_id(url) - return self._get_disco_api_info( - url, display_id, self._API_URL, 'go', 'us') + return self._get_disco_api_info(url, self._match_id(url), **self._DISCO_API_PARAMS) + + +class GoDiscoveryIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:go\.)?discovery\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://go.discovery.com/video/dirty-jobs-discovery-atve-us/rodbuster-galvanizer', + 'info_dict': { + 'id': '4164906', + 'display_id': 'dirty-jobs-discovery-atve-us/rodbuster-galvanizer', + 'ext': 'mp4', + 'title': 'Rodbuster / Galvanizer', + 'description': 'Mike installs rebar with a team of rodbusters, then he galvanizes steel.', + 'season_number': 9, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'https://discovery.com/video/dirty-jobs-discovery-atve-us/rodbuster-galvanizer', + 'only_matching': True, + }] + + _PRODUCT = 'dsc' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.go.discovery.com', + 'realm': 'go', + 'country': 'us', + } + + +class TravelChannelIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:watch\.)?travelchannel\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://watch.travelchannel.com/video/ghost-adventures-travel-channel/ghost-train-of-ely', + 'info_dict': { + 'id': '2220256', + 'display_id': 'ghost-adventures-travel-channel/ghost-train-of-ely', + 'ext': 'mp4', + 'title': 'Ghost Train of Ely', + 'description': 'The crew investigates the dark history of the Nevada Northern Railway.', + 'season_number': 24, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'https://watch.travelchannel.com/video/ghost-adventures-travel-channel/ghost-train-of-ely', + 'only_matching': True, + }] + + _PRODUCT = 'trav' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.watch.travelchannel.com', + 'realm': 'go', + 'country': 'us', + } + + +class CookingChannelIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:watch\.)?cookingchanneltv\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://watch.cookingchanneltv.com/video/carnival-eats-cooking-channel/the-postman-always-brings-rice-2348634', + 'info_dict': { + 'id': '2348634', + 'display_id': 'carnival-eats-cooking-channel/the-postman-always-brings-rice-2348634', + 'ext': 'mp4', + 'title': 'The Postman Always Brings Rice', + 'description': 'Noah visits the Maui Fair and the Aurora Winter Festival in Vancouver.', + 'season_number': 9, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'https://watch.cookingchanneltv.com/video/carnival-eats-cooking-channel/the-postman-always-brings-rice-2348634', + 'only_matching': True, + }] + + _PRODUCT = 'cook' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.watch.cookingchanneltv.com', + 'realm': 'go', + 'country': 'us', + } + + +class HGTVUsaIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:watch\.)?hgtv\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://watch.hgtv.com/video/home-inspector-joe-hgtv-atve-us/this-mold-house', + 'info_dict': { + 'id': '4289736', + 'display_id': 'home-inspector-joe-hgtv-atve-us/this-mold-house', + 'ext': 'mp4', + 'title': 'This Mold House', + 'description': 'Joe and Noel help take a familys dream home from hazardous to fabulous.', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'https://watch.hgtv.com/video/home-inspector-joe-hgtv-atve-us/this-mold-house', + 'only_matching': True, + }] + + _PRODUCT = 'hgtv' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.watch.hgtv.com', + 'realm': 'go', + 'country': 'us', + } + + +class FoodNetworkIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:watch\.)?foodnetwork\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://watch.foodnetwork.com/video/kids-baking-championship-food-network/float-like-a-butterfly', + 'info_dict': { + 'id': '4116449', + 'display_id': 'kids-baking-championship-food-network/float-like-a-butterfly', + 'ext': 'mp4', + 'title': 'Float Like a Butterfly', + 'description': 'The 12 kid bakers create colorful carved butterfly cakes.', + 'season_number': 10, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'https://watch.foodnetwork.com/video/kids-baking-championship-food-network/float-like-a-butterfly', + 'only_matching': True, + }] + + _PRODUCT = 'food' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.watch.foodnetwork.com', + 'realm': 'go', + 'country': 'us', + } -class ScienceChannelIE(DiscoveryPlusIE): - _VALID_URL = r'https?://(?:www\.)?sciencechannel\.com/video' + DPlayIE._PATH_REGEX +class DestinationAmericaIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:www\.)?destinationamerica\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.destinationamerica.com/video/alaska-monsters-destination-america-atve-us/central-alaskas-bigfoot', + 'info_dict': { + 'id': '4210904', + 'display_id': 'alaska-monsters-destination-america-atve-us/central-alaskas-bigfoot', + 'ext': 'mp4', + 'title': 'Central Alaskas Bigfoot', + 'description': 'A team heads to central Alaska to investigate an aggressive Bigfoot.', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'https://www.destinationamerica.com/video/alaska-monsters-destination-america-atve-us/central-alaskas-bigfoot', + 'only_matching': True, + }] + + _PRODUCT = 'dam' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.destinationamerica.com', + 'realm': 'go', + 'country': 'us', + } + + +class InvestigationDiscoveryIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:www\.)?investigationdiscovery\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.investigationdiscovery.com/video/unmasked-investigation-discovery/the-killer-clown', + 'info_dict': { + 'id': '2139409', + 'display_id': 'unmasked-investigation-discovery/the-killer-clown', + 'ext': 'mp4', + 'title': 'The Killer Clown', + 'description': 'A wealthy Florida woman is fatally shot in the face by a clown at her door.', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'https://www.investigationdiscovery.com/video/unmasked-investigation-discovery/the-killer-clown', + 'only_matching': True, + }] + + _PRODUCT = 'ids' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.investigationdiscovery.com', + 'realm': 'go', + 'country': 'us', + } + + +class AmHistoryChannelIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:www\.)?ahctv\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.ahctv.com/video/modern-sniper-ahc/army', + 'info_dict': { + 'id': '2309730', + 'display_id': 'modern-sniper-ahc/army', + 'ext': 'mp4', + 'title': 'Army', + 'description': 'Snipers today face challenges their predecessors couldve only dreamed of.', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'https://www.ahctv.com/video/modern-sniper-ahc/army', + 'only_matching': True, + }] + + _PRODUCT = 'ahc' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.ahctv.com', + 'realm': 'go', + 'country': 'us', + } + + +class ScienceChannelIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:www\.)?sciencechannel\.com/video' + DPlayBaseIE._PATH_REGEX _TESTS = [{ 'url': 'https://www.sciencechannel.com/video/strangest-things-science-atve-us/nazi-mystery-machine', 'info_dict': { @@ -385,14 +599,21 @@ class ScienceChannelIE(DiscoveryPlusIE): 'episode_number': 1, }, 'skip': 'Available for Premium users', + }, { + 'url': 'https://www.sciencechannel.com/video/strangest-things-science-atve-us/nazi-mystery-machine', + 'only_matching': True, }] _PRODUCT = 'sci' - _API_URL = 'us1-prod-direct.sciencechannel.com' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.sciencechannel.com', + 'realm': 'go', + 'country': 'us', + } -class DIYNetworkIE(DiscoveryPlusIE): - _VALID_URL = r'https?://(?:watch\.)?diynetwork\.com/video' + DPlayIE._PATH_REGEX +class DIYNetworkIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:watch\.)?diynetwork\.com/video' + DPlayBaseIE._PATH_REGEX _TESTS = [{ 'url': 'https://watch.diynetwork.com/video/pool-kings-diy-network/bringing-beach-life-to-texas', 'info_dict': { @@ -405,14 +626,48 @@ class DIYNetworkIE(DiscoveryPlusIE): 'episode_number': 2, }, 'skip': 'Available for Premium users', + }, { + 'url': 'https://watch.diynetwork.com/video/pool-kings-diy-network/bringing-beach-life-to-texas', + 'only_matching': True, }] _PRODUCT = 'diy' - _API_URL = 'us1-prod-direct.watch.diynetwork.com' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.watch.diynetwork.com', + 'realm': 'go', + 'country': 'us', + } + + +class DiscoveryLifeIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:www\.)?discoverylife\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.discoverylife.com/video/surviving-death-discovery-life-atve-us/bodily-trauma', + 'info_dict': { + 'id': '2218238', + 'display_id': 'surviving-death-discovery-life-atve-us/bodily-trauma', + 'ext': 'mp4', + 'title': 'Bodily Trauma', + 'description': 'Meet three people who tested the limits of the human body.', + 'season_number': 1, + 'episode_number': 2, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'https://www.discoverylife.com/video/surviving-death-discovery-life-atve-us/bodily-trauma', + 'only_matching': True, + }] + + _PRODUCT = 'dlf' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.discoverylife.com', + 'realm': 'go', + 'country': 'us', + } -class AnimalPlanetIE(DiscoveryPlusIE): - _VALID_URL = r'https?://(?:www\.)?animalplanet\.com/video' + DPlayIE._PATH_REGEX +class AnimalPlanetIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:www\.)?animalplanet\.com/video' + DPlayBaseIE._PATH_REGEX _TESTS = [{ 'url': 'https://www.animalplanet.com/video/north-woods-law-animal-planet/squirrel-showdown', 'info_dict': { @@ -425,7 +680,251 @@ class AnimalPlanetIE(DiscoveryPlusIE): 'episode_number': 11, }, 'skip': 'Available for Premium users', + }, { + 'url': 'https://www.animalplanet.com/video/north-woods-law-animal-planet/squirrel-showdown', + 'only_matching': True, }] _PRODUCT = 'apl' - _API_URL = 'us1-prod-direct.animalplanet.com' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.animalplanet.com', + 'realm': 'go', + 'country': 'us', + } + + +class TLCIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:go\.)?tlc\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://go.tlc.com/video/my-600-lb-life-tlc/melissas-story-part-1', + 'info_dict': { + 'id': '2206540', + 'display_id': 'my-600-lb-life-tlc/melissas-story-part-1', + 'ext': 'mp4', + 'title': 'Melissas Story (Part 1)', + 'description': 'At 650 lbs, Melissa is ready to begin her seven-year weight loss journey.', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'https://go.tlc.com/video/my-600-lb-life-tlc/melissas-story-part-1', + 'only_matching': True, + }] + + _PRODUCT = 'tlc' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.tlc.com', + 'realm': 'go', + 'country': 'us', + } + + +class DiscoveryPlusIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?!it/)(?:\w{2}/)?video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', + 'info_dict': { + 'id': '1140794', + 'display_id': 'property-brothers-forever-home/food-and-family', + 'ext': 'mp4', + 'title': 'Food and Family', + 'description': 'The brothers help a Richmond family expand their single-level home.', + 'duration': 2583.113, + 'timestamp': 1609304400, + 'upload_date': '20201230', + 'creator': 'HGTV', + 'series': 'Property Brothers: Forever Home', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'https://discoveryplus.com/ca/video/bering-sea-gold-discovery-ca/goldslingers', + 'only_matching': True, + }] + + _PRODUCT = 'dplus_us' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.discoveryplus.com', + 'realm': 'go', + 'country': 'us', + } + + +class DiscoveryPlusIndiaIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/videos?' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.discoveryplus.in/videos/how-do-they-do-it/fugu-and-more?seasonId=8&type=EPISODE', + 'info_dict': { + 'id': '27104', + 'ext': 'mp4', + 'display_id': 'how-do-they-do-it/fugu-and-more', + 'title': 'Fugu and More', + 'description': 'The Japanese catch, prepare and eat the deadliest fish on the planet.', + 'duration': 1319.32, + 'timestamp': 1582309800, + 'upload_date': '20200221', + 'series': 'How Do They Do It?', + 'season_number': 8, + 'episode_number': 2, + 'creator': 'Discovery Channel', + 'thumbnail': r're:https://.+\.jpeg', + 'episode': 'Episode 2', + 'season': 'Season 8', + 'tags': [], + }, + 'params': { + 'skip_download': True, + } + }] + + _PRODUCT = 'dplus-india' + _DISCO_API_PARAMS = { + 'disco_host': 'ap2-prod-direct.discoveryplus.in', + 'realm': 'dplusindia', + 'country': 'in', + 'domain': 'https://www.discoveryplus.in/', + } + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': 'realm=%s' % realm, + 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:17.0.0', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) + + +class DiscoveryNetworksDeIE(DPlayBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)' + + _TESTS = [{ + 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', + 'info_dict': { + 'id': '78867', + 'ext': 'mp4', + 'title': 'Die Welt da draußen', + 'description': 'md5:61033c12b73286e409d99a41742ef608', + 'timestamp': 1554069600, + 'upload_date': '20190331', + 'creator': 'TLC', + 'season': 'Season 1', + 'series': 'Breaking Amish', + 'episode_number': 1, + 'tags': ['new york', 'großstadt', 'amische', 'landleben', 'modern', 'infos', 'tradition', 'herausforderung'], + 'display_id': 'breaking-amish/die-welt-da-drauen', + 'episode': 'Episode 1', + 'duration': 2625.024, + 'season_number': 1, + 'thumbnail': r're:https://.+\.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.dmax.de/programme/dmax-highlights/video/tuning-star-sidney-hoffmann-exklusiv-bei-dmax/191023082312316', + 'only_matching': True, + }, { + 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', + 'only_matching': True, + }, { + 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, programme, alternate_id = self._match_valid_url(url).groups() + country = 'GB' if domain == 'dplay.co.uk' else 'DE' + realm = 'questuk' if country == 'GB' else domain.replace('.', '') + return self._get_disco_api_info( + url, '%s/%s' % (programme, alternate_id), + 'sonic-eu1-prod.disco-api.com', realm, country) + + +class DiscoveryPlusShowBaseIE(DPlayBaseIE): + + def _entries(self, show_name): + headers = { + 'x-disco-client': self._X_CLIENT, + 'x-disco-params': f'realm={self._REALM}', + 'referer': self._DOMAIN, + 'Authentication': self._get_auth(self._BASE_API, None, self._REALM), + } + show_json = self._download_json( + f'{self._BASE_API}cms/routes/{self._SHOW_STR}/{show_name}?include=default', + video_id=show_name, headers=headers)['included'][self._INDEX]['attributes']['component'] + show_id = show_json['mandatoryParams'].split('=')[-1] + season_url = self._BASE_API + 'content/videos?sort=episodeNumber&filter[seasonNumber]={}&filter[show.id]={}&page[size]=100&page[number]={}' + for season in show_json['filters'][0]['options']: + season_id = season['id'] + total_pages, page_num = 1, 0 + while page_num < total_pages: + season_json = self._download_json( + season_url.format(season_id, show_id, str(page_num + 1)), show_name, headers=headers, + note='Downloading season %s JSON metadata%s' % (season_id, ' page %d' % page_num if page_num else '')) + if page_num == 0: + total_pages = try_get(season_json, lambda x: x['meta']['totalPages'], int) or 1 + episodes_json = season_json['data'] + for episode in episodes_json: + video_path = episode['attributes']['path'] + yield self.url_result( + '%svideos/%s' % (self._DOMAIN, video_path), + ie=self._VIDEO_IE.ie_key(), video_id=episode.get('id') or video_path) + page_num += 1 + + def _real_extract(self, url): + show_name = self._match_valid_url(url).group('show_name') + return self.playlist_result(self._entries(show_name), playlist_id=show_name) + + +class DiscoveryPlusItalyIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/it/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.discoveryplus.com/it/video/i-signori-della-neve/stagione-2-episodio-1-i-preparativi', + 'only_matching': True, + }] + + _PRODUCT = 'dplus_us' + _DISCO_API_PARAMS = { + 'disco_host': 'eu1-prod-direct.discoveryplus.com', + 'realm': 'dplay', + 'country': 'it', + } + + +class DiscoveryPlusItalyShowIE(DiscoveryPlusShowBaseIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.it/programmi/(?P<show_name>[^/]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.discoveryplus.it/programmi/deal-with-it-stai-al-gioco', + 'playlist_mincount': 168, + 'info_dict': { + 'id': 'deal-with-it-stai-al-gioco', + }, + }] + + _BASE_API = 'https://disco-api.discoveryplus.it/' + _DOMAIN = 'https://www.discoveryplus.it/' + _X_CLIENT = 'WEB:UNKNOWN:dplay-client:2.6.0' + _REALM = 'dplayit' + _SHOW_STR = 'programmi' + _INDEX = 1 + _VIDEO_IE = DPlayIE + + +class DiscoveryPlusIndiaShowIE(DiscoveryPlusShowBaseIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/show/(?P<show_name>[^/]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.discoveryplus.in/show/how-do-they-do-it', + 'playlist_mincount': 140, + 'info_dict': { + 'id': 'how-do-they-do-it', + }, + }] + + _BASE_API = 'https://ap2-prod-direct.discoveryplus.in/' + _DOMAIN = 'https://www.discoveryplus.in/' + _X_CLIENT = 'WEB:UNKNOWN:dplus-india:prod' + _REALM = 'dplusindia' + _SHOW_STR = 'show' + _INDEX = 4 + _VIDEO_IE = DiscoveryPlusIndiaIE diff --git a/hypervideo_dl/extractor/drooble.py b/hypervideo_dl/extractor/drooble.py new file mode 100644 index 0000000..0584250 --- /dev/null +++ b/hypervideo_dl/extractor/drooble.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + try_get, +) + + +class DroobleIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://drooble\.com/(?: + (?:(?P<user>[^/]+)/)?(?P<kind>song|videos|music/albums)/(?P<id>\d+)| + (?P<user_2>[^/]+)/(?P<kind_2>videos|music)) + ''' + _TESTS = [{ + 'url': 'https://drooble.com/song/2858030', + 'md5': '5ffda90f61c7c318dc0c3df4179eb064', + 'info_dict': { + 'id': '2858030', + 'ext': 'mp3', + 'title': 'Skankocillin', + 'upload_date': '20200801', + 'timestamp': 1596241390, + 'uploader_id': '95894', + 'uploader': 'Bluebeat Shelter', + } + }, { + 'url': 'https://drooble.com/karl340758/videos/2859183', + 'info_dict': { + 'id': 'J6QCQY_I5Tk', + 'ext': 'mp4', + 'title': 'Skankocillin', + 'uploader_id': 'UCrSRoI5vVyeYihtWEYua7rg', + 'description': 'md5:ffc0bd8ba383db5341a86a6cd7d9bcca', + 'upload_date': '20200731', + 'uploader': 'Bluebeat Shelter', + } + }, { + 'url': 'https://drooble.com/karl340758/music/albums/2858031', + 'info_dict': { + 'id': '2858031', + }, + 'playlist_mincount': 8, + }, { + 'url': 'https://drooble.com/karl340758/music', + 'info_dict': { + 'id': 'karl340758', + }, + 'playlist_mincount': 8, + }, { + 'url': 'https://drooble.com/karl340758/videos', + 'info_dict': { + 'id': 'karl340758', + }, + 'playlist_mincount': 8, + }] + + def _call_api(self, method, video_id, data=None): + response = self._download_json( + f'https://drooble.com/api/dt/{method}', video_id, data=json.dumps(data).encode()) + if not response[0]: + raise ExtractorError('Unable to download JSON metadata') + return response[1] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + user = mobj.group('user') or mobj.group('user_2') + kind = mobj.group('kind') or mobj.group('kind_2') + display_id = mobj.group('id') or user + + if mobj.group('kind_2') == 'videos': + data = {'from_user': display_id, 'album': -1, 'limit': 18, 'offset': 0, 'order': 'new2old', 'type': 'video'} + elif kind in ('music/albums', 'music'): + data = {'user': user, 'public_only': True, 'individual_limit': {'singles': 1, 'albums': 1, 'playlists': 1}} + else: + data = {'url_slug': display_id, 'children': 10, 'order': 'old2new'} + + method = 'getMusicOverview' if kind in ('music/albums', 'music') else 'getElements' + json_data = self._call_api(method, display_id, data=data) + if kind in ('music/albums', 'music'): + json_data = json_data['singles']['list'] + + entites = [] + for media in json_data: + url = media.get('external_media_url') or media.get('link') + if url.startswith('https://www.youtube.com'): + entites.append({ + '_type': 'url', + 'url': url, + 'ie_key': 'Youtube' + }) + continue + is_audio = (media.get('type') or '').lower() == 'audio' + entites.append({ + 'url': url, + 'id': media['id'], + 'title': media['title'], + 'duration': int_or_none(media.get('duration')), + 'timestamp': int_or_none(media.get('timestamp')), + 'album': try_get(media, lambda x: x['album']['title']), + 'uploader': try_get(media, lambda x: x['creator']['display_name']), + 'uploader_id': try_get(media, lambda x: x['creator']['id']), + 'thumbnail': media.get('image_comment'), + 'like_count': int_or_none(media.get('likes')), + 'vcodec': 'none' if is_audio else None, + 'ext': 'mp3' if is_audio else None, + }) + + if len(entites) > 1: + return self.playlist_result(entites, display_id) + + return entites[0] diff --git a/hypervideo_dl/extractor/dropbox.py b/hypervideo_dl/extractor/dropbox.py index 6a7d050..2559657 100644 --- a/hypervideo_dl/extractor/dropbox.py +++ b/hypervideo_dl/extractor/dropbox.py @@ -6,7 +6,12 @@ import re from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote -from ..utils import url_basename +from ..utils import ( + ExtractorError, + traverse_obj, + try_get, + url_basename, +) class DropboxIE(InfoExtractor): @@ -28,13 +33,44 @@ class DropboxIE(InfoExtractor): def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) fn = compat_urllib_parse_unquote(url_basename(url)) title = os.path.splitext(fn)[0] - video_url = re.sub(r'[?&]dl=0', '', url) - video_url += ('?' if '?' not in video_url else '&') + 'dl=1' + + password = self.get_param('videopassword') + if (self._og_search_title(webpage) == 'Dropbox - Password Required' + or 'Enter the password for this link' in webpage): + + if password: + content_id = self._search_regex(r'content_id=(.*?)["\']', webpage, 'content_id') + payload = f'is_xhr=true&t={self._get_cookies("https://www.dropbox.com").get("t").value}&content_id={content_id}&password={password}&url={url}' + response = self._download_json( + 'https://www.dropbox.com/sm/auth', video_id, 'POSTing video password', data=payload.encode('UTF-8'), + headers={'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'}) + + if response.get('status') != 'authed': + raise ExtractorError('Authentication failed!', expected=True) + webpage = self._download_webpage(url, video_id) + elif self._get_cookies('https://dropbox.com').get('sm_auth'): + webpage = self._download_webpage(url, video_id) + else: + raise ExtractorError('Password protected video, use --video-password <password>', expected=True) + + json_string = self._html_search_regex(r'InitReact\.mountComponent\(.*?,\s*(\{.+\})\s*?\)', webpage, 'Info JSON') + info_json = self._parse_json(json_string, video_id).get('props') + transcode_url = traverse_obj(info_json, ((None, 'preview'), 'file', 'preview', 'content', 'transcode_url'), get_all=False) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id) + + # downloads enabled we can get the original file + if 'anonymous' in (try_get(info_json, lambda x: x['sharePermission']['canDownloadRoles']) or []): + video_url = re.sub(r'[?&]dl=0', '', url) + video_url += ('?' if '?' not in video_url else '&') + 'dl=1' + formats.append({'url': video_url, 'format_id': 'original', 'format_note': 'Original', 'quality': 1}) + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'url': video_url, + 'formats': formats, + 'subtitles': subtitles } diff --git a/hypervideo_dl/extractor/dropout.py b/hypervideo_dl/extractor/dropout.py new file mode 100644 index 0000000..2fa6195 --- /dev/null +++ b/hypervideo_dl/extractor/dropout.py @@ -0,0 +1,212 @@ +# coding: utf-8 +from .common import InfoExtractor +from .vimeo import VHXEmbedIE +from ..utils import ( + clean_html, + ExtractorError, + get_element_by_class, + get_element_by_id, + get_elements_by_class, + int_or_none, + join_nonempty, + unified_strdate, + urlencode_postdata, +) + + +class DropoutIE(InfoExtractor): + _LOGIN_URL = 'https://www.dropout.tv/login' + _NETRC_MACHINE = 'dropout' + + _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?:[^/]+/)*videos/(?P<id>[^/]+)/?$' + _TESTS = [ + { + 'url': 'https://www.dropout.tv/game-changer/season:2/videos/yes-or-no', + 'note': 'Episode in a series', + 'md5': '5e000fdfd8d8fa46ff40456f1c2af04a', + 'info_dict': { + 'id': '738153', + 'display_id': 'yes-or-no', + 'ext': 'mp4', + 'title': 'Yes or No', + 'description': 'Ally, Brennan, and Zac are asked a simple question, but is there a correct answer?', + 'release_date': '20200508', + 'thumbnail': 'https://vhx.imgix.net/chuncensoredstaging/assets/351e3f24-c4a3-459a-8b79-dc80f1e5b7fd.jpg', + 'series': 'Game Changer', + 'season_number': 2, + 'season': 'Season 2', + 'episode_number': 6, + 'episode': 'Yes or No', + 'duration': 1180, + 'uploader_id': 'user80538407', + 'uploader_url': 'https://vimeo.com/user80538407', + 'uploader': 'OTT Videos' + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'] + }, + { + 'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1/videos/episode-1', + 'note': 'Episode in a series (missing release_date)', + 'md5': '712caf7c191f1c47c8f1879520c2fa5c', + 'info_dict': { + 'id': '320562', + 'display_id': 'episode-1', + 'ext': 'mp4', + 'title': 'The Beginning Begins', + 'description': 'The cast introduces their PCs, including a neurotic elf, a goblin PI, and a corn-worshipping cleric.', + 'thumbnail': 'https://vhx.imgix.net/chuncensoredstaging/assets/4421ed0d-f630-4c88-9004-5251b2b8adfa.jpg', + 'series': 'Dimension 20: Fantasy High', + 'season_number': 1, + 'season': 'Season 1', + 'episode_number': 1, + 'episode': 'The Beginning Begins', + 'duration': 6838, + 'uploader_id': 'user80538407', + 'uploader_url': 'https://vimeo.com/user80538407', + 'uploader': 'OTT Videos' + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'] + }, + { + 'url': 'https://www.dropout.tv/videos/misfits-magic-holiday-special', + 'note': 'Episode not in a series', + 'md5': 'c30fa18999c5880d156339f13c953a26', + 'info_dict': { + 'id': '1915774', + 'display_id': 'misfits-magic-holiday-special', + 'ext': 'mp4', + 'title': 'Misfits & Magic Holiday Special', + 'description': 'The magical misfits spend Christmas break at Gowpenny, with an unwelcome visitor.', + 'release_date': '20211215', + 'thumbnail': 'https://vhx.imgix.net/chuncensoredstaging/assets/d91ea8a6-b250-42ed-907e-b30fb1c65176-8e24b8e5.jpg', + 'duration': 11698, + 'uploader_id': 'user80538407', + 'uploader_url': 'https://vimeo.com/user80538407', + 'uploader': 'OTT Videos' + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'] + } + ] + + def _get_authenticity_token(self, display_id): + signin_page = self._download_webpage( + self._LOGIN_URL, display_id, note='Getting authenticity token') + return self._html_search_regex( + r'name=["\']authenticity_token["\'] value=["\'](.+?)["\']', + signin_page, 'authenticity_token') + + def _login(self, display_id): + username, password = self._get_login_info() + if not (username and password): + self.raise_login_required(method='password') + + response = self._download_webpage( + self._LOGIN_URL, display_id, note='Logging in', data=urlencode_postdata({ + 'email': username, + 'password': password, + 'authenticity_token': self._get_authenticity_token(display_id), + 'utf8': True + })) + + user_has_subscription = self._search_regex( + r'user_has_subscription:\s*["\'](.+?)["\']', response, 'subscription status', default='none') + if user_has_subscription.lower() == 'true': + return response + elif user_has_subscription.lower() == 'false': + raise ExtractorError('Account is not subscribed') + else: + raise ExtractorError('Incorrect username/password') + + def _real_extract(self, url): + display_id = self._match_id(url) + try: + self._login(display_id) + webpage = self._download_webpage(url, display_id, note='Downloading video webpage') + finally: + self._download_webpage('https://www.dropout.tv/logout', display_id, note='Logging out', fatal=False) + + embed_url = self._search_regex(r'embed_url:\s*["\'](.+?)["\']', webpage, 'embed url') + thumbnail = self._og_search_thumbnail(webpage) + watch_info = get_element_by_id('watch-info', webpage) or '' + + title = clean_html(get_element_by_class('video-title', watch_info)) + season_episode = get_element_by_class( + 'site-font-secondary-color', get_element_by_class('text', watch_info)) + episode_number = int_or_none(self._search_regex( + r'Episode (\d+)', season_episode or '', 'episode', default=None)) + + return { + '_type': 'url_transparent', + 'ie_key': VHXEmbedIE.ie_key(), + 'url': embed_url, + 'id': self._search_regex(r'embed\.vhx\.tv/videos/(.+?)\?', embed_url, 'id'), + 'display_id': display_id, + 'title': title, + 'description': self._html_search_meta('description', webpage, fatal=False), + 'thumbnail': thumbnail.split('?')[0] if thumbnail else None, # Ignore crop/downscale + 'series': clean_html(get_element_by_class('series-title', watch_info)), + 'episode_number': episode_number, + 'episode': title if episode_number else None, + 'season_number': int_or_none(self._search_regex( + r'Season (\d+),', season_episode or '', 'season', default=None)), + 'release_date': unified_strdate(self._search_regex( + r'data-meta-field-name=["\']release_dates["\'] data-meta-field-value=["\'](.+?)["\']', + watch_info, 'release date', default=None)), + } + + +class DropoutSeasonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?P<id>[^\/$&?#]+)(?:/?$|/season:[0-9]+/?$)' + _TESTS = [ + { + 'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1', + 'note': 'Multi-season series with the season in the url', + 'playlist_count': 17, + 'info_dict': { + 'id': 'dimension-20-fantasy-high-season-1', + 'title': 'Dimension 20 Fantasy High - Season 1' + } + }, + { + 'url': 'https://www.dropout.tv/dimension-20-fantasy-high', + 'note': 'Multi-season series with the season not in the url', + 'playlist_count': 17, + 'info_dict': { + 'id': 'dimension-20-fantasy-high-season-1', + 'title': 'Dimension 20 Fantasy High - Season 1' + } + }, + { + 'url': 'https://www.dropout.tv/dimension-20-shriek-week', + 'note': 'Single-season series', + 'playlist_count': 4, + 'info_dict': { + 'id': 'dimension-20-shriek-week-season-1', + 'title': 'Dimension 20 Shriek Week - Season 1' + } + } + ] + + def _real_extract(self, url): + season_id = self._match_id(url) + season_title = season_id.replace('-', ' ').title() + webpage = self._download_webpage(url, season_id) + + entries = [ + self.url_result( + url=self._search_regex(r'<a href=["\'](.+?)["\'] class=["\']browse-item-link["\']', + item, 'item_url'), + ie=DropoutIE.ie_key() + ) for item in get_elements_by_class('js-collection-item', webpage) + ] + + seasons = (get_element_by_class('select-dropdown-wrapper', webpage) or '').strip().replace('\n', '') + current_season = self._search_regex(r'<option[^>]+selected>([^<]+)</option>', + seasons, 'current_season', default='').strip() + + return { + '_type': 'playlist', + 'id': join_nonempty(season_id, current_season.lower().replace(' ', '-')), + 'title': join_nonempty(season_title, current_season, delim=' - '), + 'entries': entries + } diff --git a/hypervideo_dl/extractor/drtv.py b/hypervideo_dl/extractor/drtv.py index 7bb15f8..37e4d5b 100644 --- a/hypervideo_dl/extractor/drtv.py +++ b/hypervideo_dl/extractor/drtv.py @@ -7,13 +7,11 @@ import re from .common import InfoExtractor -from ..aes import aes_cbc_decrypt +from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 from ..compat import compat_urllib_parse_unquote from ..utils import ( - bytes_to_intlist, ExtractorError, int_or_none, - intlist_to_bytes, float_or_none, mimetype2ext, str_or_none, @@ -191,13 +189,11 @@ class DRTVIE(InfoExtractor): def decrypt_uri(e): n = int(e[2:10], 16) a = e[10 + n:] - data = bytes_to_intlist(hex_to_bytes(e[10:10 + n])) - key = bytes_to_intlist(hashlib.sha256( - ('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest()) - iv = bytes_to_intlist(hex_to_bytes(a)) - decrypted = aes_cbc_decrypt(data, key, iv) - return intlist_to_bytes( - decrypted[:-decrypted[-1]]).decode('utf-8').split('?')[0] + data = hex_to_bytes(e[10:10 + n]) + key = hashlib.sha256(('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest() + iv = hex_to_bytes(a) + decrypted = unpad_pkcs7(aes_cbc_decrypt_bytes(data, key, iv)) + return decrypted.decode('utf-8').split('?')[0] for asset in assets: kind = asset.get('Kind') @@ -321,7 +317,7 @@ class DRTVLiveIE(InfoExtractor): channel_data = self._download_json( 'https://www.dr.dk/mu-online/api/1.0/channel/' + channel_id, channel_id) - title = self._live_title(channel_data['Title']) + title = channel_data['Title'] formats = [] for streaming_server in channel_data.get('StreamingServers', []): diff --git a/hypervideo_dl/extractor/dvtv.py b/hypervideo_dl/extractor/dvtv.py index de7f6d6..08663cf 100644 --- a/hypervideo_dl/extractor/dvtv.py +++ b/hypervideo_dl/extractor/dvtv.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + join_nonempty, js_to_json, mimetype2ext, try_get, @@ -139,13 +140,9 @@ class DVTVIE(InfoExtractor): label = video.get('label') height = self._search_regex( r'^(\d+)[pP]', label or '', 'height', default=None) - format_id = ['http'] - for f in (ext, label): - if f: - format_id.append(f) formats.append({ 'url': video_url, - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty('http', ext, label), 'height': int_or_none(height), }) self._sort_formats(formats) diff --git a/hypervideo_dl/extractor/egghead.py b/hypervideo_dl/extractor/egghead.py index f6b50e7..b6b8676 100644 --- a/hypervideo_dl/extractor/egghead.py +++ b/hypervideo_dl/extractor/egghead.py @@ -86,7 +86,6 @@ class EggheadLessonIE(EggheadBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', }, }, { 'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application', diff --git a/hypervideo_dl/extractor/ellentube.py b/hypervideo_dl/extractor/ellentube.py index 5444732..d451bc0 100644 --- a/hypervideo_dl/extractor/ellentube.py +++ b/hypervideo_dl/extractor/ellentube.py @@ -26,7 +26,7 @@ class EllenTubeBaseIE(InfoExtractor): duration = None for entry in data.get('media'): if entry.get('id') == 'm3u8': - formats = self._extract_m3u8_formats( + formats, subtitles = self._extract_m3u8_formats_and_subtitles( entry['url'], video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') duration = int_or_none(entry.get('duration')) @@ -48,6 +48,7 @@ class EllenTubeBaseIE(InfoExtractor): 'view_count': get_insight('view'), 'like_count': get_insight('like'), 'formats': formats, + 'subtitles': subtitles, } diff --git a/hypervideo_dl/extractor/elonet.py b/hypervideo_dl/extractor/elonet.py index eefba4e..9c6aea2 100644 --- a/hypervideo_dl/extractor/elonet.py +++ b/hypervideo_dl/extractor/elonet.py @@ -1,30 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - base_url, - ExtractorError, - try_get, -) -from ..compat import compat_str +from ..utils import determine_ext class ElonetIE(InfoExtractor): _VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P<id>[0-9]+)' _TESTS = [{ - # m3u8 with subtitles 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867', - 'md5': '8efc954b96c543711707f87de757caea', 'info_dict': { 'id': '107867', 'ext': 'mp4', 'title': 'Valkoinen peura', - 'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...', - 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large', + 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_107867.+', + 'description': 'md5:bded4201c9677fab10854884fe8f7312', }, + 'params': {'skip_download': 'dash'}, }, { # DASH with subtitles 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_116539', @@ -32,58 +24,45 @@ class ElonetIE(InfoExtractor): 'id': '116539', 'ext': 'mp4', 'title': 'Minulla on tiikeri', - 'description': 'Pienellä pojalla, joka asuu kerrostalossa, on kotieläimenä tiikeri. Se on kuitenkin salaisuus. Kerrostalon räpätäti on Kotilaisen täti, joka on aina vali...', - 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_116539&index=0&size=large&source=Solr', - } + 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_116539.+', + 'description': 'md5:5ab72b3fe76d3414e46cc8f277104419', + }, + 'params': {'skip_download': 'dash'}, + }, { + # Page with multiple videos, download the main one + 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_117396', + 'info_dict': { + 'id': '117396', + 'ext': 'mp4', + 'title': 'Sampo', + 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_117396.+', + 'description': 'md5:ec69572a5b054d0ecafe8086b1fa96f7', + }, + 'params': {'skip_download': 'dash'}, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'<meta .*property="og:title" .*content="(.+?)"', webpage, 'title') - description = self._html_search_regex( - r'<meta .*property="og:description" .*content="(.+?)"', webpage, 'description') - thumbnail = self._html_search_regex( - r'<meta .*property="og:image" .*content="(.+?)"', webpage, 'thumbnail') + src = self._parse_json(self._html_search_regex( + r'id=\'video-data\'[^>]+data-video-sources="([^"]+)"', webpage, 'json'), video_id)[0]['src'] + ext = determine_ext(src) - json_s = self._html_search_regex( - r'data-video-sources="(.+?)"', webpage, 'json') - src = try_get( - self._parse_json(json_s, video_id), - lambda x: x[0]["src"], compat_str) - formats = [] - subtitles = {} - if re.search(r'\.m3u8\??', src): - res = self._download_webpage_handle( - # elonet servers have certificate problems - src.replace('https:', 'http:'), video_id, - note='Downloading m3u8 information', - errnote='Failed to download m3u8 information') - if res: - doc, urlh = res - url = urlh.geturl() - formats, subtitles = self._parse_m3u8_formats_and_subtitles(doc, url) - for f in formats: - f['ext'] = 'mp4' - elif re.search(r'\.mpd\??', src): - res = self._download_xml_handle( - src, video_id, - note='Downloading MPD manifest', - errnote='Failed to download MPD manifest') - if res: - doc, urlh = res - url = base_url(urlh.geturl()) - formats, subtitles = self._parse_mpd_formats_and_subtitles(doc, mpd_base_url=url) + if ext == 'm3u8': + formats, subtitles = self._extract_m3u8_formats_and_subtitles(src, video_id, fatal=False) + elif ext == 'mpd': + formats, subtitles = self._extract_mpd_formats_and_subtitles(src, video_id, fatal=False) else: - raise ExtractorError("Unknown streaming format") + formats, subtitles = [], {} + self.raise_no_formats(f'Unknown streaming format {ext}') + self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, 'subtitles': subtitles, } diff --git a/hypervideo_dl/extractor/engadget.py b/hypervideo_dl/extractor/engadget.py index 65635c1..733bf32 100644 --- a/hypervideo_dl/extractor/engadget.py +++ b/hypervideo_dl/extractor/engadget.py @@ -7,16 +7,6 @@ class EngadgetIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?engadget\.com/video/(?P<id>[^/?#]+)' _TESTS = [{ - # video with 5min ID - 'url': 'http://www.engadget.com/video/518153925/', - 'md5': 'c6820d4828a5064447a4d9fc73f312c9', - 'info_dict': { - 'id': '518153925', - 'ext': 'mp4', - 'title': 'Samsung Galaxy Tab Pro 8.4 Review', - }, - 'add_ie': ['FiveMin'], - }, { # video with vidible ID 'url': 'https://www.engadget.com/video/57a28462134aa15a39f0421a/', 'only_matching': True, diff --git a/hypervideo_dl/extractor/epicon.py b/hypervideo_dl/extractor/epicon.py index b4e544d..cd19325 100644 --- a/hypervideo_dl/extractor/epicon.py +++ b/hypervideo_dl/extractor/epicon.py @@ -8,7 +8,7 @@ from ..utils import ExtractorError class EpiconIE(InfoExtractor): - _VALID_URL = r'(?:https?://)(?:www\.)?epicon\.in/(?:documentaries|movies|tv-shows/[^/?#]+/[^/?#]+)/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?epicon\.in/(?:documentaries|movies|tv-shows/[^/?#]+/[^/?#]+)/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.epicon.in/documentaries/air-battle-of-srinagar', 'info_dict': { @@ -84,7 +84,7 @@ class EpiconIE(InfoExtractor): class EpiconSeriesIE(InfoExtractor): - _VALID_URL = r'(?!.*season)(?:https?://)(?:www\.)?epicon\.in/tv-shows/(?P<id>[^/?#]+)' + _VALID_URL = r'(?!.*season)https?://(?:www\.)?epicon\.in/tv-shows/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.epicon.in/tv-shows/1-of-something', 'playlist_mincount': 5, diff --git a/hypervideo_dl/extractor/eroprofile.py b/hypervideo_dl/extractor/eroprofile.py index a8396f1..5d5e7f2 100644 --- a/hypervideo_dl/extractor/eroprofile.py +++ b/hypervideo_dl/extractor/eroprofile.py @@ -39,11 +39,7 @@ class EroProfileIE(InfoExtractor): 'skip': 'Requires login', }] - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): query = compat_urllib_parse_urlencode({ 'username': username, 'password': password, @@ -62,9 +58,6 @@ class EroProfileIE(InfoExtractor): r'<script[^>]+?src="([^"]+)"', login_page, 'login redirect url') self._download_webpage(redirect_url, None, False) - def _real_initialize(self): - self._login() - def _real_extract(self, url): display_id = self._match_id(url) diff --git a/hypervideo_dl/extractor/ertgr.py b/hypervideo_dl/extractor/ertgr.py new file mode 100644 index 0000000..19ce23f --- /dev/null +++ b/hypervideo_dl/extractor/ertgr.py @@ -0,0 +1,316 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + determine_ext, + ExtractorError, + dict_get, + int_or_none, + merge_dicts, + parse_qs, + parse_age_limit, + parse_iso8601, + str_or_none, + try_get, + unescapeHTML, + url_or_none, + variadic, +) + + +class ERTFlixBaseIE(InfoExtractor): + def _call_api( + self, video_id, method='Player/AcquireContent', api_version=1, + param_headers=None, data=None, headers=None, **params): + platform_codename = {'platformCodename': 'www'} + headers_as_param = {'X-Api-Date-Format': 'iso', 'X-Api-Camel-Case': False} + headers_as_param.update(param_headers or {}) + headers = headers or {} + if data: + headers['Content-Type'] = headers_as_param['Content-Type'] = 'application/json;charset=utf-8' + data = json.dumps(merge_dicts(platform_codename, data)).encode('utf-8') + query = merge_dicts( + {} if data else platform_codename, + {'$headers': json.dumps(headers_as_param)}, + params) + response = self._download_json( + 'https://api.app.ertflix.gr/v%s/%s' % (str(api_version), method), + video_id, fatal=False, query=query, data=data, headers=headers) + if try_get(response, lambda x: x['Result']['Success']) is True: + return response + + def _call_api_get_tiles(self, video_id, *tile_ids): + requested_tile_ids = [video_id] + list(tile_ids) + requested_tiles = [{'Id': tile_id} for tile_id in requested_tile_ids] + tiles_response = self._call_api( + video_id, method='Tile/GetTiles', api_version=2, + data={'RequestedTiles': requested_tiles}) + tiles = try_get(tiles_response, lambda x: x['Tiles'], list) or [] + if tile_ids: + if sorted([tile['Id'] for tile in tiles]) != sorted(requested_tile_ids): + raise ExtractorError('Requested tiles not found', video_id=video_id) + return tiles + try: + return next(tile for tile in tiles if tile['Id'] == video_id) + except StopIteration: + raise ExtractorError('No matching tile found', video_id=video_id) + + +class ERTFlixCodenameIE(ERTFlixBaseIE): + IE_NAME = 'ertflix:codename' + IE_DESC = 'ERTFLIX videos by codename' + _VALID_URL = r'ertflix:(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'ertflix:monogramma-praxitelis-tzanoylinos', + 'md5': '5b9c2cd171f09126167e4082fc1dd0ef', + 'info_dict': { + 'id': 'monogramma-praxitelis-tzanoylinos', + 'ext': 'mp4', + 'title': 'md5:ef0b439902963d56c43ac83c3f41dd0e', + }, + }, + ] + + def _extract_formats_and_subs(self, video_id, allow_none=True): + media_info = self._call_api(video_id, codename=video_id) + formats, subs = [], {} + for media_file in try_get(media_info, lambda x: x['MediaFiles'], list) or []: + for media in try_get(media_file, lambda x: x['Formats'], list) or []: + fmt_url = url_or_none(try_get(media, lambda x: x['Url'])) + if not fmt_url: + continue + ext = determine_ext(fmt_url) + if ext == 'm3u8': + formats_, subs_ = self._extract_m3u8_formats_and_subtitles( + fmt_url, video_id, m3u8_id='hls', ext='mp4', fatal=False) + elif ext == 'mpd': + formats_, subs_ = self._extract_mpd_formats_and_subtitles( + fmt_url, video_id, mpd_id='dash', fatal=False) + else: + formats.append({ + 'url': fmt_url, + 'format_id': str_or_none(media.get('Id')), + }) + continue + formats.extend(formats_) + self._merge_subtitles(subs_, target=subs) + + if formats or not allow_none: + self._sort_formats(formats) + return formats, subs + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats, subs = self._extract_formats_and_subs(video_id) + + if formats: + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subs, + 'title': self._generic_title(url), + } + + +class ERTFlixIE(ERTFlixBaseIE): + IE_NAME = 'ertflix' + IE_DESC = 'ERTFLIX videos' + _VALID_URL = r'https?://www\.ertflix\.gr/(?:series|vod)/(?P<id>[a-z]{3}\.\d+)' + _TESTS = [{ + 'url': 'https://www.ertflix.gr/vod/vod.173258-aoratoi-ergates', + 'md5': '6479d5e60fd7e520b07ba5411dcdd6e7', + 'info_dict': { + 'id': 'aoratoi-ergates', + 'ext': 'mp4', + 'title': 'md5:c1433d598fbba0211b0069021517f8b4', + 'description': 'md5:01a64d113c31957eb7eb07719ab18ff4', + 'thumbnail': r're:https?://.+\.jpg', + 'episode_id': 'vod.173258', + 'timestamp': 1639648800, + 'upload_date': '20211216', + 'duration': 3166, + 'age_limit': 8, + }, + }, { + 'url': 'https://www.ertflix.gr/series/ser.3448-monogramma', + 'info_dict': { + 'id': 'ser.3448', + 'age_limit': 8, + 'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.', + 'title': 'Μονόγραμμα', + }, + 'playlist_mincount': 64, + }, { + 'url': 'https://www.ertflix.gr/series/ser.3448-monogramma?season=1', + 'info_dict': { + 'id': 'ser.3448', + 'age_limit': 8, + 'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.', + 'title': 'Μονόγραμμα', + }, + 'playlist_count': 22, + }, { + 'url': 'https://www.ertflix.gr/series/ser.3448-monogramma?season=1&season=2021%20-%202022', + 'info_dict': { + 'id': 'ser.3448', + 'age_limit': 8, + 'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.', + 'title': 'Μονόγραμμα', + }, + 'playlist_mincount': 36, + }, { + 'url': 'https://www.ertflix.gr/series/ser.164991-to-diktuo-1?season=1-9', + 'info_dict': { + 'id': 'ser.164991', + 'age_limit': 8, + 'description': 'Η πρώτη ελληνική εκπομπή με θεματολογία αποκλειστικά γύρω από το ίντερνετ.', + 'title': 'Το δίκτυο', + }, + 'playlist_mincount': 9, + }] + + def _extract_episode(self, episode): + codename = try_get(episode, lambda x: x['Codename'], compat_str) + title = episode.get('Title') + description = clean_html(dict_get(episode, ('ShortDescription', 'TinyDescription', ))) + if not codename or not title or not episode.get('HasPlayableStream', True): + return + thumbnail = next(( + url_or_none(thumb.get('Url')) + for thumb in variadic(dict_get(episode, ('Images', 'Image')) or {}) + if thumb.get('IsMain')), + None) + return { + '_type': 'url_transparent', + 'thumbnail': thumbnail, + 'id': codename, + 'episode_id': episode.get('Id'), + 'title': title, + 'alt_title': episode.get('Subtitle'), + 'description': description, + 'timestamp': parse_iso8601(episode.get('PublishDate')), + 'duration': episode.get('DurationSeconds'), + 'age_limit': self._parse_age_rating(episode), + 'url': 'ertflix:%s' % (codename, ), + } + + @staticmethod + def _parse_age_rating(info_dict): + return parse_age_limit( + info_dict.get('AgeRating') + or (info_dict.get('IsAdultContent') and 18) + or (info_dict.get('IsKidsContent') and 0)) + + def _extract_series(self, video_id, season_titles=None, season_numbers=None): + media_info = self._call_api(video_id, method='Tile/GetSeriesDetails', id=video_id) + + series = try_get(media_info, lambda x: x['Series'], dict) or {} + series_info = { + 'age_limit': self._parse_age_rating(series), + 'title': series.get('Title'), + 'description': dict_get(series, ('ShortDescription', 'TinyDescription', )), + } + if season_numbers: + season_titles = season_titles or [] + for season in try_get(series, lambda x: x['Seasons'], list) or []: + if season.get('SeasonNumber') in season_numbers and season.get('Title'): + season_titles.append(season['Title']) + + def gen_episode(m_info, season_titles): + for episode_group in try_get(m_info, lambda x: x['EpisodeGroups'], list) or []: + if season_titles and episode_group.get('Title') not in season_titles: + continue + episodes = try_get(episode_group, lambda x: x['Episodes'], list) + if not episodes: + continue + season_info = { + 'season': episode_group.get('Title'), + 'season_number': int_or_none(episode_group.get('SeasonNumber')), + } + try: + episodes = [(int(ep['EpisodeNumber']), ep) for ep in episodes] + episodes.sort() + except (KeyError, ValueError): + episodes = enumerate(episodes, 1) + for n, episode in episodes: + info = self._extract_episode(episode) + if info is None: + continue + info['episode_number'] = n + info.update(season_info) + yield info + + return self.playlist_result( + gen_episode(media_info, season_titles), playlist_id=video_id, **series_info) + + def _real_extract(self, url): + video_id = self._match_id(url) + if video_id.startswith('ser.'): + param_season = parse_qs(url).get('season', [None]) + param_season = [ + (have_number, int_or_none(v) if have_number else str_or_none(v)) + for have_number, v in + [(int_or_none(ps) is not None, ps) for ps in param_season] + if v is not None + ] + season_kwargs = { + k: [v for is_num, v in param_season if is_num is c] or None + for k, c in + [('season_titles', False), ('season_numbers', True)] + } + return self._extract_series(video_id, **season_kwargs) + + return self._extract_episode(self._call_api_get_tiles(video_id)) + + +class ERTWebtvEmbedIE(InfoExtractor): + IE_NAME = 'ertwebtv:embed' + IE_DESC = 'ert.gr webtv embedded videos' + _BASE_PLAYER_URL_RE = re.escape('//www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php') + _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?f=(?P<id>[^#&]+)' + + _TESTS = [{ + 'url': 'https://www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php?f=trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4&bgimg=/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg', + 'md5': 'f9e9900c25c26f4ecfbddbb4b6305854', + 'info_dict': { + 'id': 'trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4', + 'title': 'md5:914f06a73cd8b62fbcd6fb90c636e497', + 'ext': 'mp4', + 'thumbnail': 'https://program.ert.gr/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg' + }, + }] + + @classmethod + def _extract_urls(cls, webpage): + EMBED_URL_RE = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' + EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{EMBED_URL_RE})(?P=_q1)' + + for mobj in re.finditer(EMBED_RE, webpage): + url = unescapeHTML(mobj.group('url')) + if not cls.suitable(url): + continue + yield url + + def _real_extract(self, url): + video_id = self._match_id(url) + formats, subs = self._extract_m3u8_formats_and_subtitles( + f'https://mediastream.ert.gr/vodedge/_definst_/mp4:dvrorigin/{video_id}/playlist.m3u8', + video_id, 'mp4') + self._sort_formats(formats) + thumbnail_id = parse_qs(url).get('bgimg', [None])[0] + if thumbnail_id and not thumbnail_id.startswith('http'): + thumbnail_id = f'https://program.ert.gr{thumbnail_id}' + return { + 'id': video_id, + 'title': f'VOD - {video_id}', + 'thumbnail': thumbnail_id, + 'formats': formats, + 'subtitles': subs, + } diff --git a/hypervideo_dl/extractor/espn.py b/hypervideo_dl/extractor/espn.py index d4a66c2..dc50f3b 100644 --- a/hypervideo_dl/extractor/espn.py +++ b/hypervideo_dl/extractor/espn.py @@ -7,7 +7,9 @@ from .once import OnceIE from ..compat import compat_str from ..utils import ( determine_ext, + dict_get, int_or_none, + unified_strdate, unified_timestamp, ) @@ -236,3 +238,44 @@ class FiveThirtyEightIE(InfoExtractor): webpage, 'embed url') return self.url_result(embed_url, 'AbcNewsVideo') + + +class ESPNCricInfoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?espncricinfo\.com/video/[^#$&?/]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.espncricinfo.com/video/finch-chasing-comes-with-risks-despite-world-cup-trend-1289135', + 'info_dict': { + 'id': '1289135', + 'ext': 'mp4', + 'title': 'Finch: Chasing comes with \'risks\' despite World Cup trend', + 'description': 'md5:ea32373303e25efbb146efdfc8a37829', + 'upload_date': '20211113', + 'duration': 96, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://hs-consumer-api.espncricinfo.com/v1/pages/video/video-details?videoId={id}', id)['video'] + formats, subtitles = [], {} + for item in data_json.get('playbacks') or []: + if item.get('type') == 'HLS' and item.get('url'): + m3u8_frmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(item['url'], id) + formats.extend(m3u8_frmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + elif item.get('type') == 'AUDIO' and item.get('url'): + formats.append({ + 'url': item['url'], + 'vcodec': 'none', + }) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title'), + 'description': data_json.get('summary'), + 'upload_date': unified_strdate(dict_get(data_json, ('publishedAt', 'recordedAt'))), + 'duration': data_json.get('duration'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/europeantour.py b/hypervideo_dl/extractor/europeantour.py new file mode 100644 index 0000000..e28f067 --- /dev/null +++ b/hypervideo_dl/extractor/europeantour.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class EuropeanTourIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?europeantour\.com/dpworld-tour/news/video/(?P<id>[^/&?#$]+)' + + _TESTS = [{ + 'url': 'https://www.europeantour.com/dpworld-tour/news/video/the-best-shots-of-the-2021-seasons/', + 'info_dict': { + 'id': '6287788195001', + 'ext': 'mp4', + 'title': 'The best shots of the 2021 seasons', + 'duration': 2416.512, + 'timestamp': 1640010141, + 'uploader_id': '5136026580001', + 'tags': ['prod-imported'], + 'thumbnail': 'md5:fdac52bc826548860edf8145ee74e71a', + 'upload_date': '20211220' + }, + 'params': {'skip_download': True} + }] + + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + vid, aid = re.search(r'(?s)brightcove-player\s?video-id="([^"]+)".*"ACCOUNT_ID":"([^"]+)"', webpage).groups() + if not aid: + aid = '5136026580001' + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % (aid, vid), 'BrightcoveNew') diff --git a/hypervideo_dl/extractor/euscreen.py b/hypervideo_dl/extractor/euscreen.py index 3980c23..2759e74 100644 --- a/hypervideo_dl/extractor/euscreen.py +++ b/hypervideo_dl/extractor/euscreen.py @@ -10,7 +10,7 @@ from ..utils import ( class EUScreenIE(InfoExtractor): - _VALID_URL = r'(?:https?://)(?:www\.)?euscreen\.eu/item.html\?id=(?P<id>[^&?$/]+)' + _VALID_URL = r'https?://(?:www\.)?euscreen\.eu/item.html\?id=(?P<id>[^&?$/]+)' _TESTS = [{ 'url': 'https://euscreen.eu/item.html?id=EUS_0EBCBF356BFC4E12A014023BA41BD98C', diff --git a/hypervideo_dl/extractor/extractors.py b/hypervideo_dl/extractor/extractors.py index f4f817f..457f4c2 100644 --- a/hypervideo_dl/extractor/extractors.py +++ b/hypervideo_dl/extractor/extractors.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .abc import ( ABCIE, ABCIViewIE, + ABCIViewShowSeriesIE, ) from .abcnews import ( AbcNewsIE, @@ -13,6 +14,10 @@ from .abcotvs import ( ABCOTVSIE, ABCOTVSClipsIE, ) +from .abematv import ( + AbemaTVIE, + AbemaTVTitleIE, +) from .academicearth import AcademicEarthCourseIE from .acast import ( ACastIE, @@ -36,7 +41,10 @@ from .aenetworks import ( HistoryPlayerIE, BiographyIE, ) -from .afreecatv import AfreecaTVIE +from .afreecatv import ( + AfreecaTVIE, + AfreecaTVLiveIE, +) from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE @@ -50,6 +58,7 @@ from .animelab import ( AnimeLabIE, AnimeLabShowsIE, ) +from .amazon import AmazonStoreIE from .americastestkitchen import ( AmericasTestKitchenIE, AmericasTestKitchenSeasonIE, @@ -59,6 +68,10 @@ from .anvato import AnvatoIE from .aol import AolIE from .allocine import AllocineIE from .aliexpress import AliExpressLiveIE +from .alsace20tv import ( + Alsace20TVIE, + Alsace20TVEmbedIE, +) from .apa import APAIE from .aparat import AparatIE from .appleconnect import AppleConnectIE @@ -82,6 +95,7 @@ from .arte import ( ArteTVIE, ArteTVEmbedIE, ArteTVPlaylistIE, + ArteTVCategoryIE, ) from .arnes import ArnesIE from .asiancrush import ( @@ -108,12 +122,16 @@ from .awaan import ( ) from .azmedien import AZMedienIE from .baidu import BaiduVideoIE +from .banbye import ( + BanByeIE, + BanByeChannelIE, +) from .bandaichannel import BandaiChannelIE from .bandcamp import ( BandcampIE, BandcampAlbumIE, BandcampWeeklyIE, - BandcampMusicIE, + BandcampUserIE, ) from .bannedvideo import BannedVideoIE from .bbc import ( @@ -137,6 +155,7 @@ from .bfmtv import ( ) from .bibeltv import BibelTVIE from .bigflix import BigflixIE +from .bigo import BigoIE from .bild import BildIE from .bilibili import ( BiliBiliIE, @@ -165,6 +184,7 @@ from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, ) +from .blogger import BloggerIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE @@ -177,6 +197,7 @@ from .br import ( ) from .bravotv import BravoTVIE from .breakcom import BreakIE +from .breitbart import BreitBartIE from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, @@ -185,6 +206,9 @@ from .businessinsider import BusinessInsiderIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE +from .cableav import CableAVIE +from .callin import CallinIE +from .caltrans import CaltransIE from .cam4 import CAM4IE from .camdemy import ( CamdemyIE, @@ -192,6 +216,7 @@ from .camdemy import ( ) from .cammodels import CamModelsIE from .camwithher import CamWithHerIE +from .canalalpha import CanalAlphaIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .canvas import ( @@ -235,10 +260,7 @@ from .ccc import ( from .ccma import CCMAIE from .cctv import CCTVIE from .cda import CDAIE -from .ceskatelevize import ( - CeskaTelevizeIE, - CeskaTelevizePoradyIE, -) +from .ceskatelevize import CeskaTelevizeIE from .cgtn import CGTNIE from .channel9 import Channel9IE from .charlierose import CharlieRoseIE @@ -293,25 +315,41 @@ from .commonprotocols import ( from .condenast import CondeNastIE from .contv import CONtvIE from .corus import CorusIE +from .cpac import ( + CPACIE, + CPACPlaylistIE, +) +from .cozytv import CozyTVIE from .cracked import CrackedIE from .crackle import CrackleIE +from .craftsy import CraftsyIE from .crooksandliars import CrooksAndLiarsIE +from .crowdbunker import ( + CrowdBunkerIE, + CrowdBunkerChannelIE, +) from .crunchyroll import ( CrunchyrollIE, CrunchyrollShowPlaylistIE, CrunchyrollBetaIE, CrunchyrollBetaShowIE, ) -from .cspan import CSpanIE +from .cspan import CSpanIE, CSpanCongressIE from .ctsnews import CtsNewsIE from .ctv import CTVIE from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE from .curiositystream import ( CuriosityStreamIE, - CuriosityStreamCollectionIE, + CuriosityStreamCollectionsIE, + CuriosityStreamSeriesIE, ) from .cwtv import CWTVIE +from .cybrary import ( + CybraryIE, + CybraryCourseIE +) +from .daftsex import DaftsexIE from .dailymail import DailyMailIE from .dailymotion import ( DailymotionIE, @@ -328,6 +366,7 @@ from .daum import ( DaumPlaylistIE, DaumUserIE, ) +from .daystar import DaystarClipIE from .dbtv import DBTVIE from .dctp import DctpTvIE from .deezer import ( @@ -338,10 +377,6 @@ from .democracynow import DemocracynowIE from .dfb import DFBIE from .dhm import DHMIE from .digg import DiggIE -from .discoveryplusindia import ( - DiscoveryPlusIndiaIE, - DiscoveryPlusIndiaShowIE, -) from .dotsub import DotsubIE from .douyutv import ( DouyuShowIE, @@ -351,9 +386,24 @@ from .dplay import ( DPlayIE, DiscoveryPlusIE, HGTVDeIE, + GoDiscoveryIE, + TravelChannelIE, + CookingChannelIE, + HGTVUsaIE, + FoodNetworkIE, + InvestigationDiscoveryIE, + DestinationAmericaIE, + AmHistoryChannelIE, ScienceChannelIE, DIYNetworkIE, - AnimalPlanetIE + DiscoveryLifeIE, + AnimalPlanetIE, + TLCIE, + DiscoveryPlusIndiaIE, + DiscoveryNetworksDeIE, + DiscoveryPlusItalyIE, + DiscoveryPlusItalyShowIE, + DiscoveryPlusIndiaShowIE, ) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE @@ -370,17 +420,16 @@ from .duboku import ( ) from .dumpert import DumpertIE from .defense import DefenseGouvFrIE +from .digitalconcerthall import DigitalConcertHallIE from .discovery import DiscoveryIE -from .discoverygo import ( - DiscoveryGoIE, - DiscoveryGoPlaylistIE, -) -from .discoverynetworks import DiscoveryNetworksDeIE -from .discoveryvr import DiscoveryVRIE from .disney import DisneyIE from .dispeak import DigitallySpeakingIE from .doodstream import DoodStreamIE from .dropbox import DropboxIE +from .dropout import ( + DropoutSeasonIE, + DropoutIE +) from .dw import ( DWIE, DWArticleIE, @@ -414,14 +463,21 @@ from .eroprofile import ( EroProfileIE, EroProfileAlbumIE, ) +from .ertgr import ( + ERTFlixCodenameIE, + ERTFlixIE, + ERTWebtvEmbedIE, +) from .escapist import EscapistIE from .espn import ( ESPNIE, ESPNArticleIE, FiveThirtyEightIE, + ESPNCricInfoIE, ) from .esri import EsriVideoIE from .europa import EuropaIE +from .europeantour import EuropeanTourIE from .euscreen import EUScreenIE from .expotv import ExpoTVIE from .expressen import ExpressenIE @@ -430,6 +486,7 @@ from .eyedotv import EyedoTVIE from .facebook import ( FacebookIE, FacebookPluginsVideoIE, + FacebookRedirectURLIE, ) from .fancode import ( FancodeVodIE, @@ -440,6 +497,7 @@ from .faz import FazIE from .fc2 import ( FC2IE, FC2EmbedIE, + FC2LiveIE, ) from .fczenit import FczenitIE from .filmmodu import FilmmoduIE @@ -449,7 +507,6 @@ from .filmon import ( ) from .filmweb import FilmwebIE from .firsttv import FirstTVIE -from .fivemin import FiveMinIE from .fivetv import FiveTVIE from .flickr import FlickrIE from .folketinget import FolketingetIE @@ -472,6 +529,7 @@ from .foxnews import ( FoxNewsArticleIE, ) from .foxsports import FoxSportsIE +from .fptplay import FptplayIE from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( @@ -481,7 +539,6 @@ from .francetv import ( ) from .freesound import FreesoundIE from .freespeech import FreespeechIE -from .freshlive import FreshLiveIE from .frontendmasters import ( FrontendMastersIE, FrontendMastersLessonIE, @@ -495,9 +552,20 @@ from .funimation import ( ) from .funk import FunkIE from .fusion import FusionIE -from .gab import GabTVIE +from .gab import ( + GabTVIE, + GabIE, +) from .gaia import GaiaIE from .gameinformer import GameInformerIE +from .gamejolt import ( + GameJoltIE, + GameJoltUserIE, + GameJoltGameIE, + GameJoltGameSoundtrackIE, + GameJoltCommunityIE, + GameJoltSearchIE, +) from .gamespot import GameSpotIE from .gamestar import GameStarIE from .gaskrank import GaskrankIE @@ -505,7 +573,10 @@ from .gazeta import GazetaIE from .gdcvault import GDCVaultIE from .gedidigital import GediDigitalIE from .generic import GenericIE -from .gettr import GettrIE +from .gettr import ( + GettrIE, + GettrStreamingIE, +) from .gfycat import GfycatIE from .giantbomb import GiantBombIE from .giga import GigaIE @@ -516,6 +587,7 @@ from .globo import ( ) from .go import GoIE from .godtube import GodTubeIE +from .gofile import GofileIE from .golem import GolemIE from .googledrive import GoogleDriveIE from .googlepodcasts import ( @@ -541,7 +613,6 @@ from .hidive import HiDiveIE from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE from .hitrecord import HitRecordIE -from .hornbunny import HornBunnyIE from .hotnewhiphop import HotNewHipHopIE from .hotstar import ( HotStarIE, @@ -555,7 +626,12 @@ from .hrti import ( HRTiIE, HRTiPlaylistIE, ) +from .hse import ( + HSEShowIE, + HSEProductIE, +) from .huajiao import HuajiaoIE +from .huya import HuyaLiveIE from .huffpost import HuffPostIE from .hungama import ( HungamaIE, @@ -591,14 +667,28 @@ from .indavideo import IndavideoEmbedIE from .infoq import InfoQIE from .instagram import ( InstagramIE, + InstagramIOSIE, InstagramUserIE, InstagramTagIE, + InstagramStoryIE, ) from .internazionale import InternazionaleIE from .internetvideoarchive import InternetVideoArchiveIE -from .iprima import IPrimaIE -from .iqiyi import IqiyiIE -from .ir90tv import Ir90TvIE +from .iprima import ( + IPrimaIE, + IPrimaCNNIE +) +from .iqiyi import ( + IqiyiIE, + IqIE, + IqAlbumIE +) + +from .itprotv import ( + ITProTVIE, + ITProTVCourseIE +) + from .itv import ( ITVIE, ITVBTCCIE, @@ -620,10 +710,10 @@ from .joj import JojIE from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE -from .kankan import KankanIE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE +from .kelbyone import KelbyOneIE from .ketnet import KetnetIE from .khanacademy import ( KhanAcademyIE, @@ -656,6 +746,11 @@ from .laola1tv import ( EHFTVIE, ITTFIE, ) +from .lastfm import ( + LastFMIE, + LastFMPlaylistIE, + LastFMUserIE, +) from .lbry import ( LBRYIE, LBRYChannelIE, @@ -691,11 +786,11 @@ from .limelight import ( LimelightChannelListIE, ) from .line import ( - LineTVIE, LineLiveIE, LineLiveChannelIE, ) from .linkedin import ( + LinkedInIE, LinkedInLearningIE, LinkedInLearningCourseIE, ) @@ -707,7 +802,10 @@ from .livestream import ( LivestreamOriginalIE, LivestreamShortenerIE, ) -from .lnkgo import LnkGoIE +from .lnkgo import ( + LnkGoIE, + LnkIE, +) from .localnews8 import LocalNews8IE from .lovehomeporn import LoveHomePornIE from .lrt import LRTIE @@ -722,6 +820,7 @@ from .mailru import ( MailRuMusicIE, MailRuMusicSearchIE, ) +from .mainstreaming import MainStreamingIE from .malltv import MallTVIE from .mangomolo import ( MangomoloVideoIE, @@ -744,7 +843,10 @@ from .mdr import MDRIE from .medaltv import MedalTVIE from .mediaite import MediaiteIE from .mediaklikk import MediaKlikkIE -from .mediaset import MediasetIE +from .mediaset import ( + MediasetIE, + MediasetShowIE, +) from .mediasite import ( MediasiteIE, MediasiteCatalogIE, @@ -760,6 +862,7 @@ from .metacritic import MetacriticIE from .mgoon import MgoonIE from .mgtv import MGTVIE from .miaopai import MiaoPaiIE +from .microsoftstream import MicrosoftStreamIE from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, @@ -767,6 +870,7 @@ from .microsoftvirtualacademy import ( from .mildom import ( MildomIE, MildomVodIE, + MildomClipIE, MildomUserVodIE, ) from .minds import ( @@ -783,6 +887,10 @@ from .mirrativ import ( ) from .mit import TechTVMITIE, OCWMITIE from .mitele import MiTeleIE +from .mixch import ( + MixchIE, + MixchArchiveIE, +) from .mixcloud import ( MixcloudIE, MixcloudUserIE, @@ -792,6 +900,7 @@ from .mlb import ( MLBIE, MLBVideoIE, ) +from .mlssoccer import MLSSoccerIE from .mnet import MnetIE from .moevideo import MoeVideoIE from .mofosex import ( @@ -819,7 +928,14 @@ from .mtv import ( MTVItaliaProgrammaIE, ) from .muenchentv import MuenchenTVIE +from .murrtube import MurrtubeIE, MurrtubeUserIE from .musescore import MuseScoreIE +from .musicdex import ( + MusicdexSongIE, + MusicdexAlbumIE, + MusicdexArtistIE, + MusicdexPlaylistIE, +) from .mwave import MwaveIE, MwaveMeetGreetIE from .mxplayer import ( MxplayerIE, @@ -834,7 +950,14 @@ from .myvi import ( ) from .myvideoge import MyVideoGeIE from .myvidster import MyVidsterIE -from .n1 import N1InfoIIE, N1InfoAssetIE +from .n1 import ( + N1InfoAssetIE, + N1InfoIIE, +) +from .nate import ( + NateIE, + NateProgramIE, +) from .nationalgeographic import ( NationalGeographicVideoIE, NationalGeographicTVIE, @@ -868,7 +991,10 @@ from .ndr import ( NJoyEmbedIE, ) from .ndtv import NDTVIE -from .nebula import NebulaIE +from .nebula import ( + NebulaIE, + NebulaCollectionIE, +) from .nerdcubed import NerdCubedFeedIE from .netzkino import NetzkinoIE from .neteasemusic import ( @@ -886,6 +1012,7 @@ from .newgrounds import ( NewgroundsUserIE, ) from .newstube import NewstubeIE +from .newsy import NewsyIE from .nextmedia import ( NextMediaIE, NextMediaActionNewsIE, @@ -896,6 +1023,7 @@ from .nexx import ( NexxIE, NexxEmbedIE, ) +from .nfb import NFBIE from .nfhsnetwork import NFHSNetworkIE from .nfl import ( NFLIE, @@ -904,6 +1032,9 @@ from .nfl import ( from .nhk import ( NhkVodIE, NhkVodProgramIE, + NhkForSchoolBangumiIE, + NhkForSchoolSubjectIE, + NhkForSchoolProgramListIE, ) from .nhl import NHLIE from .nick import ( @@ -913,16 +1044,21 @@ from .nick import ( NickNightIE, NickRuIE, ) - from .niconico import ( NiconicoIE, NiconicoPlaylistIE, NiconicoUserIE, + NiconicoSeriesIE, + NiconicoHistoryIE, NicovideoSearchDateIE, NicovideoSearchIE, NicovideoSearchURLIE, + NicovideoTagURLIE, +) +from .ninecninemedia import ( + NineCNineMediaIE, + CPTwentyFourIE, ) -from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE @@ -930,6 +1066,7 @@ from .nitter import NitterIE from .njpwworld import NJPWWorldIE from .nobelprize import NobelPrizeIE from .nonktube import NonkTubeIE +from .noodlemagazine import NoodleMagazineIE from .noovo import NoovoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE @@ -986,6 +1123,7 @@ from .oktoberfesttv import OktoberfestTVIE from .olympics import OlympicsReplayIE from .on24 import On24IE from .ondemandkorea import OnDemandKoreaIE +from .onefootball import OneFootballIE from .onet import ( OnetIE, OnetChannelIE, @@ -997,9 +1135,14 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) +from .opencast import ( + OpencastIE, + OpencastPlaylistIE, +) from .openrec import ( OpenRecIE, OpenRecCaptureIE, + OpenRecMovieIE, ) from .ora import OraTVIE from .orf import ( @@ -1030,6 +1173,11 @@ from .palcomp3 import ( PalcoMP3VideoIE, ) from .pandoratv import PandoraTVIE +from .panopto import ( + PanoptoIE, + PanoptoListIE, + PanoptoPlaylistIE +) from .paramountplus import ( ParamountPlusIE, ParamountPlusSeriesIE, @@ -1042,10 +1190,12 @@ from .patreon import ( ) from .pbs import PBSIE from .pearvideo import PearVideoIE +from .peekvids import PeekVidsIE, PlayVidsIE from .peertube import ( PeerTubeIE, PeerTubePlaylistIE, ) +from .peertv import PeerTVIE from .peloton import ( PelotonIE, PelotonLiveIE @@ -1059,6 +1209,7 @@ from .periscope import ( from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE +from .piapro import PiaproIE from .picarto import ( PicartoIE, PicartoVodIE, @@ -1069,7 +1220,12 @@ from .pinterest import ( PinterestIE, PinterestCollectionIE, ) +from .pixivsketch import ( + PixivSketchIE, + PixivSketchUserIE, +) from .pladform import PladformIE +from .planetmarathi import PlanetMarathiIE from .platzi import ( PlatziIE, PlatziCourseIE, @@ -1090,10 +1246,20 @@ from .podomatic import PodomaticIE from .pokemon import ( PokemonIE, PokemonWatchIE, + PokemonSoundLibraryIE, ) +from .pokergo import ( + PokerGoIE, + PokerGoCollectionIE, +) +from .polsatgo import PolsatGoIE from .polskieradio import ( PolskieRadioIE, PolskieRadioCategoryIE, + PolskieRadioPlayerIE, + PolskieRadioPodcastIE, + PolskieRadioPodcastListIE, + PolskieRadioRadioKierowcowIE, ) from .popcorntimes import PopcorntimesIE from .popcorntv import PopcornTVIE @@ -1111,6 +1277,7 @@ from .pornhub import ( from .pornotube import PornotubeIE from .pornovoisines import PornoVoisinesIE from .pornoxo import PornoXOIE +from .pornez import PornezIE from .puhutv import ( PuhuTVIE, PuhuTVSerieIE, @@ -1118,6 +1285,13 @@ from .puhutv import ( from .presstv import PressTVIE from .projectveritas import ProjectVeritasIE from .prosiebensat1 import ProSiebenSat1IE +from .prx import ( + PRXStoryIE, + PRXSeriesIE, + PRXAccountIE, + PRXStoriesSearchIE, + PRXSeriesSearchIE +) from .puls4 import Puls4IE from .pyvideo import PyvideoIE from .qqmusic import ( @@ -1140,6 +1314,11 @@ from .radiode import RadioDeIE from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE +from .radiozet import RadioZetPodcastIE +from .radiokapital import ( + RadioKapitalIE, + RadioKapitalShowIE, +) from .radlive import ( RadLiveIE, RadLiveChannelIE, @@ -1149,6 +1328,9 @@ from .rai import ( RaiPlayIE, RaiPlayLiveIE, RaiPlayPlaylistIE, + RaiPlaySoundIE, + RaiPlaySoundLiveIE, + RaiPlaySoundPlaylistIE, RaiIE, ) from .raywenderlich import ( @@ -1173,9 +1355,11 @@ from .redbulltv import ( RedBullTVRrnContentIE, RedBullIE, ) -from .reddit import ( - RedditIE, - RedditRIE, +from .reddit import RedditIE +from .redgifs import ( + RedGifsIE, + RedGifsSearchIE, + RedGifsUserIE, ) from .redtube import RedTubeIE from .regiotv import RegioTVIE @@ -1188,11 +1372,14 @@ from .reuters import ReutersIE from .reverbnation import ReverbNationIE from .rice import RICEIE from .rmcdecouverte import RMCDecouverteIE -from .ro220 import Ro220IE from .rockstargames import RockstarGamesIE -from .roosterteeth import RoosterTeethIE +from .rokfin import ( + RokfinIE, + RokfinStackIE, + RokfinChannelIE, +) +from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE from .rottentomatoes import RottenTomatoesIE -from .roxwel import RoxwelIE from .rozhlas import RozhlasIE from .rtbf import RTBFIE from .rte import RteIE, RteRadioIE @@ -1202,12 +1389,26 @@ from .rtl2 import ( RTL2YouIE, RTL2YouSeriesIE, ) +from .rtnews import ( + RTNewsIE, + RTDocumentryIE, + RTDocumentryPlaylistIE, + RuptlyIE, +) from .rtp import RTPIE +from .rtrfm import RTRFMIE from .rts import RTSIE -from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE +from .rtve import ( + RTVEALaCartaIE, + RTVEAudioIE, + RTVELiveIE, + RTVEInfantilIE, + RTVETelevisionIE, +) from .rtvnh import RTVNHIE from .rtvs import RTVSIE from .ruhd import RUHDIE +from .rule34video import Rule34VideoIE from .rumble import ( RumbleEmbedIE, RumbleChannelIE, @@ -1219,10 +1420,27 @@ from .rutube import ( RutubeMovieIE, RutubePersonIE, RutubePlaylistIE, + RutubeTagsIE, +) +from .glomex import ( + GlomexIE, + GlomexEmbedIE, +) +from .megatvcom import ( + MegaTVComIE, + MegaTVComEmbedIE, +) +from .ant1newsgr import ( + Ant1NewsGrWatchIE, + Ant1NewsGrArticleIE, + Ant1NewsGrEmbedIE, ) from .rutv import RUTVIE from .ruutu import RuutuIE -from .ruv import RuvIE +from .ruv import ( + RuvIE, + RuvSpilaIE +) from .safari import ( SafariIE, SafariApiIE, @@ -1244,7 +1462,7 @@ from .scte import ( SCTECourseIE, ) from .seeker import SeekerIE -from .senateisvp import SenateISVPIE +from .senategov import SenateISVPIE, SenateGovIE from .sendtonews import SendtoNewsIE from .servus import ServusIE from .sevenplus import SevenPlusIE @@ -1270,6 +1488,7 @@ from .simplecast import ( ) from .sina import SinaIE from .sixplay import SixPlayIE +from .skeb import SkebIE from .skyit import ( SkyItPlayerIE, SkyItVideoIE, @@ -1288,6 +1507,7 @@ from .skynewsarabia import ( from .skynewsau import SkyNewsAUIE from .sky import ( SkyNewsIE, + SkyNewsStoryIE, SkySportsIE, SkySportsNewsIE, ) @@ -1304,6 +1524,7 @@ from .soundcloud import ( SoundcloudEmbedIE, SoundcloudIE, SoundcloudSetIE, + SoundcloudRelatedIE, SoundcloudUserIE, SoundcloudTrackStationIE, SoundcloudPlaylistIE, @@ -1370,8 +1591,10 @@ from .streamable import StreamableIE from .streamanity import StreamanityIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .streamff import StreamFFIE from .streetvoice import StreetVoiceIE from .stretchinternet import StretchInternetIE +from .stripchat import StripchatIE from .stv import STVPlayerIE from .sunporno import SunPornoIE from .sverigesradio import ( @@ -1387,10 +1610,7 @@ from .svt import ( from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE -from .tagesschau import ( - TagesschauPlayerIE, - TagesschauIE, -) +from .tagesschau import TagesschauIE from .tass import TassIE from .tbs import TBSIE from .tdslifeway import TDSLifewayIE @@ -1406,12 +1626,18 @@ from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE from .teamtreehouse import TeamTreeHouseIE from .techtalks import TechTalksIE -from .ted import TEDIE +from .ted import ( + TedEmbedIE, + TedPlaylistIE, + TedSeriesIE, + TedTalkIE, +) from .tele5 import Tele5IE from .tele13 import Tele13IE from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE +from .telegram import TelegramEmbedIE from .telemb import TeleMBIE from .telemundo import TelemundoIE from .telequebec import ( @@ -1433,7 +1659,6 @@ from .theplatform import ( ThePlatformIE, ThePlatformFeedIE, ) -from .thescene import TheSceneIE from .thestar import TheStarIE from .thesun import TheSunIE from .theta import ( @@ -1444,10 +1669,18 @@ from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE +from .threespeak import ( + ThreeSpeakIE, + ThreeSpeakUserIE, +) from .threeqsdn import ThreeQSDNIE from .tiktok import ( TikTokIE, TikTokUserIE, + TikTokSoundIE, + TikTokEffectIE, + TikTokTagIE, + TikTokVMIE, DouyinIE, ) from .tinypic import TinyPicIE @@ -1462,6 +1695,9 @@ from .toggle import ( ToggleIE, MeWatchIE, ) +from .toggo import ( + ToggoIE, +) from .tokentube import ( TokentubeIE, TokentubeChannelIE @@ -1478,6 +1714,7 @@ from .trovo import ( TrovoChannelVodIE, TrovoChannelClipIE, ) +from .trueid import TrueIDIE from .trunews import TruNewsIE from .trutv import TruTVIE from .tube8 import Tube8IE @@ -1541,9 +1778,14 @@ from .tvnow import ( TVNowAnnualIE, TVNowShowIE, ) +from .tvopengr import ( + TVOpenGrWatchIE, + TVOpenGrEmbedIE, +) from .tvp import ( TVPEmbedIE, TVPIE, + TVPStreamIE, TVPWebsiteIE, ) from .tvplay import ( @@ -1593,6 +1835,7 @@ from .dlive import ( DLiveVODIE, DLiveStreamIE, ) +from .drooble import DroobleIE from .umg import UMGDeIE from .unistra import UnistraIE from .unity import UnityIE @@ -1635,6 +1878,10 @@ from .vice import ( from .vidbit import VidbitIE from .viddler import ViddlerIE from .videa import VideaIE +from .videocampus_sachsen import ( + VideocampusSachsenIE, + VideocampusSachsenEmbedIE, +) from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE from .videomore import ( @@ -1667,6 +1914,10 @@ from .vimeo import ( VimeoWatchLaterIE, VHXEmbedIE, ) +from .vimm import ( + VimmIE, + VimmRecordingIE, +) from .vimple import VimpleIE from .vine import ( VineIE, @@ -1717,7 +1968,6 @@ from .vrv import ( from .vshare import VShareIE from .vtm import VTMIE from .medialaan import MedialaanIE -from .vube import VubeIE from .vuclip import VuClipIE from .vupload import VuploadIE from .vvvvid import ( @@ -1732,6 +1982,11 @@ from .washingtonpost import ( WashingtonPostIE, WashingtonPostArticleIE, ) +from .wasdtv import ( + WASDTVStreamIE, + WASDTVRecordIE, + WASDTVClipIE, +) from .wat import WatIE from .watchbox import WatchBoxIE from .watchindianporn import WatchIndianPornIE @@ -1754,6 +2009,7 @@ from .weibo import ( WeiboMobileIE ) from .weiqitv import WeiqiTVIE +from .willow import WillowIE from .wimtv import WimTVIE from .whowatch import WhoWatchIE from .wistia import ( @@ -1761,6 +2017,10 @@ from .wistia import ( WistiaPlaylistIE, ) from .worldstarhiphop import WorldStarHipHopIE +from .wppilot import ( + WPPilotIE, + WPPilotChannelsIE, +) from .wsj import ( WSJIE, WSJArticleIE, @@ -1784,6 +2044,7 @@ from .ximalaya import ( XimalayaIE, XimalayaAlbumIE ) +from .xinpianchang import XinpianchangIE from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE @@ -1808,6 +2069,7 @@ from .yandexmusic import ( ) from .yandexvideo import ( YandexVideoIE, + YandexVideoPreviewIE, ZenYandexIE, ZenYandexChannelIE, ) @@ -1834,11 +2096,13 @@ from .youtube import ( YoutubeFavouritesIE, YoutubeHistoryIE, YoutubeTabIE, + YoutubeLivestreamEmbedIE, YoutubePlaylistIE, YoutubeRecommendedIE, YoutubeSearchDateIE, YoutubeSearchIE, YoutubeSearchURLIE, + YoutubeMusicSearchURLIE, YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, diff --git a/hypervideo_dl/extractor/facebook.py b/hypervideo_dl/extractor/facebook.py index f32700f..022ea85 100644 --- a/hypervideo_dl/extractor/facebook.py +++ b/hypervideo_dl/extractor/facebook.py @@ -13,21 +13,26 @@ from ..compat import ( ) from ..utils import ( clean_html, + determine_ext, error_to_compat_str, ExtractorError, float_or_none, get_element_by_id, + get_first, int_or_none, js_to_json, - limit_length, merge_dicts, network_exceptions, parse_count, + parse_qs, qualities, sanitized_Request, + traverse_obj, try_get, + url_or_none, urlencode_postdata, urljoin, + variadic, ) @@ -161,7 +166,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1417995061575415', 'ext': 'mp4', - 'title': 'Yaroslav Korpan - Довгоочікуване відео', + 'title': 'Ukrainian Scientists Worldwide | Довгоочікуване відео', 'description': 'Довгоочікуване відео', 'timestamp': 1486648771, 'upload_date': '20170209', @@ -192,8 +197,8 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '202882990186699', 'ext': 'mp4', - 'title': 'Elisabeth Ahtn - Hello? Yes your uber ride is here\n* Jukin...', - 'description': 'Hello? Yes your uber ride is here\n* Jukin Media Verified *\nFind this video and others like it by visiting...', + 'title': 'birb (O v O") | Hello? Yes your uber ride is here', + 'description': 'Hello? Yes your uber ride is here * Jukin Media Verified * Find this video and others like it by visiting...', 'timestamp': 1486035513, 'upload_date': '20170202', 'uploader': 'Elisabeth Ahtn', @@ -324,11 +329,7 @@ class FacebookIE(InfoExtractor): urls.append(mobj.group('url')) return urls - def _login(self): - useremail, password = self._get_login_info() - if useremail is None: - return - + def _perform_login(self, username, password): login_page_req = sanitized_Request(self._LOGIN_URL) self._set_cookie('facebook.com', 'locale', 'en_US') login_page = self._download_webpage(login_page_req, None, @@ -340,7 +341,7 @@ class FacebookIE(InfoExtractor): lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd') login_form = { - 'email': useremail, + 'email': username, 'pass': password, 'lsd': lsd, 'lgnrnd': lgnrnd, @@ -387,36 +388,36 @@ class FacebookIE(InfoExtractor): self.report_warning('unable to log in: %s' % error_to_compat_str(err)) return - def _real_initialize(self): - self._login() - def _extract_from_url(self, url, video_id): webpage = self._download_webpage( url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) def extract_metadata(webpage): - video_title = self._html_search_regex( - r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, - 'title', default=None) - if not video_title: - video_title = self._html_search_regex( - r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>', - webpage, 'alternative title', default=None) - if not video_title: - video_title = self._html_search_meta( - ['og:title', 'twitter:title', 'description'], - webpage, 'title', default=None) - if video_title: - video_title = limit_length(video_title, 80) - else: - video_title = 'Facebook video #%s' % video_id - description = self._html_search_meta( + post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( + r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)] + post = traverse_obj(post_data, ( + ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] + media = traverse_obj( + post, + (..., 'attachments', ..., 'media', lambda _, m: str(m['id']) == video_id and m['__typename'] == 'Video'), + expected_type=dict) + title = get_first(media, ('title', 'text')) + description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) + uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {} + + page_title = title or self._html_search_regex(( + r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>', + r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(?P<content>.*?)</span>', + self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'<title>(?P<content>.+?)</title>' + ), webpage, 'title', default=None, group='content') + description = description or self._html_search_meta( ['description', 'og:description', 'twitter:description'], webpage, 'description', default=None) - uploader = clean_html(get_element_by_id( - 'fbPhotoPageAuthorName', webpage)) or self._search_regex( - r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', - default=None) or self._og_search_title(webpage, fatal=False) + uploader = uploader_data.get('name') or ( + clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) + or self._search_regex( + (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False)) + timestamp = int_or_none(self._search_regex( r'<abbr[^>]+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) @@ -431,17 +432,17 @@ class FacebookIE(InfoExtractor): r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', default=None)) info_dict = { - 'title': video_title, 'description': description, 'uploader': uploader, + 'uploader_id': uploader_data.get('id'), 'timestamp': timestamp, 'thumbnail': thumbnail, 'view_count': view_count, } + info_json_ld = self._search_json_ld(webpage, video_id, default={}) - if info_json_ld.get('title'): - info_json_ld['title'] = limit_length( - re.sub(r'\s*\|\s*Facebook$', '', info_json_ld['title']), 80) + info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '') + or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}') return merge_dicts(info_json_ld, info_dict) video_data = None @@ -508,15 +509,19 @@ class FacebookIE(InfoExtractor): def parse_graphql_video(video): formats = [] q = qualities(['sd', 'hd']) - for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: - playable_url = video.get('playable_url' + suffix) + for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), + ('playable_url_dash', '')): + playable_url = video.get(key) if not playable_url: continue - formats.append({ - 'format_id': format_id, - 'quality': q(format_id), - 'url': playable_url, - }) + if determine_ext(playable_url) == 'mpd': + formats.extend(self._extract_mpd_formats(playable_url, video_id)) + else: + formats.append({ + 'format_id': format_id, + 'quality': q(format_id), + 'url': playable_url, + }) extract_dash_manifest(video, formats) process_formats(formats) v_id = video.get('videoId') or video.get('id') or video_id @@ -544,22 +549,15 @@ class FacebookIE(InfoExtractor): if media.get('__typename') == 'Video': return parse_graphql_video(media) - nodes = data.get('nodes') or [] - node = data.get('node') or {} - if not nodes and node: - nodes.append(node) - for node in nodes: - story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {} - attachments = try_get(story, [ - lambda x: x['attached_story']['attachments'], - lambda x: x['attachments'] - ], list) or [] - for attachment in attachments: - attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) - ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] - for n in ns: - parse_attachment(n) - parse_attachment(attachment) + nodes = variadic(traverse_obj(data, 'nodes', 'node') or []) + attachments = traverse_obj(nodes, ( + ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments', + ..., ('styles', 'style_type_renderer'), 'attachment'), expected_type=dict) or [] + for attachment in attachments: + ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] + for n in ns: + parse_attachment(n) + parse_attachment(attachment) edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] for edge in edges: @@ -728,6 +726,7 @@ class FacebookPluginsVideoIE(InfoExtractor): 'info_dict': { 'id': '10154383743583686', 'ext': 'mp4', + # TODO: Fix title, uploader 'title': 'What to do during the haze?', 'uploader': 'Gov.sg', 'upload_date': '20160826', @@ -746,3 +745,42 @@ class FacebookPluginsVideoIE(InfoExtractor): return self.url_result( compat_urllib_parse_unquote(self._match_id(url)), FacebookIE.ie_key()) + + +class FacebookRedirectURLIE(InfoExtractor): + IE_DESC = False # Do not list + _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/flx/warn[/?]' + _TESTS = [{ + 'url': 'https://www.facebook.com/flx/warn/?h=TAQHsoToz&u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&s=1', + 'info_dict': { + 'id': 'pO8h3EaFRdo', + 'ext': 'mp4', + 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', + 'description': 'md5:2d713ccbb45b686a1888397b2c77ca6b', + 'channel_id': 'UCGBpxWJr9FNOcFYA5GkKrMg', + 'playable_in_embed': True, + 'categories': ['Music'], + 'channel': 'Boiler Room', + 'uploader_id': 'brtvofficial', + 'uploader': 'Boiler Room', + 'tags': 'count:11', + 'duration': 3332, + 'live_status': 'not_live', + 'thumbnail': 'https://i.ytimg.com/vi/pO8h3EaFRdo/maxresdefault.jpg', + 'channel_url': 'https://www.youtube.com/channel/UCGBpxWJr9FNOcFYA5GkKrMg', + 'availability': 'public', + 'uploader_url': 'http://www.youtube.com/user/brtvofficial', + 'upload_date': '20150917', + 'age_limit': 0, + 'view_count': int, + 'like_count': int, + }, + 'add_ie': ['Youtube'], + 'params': {'skip_download': 'Youtube'}, + }] + + def _real_extract(self, url): + redirect_url = url_or_none(parse_qs(url).get('u', [None])[-1]) + if not redirect_url: + raise ExtractorError('Invalid facebook redirect URL', expected=True) + return self.url_result(redirect_url) diff --git a/hypervideo_dl/extractor/fancode.py b/hypervideo_dl/extractor/fancode.py index 912feb7..7ea16c6 100644 --- a/hypervideo_dl/extractor/fancode.py +++ b/hypervideo_dl/extractor/fancode.py @@ -21,7 +21,6 @@ class FancodeVodIE(InfoExtractor): 'url': 'https://fancode.com/video/15043/match-preview-pbks-vs-mi', 'params': { 'skip_download': True, - 'format': 'bestvideo' }, 'info_dict': { 'id': '6249806281001', @@ -42,7 +41,7 @@ class FancodeVodIE(InfoExtractor): _ACCESS_TOKEN = None _NETRC_MACHINE = 'fancode' - _LOGIN_HINT = 'Use "--user refresh --password <refresh_token>" to login using a refresh token' + _LOGIN_HINT = 'Use "--username refresh --password <refresh_token>" to login using a refresh token' headers = { 'content-type': 'application/json', @@ -50,30 +49,26 @@ class FancodeVodIE(InfoExtractor): 'referer': 'https://fancode.com', } - def _login(self): + def _perform_login(self, username, password): # Access tokens are shortlived, so get them using the refresh token. - username, password = self._get_login_info() - if username == 'refresh' and password is not None: - self.report_login() - data = '''{ - "query":"mutation RefreshToken($refreshToken: String\\u0021) { refreshToken(refreshToken: $refreshToken) { accessToken }}", - "variables":{ - "refreshToken":"%s" - }, - "operationName":"RefreshToken" - }''' % password - - token_json = self.download_gql('refresh token', data, "Getting the Access token") - self._ACCESS_TOKEN = try_get(token_json, lambda x: x['data']['refreshToken']['accessToken']) - if self._ACCESS_TOKEN is None: - self.report_warning('Failed to get Access token') - else: - self.headers.update({'Authorization': 'Bearer %s' % self._ACCESS_TOKEN}) - elif username is not None: + if username != 'refresh': self.report_warning(f'Login using username and password is not currently supported. {self._LOGIN_HINT}') - def _real_initialize(self): - self._login() + self.report_login() + data = '''{ + "query":"mutation RefreshToken($refreshToken: String\\u0021) { refreshToken(refreshToken: $refreshToken) { accessToken }}", + "variables":{ + "refreshToken":"%s" + }, + "operationName":"RefreshToken" + }''' % password + + token_json = self.download_gql('refresh token', data, "Getting the Access token") + self._ACCESS_TOKEN = try_get(token_json, lambda x: x['data']['refreshToken']['accessToken']) + if self._ACCESS_TOKEN is None: + self.report_warning('Failed to get Access token') + else: + self.headers.update({'Authorization': 'Bearer %s' % self._ACCESS_TOKEN}) def _check_login_required(self, is_available, is_premium): msg = None diff --git a/hypervideo_dl/extractor/fc2.py b/hypervideo_dl/extractor/fc2.py index 4d85e62..54a83aa 100644 --- a/hypervideo_dl/extractor/fc2.py +++ b/hypervideo_dl/extractor/fc2.py @@ -1,18 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals -import hashlib +import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urllib_request, - compat_urlparse, ) from ..utils import ( ExtractorError, + WebSocketsWrapper, + has_websockets, + js_to_json, sanitized_Request, + std_headers, + traverse_obj, + update_url_query, urlencode_postdata, + urljoin, ) @@ -82,41 +87,33 @@ class FC2IE(InfoExtractor): self._downloader.cookiejar.clear_session_cookies() # must clear self._login() - title = 'FC2 video %s' % video_id - thumbnail = None + title, thumbnail, description = None, None, None if webpage is not None: - title = self._og_search_title(webpage) + title = self._html_search_regex( + (r'<h2\s+class="videoCnt_title">([^<]+?)</h2>', + r'\s+href="[^"]+"\s*title="([^"]+?)"\s*rel="nofollow">\s*<img', + # there's two matches in the webpage + r'\s+href="[^"]+"\s*title="([^"]+?)"\s*rel="nofollow">\s*\1'), + webpage, + 'title', fatal=False) thumbnail = self._og_search_thumbnail(webpage) - refer = url.replace('/content/', '/a/content/') if '/a/content/' not in url else url - - mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest() - - info_url = ( - 'http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&'. - format(video_id, mimi, compat_urllib_request.quote(refer, safe=b'').replace('.', '%2E'))) - - info_webpage = self._download_webpage( - info_url, video_id, note='Downloading info page') - info = compat_urlparse.parse_qs(info_webpage) - - if 'err_code' in info: - # most of the time we can still download wideo even if err_code is 403 or 602 - self.report_warning( - 'Error code was: %s... but still trying' % info['err_code'][0]) + description = self._og_search_description(webpage, default=None) - if 'filepath' not in info: - raise ExtractorError('Cannot download file. Are you logged in?') - - video_url = info['filepath'][0] + '?mid=' + info['mid'][0] - title_info = info.get('title') - if title_info: - title = title_info[0] + vidplaylist = self._download_json( + 'https://video.fc2.com/api/v3/videoplaylist/%s?sh=1&fs=0' % video_id, video_id, + note='Downloading info page') + vid_url = traverse_obj(vidplaylist, ('playlist', 'nq')) + if not vid_url: + raise ExtractorError('Unable to extract video URL') + vid_url = urljoin('https://video.fc2.com/', vid_url) return { 'id': video_id, 'title': title, - 'url': video_url, - 'ext': 'flv', + 'url': vid_url, + 'ext': 'mp4', + 'protocol': 'm3u8_native', + 'description': description, 'thumbnail': thumbnail, } @@ -157,3 +154,145 @@ class FC2EmbedIE(InfoExtractor): 'title': title, 'thumbnail': thumbnail, } + + +class FC2LiveIE(InfoExtractor): + _VALID_URL = r'https?://live\.fc2\.com/(?P<id>\d+)' + IE_NAME = 'fc2:live' + + _TESTS = [{ + 'url': 'https://live.fc2.com/57892267/', + 'info_dict': { + 'id': '57892267', + 'title': 'どこまで・・・', + 'uploader': 'あつあげ', + 'uploader_id': '57892267', + 'thumbnail': r're:https?://.+fc2.+', + }, + 'skip': 'livestream', + }] + + def _real_extract(self, url): + if not has_websockets: + raise ExtractorError('websockets library is not available. Please install it.', expected=True) + video_id = self._match_id(url) + webpage = self._download_webpage('https://live.fc2.com/%s/' % video_id, video_id) + + self._set_cookie('live.fc2.com', 'js-player_size', '1') + + member_api = self._download_json( + 'https://live.fc2.com/api/memberApi.php', video_id, data=urlencode_postdata({ + 'channel': '1', + 'profile': '1', + 'user': '1', + 'streamid': video_id + }), note='Requesting member info') + + control_server = self._download_json( + 'https://live.fc2.com/api/getControlServer.php', video_id, note='Downloading ControlServer data', + data=urlencode_postdata({ + 'channel_id': video_id, + 'mode': 'play', + 'orz': '', + 'channel_version': member_api['data']['channel_data']['version'], + 'client_version': '2.1.0\n [1]', + 'client_type': 'pc', + 'client_app': 'browser_hls', + 'ipv6': '', + }), headers={'X-Requested-With': 'XMLHttpRequest'}) + self._set_cookie('live.fc2.com', 'l_ortkn', control_server['orz_raw']) + + ws_url = update_url_query(control_server['url'], {'control_token': control_server['control_token']}) + playlist_data = None + + self.to_screen('%s: Fetching HLS playlist info via WebSocket' % video_id) + ws = WebSocketsWrapper(ws_url, { + 'Cookie': str(self._get_cookies('https://live.fc2.com/'))[12:], + 'Origin': 'https://live.fc2.com', + 'Accept': '*/*', + 'User-Agent': std_headers['User-Agent'], + }) + + self.write_debug('[debug] Sending HLS server request') + + while True: + recv = ws.recv() + if not recv: + continue + data = self._parse_json(recv, video_id, fatal=False) + if not data or not isinstance(data, dict): + continue + + if data.get('name') == 'connect_complete': + break + ws.send(r'{"name":"get_hls_information","arguments":{},"id":1}') + + while True: + recv = ws.recv() + if not recv: + continue + data = self._parse_json(recv, video_id, fatal=False) + if not data or not isinstance(data, dict): + continue + if data.get('name') == '_response_' and data.get('id') == 1: + self.write_debug('[debug] Goodbye.') + playlist_data = data + break + elif self._downloader.params.get('verbose', False): + if len(recv) > 100: + recv = recv[:100] + '...' + self.to_screen('[debug] Server said: %s' % recv) + + if not playlist_data: + raise ExtractorError('Unable to fetch HLS playlist info via WebSocket') + + formats = [] + for name, playlists in playlist_data['arguments'].items(): + if not isinstance(playlists, list): + continue + for pl in playlists: + if pl.get('status') == 0 and 'master_playlist' in pl.get('url'): + formats.extend(self._extract_m3u8_formats( + pl['url'], video_id, ext='mp4', m3u8_id=name, live=True, + headers={ + 'Origin': 'https://live.fc2.com', + 'Referer': url, + })) + + self._sort_formats(formats) + for fmt in formats: + fmt.update({ + 'protocol': 'fc2_live', + 'ws': ws, + }) + + title = self._html_search_meta(('og:title', 'twitter:title'), webpage, 'live title', fatal=False) + if not title: + title = self._html_extract_title(webpage, 'html title', fatal=False) + if title: + # remove service name in <title> + title = re.sub(r'\s+-\s+.+$', '', title) + uploader = None + if title: + match = self._search_regex(r'^(.+?)\s*\[(.+?)\]$', title, 'title and uploader', default=None, group=(1, 2)) + if match and all(match): + title, uploader = match + + live_info_view = self._search_regex(r'(?s)liveInfoView\s*:\s*({.+?}),\s*premiumStateView', webpage, 'user info', fatal=False) or None + if live_info_view: + # remove jQuery code from object literal + live_info_view = re.sub(r'\$\(.+?\)[^,]+,', '"",', live_info_view) + live_info_view = self._parse_json(js_to_json(live_info_view), video_id) + + return { + 'id': video_id, + 'title': title or traverse_obj(live_info_view, 'title'), + 'description': self._html_search_meta( + ('og:description', 'twitter:description'), + webpage, 'live description', fatal=False) or traverse_obj(live_info_view, 'info'), + 'formats': formats, + 'uploader': uploader or traverse_obj(live_info_view, 'name'), + 'uploader_id': video_id, + 'thumbnail': traverse_obj(live_info_view, 'thumb'), + 'is_live': True, + } diff --git a/hypervideo_dl/extractor/filmon.py b/hypervideo_dl/extractor/filmon.py index f775fe0..7b43ecc 100644 --- a/hypervideo_dl/extractor/filmon.py +++ b/hypervideo_dl/extractor/filmon.py @@ -170,7 +170,7 @@ class FilmOnChannelIE(InfoExtractor): return { 'id': channel_id, 'display_id': channel_data.get('alias'), - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': channel_data.get('description'), 'thumbnails': thumbnails, 'formats': formats, diff --git a/hypervideo_dl/extractor/fivetv.py b/hypervideo_dl/extractor/fivetv.py index be81fcc..d6bebd1 100644 --- a/hypervideo_dl/extractor/fivetv.py +++ b/hypervideo_dl/extractor/fivetv.py @@ -75,8 +75,7 @@ class FiveTVIE(InfoExtractor): r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'], webpage, 'video url') - title = self._og_search_title(webpage, default=None) or self._search_regex( - r'<title>([^<]+)</title>', webpage, 'title') + title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) duration = int_or_none(self._og_search_property( 'video:duration', webpage, 'duration', default=None)) diff --git a/hypervideo_dl/extractor/flickr.py b/hypervideo_dl/extractor/flickr.py index 6c82fae..2ed6c2b 100644 --- a/hypervideo_dl/extractor/flickr.py +++ b/hypervideo_dl/extractor/flickr.py @@ -7,6 +7,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + format_field, int_or_none, qualities, ) @@ -95,7 +96,7 @@ class FlickrIE(InfoExtractor): owner = video_info.get('owner', {}) uploader_id = owner.get('nsid') uploader_path = owner.get('path_alias') or uploader_id - uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None + uploader_url = format_field(uploader_path, template='https://www.flickr.com/photos/%s/') return { 'id': video_id, diff --git a/hypervideo_dl/extractor/fox.py b/hypervideo_dl/extractor/fox.py index 04f4bdb..4c52b9a 100644 --- a/hypervideo_dl/extractor/fox.py +++ b/hypervideo_dl/extractor/fox.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import json import uuid -from .adobepass import AdobePassIE +from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, @@ -20,7 +20,7 @@ from ..utils import ( ) -class FOXIE(AdobePassIE): +class FOXIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)' _TESTS = [{ # clip @@ -37,6 +37,7 @@ class FOXIE(AdobePassIE): 'creator': 'FOX', 'series': 'Gotham', 'age_limit': 14, + 'episode': 'Aftermath: Bruce Wayne Develops Into The Dark Knight' }, 'params': { 'skip_download': True, @@ -46,14 +47,15 @@ class FOXIE(AdobePassIE): 'url': 'https://www.fox.com/watch/087036ca7f33c8eb79b08152b4dd75c1/', 'only_matching': True, }, { - # episode, geo-restricted, tv provided required - 'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/', + # sports event, geo-restricted + 'url': 'https://www.fox.com/watch/b057484dade738d1f373b3e46216fa2c/', 'only_matching': True, }] _GEO_BYPASS = False _HOME_PAGE_URL = 'https://www.fox.com/' - _API_KEY = 'abdcbed02c124d393b39e818a4312055' + _API_KEY = '6E9S4bmcoNnZwVLOHywOv8PJEdu76cM9' _access_token = None + _device_id = compat_str(uuid.uuid4()) def _call_api(self, path, video_id, data=None): headers = { @@ -63,7 +65,7 @@ class FOXIE(AdobePassIE): headers['Authorization'] = 'Bearer ' + self._access_token try: return self._download_json( - 'https://api2.fox.com/v2.0/' + path, + 'https://api3.fox.com/v2.0/' + path, video_id, data=data, headers=headers) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: @@ -87,16 +89,37 @@ class FOXIE(AdobePassIE): if not self._access_token: self._access_token = self._call_api( 'login', None, json.dumps({ - 'deviceId': compat_str(uuid.uuid4()), + 'deviceId': self._device_id, }).encode())['accessToken'] def _real_extract(self, url): video_id = self._match_id(url) - video = self._call_api('vodplayer/' + video_id, video_id) + self._access_token = self._call_api( + 'previewpassmvpd?device_id=%s&mvpd_id=TempPass_fbcfox_60min' % self._device_id, + video_id)['accessToken'] + + video = self._call_api('watch', video_id, data=json.dumps({ + 'capabilities': ['drm/widevine', 'fsdk/yo'], + 'deviceWidth': 1280, + 'deviceHeight': 720, + 'maxRes': '720p', + 'os': 'macos', + 'osv': '', + 'provider': { + 'freewheel': {'did': self._device_id}, + 'vdms': {'rays': ''}, + 'dmp': {'kuid': '', 'seg': ''} + }, + 'playlist': '', + 'privacy': {'us': '1---'}, + 'siteSection': '', + 'streamType': 'vod', + 'streamId': video_id}).encode('utf-8')) title = video['name'] release_url = video['url'] + try: m3u8_url = self._download_json(release_url, video_id)['playURL'] except ExtractorError as e: diff --git a/hypervideo_dl/extractor/foxgay.py b/hypervideo_dl/extractor/foxgay.py index 512a106..1c53e06 100644 --- a/hypervideo_dl/extractor/foxgay.py +++ b/hypervideo_dl/extractor/foxgay.py @@ -29,8 +29,7 @@ class FoxgayIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = remove_end(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), ' - Foxgay.com') + title = remove_end(self._html_extract_title(webpage), ' - Foxgay.com') description = get_element_by_id('inf_tit', webpage) # The default user-agent with foxgay cookies leads to pages without videos diff --git a/hypervideo_dl/extractor/fptplay.py b/hypervideo_dl/extractor/fptplay.py new file mode 100644 index 0000000..a34e90b --- /dev/null +++ b/hypervideo_dl/extractor/fptplay.py @@ -0,0 +1,102 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import time +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + join_nonempty, +) + + +class FptplayIE(InfoExtractor): + _VALID_URL = r'https?://fptplay\.vn/(?P<type>xem-video)/[^/]+\-(?P<id>\w+)(?:/tap-(?P<episode>[^/]+)?/?(?:[?#]|$)|)' + _GEO_COUNTRIES = ['VN'] + IE_NAME = 'fptplay' + IE_DESC = 'fptplay.vn' + _TESTS = [{ + 'url': 'https://fptplay.vn/xem-video/nhan-duyen-dai-nhan-xin-dung-buoc-621a123016f369ebbde55945', + 'md5': 'ca0ee9bc63446c0c3e9a90186f7d6b33', + 'info_dict': { + 'id': '621a123016f369ebbde55945', + 'ext': 'mp4', + 'title': 'Nhân Duyên Đại Nhân Xin Dừng Bước - Ms. Cupid In Love', + 'description': 'md5:23cf7d1ce0ade8e21e76ae482e6a8c6c', + }, + }, { + 'url': 'https://fptplay.vn/xem-video/ma-toi-la-dai-gia-61f3aa8a6b3b1d2e73c60eb5/tap-3', + 'md5': 'b35be968c909b3e4e1e20ca45dd261b1', + 'info_dict': { + 'id': '61f3aa8a6b3b1d2e73c60eb5', + 'ext': 'mp4', + 'title': 'Má Tôi Là Đại Gia - 3', + 'description': 'md5:ff8ba62fb6e98ef8875c42edff641d1c', + }, + }, { + 'url': 'https://fptplay.vn/xem-video/nha-co-chuyen-hi-alls-well-ends-well-1997-6218995f6af792ee370459f0', + 'only_matching': True, + }] + + def _real_extract(self, url): + type_url, video_id, episode = self._match_valid_url(url).group('type', 'id', 'episode') + webpage = self._download_webpage(url, video_id=video_id, fatal=False) + info = self._download_json(self.get_api_with_st_token(video_id, episode or 0), video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(info['data']['url'], video_id, 'mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': join_nonempty( + self._html_search_meta(('og:title', 'twitter:title'), webpage), episode, delim=' - '), + 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), + 'formats': formats, + 'subtitles': subtitles, + } + + def get_api_with_st_token(self, video_id, episode): + path = f'/api/v6.2_w/stream/vod/{video_id}/{episode}/auto_vip' + timestamp = int(time.time()) + 10800 + + t = hashlib.md5(f'WEBv6Dkdsad90dasdjlALDDDS{timestamp}{path}'.encode()).hexdigest().upper() + r = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/' + n = [int(f'0x{t[2 * o: 2 * o + 2]}', 16) for o in range(len(t) // 2)] + + def convert(e): + t = '' + n = 0 + i = [0, 0, 0] + a = [0, 0, 0, 0] + s = len(e) + c = 0 + for z in range(s, 0, -1): + if n <= 3: + i[n] = e[c] + n += 1 + c += 1 + if 3 == n: + a[0] = (252 & i[0]) >> 2 + a[1] = ((3 & i[0]) << 4) + ((240 & i[1]) >> 4) + a[2] = ((15 & i[1]) << 2) + ((192 & i[2]) >> 6) + a[3] = (63 & i[2]) + for v in range(4): + t += r[a[v]] + n = 0 + if n: + for o in range(n, 3): + i[o] = 0 + + for o in range(n + 1): + a[0] = (252 & i[0]) >> 2 + a[1] = ((3 & i[0]) << 4) + ((240 & i[1]) >> 4) + a[2] = ((15 & i[1]) << 2) + ((192 & i[2]) >> 6) + a[3] = (63 & i[2]) + t += r[a[o]] + n += 1 + while n < 3: + t += '' + n += 1 + return t + + st_token = convert(n).replace('+', '-').replace('/', '_').replace('=', '') + return f'https://api.fptplay.net{path}?{urllib.parse.urlencode({"st": st_token, "e": timestamp})}' diff --git a/hypervideo_dl/extractor/franceculture.py b/hypervideo_dl/extractor/franceculture.py index 14f4cb4..9dc28d8 100644 --- a/hypervideo_dl/extractor/franceculture.py +++ b/hypervideo_dl/extractor/franceculture.py @@ -1,18 +1,45 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import ( determine_ext, extract_attributes, int_or_none, + traverse_obj, + unified_strdate, ) class FranceCultureIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', + # playlist + 'url': 'https://www.franceculture.fr/emissions/serie/hasta-dente', + 'playlist_count': 12, + 'info_dict': { + 'id': 'hasta-dente', + 'title': 'Hasta Dente', + 'description': 'md5:57479af50648d14e9bb649e6b1f8f911', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20201024', + }, + 'playlist': [{ + 'info_dict': { + 'id': '3c1c2e55-41a0-11e5-9fe0-005056a87c89', + 'ext': 'mp3', + 'title': 'Jeudi, vous avez dit bizarre ?', + 'description': 'md5:47cf1e00cc21c86b0210279996a812c6', + 'duration': 604, + 'upload_date': '20201024', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1603576680 + }, + }, + ], + }, { + 'url': 'https://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', 'info_dict': { 'id': 'rendez-vous-au-pays-des-geeks', 'display_id': 'rendez-vous-au-pays-des-geeks', @@ -20,9 +47,9 @@ class FranceCultureIE(InfoExtractor): 'title': 'Rendez-vous au pays des geeks', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140301', - 'timestamp': 1393700400, 'vcodec': 'none', - } + 'duration': 3569, + }, }, { # no thumbnail 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018', @@ -31,9 +58,54 @@ class FranceCultureIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + info = { + 'id': display_id, + 'title': self._html_search_regex( + r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>', + webpage, 'title', default=self._og_search_title(webpage)), + 'description': self._html_search_regex( + r'(?s)<div[^>]+class="excerpt"[^>]*>(.*?)</div>', webpage, 'description', default=None), + 'thumbnail': self._og_search_thumbnail(webpage), + 'uploader': self._html_search_regex( + r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None), + 'upload_date': unified_strdate(self._html_search_regex( + r'(?s)class="teaser-text-date".*?(\d{2}/\d{2}/\d{4})', webpage, 'date', default=None)), + } + + playlist_data = self._search_regex( + r'''(?sx) + <section[^>]+data-xiti-place="[^"]*?liste_episodes[^"?]*?"[^>]*> + (.*?) + </section> + ''', + webpage, 'playlist data', fatal=False, default=None) + + if playlist_data: + entries = [] + for item, item_description in re.findall( + r'(?s)(<button[^<]*class="[^"]*replay-button[^>]*>).*?<p[^>]*class="[^"]*teaser-text-chapo[^>]*>(.*?)</p>', + playlist_data): + + item_attributes = extract_attributes(item) + entries.append({ + 'id': item_attributes.get('data-emission-uuid'), + 'url': item_attributes.get('data-url'), + 'title': item_attributes.get('data-diffusion-title'), + 'duration': int_or_none(traverse_obj(item_attributes, 'data-duration-seconds', 'data-duration-seconds')), + 'description': item_description, + 'timestamp': int_or_none(item_attributes.get('data-start-time')), + 'thumbnail': info['thumbnail'], + 'uploader': info['uploader'], + }) + + return { + '_type': 'playlist', + 'entries': entries, + **info + } + video_data = extract_attributes(self._search_regex( r'''(?sx) (?: @@ -43,31 +115,14 @@ class FranceCultureIE(InfoExtractor): (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>) ''', webpage, 'video data')) - - video_url = video_data.get('data-url') or video_data['data-asset-source'] - title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage) - - description = self._html_search_regex( - r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>', - webpage, 'description', default=None) - thumbnail = self._search_regex( - r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"', - webpage, 'thumbnail', default=None) - uploader = self._html_search_regex( - r'(?s)<span class="author">(.*?)</span>', - webpage, 'uploader', default=None) + video_url = traverse_obj(video_data, 'data-url', 'data-asset-source') ext = determine_ext(video_url.lower()) return { - 'id': display_id, 'display_id': display_id, 'url': video_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, 'ext': ext, 'vcodec': 'none' if ext == 'mp3' else None, - 'uploader': uploader, - 'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')), 'duration': int_or_none(video_data.get('data-duration')), + **info } diff --git a/hypervideo_dl/extractor/francetv.py b/hypervideo_dl/extractor/francetv.py index 3bbab69..347a766 100644 --- a/hypervideo_dl/extractor/francetv.py +++ b/hypervideo_dl/extractor/francetv.py @@ -185,9 +185,9 @@ class FranceTVIE(InfoExtractor): 'vcodec': 'none', 'ext': 'mhtml', 'protocol': 'mhtml', - 'url': 'about:dummy', + 'url': 'about:invalid', 'fragments': [{ - 'path': sheet, + 'url': sheet, # XXX: not entirely accurate; each spritesheet seems to be # a 10×10 grid of thumbnails corresponding to approximately # 2 seconds of the video; the last spritesheet may be shorter @@ -203,7 +203,7 @@ class FranceTVIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'thumbnail': image, 'duration': duration, 'timestamp': timestamp, diff --git a/hypervideo_dl/extractor/frontendmasters.py b/hypervideo_dl/extractor/frontendmasters.py index 40b8cb0..fc67a84 100644 --- a/hypervideo_dl/extractor/frontendmasters.py +++ b/hypervideo_dl/extractor/frontendmasters.py @@ -28,14 +28,7 @@ class FrontendMastersBaseIE(InfoExtractor): 'high': {'width': 1920, 'height': 1080} } - def _real_initialize(self): - self._login() - - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') @@ -252,9 +245,9 @@ class FrontendMastersCourseIE(FrontendMastersPageBaseIE): entries = [] for lesson in lessons: lesson_name = lesson.get('slug') - if not lesson_name: - continue lesson_id = lesson.get('hash') or lesson.get('statsId') + if not lesson_id or not lesson_name: + continue entries.append(self._extract_lesson(chapters, lesson_id, lesson)) title = course.get('title') diff --git a/hypervideo_dl/extractor/fujitv.py b/hypervideo_dl/extractor/fujitv.py index a02a943..4fdfe12 100644 --- a/hypervideo_dl/extractor/fujitv.py +++ b/hypervideo_dl/extractor/fujitv.py @@ -1,35 +1,73 @@ # coding: utf-8 from __future__ import unicode_literals - +from ..utils import HEADRequest from .common import InfoExtractor class FujiTVFODPlus7IE(InfoExtractor): - _VALID_URL = r'https?://i\.fod\.fujitv\.co\.jp/plus7/web/[0-9a-z]{4}/(?P<id>[0-9a-z]+)' - _BASE_URL = 'http://i.fod.fujitv.co.jp/' + _VALID_URL = r'https?://fod\.fujitv\.co\.jp/title/(?P<sid>[0-9a-z]{4})/(?P<id>[0-9a-z]+)' + _BASE_URL = 'https://i.fod.fujitv.co.jp/' _BITRATE_MAP = { 300: (320, 180), 800: (640, 360), 1200: (1280, 720), 2000: (1280, 720), + 4000: (1920, 1080), } + _TESTS = [{ + 'url': 'https://fod.fujitv.co.jp/title/5d40/5d40110076', + 'info_dict': { + 'id': '5d40110076', + 'ext': 'mp4', + 'title': '#1318 『まる子、まぼろしの洋館を見る』の巻', + 'series': 'ちびまる子ちゃん', + 'series_id': '5d40', + 'description': 'md5:b3f51dbfdda162ac4f789e0ff4d65750', + 'thumbnail': 'https://i.fod.fujitv.co.jp/img/program/5d40/episode/5d40110076_a.jpg', + }, + }, { + 'url': 'https://fod.fujitv.co.jp/title/5d40/5d40810083', + 'info_dict': { + 'id': '5d40810083', + 'ext': 'mp4', + 'title': '#1324 『まる子とオニの子』の巻/『結成!2月をムダにしない会』の巻', + 'description': 'md5:3972d900b896adc8ab1849e310507efa', + 'series': 'ちびまる子ちゃん', + 'series_id': '5d40', + 'thumbnail': 'https://i.fod.fujitv.co.jp/img/program/5d40/episode/5d40810083_a.jpg'}, + 'skip': 'Video available only in one week' + }] + def _real_extract(self, url): - video_id = self._match_id(url) - formats = self._extract_m3u8_formats( - self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id, 'mp4') - for f in formats: - wh = self._BITRATE_MAP.get(f.get('tbr')) - if wh: - f.update({ - 'width': wh[0], - 'height': wh[1], - }) - self._sort_formats(formats) + series_id, video_id = self._match_valid_url(url).groups() + self._request_webpage(HEADRequest(url), video_id) + json_info = {} + token = self._get_cookies(url).get('CT') + if token: + json_info = self._download_json('https://fod-sp.fujitv.co.jp/apps/api/episode/detail/?ep_id=%s&is_premium=false' % video_id, video_id, headers={'x-authorization': f'Bearer {token.value}'}, fatal=False) + else: + self.report_warning(f'The token cookie is needed to extract video metadata. {self._LOGIN_HINTS["cookies"]}') + formats, subtitles = [], {} + src_json = self._download_json(f'{self._BASE_URL}abrjson_v2/tv_android/{video_id}', video_id) + for src in src_json['video_selector']: + if not src.get('url'): + continue + fmt, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, 'mp4') + for f in fmt: + f.update(dict(zip(('height', 'width'), + self._BITRATE_MAP.get(f.get('tbr'), ())))) + formats.extend(fmt) + subtitles = self._merge_subtitles(subtitles, subs) + self._sort_formats(formats, ['tbr']) return { 'id': video_id, - 'title': video_id, + 'title': json_info.get('ep_title'), + 'series': json_info.get('lu_title'), + 'series_id': series_id, + 'description': json_info.get('ep_description'), 'formats': formats, - 'thumbnail': self._BASE_URL + 'pc/image/wbtn/wbtn_%s.jpg' % video_id, + 'subtitles': subtitles, + 'thumbnail': f'{self._BASE_URL}img/program/{series_id}/episode/{video_id}_a.jpg', } diff --git a/hypervideo_dl/extractor/funimation.py b/hypervideo_dl/extractor/funimation.py index 382cbe1..6aa9bc9 100644 --- a/hypervideo_dl/extractor/funimation.py +++ b/hypervideo_dl/extractor/funimation.py @@ -10,6 +10,7 @@ from ..compat import compat_HTTPError from ..utils import ( determine_ext, int_or_none, + join_nonempty, js_to_json, orderedSet, qualities, @@ -35,9 +36,8 @@ class FunimationBaseIE(InfoExtractor): note='Checking geo-location', errnote='Unable to fetch geo-location information'), 'region') or 'US' - def _login(self): - username, password = self._get_login_info() - if username is None: + def _perform_login(self, username, password): + if self._TOKEN: return try: data = self._download_json( @@ -46,7 +46,7 @@ class FunimationBaseIE(InfoExtractor): 'username': username, 'password': password, })) - return data['token'] + FunimationBaseIE._TOKEN = data['token'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: error = self._parse_json(e.cause.read().decode(), None)['error'] @@ -89,8 +89,6 @@ class FunimationPageIE(FunimationBaseIE): def _real_initialize(self): if not self._REGION: FunimationBaseIE._REGION = self._get_region() - if not self._TOKEN: - FunimationBaseIE._TOKEN = self._login() def _real_extract(self, url): locale, show, episode = self._match_valid_url(url).group('lang', 'show', 'episode') @@ -153,10 +151,6 @@ class FunimationIE(FunimationBaseIE): }, }] - def _real_initialize(self): - if not self._TOKEN: - FunimationBaseIE._TOKEN = self._login() - @staticmethod def _get_experiences(episode): for lang, lang_data in episode.get('languages', {}).items(): @@ -275,7 +269,7 @@ class FunimationIE(FunimationBaseIE): def _get_subtitles(self, subtitles, experience_id, episode, display_id, format_name): if isinstance(episode, str): webpage = self._download_webpage( - f'https://www.funimation.com/player/{experience_id}', display_id, + f'https://www.funimation.com/player/{experience_id}/', display_id, fatal=False, note=f'Downloading player webpage for {format_name}') episode, _, _ = self._get_episode(webpage, episode_id=episode, fatal=False) @@ -288,10 +282,11 @@ class FunimationIE(FunimationBaseIE): sub_type = sub_type if sub_type != 'FULL' else None current_sub = { 'url': text_track['src'], - 'name': ' '.join(filter(None, (version, text_track.get('label'), sub_type))) + 'name': join_nonempty(version, text_track.get('label'), sub_type, delim=' ') } - lang = '_'.join(filter(None, ( - text_track.get('language', 'und'), version if version != 'Simulcast' else None, sub_type))) + lang = join_nonempty(text_track.get('language', 'und'), + version if version != 'Simulcast' else None, + sub_type, delim='_') if current_sub not in subtitles.get(lang, []): subtitles.setdefault(lang, []).append(current_sub) return subtitles @@ -338,7 +333,7 @@ class FunimationShowIE(FunimationBaseIE): 'https://prod-api-funimationnow.dadcdigital.com/api/funimation/episodes/?limit=99999&title_id=%s' % show_info.get('id'), display_id) - vod_items = traverse_obj(items_info, ('items', ..., re.compile('(?i)mostRecent[AS]vod').match, 'item')) + vod_items = traverse_obj(items_info, ('items', ..., lambda k, _: re.match(r'(?i)mostRecent[AS]vod', k), 'item')) return { '_type': 'playlist', diff --git a/hypervideo_dl/extractor/funk.py b/hypervideo_dl/extractor/funk.py index e5e3260..2c5cfe8 100644 --- a/hypervideo_dl/extractor/funk.py +++ b/hypervideo_dl/extractor/funk.py @@ -11,7 +11,7 @@ from ..utils import ( class FunkIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.|origin\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.funk.net/channel/ba-793/die-lustigsten-instrumente-aus-dem-internet-teil-2-1155821', 'md5': '8dd9d9ab59b4aa4173b3197f2ea48e81', diff --git a/hypervideo_dl/extractor/gab.py b/hypervideo_dl/extractor/gab.py index 25b5cb0..9ba0b1c 100644 --- a/hypervideo_dl/extractor/gab.py +++ b/hypervideo_dl/extractor/gab.py @@ -6,12 +6,16 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + int_or_none, + parse_codecs, + parse_duration, str_to_int, + unified_timestamp ) class GabTVIE(InfoExtractor): - _VALID_URL = r'(?:https?://)tv.gab.com/channel/[^/]+/view/(?P<id>[a-z0-9-]+)' + _VALID_URL = r'https?://tv\.gab\.com/channel/[^/]+/view/(?P<id>[a-z0-9-]+)' _TESTS = [{ 'url': 'https://tv.gab.com/channel/wurzelroot/view/why-was-america-in-afghanistan-61217eacea5665de450d0488', 'info_dict': { @@ -32,8 +36,10 @@ class GabTVIE(InfoExtractor): channel_name = self._search_regex(r'data-channel-name=\"(?P<channel_id>[^\"]+)', webpage, 'channel_name') title = self._search_regex(r'data-episode-title=\"(?P<channel_id>[^\"]+)', webpage, 'title') view_key = self._search_regex(r'data-view-key=\"(?P<channel_id>[^\"]+)', webpage, 'view_key') - description = clean_html(self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content')) or None - available_resolutions = re.findall(r'<a\ data-episode-id=\"%s\"\ data-resolution=\"(?P<resolution>[^\"]+)' % id, webpage) + description = clean_html( + self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content')) or None + available_resolutions = re.findall(r'<a\ data-episode-id=\"%s\"\ data-resolution=\"(?P<resolution>[^\"]+)' % id, + webpage) formats = [] for resolution in available_resolutions: @@ -62,3 +68,80 @@ class GabTVIE(InfoExtractor): 'uploader_id': channel_id, 'thumbnail': f'https://tv.gab.com/image/{id}', } + + +class GabIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gab\.com/[^/]+/posts/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://gab.com/SomeBitchIKnow/posts/107163961867310434', + 'md5': '8ca34fb00f1e1033b5c5988d79ec531d', + 'info_dict': { + 'id': '107163961867310434-0', + 'ext': 'mp4', + 'title': 'L on Gab', + 'uploader_id': '946600', + 'uploader': 'SomeBitchIKnow', + 'description': 'md5:204055fafd5e1a519f5d6db953567ca3', + 'timestamp': 1635192289, + 'upload_date': '20211025', + } + }, { + 'url': 'https://gab.com/TheLonelyProud/posts/107045884469287653', + 'md5': 'f9cefcfdff6418e392611a828d47839d', + 'info_dict': { + 'id': '107045884469287653-0', + 'ext': 'mp4', + 'title': 'Jody Sadowski on Gab', + 'uploader_id': '1390705', + 'timestamp': 1633390571, + 'upload_date': '20211004', + 'uploader': 'TheLonelyProud', + } + }] + + def _real_extract(self, url): + post_id = self._match_id(url) + json_data = self._download_json(f'https://gab.com/api/v1/statuses/{post_id}', post_id) + + entries = [] + for idx, media in enumerate(json_data['media_attachments']): + if media.get('type') not in ('video', 'gifv'): + continue + metadata = media['meta'] + format_metadata = { + 'acodec': parse_codecs(metadata.get('audio_encode')).get('acodec'), + 'asr': int_or_none((metadata.get('audio_bitrate') or '').split(' ')[0]), + 'fps': metadata.get('fps'), + } + + formats = [{ + 'url': url, + 'width': f.get('width'), + 'height': f.get('height'), + 'tbr': int_or_none(f.get('bitrate'), scale=1000), + **format_metadata, + } for url, f in ((media.get('url'), metadata.get('original') or {}), + (media.get('source_mp4'), metadata.get('playable') or {})) if url] + + self._sort_formats(formats) + + author = json_data.get('account') or {} + entries.append({ + 'id': f'{post_id}-{idx}', + 'title': f'{json_data["account"]["display_name"]} on Gab', + 'timestamp': unified_timestamp(json_data.get('created_at')), + 'formats': formats, + 'description': clean_html(json_data.get('content')), + 'duration': metadata.get('duration') or parse_duration(metadata.get('length')), + 'like_count': json_data.get('favourites_count'), + 'comment_count': json_data.get('replies_count'), + 'repost_count': json_data.get('reblogs_count'), + 'uploader': author.get('username'), + 'uploader_id': author.get('id'), + 'uploader_url': author.get('url'), + }) + + if len(entries) > 1: + return self.playlist_result(entries, post_id) + + return entries[0] diff --git a/hypervideo_dl/extractor/gaia.py b/hypervideo_dl/extractor/gaia.py index 7821fb7..5b0195c 100644 --- a/hypervideo_dl/extractor/gaia.py +++ b/hypervideo_dl/extractor/gaia.py @@ -56,24 +56,22 @@ class GaiaIE(InfoExtractor): def _real_initialize(self): auth = self._get_cookies('https://www.gaia.com/').get('auth') if auth: - auth = self._parse_json( - compat_urllib_parse_unquote(auth.value), - None, fatal=False) - if not auth: - username, password = self._get_login_info() - if username is None: - return - auth = self._download_json( - 'https://auth.gaia.com/v1/login', - None, data=urlencode_postdata({ - 'username': username, - 'password': password - })) - if auth.get('success') is False: - raise ExtractorError(', '.join(auth['messages']), expected=True) - if auth: + auth = self._parse_json(compat_urllib_parse_unquote(auth.value), None, fatal=False) self._jwt = auth.get('jwt') + def _perform_login(self, username, password): + if self._jwt: + return + auth = self._download_json( + 'https://auth.gaia.com/v1/login', + None, data=urlencode_postdata({ + 'username': username, + 'password': password + })) + if auth.get('success') is False: + raise ExtractorError(', '.join(auth['messages']), expected=True) + self._jwt = auth.get('jwt') + def _real_extract(self, url): display_id, vtype = self._match_valid_url(url).groups() node_id = self._download_json( diff --git a/hypervideo_dl/extractor/gamejolt.py b/hypervideo_dl/extractor/gamejolt.py new file mode 100644 index 0000000..a13e528 --- /dev/null +++ b/hypervideo_dl/extractor/gamejolt.py @@ -0,0 +1,541 @@ +# coding: utf-8 +import itertools +import json +import math + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + determine_ext, + format_field, + int_or_none, + str_or_none, + traverse_obj, + try_get +) + + +class GameJoltBaseIE(InfoExtractor): + _API_BASE = 'https://gamejolt.com/site-api/' + + def _call_api(self, endpoint, *args, **kwargs): + kwargs.setdefault('headers', {}).update({'Accept': 'image/webp,*/*'}) + return self._download_json(self._API_BASE + endpoint, *args, **kwargs)['payload'] + + def _parse_content_as_text(self, content): + outer_contents, joined_contents = content.get('content') or [], [] + for outer_content in outer_contents: + if outer_content.get('type') != 'paragraph': + joined_contents.append(self._parse_content_as_text(outer_content)) + continue + inner_contents, inner_content_text = outer_content.get('content') or [], '' + for inner_content in inner_contents: + if inner_content.get('text'): + inner_content_text += inner_content['text'] + elif inner_content.get('type') == 'hardBreak': + inner_content_text += '\n' + joined_contents.append(inner_content_text) + + return '\n'.join(joined_contents) + + def _get_comments(self, post_num_id, post_hash_id): + sort_by, scroll_id = self._configuration_arg('comment_sort', ['hot'], ie_key=GameJoltIE.ie_key())[0], -1 + is_scrolled = sort_by in ('new', 'you') + for page in itertools.count(1): + comments_data = self._call_api( + 'comments/Fireside_Post/%s/%s?%s=%d' % ( + post_num_id, sort_by, + 'scroll_id' if is_scrolled else 'page', scroll_id if is_scrolled else page), + post_hash_id, note='Downloading comments list page %d' % page) + if not comments_data.get('comments'): + break + for comment in traverse_obj(comments_data, (('comments', 'childComments'), ...), expected_type=dict, default=[]): + yield { + 'id': comment['id'], + 'text': self._parse_content_as_text( + self._parse_json(comment['comment_content'], post_hash_id)), + 'timestamp': int_or_none(comment.get('posted_on'), scale=1000), + 'like_count': comment.get('votes'), + 'author': traverse_obj(comment, ('user', ('display_name', 'name')), expected_type=str_or_none, get_all=False), + 'author_id': traverse_obj(comment, ('user', 'username'), expected_type=str_or_none), + 'author_thumbnail': traverse_obj(comment, ('user', 'image_avatar'), expected_type=str_or_none), + 'parent': comment.get('parent_id') or None, + } + scroll_id = int_or_none(comments_data['comments'][-1].get('posted_on')) + + def _parse_post(self, post_data): + post_id = post_data['hash'] + lead_content = self._parse_json(post_data.get('lead_content') or '{}', post_id, fatal=False) or {} + description, full_description = post_data.get('leadStr') or self._parse_content_as_text( + self._parse_json(post_data.get('lead_content'), post_id)), None + if post_data.get('has_article'): + article_content = self._parse_json( + post_data.get('article_content') + or self._call_api(f'web/posts/article/{post_data.get("id", post_id)}', post_id, + note='Downloading article metadata', errnote='Unable to download article metadata', fatal=False).get('article'), + post_id, fatal=False) + full_description = self._parse_content_as_text(article_content) + + user_data = post_data.get('user') or {} + info_dict = { + 'extractor_key': GameJoltIE.ie_key(), + 'extractor': 'GameJolt', + 'webpage_url': str_or_none(post_data.get('url')) or f'https://gamejolt.com/p/{post_id}', + 'id': post_id, + 'title': description, + 'description': full_description or description, + 'display_id': post_data.get('slug'), + 'uploader': user_data.get('display_name') or user_data.get('name'), + 'uploader_id': user_data.get('username'), + 'uploader_url': format_field(user_data, 'url', 'https://gamejolt.com%s'), + 'categories': [try_get(category, lambda x: '%s - %s' % (x['community']['name'], x['channel'].get('display_title') or x['channel']['title'])) + for category in post_data.get('communities' or [])], + 'tags': traverse_obj( + lead_content, ('content', ..., 'content', ..., 'marks', ..., 'attrs', 'tag'), expected_type=str_or_none), + 'like_count': int_or_none(post_data.get('like_count')), + 'comment_count': int_or_none(post_data.get('comment_count'), default=0), + 'timestamp': int_or_none(post_data.get('added_on'), scale=1000), + 'release_timestamp': int_or_none(post_data.get('published_on'), scale=1000), + '__post_extractor': self.extract_comments(post_data.get('id'), post_id) + } + + # TODO: Handle multiple videos/embeds? + video_data = traverse_obj(post_data, ('videos', ...), expected_type=dict, get_all=False) or {} + formats, subtitles, thumbnails = [], {}, [] + for media in video_data.get('media') or []: + media_url, mimetype, ext, media_id = media['img_url'], media.get('filetype', ''), determine_ext(media['img_url']), media.get('type') + if mimetype == 'application/vnd.apple.mpegurl' or ext == 'm3u8': + hls_formats, hls_subs = self._extract_m3u8_formats_and_subtitles(media_url, post_id, 'mp4', m3u8_id=media_id) + formats.extend(hls_formats) + subtitles.update(hls_subs) + elif mimetype == 'application/dash+xml' or ext == 'mpd': + dash_formats, dash_subs = self._extract_mpd_formats_and_subtitles(media_url, post_id, mpd_id=media_id) + formats.extend(dash_formats) + subtitles.update(dash_subs) + elif 'image' in mimetype: + thumbnails.append({ + 'id': media_id, + 'url': media_url, + 'width': media.get('width'), + 'height': media.get('height'), + 'filesize': media.get('filesize'), + }) + else: + formats.append({ + 'format_id': media_id, + 'url': media_url, + 'width': media.get('width'), + 'height': media.get('height'), + 'filesize': media.get('filesize'), + 'acodec': 'none' if 'video-card' in media_url else None, + }) + + if formats: + return { + **info_dict, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'view_count': int_or_none(video_data.get('view_count')), + } + + gif_entries = [] + for media in post_data.get('media', []): + if determine_ext(media['img_url']) != 'gif' or 'gif' not in media.get('filetype', ''): + continue + gif_entries.append({ + 'id': media['hash'], + 'title': media['filename'].split('.')[0], + 'formats': [{ + 'format_id': url_key, + 'url': media[url_key], + 'width': media.get('width') if url_key == 'img_url' else None, + 'height': media.get('height') if url_key == 'img_url' else None, + 'filesize': media.get('filesize') if url_key == 'img_url' else None, + 'acodec': 'none', + } for url_key in ('img_url', 'mediaserver_url', 'mediaserver_url_mp4', 'mediaserver_url_webm') if media.get(url_key)] + }) + if gif_entries: + return { + '_type': 'playlist', + **info_dict, + 'entries': gif_entries, + } + + embed_url = traverse_obj(post_data, ('embeds', ..., 'url'), expected_type=str_or_none, get_all=False) + if embed_url: + return self.url_result(embed_url) + return info_dict + + +class GameJoltIE(GameJoltBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/p/(?:[\w-]*-)?(?P<id>\w{8})' + _TESTS = [{ + # No audio + 'url': 'https://gamejolt.com/p/introducing-ramses-jackson-some-fnf-himbo-i-ve-been-animating-fo-c6achnzu', + 'md5': 'cd5f733258f6678b0ce500dd88166d86', + 'info_dict': { + 'id': 'c6achnzu', + 'ext': 'mp4', + 'display_id': 'introducing-ramses-jackson-some-fnf-himbo-i-ve-been-animating-fo-c6achnzu', + 'title': 'Introducing Ramses Jackson, some FNF himbo I’ve been animating for the past few days, hehe.\n#fnfmod #fridaynightfunkin', + 'description': 'Introducing Ramses Jackson, some FNF himbo I’ve been animating for the past few days, hehe.\n#fnfmod #fridaynightfunkin', + 'uploader': 'Jakeneutron', + 'uploader_id': 'Jakeneutron', + 'uploader_url': 'https://gamejolt.com/@Jakeneutron', + 'categories': ['Friday Night Funkin\' - Videos'], + 'tags': ['fnfmod', 'fridaynightfunkin'], + 'timestamp': 1633499590, + 'upload_date': '20211006', + 'release_timestamp': 1633499655, + 'release_date': '20211006', + 'thumbnail': 're:^https?://.+wgch9mhq.png$', + 'like_count': int, + 'comment_count': int, + 'view_count': int, + } + }, { + # YouTube embed + 'url': 'https://gamejolt.com/p/hey-hey-if-there-s-anyone-who-s-looking-to-get-into-learning-a-n6g4jzpq', + 'md5': '79a931ff500a5c783ef6c3bda3272e32', + 'info_dict': { + 'id': 'XsNA_mzC0q4', + 'title': 'Adobe Animate CC 2021 Tutorial || Part 1 - The Basics', + 'description': 'md5:9d1ab9e2625b3fe1f42b2a44c67fdd13', + 'uploader': 'Jakeneutron', + 'uploader_id': 'Jakeneutron', + 'uploader_url': 'http://www.youtube.com/user/Jakeneutron', + 'ext': 'mp4', + 'duration': 1749, + 'tags': ['Adobe Animate CC', 'Tutorial', 'Animation', 'The Basics', 'For Beginners'], + 'like_count': int, + 'playable_in_embed': True, + 'categories': ['Education'], + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi_webp/XsNA_mzC0q4/maxresdefault.webp', + 'age_limit': 0, + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UC6_L7fnczNalFZyBthUE9oA', + 'channel': 'Jakeneutron', + 'channel_id': 'UC6_L7fnczNalFZyBthUE9oA', + 'upload_date': '20211015', + 'view_count': int, + 'chapters': 'count:18', + } + }, { + # Article + 'url': 'https://gamejolt.com/p/i-fuckin-broke-chaos-d56h3eue', + 'md5': '786c1ccf98fde02c03a2768acb4258d0', + 'info_dict': { + 'id': 'd56h3eue', + 'ext': 'mp4', + 'display_id': 'i-fuckin-broke-chaos-d56h3eue', + 'title': 'I fuckin broke Chaos.', + 'description': 'I moved my tab durning the cutscene so now it\'s stuck like this.', + 'uploader': 'Jeff____________', + 'uploader_id': 'The_Nyesh_Man', + 'uploader_url': 'https://gamejolt.com/@The_Nyesh_Man', + 'categories': ['Friday Night Funkin\' - Videos'], + 'timestamp': 1639800264, + 'upload_date': '20211218', + 'release_timestamp': 1639800330, + 'release_date': '20211218', + 'thumbnail': 're:^https?://.+euksy8bd.png$', + 'like_count': int, + 'comment_count': int, + 'view_count': int, + } + }, { + # Single GIF + 'url': 'https://gamejolt.com/p/hello-everyone-i-m-developing-a-pixel-art-style-mod-for-fnf-and-i-vs4gdrd8', + 'info_dict': { + 'id': 'vs4gdrd8', + 'display_id': 'hello-everyone-i-m-developing-a-pixel-art-style-mod-for-fnf-and-i-vs4gdrd8', + 'title': 'md5:cc3d8b031d9bc7ec2ec5a9ffc707e1f9', + 'description': 'md5:cc3d8b031d9bc7ec2ec5a9ffc707e1f9', + 'uploader': 'Quesoguy', + 'uploader_id': 'CheeseguyDev', + 'uploader_url': 'https://gamejolt.com/@CheeseguyDev', + 'categories': ['Game Dev - General', 'Arts n\' Crafts - Creations', 'Pixel Art - showcase', + 'Friday Night Funkin\' - Mods', 'Newgrounds - Friday Night Funkin (13+)'], + 'timestamp': 1639517122, + 'release_timestamp': 1639519966, + 'like_count': int, + 'comment_count': int, + }, + 'playlist': [{ + 'info_dict': { + 'id': 'dszyjnwi', + 'ext': 'webm', + 'title': 'gif-presentacion-mejorado-dszyjnwi', + 'n_entries': 1, + } + }] + }, { + # Multiple GIFs + 'url': 'https://gamejolt.com/p/gif-yhsqkumq', + 'playlist_count': 35, + 'info_dict': { + 'id': 'yhsqkumq', + 'display_id': 'gif-yhsqkumq', + 'title': 'GIF', + 'description': 'GIF', + 'uploader': 'DaniilTvman', + 'uploader_id': 'DaniilTvman', + 'uploader_url': 'https://gamejolt.com/@DaniilTvman', + 'categories': ['Five Nights At The AGK Studio Comunity - NEWS game'], + 'timestamp': 1638721559, + 'release_timestamp': 1638722276, + 'like_count': int, + 'comment_count': int, + }, + }] + + def _real_extract(self, url): + post_id = self._match_id(url) + post_data = self._call_api( + f'web/posts/view/{post_id}', post_id)['post'] + return self._parse_post(post_data) + + +class GameJoltPostListBaseIE(GameJoltBaseIE): + def _entries(self, endpoint, list_id, note='Downloading post list', errnote='Unable to download post list', initial_items=[]): + page_num, scroll_id = 1, None + items = initial_items or self._call_api(endpoint, list_id, note=note, errnote=errnote)['items'] + while items: + for item in items: + yield self._parse_post(item['action_resource_model']) + scroll_id = items[-1]['scroll_id'] + page_num += 1 + items = self._call_api( + endpoint, list_id, note=f'{note} page {page_num}', errnote=errnote, data=json.dumps({ + 'scrollDirection': 'from', + 'scrollId': scroll_id, + }).encode('utf-8')).get('items') + + +class GameJoltUserIE(GameJoltPostListBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/@(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://gamejolt.com/@BlazikenSuperStar', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '6116784', + 'title': 'S. Blaze', + 'description': 'md5:5ba7fbbb549e8ea2545aafbfe22eb03a', + }, + 'params': { + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + user_data = self._call_api( + f'web/profile/@{user_id}', user_id, note='Downloading user info', errnote='Unable to download user info')['user'] + bio = self._parse_content_as_text( + self._parse_json(user_data.get('bio_content', '{}'), user_id, fatal=False) or {}) + return self.playlist_result( + self._entries(f'web/posts/fetch/user/@{user_id}?tab=active', user_id, 'Downloading user posts', 'Unable to download user posts'), + str_or_none(user_data.get('id')), user_data.get('display_name') or user_data.get('name'), bio) + + +class GameJoltGameIE(GameJoltPostListBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/games/[\w-]+/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://gamejolt.com/games/Friday4Fun/655124', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '655124', + 'title': 'Friday Night Funkin\': Friday 4 Fun', + 'description': 'md5:576a7dd87912a2dcf33c50d2bd3966d3' + }, + 'params': { + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }] + + def _real_extract(self, url): + game_id = self._match_id(url) + game_data = self._call_api( + f'web/discover/games/{game_id}', game_id, note='Downloading game info', errnote='Unable to download game info')['game'] + description = self._parse_content_as_text( + self._parse_json(game_data.get('description_content', '{}'), game_id, fatal=False) or {}) + return self.playlist_result( + self._entries(f'web/posts/fetch/game/{game_id}', game_id, 'Downloading game posts', 'Unable to download game posts'), + game_id, game_data.get('title'), description) + + +class GameJoltGameSoundtrackIE(GameJoltBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/get/soundtrack(?:\?|\#!?)(?:.*?[&;])??game=(?P<id>(?:\d+)+)' + _TESTS = [{ + 'url': 'https://gamejolt.com/get/soundtrack?foo=bar&game=657899', + 'info_dict': { + 'id': '657899', + 'title': 'Friday Night Funkin\': Vs Oswald', + 'n_entries': None, + }, + 'playlist': [{ + 'info_dict': { + 'id': '184434', + 'ext': 'mp3', + 'title': 'Gettin\' Lucky (Menu Music)', + 'url': r're:^https://.+vs-oswald-menu-music\.mp3$', + 'release_timestamp': 1635190816, + 'release_date': '20211025', + 'n_entries': 3, + } + }, { + 'info_dict': { + 'id': '184435', + 'ext': 'mp3', + 'title': 'Rabbit\'s Luck (Extended Version)', + 'url': r're:^https://.+rabbit-s-luck--full-version-\.mp3$', + 'release_timestamp': 1635190841, + 'release_date': '20211025', + 'n_entries': 3, + } + }, { + 'info_dict': { + 'id': '185228', + 'ext': 'mp3', + 'title': 'Last Straw', + 'url': r're:^https://.+last-straw\.mp3$', + 'release_timestamp': 1635881104, + 'release_date': '20211102', + 'n_entries': 3, + } + }] + }] + + def _real_extract(self, url): + game_id = self._match_id(url) + game_overview = self._call_api( + f'web/discover/games/overview/{game_id}', game_id, note='Downloading soundtrack info', errnote='Unable to download soundtrack info') + return self.playlist_result([{ + 'id': str_or_none(song.get('id')), + 'title': str_or_none(song.get('title')), + 'url': str_or_none(song.get('url')), + 'release_timestamp': int_or_none(song.get('posted_on'), scale=1000), + } for song in game_overview.get('songs') or []], game_id, traverse_obj( + game_overview, ('microdata', 'name'), (('twitter', 'fb'), 'title'), expected_type=str_or_none, get_all=False)) + + +class GameJoltCommunityIE(GameJoltPostListBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/c/(?P<id>(?P<community>[\w-]+)(?:/(?P<channel>[\w-]+))?)(?:(?:\?|\#!?)(?:.*?[&;])??sort=(?P<sort>\w+))?' + _TESTS = [{ + 'url': 'https://gamejolt.com/c/fnf/videos', + 'playlist_mincount': 50, + 'info_dict': { + 'id': 'fnf/videos', + 'title': 'Friday Night Funkin\' - Videos', + 'description': 'md5:6d8c06f27460f7d35c1554757ffe53c8' + }, + 'params': { + 'playlistend': 50, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }, { + 'url': 'https://gamejolt.com/c/youtubers', + 'playlist_mincount': 50, + 'info_dict': { + 'id': 'youtubers/featured', + 'title': 'Youtubers - featured', + 'description': 'md5:53e5582c93dcc467ab597bfca4db17d4' + }, + 'params': { + 'playlistend': 50, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }] + + def _real_extract(self, url): + display_id, community_id, channel_id, sort_by = self._match_valid_url(url).group('id', 'community', 'channel', 'sort') + channel_id, sort_by = channel_id or 'featured', sort_by or 'new' + + community_data = self._call_api( + f'web/communities/view/{community_id}', display_id, + note='Downloading community info', errnote='Unable to download community info')['community'] + channel_data = traverse_obj(self._call_api( + f'web/communities/view-channel/{community_id}/{channel_id}', display_id, + note='Downloading channel info', errnote='Unable to download channel info', fatal=False), 'channel') or {} + + title = f'{community_data.get("name") or community_id} - {channel_data.get("display_title") or channel_id}' + description = self._parse_content_as_text( + self._parse_json(community_data.get('description_content') or '{}', display_id, fatal=False) or {}) + return self.playlist_result( + self._entries( + f'web/posts/fetch/community/{community_id}?channels[]={sort_by}&channels[]={channel_id}', + display_id, 'Downloading community posts', 'Unable to download community posts'), + f'{community_id}/{channel_id}', title, description) + + +class GameJoltSearchIE(GameJoltPostListBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/search(?:/(?P<filter>communities|users|games))?(?:\?|\#!?)(?:.*?[&;])??q=(?P<id>(?:[^&#]+)+)' + _URL_FORMATS = { + 'users': 'https://gamejolt.com/@{username}', + 'communities': 'https://gamejolt.com/c/{path}', + 'games': 'https://gamejolt.com/games/{slug}/{id}', + } + _TESTS = [{ + 'url': 'https://gamejolt.com/search?foo=bar&q=%23fnf', + 'playlist_mincount': 50, + 'info_dict': { + 'id': '#fnf', + 'title': '#fnf', + }, + 'params': { + 'playlistend': 50, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }, { + 'url': 'https://gamejolt.com/search/communities?q=cookie%20run', + 'playlist_mincount': 10, + 'info_dict': { + 'id': 'cookie run', + 'title': 'cookie run', + }, + }, { + 'url': 'https://gamejolt.com/search/users?q=mlp', + 'playlist_mincount': 278, + 'info_dict': { + 'id': 'mlp', + 'title': 'mlp', + }, + }, { + 'url': 'https://gamejolt.com/search/games?q=roblox', + 'playlist_mincount': 688, + 'info_dict': { + 'id': 'roblox', + 'title': 'roblox', + }, + }] + + def _search_entries(self, query, filter_mode, display_query): + initial_search_data = self._call_api( + f'web/search/{filter_mode}?q={query}', display_query, + note=f'Downloading {filter_mode} list', errnote=f'Unable to download {filter_mode} list') + entries_num = traverse_obj(initial_search_data, 'count', f'{filter_mode}Count') + if not entries_num: + return + for page in range(1, math.ceil(entries_num / initial_search_data['perPage']) + 1): + search_results = self._call_api( + f'web/search/{filter_mode}?q={query}&page={page}', display_query, + note=f'Downloading {filter_mode} list page {page}', errnote=f'Unable to download {filter_mode} list') + for result in search_results[filter_mode]: + yield self.url_result(self._URL_FORMATS[filter_mode].format(**result)) + + def _real_extract(self, url): + filter_mode, query = self._match_valid_url(url).group('filter', 'id') + display_query = compat_urllib_parse_unquote(query) + return self.playlist_result( + self._search_entries(query, filter_mode, display_query) if filter_mode else self._entries( + f'web/posts/fetch/search/{query}', display_query, initial_items=self._call_api( + f'web/search?q={query}', display_query, + note='Downloading initial post list', errnote='Unable to download initial post list')['posts']), + display_query, display_query) diff --git a/hypervideo_dl/extractor/generic.py b/hypervideo_dl/extractor/generic.py index 8387646..03e6eb2 100644 --- a/hypervideo_dl/extractor/generic.py +++ b/hypervideo_dl/extractor/generic.py @@ -17,6 +17,7 @@ from ..compat import ( ) from ..utils import ( determine_ext, + dict_get, ExtractorError, float_or_none, HEADRequest, @@ -28,8 +29,10 @@ from ..utils import ( mimetype2ext, orderedSet, parse_duration, + parse_resolution, sanitized_Request, smuggle_url, + str_or_none, unescapeHTML, unified_timestamp, unsmuggle_url, @@ -56,7 +59,7 @@ from .sportbox import SportBoxIE from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE -from .senateisvp import SenateISVPIE +from .senategov import SenateISVPIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE @@ -100,6 +103,9 @@ from .ustream import UstreamIE from .arte import ArteTVEmbedIE from .videopress import VideoPressIE from .rutube import RutubeIE +from .glomex import GlomexEmbedIE +from .megatvcom import MegaTVComEmbedIE +from .ant1newsgr import Ant1NewsGrEmbedIE from .limelight import LimelightBaseIE from .anvato import AnvatoIE from .washingtonpost import WashingtonPostIE @@ -112,6 +118,7 @@ from .channel9 import Channel9IE from .vshare import VShareIE from .mediasite import MediasiteIE from .springboardplatform import SpringboardPlatformIE +from .ted import TedEmbedIE from .yapfiles import YapFilesIE from .vice import ViceIE from .xfileshare import XFileShareIE @@ -135,12 +142,21 @@ from .arcpublishing import ArcPublishingIE from .medialaan import MedialaanIE from .simplecast import SimplecastIE from .wimtv import WimTVIE +from .tvopengr import TVOpenGrEmbedIE +from .ertgr import ERTWebtvEmbedIE +from .tvp import TVPEmbedIE +from .blogger import BloggerIE +from .mainstreaming import MainStreamingIE +from .gfycat import GfycatIE +from .panopto import PanoptoBaseIE +from .ruutu import RuutuIE class GenericIE(InfoExtractor): IE_DESC = 'Generic downloader that works on some sites' _VALID_URL = r'.*' IE_NAME = 'generic' + _NETRC_MACHINE = False # Supress username warning _TESTS = [ # Direct link to a video { @@ -203,7 +219,7 @@ class GenericIE(InfoExtractor): { 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', 'info_dict': { - 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'id': 'https://phihag.de/2014/youtube-dl/rss2.xml', 'title': 'Zero Punctuation', 'description': 're:.*groundbreaking video review series.*' }, @@ -248,6 +264,9 @@ class GenericIE(InfoExtractor): 'episode_number': 1, 'season_number': 1, 'age_limit': 0, + 'season': 'Season 1', + 'direct': True, + 'episode': 'Episode 1', }, }], 'params': { @@ -264,6 +283,16 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 100, }, + # RSS feed with guid + { + 'url': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss', + 'info_dict': { + 'id': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss', + 'description': 'md5:be809a44b63b0c56fb485caf68685520', + 'title': 'The Little Red Podcast', + }, + 'playlist_mincount': 76, + }, # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng { 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', @@ -359,9 +388,6 @@ class GenericIE(InfoExtractor): 'formats': 'mincount:9', 'upload_date': '20130904', }, - 'params': { - 'format': 'bestvideo', - }, }, # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 { @@ -1188,6 +1214,21 @@ class GenericIE(InfoExtractor): }, 'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/', }, + # jwplayer with only the json URL + { + 'url': 'https://www.hollywoodreporter.com/news/general-news/dunkirk-team-reveals-what-christopher-nolan-said-oscar-win-meet-your-oscar-winner-1092454', + 'info_dict': { + 'id': 'TljWkvWH', + 'ext': 'mp4', + 'upload_date': '20180306', + 'title': 'md5:91eb1862f6526415214f62c00b453936', + 'description': 'md5:73048ae50ae953da10549d1d2fe9b3aa', + 'timestamp': 1520367225, + }, + 'params': { + 'skip_download': True, + }, + }, # Complex jwplayer { 'url': 'http://www.indiedb.com/games/king-machine/videos', @@ -1434,24 +1475,6 @@ class GenericIE(InfoExtractor): 'duration': 45.115, }, }, - # 5min embed - { - 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/', - 'md5': '4c6f127a30736b59b3e2c19234ee2bf7', - 'info_dict': { - 'id': '518726732', - 'ext': 'mp4', - 'title': 'Facebook Creates "On This Day" | Crunch Report', - 'description': 'Amazon updates Fire TV line, Tesla\'s Model X spotted in the wild', - 'timestamp': 1427237531, - 'uploader': 'Crunch Report', - 'upload_date': '20150324', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, # Crooks and Liars embed { 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', @@ -1856,6 +1879,62 @@ class GenericIE(InfoExtractor): 'add_ie': [RutubeIE.ie_key()], }, { + # glomex:embed + 'url': 'https://www.skai.gr/news/world/iatrikos-syllogos-tourkias-to-turkovac-aplo-dialyma-erntogan-eiste-apateones-kai-pseytes', + 'info_dict': { + 'id': 'v-ch2nkhcirwc9-sf', + 'ext': 'mp4', + 'title': 'md5:786e1e24e06c55993cee965ef853a0c1', + 'description': 'md5:8b517a61d577efe7e36fde72fd535995', + 'timestamp': 1641885019, + 'upload_date': '20220111', + 'duration': 460000, + 'thumbnail': 'https://i3thumbs.glomex.com/dC1idjJwdndiMjRzeGwvMjAyMi8wMS8xMS8wNy8xMF8zNV82MWRkMmQ2YmU5ZTgyLmpwZw==/profile:player-960x540', + }, + }, + { + # megatvcom:embed + 'url': 'https://www.in.gr/2021/12/18/greece/apokalypsi-mega-poios-parelave-tin-ereyna-tsiodra-ek-merous-tis-kyvernisis-o-prothypourgos-telika-gnorize/', + 'info_dict': { + 'id': 'apokalypsi-mega-poios-parelave-tin-ereyna-tsiodra-ek-merous-tis-kyvernisis-o-prothypourgos-telika-gnorize', + 'title': 'md5:5e569cf996ec111057c2764ec272848f', + }, + 'playlist': [{ + 'md5': '1afa26064ff00ccb91617957dbc73dc1', + 'info_dict': { + 'ext': 'mp4', + 'id': '564916', + 'display_id': 'md5:6cdf22d3a2e7bacb274b7295089a1770', + 'title': 'md5:33b9dd39584685b62873043670eb52a6', + 'description': 'md5:c1db7310f390518ac36dd69d947ef1a1', + 'timestamp': 1639753145, + 'upload_date': '20211217', + 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/12/prezerakos-1024x597.jpg', + }, + }, { + 'md5': '4a1c220695f1ef865a8b7966a53e2474', + 'info_dict': { + 'ext': 'mp4', + 'id': '564905', + 'display_id': 'md5:ead15695e485e649aed2b81ebd699b88', + 'title': 'md5:2b71fd54249a3ca34609fe39ae31c47b', + 'description': 'md5:c42e12f638d0a97d6de4508e2c4df982', + 'timestamp': 1639753047, + 'upload_date': '20211217', + 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/12/tsiodras-mitsotakis-1024x545.jpg', + }, + }] + }, + { + 'url': 'https://www.ertnews.gr/video/manolis-goyalles-o-anthropos-piso-apo-ti-diadiktyaki-vasilopita/', + 'info_dict': { + 'id': '2022/tv/news-themata-ianouarios/20220114-apotis6-gouales-pita.mp4', + 'ext': 'mp4', + 'title': 'md5:df64f5b61c06d0e9556c0cdd5cf14464', + 'thumbnail': 'https://www.ert.gr/themata/photos/2021/20220114-apotis6-gouales-pita.jpg', + }, + }, + { # ThePlatform embedded with whitespaces in URLs 'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm', 'only_matching': True, @@ -2160,6 +2239,33 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # tvopengr:embed + 'url': 'https://www.ethnos.gr/World/article/190604/hparosiaxekinoynoisynomiliessthgeneyhmethskiatoypolemoypanoapothnoykrania', + 'md5': 'eb0c3995d0a6f18f6538c8e057865d7d', + 'info_dict': { + 'id': '101119', + 'ext': 'mp4', + 'display_id': 'oikarpoitondiapragmateyseonhparosias', + 'title': 'md5:b979f4d640c568617d6547035528a149', + 'description': 'md5:e54fc1977c7159b01cc11cd7d9d85550', + 'timestamp': 1641772800, + 'upload_date': '20220110', + 'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/70bc39fa-895b-4918-a364-c39d2135fc6d.jpg', + + } + }, + { + # blogger embed + 'url': 'https://blog.tomeuvizoso.net/2019/01/a-panfrost-milestone.html', + 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac', + 'info_dict': { + 'id': 'BLOGGER-video-3c740e3a49197e16-796', + 'ext': 'mp4', + 'title': 'Blogger', + 'thumbnail': r're:^https?://.*', + }, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2319,12 +2425,120 @@ class GenericIE(InfoExtractor): 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', } }, + { + # KVS Player (for sites that serve kt_player.js via non-https urls) + 'url': 'http://www.camhub.world/embed/389508', + 'md5': 'fbe89af4cfb59c8fd9f34a202bb03e32', + 'info_dict': { + 'id': '389508', + 'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source', + 'ext': 'mp4', + 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', + 'thumbnail': 'http://www.camhub.world/contents/videos_screenshots/389000/389508/preview.mp4.jpg', + } + }, + { + # Reddit-hosted video that will redirect and be processed by RedditIE + # Redirects to https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ + 'url': 'https://v.redd.it/zv89llsvexdz', + 'md5': '87f5f02f6c1582654146f830f21f8662', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'timestamp': 1501941939.0, + 'title': 'That small heart attack.', + 'upload_date': '20170805', + 'uploader': 'Antw87' + } + }, + { + # 1080p Reddit-hosted video that will redirect and be processed by RedditIE + 'url': 'https://v.redd.it/33hgok7dfbz71/', + 'md5': '7a1d587940242c9bb3bd6eb320b39258', + 'info_dict': { + 'id': '33hgok7dfbz71', + 'ext': 'mp4', + 'title': "The game Didn't want me to Knife that Guy I guess", + 'uploader': 'paraf1ve', + 'timestamp': 1636788683.0, + 'upload_date': '20211113' + } + }, + { + # MainStreaming player + 'url': 'https://www.lactv.it/2021/10/03/lac-news24-la-settimana-03-10-2021/', + 'info_dict': { + 'id': 'EUlZfGWkGpOd', + 'title': 'La Settimana ', + 'description': '03 Ottobre ore 02:00', + 'ext': 'mp4', + 'live_status': 'not_live', + 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', + 'duration': 1512 + } + }, + { + # Multiple gfycat iframe embeds + 'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=613422', + 'info_dict': { + 'title': '재이, 윤, 세은 황금 드레스를 입고 빛난다', + 'id': 'board' + }, + 'playlist_count': 8, + }, + { + # Multiple gfycat gifs (direct links) + 'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=612199', + 'info_dict': { + 'title': '옳게 된 크롭 니트 스테이씨 아이사', + 'id': 'board' + }, + 'playlist_count': 6 + }, + { + # Multiple gfycat embeds, with uppercase "IFR" in urls + 'url': 'https://kkzz.kr/?vid=2295', + 'info_dict': { + 'title': '지방시 앰버서더 에스파 카리나 움짤', + 'id': '?vid=2295' + }, + 'playlist_count': 9 + }, + { + # Panopto embeds + 'url': 'https://www.monash.edu/learning-teaching/teachhq/learning-technologies/panopto/how-to/insert-a-quiz-into-a-panopto-video', + 'info_dict': { + 'title': 'Insert a quiz into a Panopto video', + 'id': 'insert-a-quiz-into-a-panopto-video' + }, + 'playlist_count': 1 + }, + { + # Ruutu embed + 'url': 'https://www.nelonen.fi/ohjelmat/madventures-suomi/2160731-riku-ja-tunna-lahtevat-peurajahtiin-tv-sta-tutun-biologin-kanssa---metsastysreissu-huipentuu-kasvissyojan-painajaiseen', + 'md5': 'a2513a98d3496099e6eced40f7e6a14b', + 'info_dict': { + 'id': '4044426', + 'ext': 'mp4', + 'title': 'Riku ja Tunna lähtevät peurajahtiin tv:stä tutun biologin kanssa – metsästysreissu huipentuu kasvissyöjän painajaiseen!', + 'thumbnail': r're:^https?://.+\.jpg$', + 'duration': 108, + 'series': 'Madventures Suomi', + 'description': 'md5:aa55b44bd06a1e337a6f1d0b46507381', + 'categories': ['Matkailu', 'Elämäntyyli'], + 'age_limit': 0, + 'upload_date': '20220308', + }, + }, ] def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) + def report_detected(self, name): + self._downloader.write_debug(f'Identified a {name}') + def _extract_rss(self, url, video_id, doc): playlist_title = doc.find('./channel/title').text playlist_desc_el = doc.find('./channel/description') @@ -2349,6 +2563,9 @@ class GenericIE(InfoExtractor): if not next_url: continue + if it.find('guid').text is not None: + next_url = smuggle_url(next_url, {'force_videoid': it.find('guid').text}) + def itunes(key): return xpath_text( it, xpath_with_ns('./itunes:%s' % key, NS_MAP), @@ -2540,10 +2757,13 @@ class GenericIE(InfoExtractor): content_type = head_response.headers.get('Content-Type', '').lower() m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: + self.report_detected('direct video link') format_id = compat_str(m.group('format_id')) subtitles = {} if format_id.endswith('mpegurl'): formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + elif format_id.endswith('mpd') or format_id.endswith('dash+xml'): + formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id) elif format_id == 'f4m': formats = self._extract_f4m_formats(url, video_id) else: @@ -2580,6 +2800,7 @@ class GenericIE(InfoExtractor): # Is it an M3U playlist? if first_bytes.startswith(b'#EXTM3U'): + self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') self._sort_formats(info_dict['formats']) return info_dict @@ -2610,16 +2831,20 @@ class GenericIE(InfoExtractor): except compat_xml_parse_error: doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': + self.report_detected('RSS feed') return self._extract_rss(url, video_id, doc) elif doc.tag == 'SmoothStreamingMedia': info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url) + self.report_detected('ISM manifest') self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): smil = self._parse_smil(doc, url, video_id) + self.report_detected('SMIL file') self._sort_formats(smil['formats']) return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': + self.report_detected('XSPF playlist') return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, @@ -2630,10 +2855,12 @@ class GenericIE(InfoExtractor): doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) + self.report_detected('DASH manifest') self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) + self.report_detected('F4M manifest') self._sort_formats(info_dict['formats']) return info_dict except compat_xml_parse_error: @@ -2642,6 +2869,7 @@ class GenericIE(InfoExtractor): # Is it a Camtasia project? camtasia_res = self._extract_camtasia(url, video_id, webpage) if camtasia_res is not None: + self.report_detected('Camtasia video') return camtasia_res # Sometimes embedded video player is hidden behind percent encoding @@ -2663,10 +2891,8 @@ class GenericIE(InfoExtractor): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - video_title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'(?s)<title>(.*?)</title>', webpage, 'video title', - default='video') + video_title = (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'video title', default='video')) # Try to detect age limit automatically age_limit = self._rta_search(webpage) @@ -2692,6 +2918,8 @@ class GenericIE(InfoExtractor): 'age_limit': age_limit, }) + self._downloader.write_debug('Looking for video embeds') + # Look for Brightcove Legacy Studio embeds bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: @@ -3002,10 +3230,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'Tvigle') # Look for embedded TED player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'TED') + ted_urls = TedEmbedIE._extract_urls(webpage) + if ted_urls: + return self.playlist_from_matches(ted_urls, video_id, video_title, ie=TedEmbedIE.ie_key()) # Look for embedded Ustream videos ustream_url = UstreamIE._extract_url(webpage) @@ -3138,12 +3365,6 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) - # Look for 5min embeds - mobj = re.search( - r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage) - if mobj is not None: - return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin') - # Look for Crooks and Liars embeds mobj = re.search( r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage) @@ -3189,6 +3410,11 @@ class GenericIE(InfoExtractor): if onionstudios_url: return self.url_result(onionstudios_url) + # Look for Blogger embeds + blogger_urls = BloggerIE._extract_urls(webpage) + if blogger_urls: + return self.playlist_from_matches(blogger_urls, video_id, video_title, ie=BloggerIE.ie_key()) + # Look for ViewLift embeds viewlift_url = ViewLiftEmbedIE._extract_url(webpage) if viewlift_url: @@ -3336,6 +3562,24 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( rutube_urls, video_id, video_title, ie=RutubeIE.ie_key()) + # Look for Glomex embeds + glomex_urls = list(GlomexEmbedIE._extract_urls(webpage, url)) + if glomex_urls: + return self.playlist_from_matches( + glomex_urls, video_id, video_title, ie=GlomexEmbedIE.ie_key()) + + # Look for megatv.com embeds + megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage)) + if megatvcom_urls: + return self.playlist_from_matches( + megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key()) + + # Look for ant1news.gr embeds + ant1newsgr_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) + if ant1newsgr_urls: + return self.playlist_from_matches( + ant1newsgr_urls, video_id, video_title, ie=Ant1NewsGrEmbedIE.ie_key()) + # Look for WashingtonPost embeds wapo_urls = WashingtonPostIE._extract_urls(webpage) if wapo_urls: @@ -3482,9 +3726,45 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key()) + # Look for (tvopen|ethnos).gr embeds + tvopengr_urls = list(TVOpenGrEmbedIE._extract_urls(webpage)) + if tvopengr_urls: + return self.playlist_from_matches(tvopengr_urls, video_id, video_title, ie=TVOpenGrEmbedIE.ie_key()) + + # Look for ert.gr webtv embeds + ertwebtv_urls = list(ERTWebtvEmbedIE._extract_urls(webpage)) + if len(ertwebtv_urls) == 1: + return self.url_result(self._proto_relative_url(ertwebtv_urls[0]), video_title=video_title, url_transparent=True) + elif ertwebtv_urls: + return self.playlist_from_matches(ertwebtv_urls, video_id, video_title, ie=ERTWebtvEmbedIE.ie_key()) + + tvp_urls = TVPEmbedIE._extract_urls(webpage) + if tvp_urls: + return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key()) + + # Look for MainStreaming embeds + mainstreaming_urls = MainStreamingIE._extract_urls(webpage) + if mainstreaming_urls: + return self.playlist_from_matches(mainstreaming_urls, video_id, video_title, ie=MainStreamingIE.ie_key()) + + # Look for Gfycat Embeds + gfycat_urls = GfycatIE._extract_urls(webpage) + if gfycat_urls: + return self.playlist_from_matches(gfycat_urls, video_id, video_title, ie=GfycatIE.ie_key()) + + panopto_urls = PanoptoBaseIE._extract_urls(webpage) + if panopto_urls: + return self.playlist_from_matches(panopto_urls, video_id, video_title) + + # Look for Ruutu embeds + ruutu_url = RuutuIE._extract_url(webpage) + if ruutu_url: + return self.url_result(ruutu_url, RuutuIE) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: + self.report_detected('HTML5 media') if len(entries) == 1: entries[0].update({ 'id': video_id, @@ -3503,9 +3783,18 @@ class GenericIE(InfoExtractor): jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) if jwplayer_data: + if isinstance(jwplayer_data.get('playlist'), str): + self.report_detected('JW Player playlist') + return { + **info_dict, + '_type': 'url', + 'ie_key': JWPlatformIE.ie_key(), + 'url': jwplayer_data['playlist'], + } try: info = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=url) + self.report_detected('JW Player data') return merge_dicts(info, info_dict) except ExtractorError: # See https://github.com/ytdl-org/youtube-dl/pull/16735 @@ -3513,11 +3802,12 @@ class GenericIE(InfoExtractor): # Video.js embed mobj = re.search( - r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', + r'(?s)\bvideojs\s*\(.+?([a-zA-Z0-9_$]+)\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', webpage) if mobj is not None: + varname = mobj.group(1) sources = self._parse_json( - mobj.group(1), video_id, transform_source=js_to_json, + mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [] if not isinstance(sources, list): sources = [sources] @@ -3554,16 +3844,40 @@ class GenericIE(InfoExtractor): 'Referer': full_response.geturl(), }, }) + # https://docs.videojs.com/player#addRemoteTextTrack + # https://html.spec.whatwg.org/multipage/media.html#htmltrackelement + for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage): + sub = self._parse_json( + sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {} + src = str_or_none(sub.get('src')) + if not src: + continue + subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({ + 'url': compat_urlparse.urljoin(url, src), + 'name': sub.get('label'), + 'http_headers': { + 'Referer': full_response.geturl(), + }, + }) if formats or subtitles: + self.report_detected('video.js embed') self._sort_formats(formats) info_dict['formats'] = formats info_dict['subtitles'] = subtitles return info_dict # Looking for http://schema.org/VideoObject - json_ld = self._search_json_ld( - webpage, video_id, default={}, expected_type='VideoObject') - if json_ld.get('url'): + json_ld = self._search_json_ld(webpage, video_id, default={}) + if json_ld.get('url') not in (url, None): + self.report_detected('JSON LD') + if determine_ext(json_ld['url']) == 'm3u8': + json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles( + json_ld['url'], video_id, 'mp4') + json_ld.pop('url') + self._sort_formats(json_ld['formats']) + else: + json_ld['_type'] = 'url_transparent' + json_ld['url'] = smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True}) return merge_dicts(json_ld, info_dict) def check_video(vurl): @@ -3572,15 +3886,17 @@ class GenericIE(InfoExtractor): if RtmpIE.suitable(vurl): return True vpath = compat_urlparse.urlparse(vurl).path - vext = determine_ext(vpath) - return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') + vext = determine_ext(vpath, None) + return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') def filter_video(urls): return list(filter(check_video, urls)) # Start with something easy: JW Player in SWFObject found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)) - if not found: + if found: + self.report_detected('JW Player in SFWObject') + else: # Look for gorilla-vid style embedding found = filter_video(re.findall(r'''(?sx) (?: @@ -3590,10 +3906,13 @@ class GenericIE(InfoExtractor): ) .*? ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage)) + if found: + self.report_detected('JW Player embed') if not found: # Look for generic KVS player - found = re.search(r'<script [^>]*?src="https://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage) + found = re.search(r'<script [^>]*?src="https?://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage) if found: + self.report_detected('KWS Player') if found.group('maj_ver') not in ['4', '5']: self.report_warning('Untested major version (%s) in player engine--Download may fail.' % found.group('ver')) flashvars = re.search(r'(?ms)<script.*?>.*?var\s+flashvars\s*=\s*(\{.*?\});.*?</script>', webpage) @@ -3613,20 +3932,21 @@ class GenericIE(InfoExtractor): protocol, _, _ = url.partition('/') thumbnail = protocol + thumbnail + url_keys = list(filter(re.compile(r'video_url|video_alt_url\d*').fullmatch, flashvars.keys())) formats = [] - for key in ('video_url', 'video_alt_url', 'video_alt_url2'): - if key in flashvars and '/get_file/' in flashvars[key]: - next_format = { - 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']), - 'format_id': flashvars.get(key + '_text', key), - 'ext': 'mp4', - } - height = re.search(r'%s_(\d+)p\.mp4(?:/[?].*)?$' % flashvars['video_id'], flashvars[key]) - if height: - next_format['height'] = int(height.group(1)) - else: - next_format['quality'] = 1 - formats.append(next_format) + for key in url_keys: + if '/get_file/' not in flashvars[key]: + continue + format_id = flashvars.get(f'{key}_text', key) + formats.append({ + 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']), + 'format_id': format_id, + 'ext': 'mp4', + **(parse_resolution(format_id) or parse_resolution(flashvars[key])) + }) + if not formats[-1].get('height'): + formats[-1]['quality'] = 1 + self._sort_formats(formats) return { @@ -3639,10 +3959,14 @@ class GenericIE(InfoExtractor): if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) + if found: + self.report_detected('video file') if not found: # Broaden the findall a little bit: JWPlayer JS loader found = filter_video(re.findall( r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) + if found: + self.report_detected('JW Player JS loader') if not found: # Flow player found = filter_video(re.findall(r'''(?xs) @@ -3651,10 +3975,14 @@ class GenericIE(InfoExtractor): \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s* ["']?url["']?\s*:\s*["']([^"']+)["'] ''', webpage)) + if found: + self.report_detected('Flow Player') if not found: # Cinerama player found = re.findall( r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage) + if found: + self.report_detected('Cinerama player') if not found: # Try to find twitter cards info # twitter:player:stream should be checked before twitter:player since @@ -3662,6 +3990,8 @@ class GenericIE(InfoExtractor): # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) found = filter_video(re.findall( r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)) + if found: + self.report_detected('Twitter card') if not found: # We look for Open Graph info: # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) @@ -3669,6 +3999,8 @@ class GenericIE(InfoExtractor): # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: found = filter_video(re.findall(r'<meta.*?property="og:(?:video|audio)".*?content="(.*?)"', webpage)) + if found: + self.report_detected('Open Graph video info') if not found: REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' found = re.search( @@ -3700,6 +4032,7 @@ class GenericIE(InfoExtractor): # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) embed_url = self._html_search_meta('twitter:player', webpage, default=None) if embed_url and embed_url != url: + self.report_detected('twitter:player iframe') return self.url_result(embed_url) if not found: @@ -3719,12 +4052,16 @@ class GenericIE(InfoExtractor): # here's a fun little line of code for you: video_id = os.path.splitext(video_id)[0] + headers = { + 'referer': full_response.geturl() + } entry_info_dict = { 'id': video_id, 'uploader': video_uploader, 'title': video_title, 'age_limit': age_limit, + 'http_headers': headers, } if RtmpIE.suitable(video_url): @@ -3742,11 +4079,11 @@ class GenericIE(InfoExtractor): elif ext == 'xspf': return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) elif ext == 'm3u8': - entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4') + entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers) elif ext == 'mpd': - entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id) + entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers) elif ext == 'f4m': - entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id) + entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers) elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: # Just matching .ism/manifest is not enough to be reliably sure # whether it's actually an ISM manifest or some other streaming diff --git a/hypervideo_dl/extractor/gettr.py b/hypervideo_dl/extractor/gettr.py index aa50b2f..327a4d0 100644 --- a/hypervideo_dl/extractor/gettr.py +++ b/hypervideo_dl/extractor/gettr.py @@ -3,22 +3,30 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + bool_or_none, ExtractorError, dict_get, float_or_none, int_or_none, - remove_end, str_or_none, + traverse_obj, try_get, url_or_none, urljoin, ) -class GettrIE(InfoExtractor): - _VALID_URL = r'https?://(www\.)?gettr\.com/post/(?P<id>[a-z0-9]+)' +class GettrBaseIE(InfoExtractor): + _BASE_REGEX = r'https?://(www\.)?gettr\.com/' _MEDIA_BASE_URL = 'https://media.gettr.com/' + def _call_api(self, path, video_id, *args, **kwargs): + return self._download_json(urljoin('https://api.gettr.com/u/', path), video_id, *args, **kwargs)['result'] + + +class GettrIE(GettrBaseIE): + _VALID_URL = GettrBaseIE._BASE_REGEX + r'post/(?P<id>[a-z0-9]+)' + _TESTS = [{ 'url': 'https://www.gettr.com/post/pcf6uv838f', 'info_dict': { @@ -28,9 +36,11 @@ class GettrIE(InfoExtractor): 'ext': 'mp4', 'uploader': 'EpochTV', 'uploader_id': 'epochtv', + 'upload_date': '20210927', 'thumbnail': r're:^https?://.+/out\.jpg', - 'timestamp': 1632782451058, + 'timestamp': 1632782451.058, 'duration': 58.5585, + 'tags': ['hornofafrica', 'explorations'], } }, { 'url': 'https://gettr.com/post/p4iahp', @@ -41,43 +51,69 @@ class GettrIE(InfoExtractor): 'ext': 'mp4', 'uploader': 'Neues Forum Freiheit', 'uploader_id': 'nf_freiheit', + 'upload_date': '20210718', 'thumbnail': r're:^https?://.+/out\.jpg', - 'timestamp': 1626594455017, + 'timestamp': 1626594455.017, 'duration': 23, + 'tags': 'count:12', } + }, { + # quote post + 'url': 'https://gettr.com/post/pxn5b743a9', + 'only_matching': True, + }, { + # quote with video + 'url': 'https://gettr.com/post/pxtiiz5ca2', + 'only_matching': True, + }, { + # streaming embed + 'url': 'https://gettr.com/post/pxlu8p3b13', + 'only_matching': True, + }, { + # youtube embed + 'url': 'https://gettr.com/post/pv6wp9e24c', + 'only_matching': True, + 'add_ie': ['Youtube'], }] def _real_extract(self, url): post_id = self._match_id(url) webpage = self._download_webpage(url, post_id) + api_data = self._call_api('post/%s?incl="poststats|userinfo"' % post_id, post_id) + + post_data = api_data.get('data') + user_data = try_get(api_data, lambda x: x['aux']['uinf'][post_data['uid']], dict) or {} + + vid = post_data.get('vid') + ovid = post_data.get('ovid') - api_data = self._download_json( - 'https://api.gettr.com/u/post/%s?incl="poststats|userinfo"' % post_id, post_id) + if post_data.get('p_type') == 'stream': + return self.url_result(f'https://gettr.com/streaming/{post_id}', ie='GettrStreaming', video_id=post_id) - post_data = try_get(api_data, lambda x: x['result']['data']) - user_data = try_get(api_data, lambda x: x['result']['aux']['uinf'][post_data['uid']]) or {} + if not (ovid or vid): + embed_url = url_or_none(post_data.get('prevsrc')) + shared_post_id = traverse_obj(api_data, ('aux', 'shrdpst', '_id'), ('data', 'rpstIds', 0), expected_type=str) - if post_data.get('nfound'): - raise ExtractorError(post_data.get('txt'), expected=True) + if embed_url: + return self.url_result(embed_url) + elif shared_post_id: + return self.url_result(f'https://gettr.com/post/{shared_post_id}', ie='Gettr', video_id=shared_post_id) + else: + raise ExtractorError('There\'s no video in this post.') title = description = str_or_none( post_data.get('txt') or self._og_search_description(webpage)) uploader = str_or_none( user_data.get('nickname') - or remove_end(self._og_search_title(webpage), ' on GETTR')) + or self._search_regex(r'^(.+?) on GETTR', self._og_search_title(webpage, default=''), 'uploader', fatal=False)) + if uploader: title = '%s - %s' % (uploader, title) - if not dict_get(post_data, ['vid', 'ovid']): - raise ExtractorError('There\'s no video in this post.') - - vid = post_data.get('vid') - ovid = post_data.get('ovid') - - formats = self._extract_m3u8_formats( + formats, subtitles = self._extract_m3u8_formats_and_subtitles( urljoin(self._MEDIA_BASE_URL, vid), post_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') if vid else [] + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) if vid else ([], {}) if ovid: formats.append({ @@ -86,8 +122,6 @@ class GettrIE(InfoExtractor): 'ext': 'mp4', 'width': int_or_none(post_data.get('vid_wid')), 'height': int_or_none(post_data.get('vid_hgt')), - 'source_preference': 1, - 'quality': 1, }) self._sort_formats(formats) @@ -96,15 +130,84 @@ class GettrIE(InfoExtractor): 'id': post_id, 'title': title, 'description': description, - 'thumbnail': url_or_none( - urljoin(self._MEDIA_BASE_URL, post_data.get('main')) - or self._og_search_thumbnail(webpage)), - 'timestamp': int_or_none(post_data.get('cdate')), + 'formats': formats, + 'subtitles': subtitles, + 'uploader': uploader, 'uploader_id': str_or_none( dict_get(user_data, ['_id', 'username']) or post_data.get('uid')), - 'uploader': uploader, - 'formats': formats, + 'thumbnail': url_or_none( + urljoin(self._MEDIA_BASE_URL, post_data.get('main')) + or self._html_search_meta(['og:image', 'image'], webpage, 'thumbnail', fatal=False)), + 'timestamp': float_or_none(dict_get(post_data, ['cdate', 'udate']), scale=1000), 'duration': float_or_none(post_data.get('vid_dur')), 'tags': post_data.get('htgs'), } + + +class GettrStreamingIE(GettrBaseIE): + _VALID_URL = GettrBaseIE._BASE_REGEX + r'streaming/(?P<id>[a-z0-9]+)' + + _TESTS = [{ + 'url': 'https://gettr.com/streaming/psoiulc122', + 'info_dict': { + 'id': 'psoiulc122', + 'ext': 'mp4', + 'description': 'md5:56bca4b8f48f1743d9fd03d49c723017', + 'view_count': int, + 'uploader': 'Corona Investigative Committee', + 'uploader_id': 'coronacommittee', + 'duration': 5180.184, + 'thumbnail': r're:^https?://.+', + 'title': 'Day 1: Opening Session of the Grand Jury Proceeding', + 'timestamp': 1644080997.164, + 'upload_date': '20220205', + } + }, { + 'url': 'https://gettr.com/streaming/psfmeefcc1', + 'info_dict': { + 'id': 'psfmeefcc1', + 'ext': 'mp4', + 'title': 'Session 90: "The Virus Of Power"', + 'view_count': int, + 'uploader_id': 'coronacommittee', + 'description': 'md5:98986acdf656aa836bf36f9c9704c65b', + 'uploader': 'Corona Investigative Committee', + 'thumbnail': r're:^https?://.+', + 'duration': 21872.507, + 'timestamp': 1643976662.858, + 'upload_date': '20220204', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = self._call_api('live/join/%s' % video_id, video_id, data={}) + + live_info = video_info['broadcast'] + live_url = url_or_none(live_info.get('url')) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + live_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) if live_url else ([], {}) + + thumbnails = [{ + 'url': urljoin(self._MEDIA_BASE_URL, thumbnail), + } for thumbnail in try_get(video_info, lambda x: x['postData']['imgs'], list) or []] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': try_get(video_info, lambda x: x['postData']['ttl'], str), + 'description': try_get(video_info, lambda x: x['postData']['dsc'], str), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'uploader': try_get(video_info, lambda x: x['liveHostInfo']['nickname'], str), + 'uploader_id': try_get(video_info, lambda x: x['liveHostInfo']['_id'], str), + 'view_count': int_or_none(live_info.get('viewsCount')), + 'timestamp': float_or_none(live_info.get('startAt'), scale=1000), + 'duration': float_or_none(live_info.get('duration'), scale=1000), + 'is_live': bool_or_none(live_info.get('isLive')), + } diff --git a/hypervideo_dl/extractor/gfycat.py b/hypervideo_dl/extractor/gfycat.py index 18a30fe..2ad03e2 100644 --- a/hypervideo_dl/extractor/gfycat.py +++ b/hypervideo_dl/extractor/gfycat.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( int_or_none, @@ -11,7 +13,7 @@ from ..utils import ( class GfycatIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\.]+)' + _VALID_URL = r'(?i)https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\."\']+)' _TESTS = [{ 'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher', 'info_dict': { @@ -24,9 +26,10 @@ class GfycatIE(InfoExtractor): 'duration': 10.4, 'view_count': int, 'like_count': int, - 'dislike_count': int, 'categories': list, 'age_limit': 0, + 'uploader_id': 'anonymous', + 'description': '', } }, { 'url': 'http://gfycat.com/ifr/JauntyTimelyAmazontreeboa', @@ -40,9 +43,27 @@ class GfycatIE(InfoExtractor): 'duration': 3.52, 'view_count': int, 'like_count': int, - 'dislike_count': int, 'categories': list, 'age_limit': 0, + 'uploader_id': 'anonymous', + 'description': '', + } + }, { + 'url': 'https://gfycat.com/alienatedsolidgreathornedowl', + 'info_dict': { + 'id': 'alienatedsolidgreathornedowl', + 'ext': 'mp4', + 'upload_date': '20211226', + 'uploader_id': 'reactions', + 'timestamp': 1640536930, + 'like_count': int, + 'description': '', + 'title': 'Ingrid Michaelson, Zooey Deschanel - Merry Christmas Happy New Year', + 'categories': list, + 'age_limit': 0, + 'duration': 2.9583333333333335, + 'uploader': 'Reaction GIFs', + 'view_count': int, } }, { 'url': 'https://gfycat.com/ru/RemarkableDrearyAmurstarfish', @@ -59,8 +80,19 @@ class GfycatIE(InfoExtractor): }, { 'url': 'https://giant.gfycat.com/acceptablehappygoluckyharborporpoise.mp4', 'only_matching': True + }, { + 'url': 'http://gfycat.com/IFR/JauntyTimelyAmazontreeboa', + 'only_matching': True }] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>%s)' % GfycatIE._VALID_URL, + webpage)] + def _real_extract(self, url): video_id = self._match_id(url) @@ -74,7 +106,7 @@ class GfycatIE(InfoExtractor): title = gfy.get('title') or gfy['gfyName'] description = gfy.get('description') timestamp = int_or_none(gfy.get('createDate')) - uploader = gfy.get('userName') + uploader = gfy.get('userName') or gfy.get('username') view_count = int_or_none(gfy.get('views')) like_count = int_or_none(gfy.get('likes')) dislike_count = int_or_none(gfy.get('dislikes')) @@ -114,7 +146,8 @@ class GfycatIE(InfoExtractor): 'title': title, 'description': description, 'timestamp': timestamp, - 'uploader': uploader, + 'uploader': gfy.get('userDisplayName') or uploader, + 'uploader_id': uploader, 'duration': duration, 'view_count': view_count, 'like_count': like_count, diff --git a/hypervideo_dl/extractor/glide.py b/hypervideo_dl/extractor/glide.py index d94dfbf..12af859 100644 --- a/hypervideo_dl/extractor/glide.py +++ b/hypervideo_dl/extractor/glide.py @@ -23,9 +23,7 @@ class GlideIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'<title>(.+?)</title>', webpage, - 'title', default=None) or self._og_search_title(webpage) + title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage) video_url = self._proto_relative_url(self._search_regex( r'<source[^>]+src=(["\'])(?P<url>.+?)\1', webpage, 'video URL', default=None, diff --git a/hypervideo_dl/extractor/globo.py b/hypervideo_dl/extractor/globo.py index a3f0241..f6aaae1 100644 --- a/hypervideo_dl/extractor/globo.py +++ b/hypervideo_dl/extractor/globo.py @@ -12,6 +12,7 @@ from ..compat import ( compat_str, ) from ..utils import ( + HEADRequest, ExtractorError, float_or_none, orderedSet, @@ -67,11 +68,28 @@ class GloboIE(InfoExtractor): }, { 'url': 'globo:3607726', 'only_matching': True, + }, { + 'url': 'https://globoplay.globo.com/v/10248083/', + 'info_dict': { + 'id': '10248083', + 'ext': 'mp4', + 'title': 'Melhores momentos: Equador 1 x 1 Brasil pelas Eliminatórias da Copa do Mundo 2022', + 'duration': 530.964, + 'uploader': 'SporTV', + 'uploader_id': '698', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): video_id = self._match_id(url) + self._request_webpage( + HEADRequest('https://globo-ab.globo.com/v2/selected-alternatives?experiments=player-isolated-experiment-02&skipImpressions=true'), + video_id, 'Getting cookies') + video = self._download_json( 'http://api.globovideos.com/videos/%s/playlist' % video_id, video_id)['videos'][0] @@ -82,7 +100,7 @@ class GloboIE(InfoExtractor): formats = [] security = self._download_json( - 'https://playback.video.globo.com/v1/video-session', video_id, 'Downloading security hash for %s' % video_id, + 'https://playback.video.globo.com/v2/video-session', video_id, 'Downloading security hash for %s' % video_id, headers={'content-type': 'application/json'}, data=json.dumps({ "player_type": "desktop", "video_id": video_id, @@ -92,7 +110,9 @@ class GloboIE(InfoExtractor): "tz": "-3.0:00" }).encode()) - security_hash = security['source']['token'] + self._request_webpage(HEADRequest(security['sources'][0]['url_template']), video_id, 'Getting locksession cookie') + + security_hash = security['sources'][0]['token'] if not security_hash: message = security.get('message') if message: @@ -115,15 +135,15 @@ class GloboIE(InfoExtractor): md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode() signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') signed_hash = hash_prefix + padded_sign_time + signed_md5 - source = security['source']['url_parts'] + source = security['sources'][0]['url_parts'] resource_url = source['scheme'] + '://' + source['domain'] + source['path'] signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A') - formats.extend(self._extract_m3u8_formats( - signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + fmts, subtitles = self._extract_m3u8_formats_and_subtitles( + signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + formats.extend(fmts) self._sort_formats(formats) - subtitles = {} for resource in video['resources']: if resource.get('type') == 'subtitle': subtitles.setdefault(resource.get('language') or 'por', []).append({ @@ -166,6 +186,7 @@ class GloboArticleIE(InfoExtractor): r'\bvideosIDs\s*:\s*["\']?(\d{7,})', r'\bdata-id=["\'](\d{7,})', r'<div[^>]+\bid=["\'](\d{7,})', + r'<bs-player[^>]+\bvideoid=["\'](\d{8,})', ] _TESTS = [{ @@ -193,6 +214,14 @@ class GloboArticleIE(InfoExtractor): }, { 'url': 'http://oglobo.globo.com/rio/a-amizade-entre-um-entregador-de-farmacia-um-piano-19946271', 'only_matching': True, + }, { + 'url': 'https://ge.globo.com/video/ta-na-area-como-foi-assistir-ao-jogo-do-palmeiras-que-a-globo-nao-passou-10287094.ghtml', + 'info_dict': { + 'id': 'ta-na-area-como-foi-assistir-ao-jogo-do-palmeiras-que-a-globo-nao-passou-10287094', + 'title': 'Tá na Área: como foi assistir ao jogo do Palmeiras que a Globo não passou', + 'description': 'md5:2d089d036c4c9675117d3a56f8c61739', + }, + 'playlist_count': 1, }] @classmethod @@ -208,6 +237,6 @@ class GloboArticleIE(InfoExtractor): entries = [ self.url_result('globo:%s' % video_id, GloboIE.ie_key()) for video_id in orderedSet(video_ids)] - title = self._og_search_title(webpage, fatal=False) + title = self._og_search_title(webpage) description = self._html_search_meta('description', webpage) return self.playlist_result(entries, display_id, title, description) diff --git a/hypervideo_dl/extractor/glomex.py b/hypervideo_dl/extractor/glomex.py new file mode 100644 index 0000000..d9ef433 --- /dev/null +++ b/hypervideo_dl/extractor/glomex.py @@ -0,0 +1,220 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + extract_attributes, + ExtractorError, + int_or_none, + parse_qs, + smuggle_url, + unescapeHTML, + unsmuggle_url, +) + + +class GlomexBaseIE(InfoExtractor): + _DEFAULT_ORIGIN_URL = 'https://player.glomex.com/' + _API_URL = 'https://integration-cloudfront-eu-west-1.mes.glomex.cloud/' + + @staticmethod + def _smuggle_origin_url(url, origin_url): + if origin_url is None: + return url + return smuggle_url(url, {'origin': origin_url}) + + @classmethod + def _unsmuggle_origin_url(cls, url, fallback_origin_url=None): + defaults = {'origin': fallback_origin_url or cls._DEFAULT_ORIGIN_URL} + unsmuggled_url, data = unsmuggle_url(url, default=defaults) + return unsmuggled_url, data['origin'] + + def _get_videoid_type(self, video_id): + _VIDEOID_TYPES = { + 'v': 'video', + 'pl': 'playlist', + 'rl': 'related videos playlist', + 'cl': 'curated playlist', + } + prefix = video_id.split('-')[0] + return _VIDEOID_TYPES.get(prefix, 'unknown type') + + def _download_api_data(self, video_id, integration, current_url=None): + query = { + 'integration_id': integration, + 'playlist_id': video_id, + 'current_url': current_url or self._DEFAULT_ORIGIN_URL, + } + video_id_type = self._get_videoid_type(video_id) + return self._download_json( + self._API_URL, + video_id, 'Downloading %s JSON' % video_id_type, + 'Unable to download %s JSON' % video_id_type, + query=query) + + def _download_and_extract_api_data(self, video_id, integration, current_url): + api_data = self._download_api_data(video_id, integration, current_url) + videos = api_data['videos'] + if not videos: + raise ExtractorError('no videos found for %s' % video_id) + videos = [self._extract_api_data(video, video_id) for video in videos] + return videos[0] if len(videos) == 1 else self.playlist_result(videos, video_id) + + def _extract_api_data(self, video, video_id): + if video.get('error_code') == 'contentGeoblocked': + self.raise_geo_restricted(countries=video['geo_locations']) + + formats, subs = [], {} + for format_id, format_url in video['source'].items(): + ext = determine_ext(format_url) + if ext == 'm3u8': + formats_, subs_ = self._extract_m3u8_formats_and_subtitles( + format_url, video_id, 'mp4', m3u8_id=format_id, + fatal=False) + formats.extend(formats_) + self._merge_subtitles(subs_, target=subs) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + if video.get('language'): + for fmt in formats: + fmt['language'] = video['language'] + self._sort_formats(formats) + + images = (video.get('images') or []) + [video.get('image') or {}] + thumbnails = [{ + 'id': image.get('id'), + 'url': f'{image["url"]}/profile:player-960x540', + 'width': 960, + 'height': 540, + } for image in images if image.get('url')] + self._remove_duplicate_formats(thumbnails) + + return { + 'id': video.get('clip_id') or video_id, + 'title': video.get('title'), + 'description': video.get('description'), + 'thumbnails': thumbnails, + 'duration': int_or_none(video.get('clip_duration')), + 'timestamp': video.get('created_at'), + 'formats': formats, + 'subtitles': subs, + } + + +class GlomexIE(GlomexBaseIE): + IE_NAME = 'glomex' + IE_DESC = 'Glomex videos' + _VALID_URL = r'https?://video\.glomex\.com/[^/]+/(?P<id>v-[^-]+)' + _INTEGRATION_ID = '19syy24xjn1oqlpc' + + _TESTS = [{ + 'url': 'https://video.glomex.com/sport/v-cb24uwg77hgh-nach-2-0-sieg-guardiola-mit-mancity-vor-naechstem-titel', + 'md5': 'cec33a943c4240c9cb33abea8c26242e', + 'info_dict': { + 'id': 'v-cb24uwg77hgh', + 'ext': 'mp4', + 'title': 'md5:38a90cedcfadd72982c81acf13556e0c', + 'description': 'md5:1ea6b6caff1443fcbbba159e432eedb8', + 'duration': 29600, + 'timestamp': 1619895017, + 'upload_date': '20210501', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + GlomexEmbedIE.build_player_url(video_id, self._INTEGRATION_ID, url), + GlomexEmbedIE.ie_key(), video_id) + + +class GlomexEmbedIE(GlomexBaseIE): + IE_NAME = 'glomex:embed' + IE_DESC = 'Glomex embedded videos' + _BASE_PLAYER_URL = '//player.glomex.com/integration/1/iframe-player.html' + _BASE_PLAYER_URL_RE = re.escape(_BASE_PLAYER_URL).replace('/1/', r'/[^/]/') + _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?playlistId=(?P<id>[^#&]+)' + + _TESTS = [{ + 'url': 'https://player.glomex.com/integration/1/iframe-player.html?integrationId=4059a013k56vb2yd&playlistId=v-cfa6lye0dkdd-sf', + 'md5': '68f259b98cc01918ac34180142fce287', + 'info_dict': { + 'id': 'v-cfa6lye0dkdd-sf', + 'ext': 'mp4', + 'timestamp': 1635337199, + 'duration': 133080, + 'upload_date': '20211027', + 'description': 'md5:e741185fc309310ff5d0c789b437be66', + 'title': 'md5:35647293513a6c92363817a0fb0a7961', + }, + }, { + 'url': 'https://player.glomex.com/integration/1/iframe-player.html?origin=fullpage&integrationId=19syy24xjn1oqlpc&playlistId=rl-vcb49w1fb592p&playlistIndex=0', + 'info_dict': { + 'id': 'rl-vcb49w1fb592p', + }, + 'playlist_count': 100, + }, { + 'url': 'https://player.glomex.com/integration/1/iframe-player.html?playlistId=cl-bgqaata6aw8x&integrationId=19syy24xjn1oqlpc', + 'info_dict': { + 'id': 'cl-bgqaata6aw8x', + }, + 'playlist_mincount': 2, + }] + + @classmethod + def build_player_url(cls, video_id, integration, origin_url=None): + query_string = urllib.parse.urlencode({ + 'playlistId': video_id, + 'integrationId': integration, + }) + return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url) + + @classmethod + def _extract_urls(cls, webpage, origin_url): + # https://docs.glomex.com/publisher/video-player-integration/javascript-api/ + quot_re = r'["\']' + + regex = fr'''(?x) + <iframe[^>]+?src=(?P<q>{quot_re})(?P<url> + (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+ + )(?P=q)''' + for mobj in re.finditer(regex, webpage): + url = unescapeHTML(mobj.group('url')) + if cls.suitable(url): + yield cls._smuggle_origin_url(url, origin_url) + + regex = fr'''(?x) + <glomex-player [^>]+?>| + <div[^>]* data-glomex-player=(?P<q>{quot_re})true(?P=q)[^>]*>''' + for mobj in re.finditer(regex, webpage): + attrs = extract_attributes(mobj.group(0)) + if attrs.get('data-integration-id') and attrs.get('data-playlist-id'): + yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], origin_url) + + # naive parsing of inline scripts for hard-coded integration parameters + regex = fr'''(?x) + (?P<is_js>dataset\.)?%s\s*(?(is_js)=|:)\s* + (?P<q>{quot_re})(?P<id>(?:(?!(?P=q)).)+)(?P=q)\s''' + for mobj in re.finditer(r'(?x)<script[^<]*>.+?</script>', webpage): + script = mobj.group(0) + integration_id = re.search(regex % 'integrationId', script) + if not integration_id: + continue + playlist_id = re.search(regex % 'playlistId', script) + if playlist_id: + yield cls.build_player_url(playlist_id, integration_id, origin_url) + + def _real_extract(self, url): + url, origin_url = self._unsmuggle_origin_url(url) + playlist_id = self._match_id(url) + integration = parse_qs(url).get('integrationId', [None])[0] + if not integration: + raise ExtractorError('No integrationId in URL', expected=True) + return self._download_and_extract_api_data(playlist_id, integration, origin_url) diff --git a/hypervideo_dl/extractor/go.py b/hypervideo_dl/extractor/go.py index 2ccc6df..f92e166 100644 --- a/hypervideo_dl/extractor/go.py +++ b/hypervideo_dl/extractor/go.py @@ -217,6 +217,7 @@ class GoIE(AdobePassIE): title = video_data['title'] formats = [] + subtitles = {} for asset in video_data.get('assets', {}).get('asset', []): asset_url = asset.get('value') if not asset_url: @@ -256,8 +257,10 @@ class GoIE(AdobePassIE): error_message = ', '.join([error['message'] for error in errors]) raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) asset_url += '?' + entitlement['uplynkData']['sessionKey'] - formats.extend(self._extract_m3u8_formats( - asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False)) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: f = { 'format_id': format_id, @@ -281,7 +284,6 @@ class GoIE(AdobePassIE): formats.append(f) self._sort_formats(formats) - subtitles = {} for cc in video_data.get('closedcaption', {}).get('src', []): cc_url = cc.get('value') if not cc_url: diff --git a/hypervideo_dl/extractor/gofile.py b/hypervideo_dl/extractor/gofile.py new file mode 100644 index 0000000..62d778c --- /dev/null +++ b/hypervideo_dl/extractor/gofile.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + try_get +) + + +class GofileIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gofile\.io/d/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://gofile.io/d/AMZyDw', + 'info_dict': { + 'id': 'AMZyDw', + }, + 'playlist_mincount': 2, + 'playlist': [{ + 'info_dict': { + 'id': 'de571ac1-5edc-42e2-8ec2-bdac83ad4a31', + 'filesize': 928116, + 'ext': 'mp4', + 'title': 'nuuh' + } + }] + }, { # URL to test mixed file types + 'url': 'https://gofile.io/d/avt34h', + 'info_dict': { + 'id': 'avt34h', + }, + 'playlist_mincount': 1, + }, { # URL to test no video/audio error + 'url': 'https://gofile.io/d/aB03lZ', + 'info_dict': { + 'id': 'aB03lZ', + }, + 'playlist_count': 0, + 'skip': 'No video/audio found at provided URL.', + }] + _TOKEN = None + + def _real_initialize(self): + token = self._get_cookies('https://gofile.io/').get('accountToken') + if token: + self._TOKEN = token.value + return + + account_data = self._download_json( + 'https://api.gofile.io/createAccount', None, note='Getting a new guest account') + self._TOKEN = account_data['data']['token'] + self._set_cookie('gofile.io', 'accountToken', self._TOKEN) + + def _entries(self, file_id): + files = self._download_json( + f'https://api.gofile.io/getContent?contentId={file_id}&token={self._TOKEN}&websiteToken=websiteToken&cache=true', + 'Gofile', note='Getting filelist') + + status = files['status'] + if status != 'ok': + raise ExtractorError(f'{self.IE_NAME} said: status {status}', expected=True) + + found_files = False + for file in (try_get(files, lambda x: x['data']['contents'], dict) or {}).values(): + file_type, file_format = file.get('mimetype').split('/', 1) + if file_type not in ('video', 'audio') and file_format != 'vnd.mts': + continue + + found_files = True + file_url = file.get('directLink') + if file_url: + yield { + 'id': file['id'], + 'title': file['name'].rsplit('.', 1)[0], + 'url': file_url, + 'filesize': file.get('size'), + 'release_timestamp': file.get('createTime') + } + + if not found_files: + raise ExtractorError('No video/audio found at provided URL.', expected=True) + + def _real_extract(self, url): + file_id = self._match_id(url) + return self.playlist_result(self._entries(file_id), playlist_id=file_id) diff --git a/hypervideo_dl/extractor/googlesearch.py b/hypervideo_dl/extractor/googlesearch.py index f605c0c..4b8b1bc 100644 --- a/hypervideo_dl/extractor/googlesearch.py +++ b/hypervideo_dl/extractor/googlesearch.py @@ -8,36 +8,33 @@ from .common import SearchInfoExtractor class GoogleSearchIE(SearchInfoExtractor): IE_DESC = 'Google Video search' - _MAX_RESULTS = 1000 IE_NAME = 'video.google:search' _SEARCH_KEY = 'gvsearch' - _WORKING = False - _TEST = { + _TESTS = [{ 'url': 'gvsearch15:python language', 'info_dict': { 'id': 'python language', 'title': 'python language', }, 'playlist_count': 15, - } + }] + _PAGE_SIZE = 100 def _search_results(self, query): for pagenum in itertools.count(): webpage = self._download_webpage( - 'http://www.google.com/search', - 'gvsearch:' + query, - note='Downloading result page %s' % (pagenum + 1), + 'http://www.google.com/search', f'gvsearch:{query}', + note=f'Downloading result page {pagenum + 1}', query={ 'tbm': 'vid', 'q': query, - 'start': pagenum * 10, + 'start': pagenum * self._PAGE_SIZE, + 'num': self._PAGE_SIZE, 'hl': 'en', }) - for hit_idx, mobj in enumerate(re.finditer( - r'<h3 class="r"><a href="([^"]+)"', webpage)): - if re.search(f'id="vidthumb{hit_idx + 1}"', webpage): - yield self.url_result(mobj.group(1)) + for url in re.findall(r'<div[^>]* class="dXiKIc"[^>]*><a href="([^"]+)"', webpage): + yield self.url_result(url) if not re.search(r'id="pnnext"', webpage): return diff --git a/hypervideo_dl/extractor/gronkh.py b/hypervideo_dl/extractor/gronkh.py index a7792a5..c9f1dd2 100644 --- a/hypervideo_dl/extractor/gronkh.py +++ b/hypervideo_dl/extractor/gronkh.py @@ -6,7 +6,7 @@ from ..utils import unified_strdate class GronkhIE(InfoExtractor): - _VALID_URL = r'(?:https?://)(?:www\.)?gronkh\.tv/stream/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/(?:watch/)?stream/(?P<id>\d+)' _TESTS = [{ 'url': 'https://gronkh.tv/stream/536', @@ -19,6 +19,9 @@ class GronkhIE(InfoExtractor): 'upload_date': '20211001' }, 'params': {'skip_download': True} + }, { + 'url': 'https://gronkh.tv/watch/stream/546', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/hellporno.py b/hypervideo_dl/extractor/hellporno.py index fae4251..92d32cd 100644 --- a/hypervideo_dl/extractor/hellporno.py +++ b/hypervideo_dl/extractor/hellporno.py @@ -38,8 +38,7 @@ class HellPornoIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - title = remove_end(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), ' - Hell Porno') + title = remove_end(self._html_extract_title(webpage), ' - Hell Porno') info = self._parse_html5_media_entries(url, webpage, display_id)[0] self._sort_formats(info['formats']) diff --git a/hypervideo_dl/extractor/hidive.py b/hypervideo_dl/extractor/hidive.py index 15bd444..46d7d62 100644 --- a/hypervideo_dl/extractor/hidive.py +++ b/hypervideo_dl/extractor/hidive.py @@ -35,18 +35,14 @@ class HiDiveIE(InfoExtractor): 'skip': 'Requires Authentication', }] - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return - + def _perform_login(self, username, password): webpage = self._download_webpage(self._LOGIN_URL, None) form = self._search_regex( r'(?s)<form[^>]+action="/account/login"[^>]*>(.+?)</form>', webpage, 'login form') data = self._hidden_inputs(form) data.update({ - 'Email': email, + 'Email': username, 'Password': password, }) self._download_webpage( diff --git a/hypervideo_dl/extractor/hitbox.py b/hypervideo_dl/extractor/hitbox.py index 3e5ff26..0470d0a 100644 --- a/hypervideo_dl/extractor/hitbox.py +++ b/hypervideo_dl/extractor/hitbox.py @@ -209,6 +209,6 @@ class HitboxLiveIE(HitboxIE): 'https://www.smashcast.tv/api/media/live', video_id) metadata['formats'] = formats metadata['is_live'] = True - metadata['title'] = self._live_title(metadata.get('title')) + metadata['title'] = metadata.get('title') return metadata diff --git a/hypervideo_dl/extractor/hotstar.py b/hypervideo_dl/extractor/hotstar.py index 74e2728..d55a79b 100644 --- a/hypervideo_dl/extractor/hotstar.py +++ b/hypervideo_dl/extractor/hotstar.py @@ -203,6 +203,9 @@ class HotStarIE(HotStarBaseIE): format_url = re.sub( r'(?<=//staragvod)(\d)', r'web\1', format_url) tags = str_or_none(playback_set.get('tagsCombination')) or '' + ingored_res, ignored_vcodec, ignored_dr = self._configuration_arg('res'), self._configuration_arg('vcodec'), self._configuration_arg('dr') + if any(f'resolution:{ig_res}' in tags for ig_res in ingored_res) or any(f'video_codec:{ig_vc}' in tags for ig_vc in ignored_vcodec) or any(f'dynamic_range:{ig_dr}' in tags for ig_dr in ignored_dr): + continue ext = determine_ext(format_url) current_formats, current_subs = [], {} try: @@ -230,6 +233,11 @@ class HotStarIE(HotStarBaseIE): if tags and 'encryption:plain' not in tags: for f in current_formats: f['has_drm'] = True + if tags and 'language' in tags: + lang = re.search(r'language:(?P<lang>[a-z]+)', tags).group('lang') + for f in current_formats: + if not f.get('langauge'): + f['language'] = lang formats.extend(current_formats) subs = self._merge_subtitles(subs, current_subs) if not formats and geo_restricted: @@ -291,7 +299,7 @@ class HotStarPlaylistIE(HotStarBaseIE): class HotStarSeriesIE(HotStarBaseIE): IE_NAME = 'hotstar:series' - _VALID_URL = r'(?P<url>(?:https?://)(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))' _TESTS = [{ 'url': 'https://www.hotstar.com/in/tv/radhakrishn/1260000646', 'info_dict': { diff --git a/hypervideo_dl/extractor/hrfensehen.py b/hypervideo_dl/extractor/hrfensehen.py index 2a994d4..e39ded2 100644 --- a/hypervideo_dl/extractor/hrfensehen.py +++ b/hypervideo_dl/extractor/hrfensehen.py @@ -26,13 +26,7 @@ class HRFernsehenIE(InfoExtractor): }]}, 'timestamp': 1598470200, 'upload_date': '20200826', - 'thumbnails': [{ - 'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9.jpg', - 'id': '0' - }, { - 'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg', - 'id': '1' - }], + 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg', 'title': 'hessenschau vom 26.08.2020' } }, { @@ -81,7 +75,7 @@ class HRFernsehenIE(InfoExtractor): description = self._html_search_meta( ['description'], webpage) - loader_str = unescapeHTML(self._search_regex(r"data-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader")) + loader_str = unescapeHTML(self._search_regex(r"data-new-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader")) loader_data = json.loads(loader_str) info = { diff --git a/hypervideo_dl/extractor/hrti.py b/hypervideo_dl/extractor/hrti.py index dc5b967..36d6007 100644 --- a/hypervideo_dl/extractor/hrti.py +++ b/hypervideo_dl/extractor/hrti.py @@ -27,8 +27,9 @@ class HRTiBaseIE(InfoExtractor): _APP_VERSION = '1.1' _APP_PUBLICATION_ID = 'all_in_one' _API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json' + _token = None - def _initialize_api(self): + def _initialize_pre_login(self): init_data = { 'application_publication_id': self._APP_PUBLICATION_ID } @@ -64,12 +65,7 @@ class HRTiBaseIE(InfoExtractor): self._logout_url = modules['user']['resources']['logout']['uri'] - def _login(self): - username, password = self._get_login_info() - # TODO: figure out authentication with cookies - if username is None or password is None: - self.raise_login_required() - + def _perform_login(self, username, password): auth_data = { 'username': username, 'password': password, @@ -94,8 +90,9 @@ class HRTiBaseIE(InfoExtractor): self._token = auth_info['secure_streaming_token'] def _real_initialize(self): - self._initialize_api() - self._login() + if not self._token: + # TODO: figure out authentication with cookies + self.raise_login_required(method='password') class HRTiIE(HRTiBaseIE): diff --git a/hypervideo_dl/extractor/hse.py b/hypervideo_dl/extractor/hse.py new file mode 100644 index 0000000..9144ff8 --- /dev/null +++ b/hypervideo_dl/extractor/hse.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_timestamp, +) + + +class HSEShowBaseInfoExtractor(InfoExtractor): + _GEO_COUNTRIES = ['DE'] + + def _extract_redux_data(self, url, video_id): + webpage = self._download_webpage(url, video_id) + redux = self._html_search_regex( + r'window\.__REDUX_DATA__\s*=\s*({.*});?', webpage, 'redux data') + return self._parse_json(redux.replace('\n', ''), video_id) + + def _extract_formats_and_subtitles(self, sources, video_id): + if not sources: + raise ExtractorError('No video found', expected=True, video_id=video_id) + formats, subtitles = [], {} + for src in sources: + if src['mimetype'] != 'application/x-mpegURL': + continue + fmts, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, ext='mp4') + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + self._sort_formats(formats) + return formats, subtitles + + +class HSEShowIE(HSEShowBaseInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/c/tv-shows/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.hse.de/dpl/c/tv-shows/505350', + 'info_dict': { + 'id': '505350', + 'ext': 'mp4', + 'title': 'Pfeffinger Mode & Accessoires', + 'timestamp': 1638810000, + 'upload_date': '20211206', + 'channel': 'HSE24', + 'uploader': 'Arina Pirayesh' + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._extract_redux_data(url, video_id) + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(json_data, ('tvShowPage', 'tvShowVideo', 'sources')), video_id) + + show = traverse_obj(json_data, ('tvShowPage', 'tvShow')) or {} + return { + 'id': video_id, + 'title': show.get('title') or video_id, + 'formats': formats, + 'timestamp': unified_timestamp(f'{show.get("date")} {show.get("hour")}:00'), + 'thumbnail': traverse_obj(json_data, ('tvShowVideo', 'poster')), + 'channel': self._search_regex( + r'tvShow \| ([A-Z0-9]+)_', show.get('actionFieldText') or '', video_id, fatal=False), + 'uploader': show.get('presenter'), + 'subtitles': subtitles, + } + + +class HSEProductIE(HSEShowBaseInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/p/product/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.hse.de/dpl/p/product/408630', + 'info_dict': { + 'id': '408630', + 'ext': 'mp4', + 'title': 'Hose im Ponte-Mix', + 'uploader': 'Judith Williams' + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._extract_redux_data(url, video_id) + video = traverse_obj(json_data, ('productContent', 'productContent', 'videos', 0)) or {} + formats, subtitles = self._extract_formats_and_subtitles(video.get('sources'), video_id) + + return { + 'id': video_id, + 'title': traverse_obj(json_data, ('productDetail', 'product', 'name', 'short')) or video_id, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': video.get('poster'), + 'uploader': traverse_obj(json_data, ('productDetail', 'product', 'brand', 'brandName')), + } diff --git a/hypervideo_dl/extractor/huffpost.py b/hypervideo_dl/extractor/huffpost.py index 97e36f0..54385ba 100644 --- a/hypervideo_dl/extractor/huffpost.py +++ b/hypervideo_dl/extractor/huffpost.py @@ -80,9 +80,6 @@ class HuffPostIE(InfoExtractor): 'vcodec': 'none' if key.startswith('audio/') else None, }) - if not formats and data.get('fivemin_id'): - return self.url_result('5min:%s' % data['fivemin_id']) - self._sort_formats(formats) return { diff --git a/hypervideo_dl/extractor/huya.py b/hypervideo_dl/extractor/huya.py new file mode 100644 index 0000000..4e96f22 --- /dev/null +++ b/hypervideo_dl/extractor/huya.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import random + +from ..compat import compat_urlparse, compat_b64decode + +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + str_or_none, + try_get, + unescapeHTML, + update_url_query, +) + +from .common import InfoExtractor + + +class HuyaLiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?P<id>[^/#?&]+)(?:\D|$)' + IE_NAME = 'huya:live' + IE_DESC = 'huya.com' + TESTS = [{ + 'url': 'https://www.huya.com/572329', + 'info_dict': { + 'id': '572329', + 'title': str, + 'description': str, + 'is_live': True, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.huya.com/xiaoyugame', + 'only_matching': True + }] + + _RESOLUTION = { + '蓝光4M': { + 'width': 1920, + 'height': 1080, + }, + '超清': { + 'width': 1280, + 'height': 720, + }, + '流畅': { + 'width': 800, + 'height': 480 + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id=video_id) + json_stream = self._search_regex(r'"stream":\s+"([a-zA-Z0-9+=/]+)"', webpage, 'stream', default=None) + if not json_stream: + raise ExtractorError('Video is offline', expected=True) + stream_data = self._parse_json(compat_b64decode(json_stream).decode(), video_id=video_id, + transform_source=js_to_json) + room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo']) + if not room_info: + raise ExtractorError('Can not extract the room info', expected=True) + title = room_info.get('roomName') or room_info.get('introduction') or self._html_extract_title(webpage) + screen_type = room_info.get('screenType') + live_source_type = room_info.get('liveSourceType') + stream_info_list = stream_data['data'][0]['gameStreamInfoList'] + formats = [] + for stream_info in stream_info_list: + stream_url = stream_info.get('sFlvUrl') + if not stream_url: + continue + stream_name = stream_info.get('sStreamName') + re_secret = not screen_type and live_source_type in (0, 8, 13) + params = dict(compat_urlparse.parse_qsl(unescapeHTML(stream_info['sFlvAntiCode']))) + fm, ss = '', '' + if re_secret: + fm, ss = self.encrypt(params, stream_info, stream_name) + for si in stream_data.get('vMultiStreamInfo'): + rate = si.get('iBitRate') + if rate: + params['ratio'] = rate + else: + params.pop('ratio', None) + if re_secret: + params['wsSecret'] = hashlib.md5( + '_'.join([fm, params['u'], stream_name, ss, params['wsTime']])) + formats.append({ + 'ext': stream_info.get('sFlvUrlSuffix'), + 'format_id': str_or_none(stream_info.get('iLineIndex')), + 'tbr': rate, + 'url': update_url_query(f'{stream_url}/{stream_name}.{stream_info.get("sFlvUrlSuffix")}', + query=params), + **self._RESOLUTION.get(si.get('sDisplayName'), {}), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'view_count': room_info.get('totalCount'), + 'thumbnail': room_info.get('screenshot'), + 'description': room_info.get('contentIntro'), + 'http_headers': { + 'Origin': 'https://www.huya.com', + 'Referer': 'https://www.huya.com/', + }, + } + + def encrypt(self, params, stream_info, stream_name): + ct = int_or_none(params.get('wsTime'), 16) + random.random() + presenter_uid = stream_info['lPresenterUid'] + if not stream_name.startswith(str(presenter_uid)): + uid = presenter_uid + else: + uid = int_or_none(ct % 1e7 * 1e6 % 0xffffffff) + u1 = uid & 0xffffffff00000000 + u2 = uid & 0xffffffff + u3 = uid & 0xffffff + u = u1 | u2 >> 24 | u3 << 8 + params.update({ + 'u': str_or_none(u), + 'seqid': str_or_none(int_or_none(ct * 1000) + uid), + 'ver': '1', + 'uuid': int_or_none(ct % 1e7 * 1e6 % 0xffffffff), + 't': '100', + }) + fm = compat_b64decode(params['fm']).decode().split('_', 1)[0] + ss = hashlib.md5('|'.join([params['seqid'], params['ctype'], params['t']])) + return fm, ss diff --git a/hypervideo_dl/extractor/imdb.py b/hypervideo_dl/extractor/imdb.py index a313019..96cee2e 100644 --- a/hypervideo_dl/extractor/imdb.py +++ b/hypervideo_dl/extractor/imdb.py @@ -7,9 +7,10 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + int_or_none, mimetype2ext, - parse_duration, qualities, + traverse_obj, try_get, url_or_none, ) @@ -28,6 +29,17 @@ class ImdbIE(InfoExtractor): 'title': 'No. 2', 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7', 'duration': 152, + 'thumbnail': r're:^https?://.+\.jpg', + } + }, { + 'url': 'https://www.imdb.com/video/vi3516832537', + 'info_dict': { + 'id': '3516832537', + 'ext': 'mp4', + 'title': 'Paul: U.S. Trailer #1', + 'description': 'md5:17fcc4fe11ec29b4399be9d4c5ef126c', + 'duration': 153, + 'thumbnail': r're:^https?://.+\.jpg', } }, { 'url': 'http://www.imdb.com/video/_/vi2524815897', @@ -51,8 +63,13 @@ class ImdbIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - data = self._download_json( + webpage = self._download_webpage(f'https://www.imdb.com/video/vi{video_id}', video_id) + info = self._search_nextjs_data(webpage, video_id) + video_info = traverse_obj(info, ('props', 'pageProps', 'videoPlaybackData', 'video'), default={}) + title = (traverse_obj(video_info, ('name', 'value'), ('primaryTitle', 'titleText', 'text')) + or self._html_search_meta(('og:title', 'twitter:title'), webpage, default=None) + or self._html_extract_title(webpage)) + data = video_info.get('playbackURLs') or try_get(self._download_json( 'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id, query={ 'key': base64.b64encode(json.dumps({ @@ -60,11 +77,10 @@ class ImdbIE(InfoExtractor): 'subType': 'FORCE_LEGACY', 'id': 'vi%s' % video_id, }).encode()).decode(), - })[0] - + }), lambda x: x[0]['videoLegacyEncodings']) quality = qualities(('SD', '480p', '720p', '1080p')) - formats = [] - for encoding in data['videoLegacyEncodings']: + formats, subtitles = [], {} + for encoding in data: if not encoding or not isinstance(encoding, dict): continue video_url = url_or_none(encoding.get('url')) @@ -73,11 +89,13 @@ class ImdbIE(InfoExtractor): ext = mimetype2ext(encoding.get( 'mimeType')) or determine_ext(video_url) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( video_url, video_id, 'mp4', entry_protocol='m3u8_native', - preference=1, m3u8_id='hls', fatal=False)) + preference=1, m3u8_id='hls', fatal=False) + subtitles = self._merge_subtitles(subtitles, subs) + formats.extend(fmts) continue - format_id = encoding.get('definition') + format_id = traverse_obj(encoding, ('displayName', 'value'), 'definition') formats.append({ 'format_id': format_id, 'url': video_url, @@ -86,33 +104,15 @@ class ImdbIE(InfoExtractor): }) self._sort_formats(formats) - webpage = self._download_webpage( - 'https://www.imdb.com/video/vi' + video_id, video_id) - video_metadata = self._parse_json(self._search_regex( - r'args\.push\(\s*({.+?})\s*\)\s*;', webpage, - 'video metadata'), video_id) - - video_info = video_metadata.get('VIDEO_INFO') - if video_info and isinstance(video_info, dict): - info = try_get( - video_info, lambda x: x[list(video_info.keys())[0]][0], dict) - else: - info = {} - - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage) or self._html_search_regex( - r'<title>(.+?)</title>', webpage, 'title', - default=None) or info['videoTitle'] - return { 'id': video_id, 'title': title, 'alt_title': info.get('videoSubTitle'), 'formats': formats, - 'description': info.get('videoDescription'), - 'thumbnail': url_or_none(try_get( - video_metadata, lambda x: x['videoSlate']['source'])), - 'duration': parse_duration(info.get('videoRuntime')), + 'description': try_get(video_info, lambda x: x['description']['value']), + 'thumbnail': url_or_none(try_get(video_info, lambda x: x['thumbnail']['url'])), + 'duration': int_or_none(try_get(video_info, lambda x: x['runtime']['value'])), + 'subtitles': subtitles, } diff --git a/hypervideo_dl/extractor/imggaming.py b/hypervideo_dl/extractor/imggaming.py index ef20a4b..ce7b21a 100644 --- a/hypervideo_dl/extractor/imggaming.py +++ b/hypervideo_dl/extractor/imggaming.py @@ -21,25 +21,26 @@ class ImgGamingBaseIE(InfoExtractor): _REALM = None _VALID_URL_TEMPL = r'https?://(?P<domain>%s)/(?P<type>live|playlist|video)/(?P<id>\d+)(?:\?.*?\bplaylistId=(?P<playlist_id>\d+))?' - def _real_initialize(self): + def _initialize_pre_login(self): self._HEADERS = { 'Realm': 'dce.' + self._REALM, 'x-api-key': self._API_KEY, } - email, password = self._get_login_info() - if email is None: - self.raise_login_required() - + def _perform_login(self, username, password): p_headers = self._HEADERS.copy() p_headers['Content-Type'] = 'application/json' self._HEADERS['Authorization'] = 'Bearer ' + self._download_json( self._API_BASE + 'login', None, 'Logging in', data=json.dumps({ - 'id': email, + 'id': username, 'secret': password, }).encode(), headers=p_headers)['authorisationToken'] + def _real_initialize(self): + if not self._HEADERS.get('Authorization'): + self.raise_login_required(method='password') + def _call_api(self, path, media_id): return self._download_json( self._API_BASE + path + media_id, media_id, headers=self._HEADERS) @@ -64,10 +65,7 @@ class ImgGamingBaseIE(InfoExtractor): domain, media_type, media_id, playlist_id = self._match_valid_url(url).groups() if playlist_id: - if self.get_param('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % media_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id) + if self._yes_playlist(playlist_id, media_id): media_type, media_id = 'playlist', playlist_id if media_type == 'playlist': @@ -88,7 +86,7 @@ class ImgGamingBaseIE(InfoExtractor): video_data = self._download_json(dve_api_url, media_id) is_live = media_type == 'live' if is_live: - title = self._live_title(self._call_api('event/', media_id)['title']) + title = self._call_api('event/', media_id)['title'] else: title = video_data['name'] @@ -99,7 +97,7 @@ class ImgGamingBaseIE(InfoExtractor): continue if proto == 'hls': m3u8_formats = self._extract_m3u8_formats( - media_url, media_id, 'mp4', 'm3u8' if is_live else 'm3u8_native', + media_url, media_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False, headers=self._MANIFEST_HEADERS) for f in m3u8_formats: f.setdefault('http_headers', {}).update(self._MANIFEST_HEADERS) diff --git a/hypervideo_dl/extractor/infoq.py b/hypervideo_dl/extractor/infoq.py index 0a70a1f..347cc51 100644 --- a/hypervideo_dl/extractor/infoq.py +++ b/hypervideo_dl/extractor/infoq.py @@ -115,7 +115,7 @@ class InfoQIE(BokeCCBaseIE): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title') + video_title = self._html_extract_title(webpage) video_description = self._html_search_meta('description', webpage, 'description') if '/cn/' in url: diff --git a/hypervideo_dl/extractor/instagram.py b/hypervideo_dl/extractor/instagram.py index 3801c7a..970f2c8 100644 --- a/hypervideo_dl/extractor/instagram.py +++ b/hypervideo_dl/extractor/instagram.py @@ -1,32 +1,202 @@ -from __future__ import unicode_literals +# coding: utf-8 import itertools import hashlib import json import re +import time from .common import InfoExtractor from ..compat import ( - compat_str, compat_HTTPError, ) from ..utils import ( ExtractorError, + format_field, float_or_none, get_element_by_attribute, int_or_none, lowercase_escape, - std_headers, - try_get, + str_or_none, + str_to_int, + traverse_obj, url_or_none, - variadic, + urlencode_postdata, ) -class InstagramIE(InfoExtractor): - _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))' +class InstagramBaseIE(InfoExtractor): + _NETRC_MACHINE = 'instagram' + _IS_LOGGED_IN = False + + def _perform_login(self, username, password): + if self._IS_LOGGED_IN: + return + + login_webpage = self._download_webpage( + 'https://www.instagram.com/accounts/login/', None, + note='Downloading login webpage', errnote='Failed to download login webpage') + + shared_data = self._parse_json( + self._search_regex( + r'window\._sharedData\s*=\s*({.+?});', + login_webpage, 'shared data', default='{}'), + None) + + login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={ + 'Accept': '*/*', + 'X-IG-App-ID': '936619743392459', + 'X-ASBD-ID': '198387', + 'X-IG-WWW-Claim': '0', + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRFToken': shared_data['config']['csrf_token'], + 'X-Instagram-AJAX': shared_data['rollout_hash'], + 'Referer': 'https://www.instagram.com/', + }, data=urlencode_postdata({ + 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', + 'username': username, + 'queryParams': '{}', + 'optIntoOneTap': 'false', + 'stopDeletionNonce': '', + 'trustedDeviceRecords': '{}', + })) + + if not login.get('authenticated'): + if login.get('message'): + raise ExtractorError(f'Unable to login: {login["message"]}') + elif login.get('user'): + raise ExtractorError('Unable to login: Sorry, your password was incorrect. Please double-check your password.', expected=True) + elif login.get('user') is False: + raise ExtractorError('Unable to login: The username you entered doesn\'t belong to an account. Please check your username and try again.', expected=True) + raise ExtractorError('Unable to login') + InstagramBaseIE._IS_LOGGED_IN = True + + def _get_count(self, media, kind, *keys): + return traverse_obj( + media, (kind, 'count'), *((f'edge_media_{key}', 'count') for key in keys), + expected_type=int_or_none) + + def _get_dimension(self, name, media, webpage=None): + return ( + traverse_obj(media, ('dimensions', name), expected_type=int_or_none) + or int_or_none(self._html_search_meta( + (f'og:video:{name}', f'video:{name}'), webpage or '', default=None))) + + def _extract_nodes(self, nodes, is_direct=False): + for idx, node in enumerate(nodes, start=1): + if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: + continue + + video_id = node.get('shortcode') + + if is_direct: + info = { + 'id': video_id or node['id'], + 'url': node.get('video_url'), + 'width': self._get_dimension('width', node), + 'height': self._get_dimension('height', node), + 'http_headers': { + 'Referer': 'https://www.instagram.com/', + } + } + elif not video_id: + continue + else: + info = { + '_type': 'url', + 'ie_key': 'Instagram', + 'id': video_id, + 'url': f'https://instagram.com/p/{video_id}', + } + + yield { + **info, + 'title': node.get('title') or (f'Video {idx}' if is_direct else None), + 'description': traverse_obj( + node, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str), + 'thumbnail': traverse_obj( + node, 'display_url', 'thumbnail_src', 'display_src', expected_type=url_or_none), + 'duration': float_or_none(node.get('video_duration')), + 'timestamp': int_or_none(node.get('taken_at_timestamp')), + 'view_count': int_or_none(node.get('video_view_count')), + 'comment_count': self._get_count(node, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'), + 'like_count': self._get_count(node, 'likes', 'preview_like'), + } + + def _extract_product_media(self, product_media): + media_id = product_media.get('code') or product_media.get('id') + vcodec = product_media.get('video_codec') + dash_manifest_raw = product_media.get('video_dash_manifest') + videos_list = product_media.get('video_versions') + if not (dash_manifest_raw or videos_list): + return {} + + formats = [{ + 'format_id': format.get('id'), + 'url': format.get('url'), + 'width': format.get('width'), + 'height': format.get('height'), + 'vcodec': vcodec, + } for format in videos_list or []] + if dash_manifest_raw: + formats.extend(self._parse_mpd_formats(self._parse_xml(dash_manifest_raw, media_id), mpd_id='dash')) + self._sort_formats(formats) + + thumbnails = [{ + 'url': thumbnail.get('url'), + 'width': thumbnail.get('width'), + 'height': thumbnail.get('height') + } for thumbnail in traverse_obj(product_media, ('image_versions2', 'candidates')) or []] + return { + 'id': media_id, + 'duration': float_or_none(product_media.get('video_duration')), + 'formats': formats, + 'thumbnails': thumbnails + } + + def _extract_product(self, product_info): + if isinstance(product_info, list): + product_info = product_info[0] + + user_info = product_info.get('user') or {} + info_dict = { + 'id': product_info.get('code') or product_info.get('id'), + 'title': product_info.get('title') or f'Video by {user_info.get("username")}', + 'description': traverse_obj(product_info, ('caption', 'text'), expected_type=str_or_none), + 'timestamp': int_or_none(product_info.get('taken_at')), + 'channel': user_info.get('username'), + 'uploader': user_info.get('full_name'), + 'uploader_id': str_or_none(user_info.get('pk')), + 'view_count': int_or_none(product_info.get('view_count')), + 'like_count': int_or_none(product_info.get('like_count')), + 'comment_count': int_or_none(product_info.get('comment_count')), + 'http_headers': { + 'Referer': 'https://www.instagram.com/', + } + } + carousel_media = product_info.get('carousel_media') + if carousel_media: + return { + '_type': 'playlist', + **info_dict, + 'title': f'Post by {user_info.get("username")}', + 'entries': [{ + **info_dict, + **self._extract_product_media(product_media), + } for product_media in carousel_media], + } + + return { + **info_dict, + **self._extract_product_media(product_info) + } + + +class InstagramIOSIE(InfoExtractor): + IE_DESC = 'IOS instagram:// URL' + _VALID_URL = r'instagram://media\?id=(?P<id>[\d_]+)' _TESTS = [{ - 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', + 'url': 'instagram://media?id=482584233761418119', 'md5': '0d2da106a9d2631273e192b372806516', 'info_dict': { 'id': 'aye83DjauH', @@ -43,6 +213,49 @@ class InstagramIE(InfoExtractor): 'comment_count': int, 'comments': list, }, + 'add_ie': ['Instagram'] + }] + + def _get_id(self, id): + """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id""" + chrs = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' + media_id = int(id.split('_')[0]) + shortened_id = '' + while media_id > 0: + r = media_id % 64 + media_id = (media_id - r) // 64 + shortened_id = chrs[r] + shortened_id + return shortened_id + + def _real_extract(self, url): + return { + '_type': 'url_transparent', + 'url': f'http://instagram.com/tv/{self._get_id(self._match_id(url))}/', + 'ie_key': 'Instagram', + } + + +class InstagramIE(InstagramBaseIE): + _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+))' + _TESTS = [{ + 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', + 'md5': '0d2da106a9d2631273e192b372806516', + 'info_dict': { + 'id': 'aye83DjauH', + 'ext': 'mp4', + 'title': 'Video by naomipq', + 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, + 'timestamp': 1371748545, + 'upload_date': '20130620', + 'uploader_id': '2815873', + 'uploader': 'B E A U T Y F O R A S H E S', + 'channel': 'naomipq', + 'like_count': int, + 'comment_count': int, + 'comments': list, + }, }, { # missing description 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', @@ -54,8 +267,9 @@ class InstagramIE(InfoExtractor): 'duration': 0, 'timestamp': 1453760977, 'upload_date': '20160125', - 'uploader_id': 'britneyspears', + 'uploader_id': '12246775', 'uploader': 'Britney Spears', + 'channel': 'britneyspears', 'like_count': int, 'comment_count': int, 'comments': list, @@ -101,8 +315,9 @@ class InstagramIE(InfoExtractor): 'duration': 53.83, 'timestamp': 1530032919, 'upload_date': '20180626', - 'uploader_id': 'instagram', + 'uploader_id': '25025320', 'uploader': 'Instagram', + 'channel': 'instagram', 'like_count': int, 'comment_count': int, 'comments': list, @@ -120,6 +335,9 @@ class InstagramIE(InfoExtractor): }, { 'url': 'https://www.instagram.com/reel/CDUMkliABpa/', 'only_matching': True, + }, { + 'url': 'https://www.instagram.com/marvelskies.fc/reel/CWqAgUZgCku/', + 'only_matching': True, }] @staticmethod @@ -141,154 +359,114 @@ class InstagramIE(InfoExtractor): return mobj.group('link') def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - url = mobj.group('url') - + video_id, url = self._match_valid_url(url).group('id', 'url') webpage, urlh = self._download_webpage_handle(url, video_id) - if 'www.instagram.com/accounts/login' in urlh.geturl().rstrip('/'): - self.raise_login_required('You need to log in to access this content', method='cookies') - - (media, video_url, description, thumbnail, timestamp, uploader, - uploader_id, like_count, comment_count, comments, height, - width) = [None] * 12 + if 'www.instagram.com/accounts/login' in urlh.geturl(): + self.report_warning('Main webpage is locked behind the login page. ' + 'Retrying with embed webpage (Note that some metadata might be missing)') + webpage = self._download_webpage( + 'https://www.instagram.com/p/%s/embed/' % video_id, video_id, note='Downloading embed webpage') shared_data = self._parse_json( self._search_regex( r'window\._sharedData\s*=\s*({.+?});', webpage, 'shared data', default='{}'), video_id, fatal=False) - if shared_data: - media = try_get( - shared_data, - (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], - lambda x: x['entry_data']['PostPage'][0]['media']), - dict) + media = traverse_obj( + shared_data, + ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), + ('entry_data', 'PostPage', 0, 'media'), + expected_type=dict) + # _sharedData.entry_data.PostPage is empty when authenticated (see # https://github.com/ytdl-org/youtube-dl/pull/22880) if not media: additional_data = self._parse_json( self._search_regex( - r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;', + r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\);', webpage, 'additional data', default='{}'), video_id, fatal=False) - if additional_data: - media = try_get( - additional_data, lambda x: x['graphql']['shortcode_media'], - dict) - if media: - video_url = media.get('video_url') - height = int_or_none(media.get('dimensions', {}).get('height')) - width = int_or_none(media.get('dimensions', {}).get('width')) - description = try_get( - media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) or media.get('caption') - title = media.get('title') - thumbnail = media.get('display_src') or media.get('display_url') - duration = float_or_none(media.get('video_duration')) - timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) - uploader = media.get('owner', {}).get('full_name') - uploader_id = media.get('owner', {}).get('username') - - def get_count(keys, kind): - for key in variadic(keys): - count = int_or_none(try_get( - media, (lambda x: x['edge_media_%s' % key]['count'], - lambda x: x['%ss' % kind]['count']))) - if count is not None: - return count - - like_count = get_count('preview_like', 'like') - comment_count = get_count( - ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment') - - comments = [] - for comment in try_get(media, lambda x: x['edge_media_to_parent_comment']['edges']): - comment_dict = comment.get('node', {}) - comment_text = comment_dict.get('text') - if comment_text: - comments.append({ - 'author': try_get(comment_dict, lambda x: x['owner']['username']), - 'author_id': try_get(comment_dict, lambda x: x['owner']['id']), - 'id': comment_dict.get('id'), - 'text': comment_text, - 'timestamp': int_or_none(comment_dict.get('created_at')), - }) - if not video_url: - edges = try_get( - media, lambda x: x['edge_sidecar_to_children']['edges'], - list) or [] - if edges: - entries = [] - for edge_num, edge in enumerate(edges, start=1): - node = try_get(edge, lambda x: x['node'], dict) - if not node: - continue - node_video_url = url_or_none(node.get('video_url')) - if not node_video_url: - continue - entries.append({ - 'id': node.get('shortcode') or node['id'], - 'title': node.get('title') or 'Video %d' % edge_num, - 'url': node_video_url, - 'thumbnail': node.get('display_url'), - 'duration': float_or_none(node.get('video_duration')), - 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), - 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), - 'view_count': int_or_none(node.get('video_view_count')), - }) - return self.playlist_result( - entries, video_id, - 'Post by %s' % uploader_id if uploader_id else None, - description) + product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict) + if product_item: + return self._extract_product(product_item) + media = traverse_obj(additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {} - if not video_url: - video_url = self._og_search_video_url(webpage, secure=False) + if not media and 'www.instagram.com/accounts/login' in urlh.geturl(): + self.raise_login_required('You need to log in to access this content') - formats = [{ - 'url': video_url, - 'width': width, - 'height': height, - }] - - if not uploader_id: - uploader_id = self._search_regex( - r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', - webpage, 'uploader id', fatal=False) + username = traverse_obj(media, ('owner', 'username')) or self._search_regex( + r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'username', fatal=False) + description = ( + traverse_obj(media, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str) + or media.get('caption')) if not description: description = self._search_regex( r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None) if description is not None: description = lowercase_escape(description) - if not thumbnail: - thumbnail = self._og_search_thumbnail(webpage) + video_url = media.get('video_url') + if not video_url: + nodes = traverse_obj(media, ('edge_sidecar_to_children', 'edges', ..., 'node'), expected_type=dict) or [] + if nodes: + return self.playlist_result( + self._extract_nodes(nodes, True), video_id, + format_field(username, template='Post by %s'), description) + + video_url = self._og_search_video_url(webpage, secure=False) + + formats = [{ + 'url': video_url, + 'width': self._get_dimension('width', media, webpage), + 'height': self._get_dimension('height', media, webpage), + }] + dash = traverse_obj(media, ('dash_info', 'video_dash_manifest')) + if dash: + formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash')) + self._sort_formats(formats) + + comment_data = traverse_obj(media, ('edge_media_to_parent_comment', 'edges')) + comments = [{ + 'author': traverse_obj(comment_dict, ('node', 'owner', 'username')), + 'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id')), + 'id': traverse_obj(comment_dict, ('node', 'id')), + 'text': traverse_obj(comment_dict, ('node', 'text')), + 'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), expected_type=int_or_none), + } for comment_dict in comment_data] if comment_data else None + + display_resources = ( + media.get('display_resources') + or [{'src': media.get(key)} for key in ('display_src', 'display_url')] + or [{'src': self._og_search_thumbnail(webpage)}]) + thumbnails = [{ + 'url': thumbnail['src'], + 'width': thumbnail.get('config_width'), + 'height': thumbnail.get('config_height'), + } for thumbnail in display_resources if thumbnail.get('src')] return { 'id': video_id, 'formats': formats, - 'ext': 'mp4', - 'title': title or 'Video by %s' % uploader_id, + 'title': media.get('title') or 'Video by %s' % username, 'description': description, - 'duration': duration, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'uploader_id': uploader_id, - 'uploader': uploader, - 'like_count': like_count, - 'comment_count': comment_count, + 'duration': float_or_none(media.get('video_duration')), + 'timestamp': traverse_obj(media, 'taken_at_timestamp', 'date', expected_type=int_or_none), + 'uploader_id': traverse_obj(media, ('owner', 'id')), + 'uploader': traverse_obj(media, ('owner', 'full_name')), + 'channel': username, + 'like_count': self._get_count(media, 'likes', 'preview_like') or str_to_int(self._search_regex( + r'data-log-event="likeCountClick"[^>]*>[^\d]*([\d,\.]+)', webpage, 'like count', fatal=False)), + 'comment_count': self._get_count(media, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'), 'comments': comments, + 'thumbnails': thumbnails, 'http_headers': { 'Referer': 'https://www.instagram.com/', } } -class InstagramPlaylistIE(InfoExtractor): - # A superclass for handling any kind of query based on GraphQL which - # results in a playlist. - +class InstagramPlaylistBaseIE(InstagramBaseIE): _gis_tmpl = None # used to cache GIS request type def _parse_graphql(self, webpage, item_id): @@ -300,10 +478,6 @@ class InstagramPlaylistIE(InfoExtractor): def _extract_graphql(self, data, url): # Parses GraphQL queries containing videos and generates a playlist. - def get_count(suffix): - return int_or_none(try_get( - node, lambda x: x['edge_media_' + suffix]['count'])) - uploader_id = self._match_id(url) csrf_token = data['config']['csrf_token'] rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' @@ -324,7 +498,7 @@ class InstagramPlaylistIE(InfoExtractor): '%s' % rhx_gis, '', '%s:%s' % (rhx_gis, csrf_token), - '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), + '%s:%s:%s' % (rhx_gis, csrf_token, self.get_param('http_headers')['User-Agent']), ] # try all of the ways to generate a GIS query, and not only use the @@ -352,55 +526,14 @@ class InstagramPlaylistIE(InfoExtractor): continue raise - edges = media.get('edges') - if not edges or not isinstance(edges, list): + nodes = traverse_obj(media, ('edges', ..., 'node'), expected_type=dict) or [] + if not nodes: break + yield from self._extract_nodes(nodes) - for edge in edges: - node = edge.get('node') - if not node or not isinstance(node, dict): - continue - if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: - continue - video_id = node.get('shortcode') - if not video_id: - continue - - info = self.url_result( - 'https://instagram.com/p/%s/' % video_id, - ie=InstagramIE.ie_key(), video_id=video_id) - - description = try_get( - node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) - thumbnail = node.get('thumbnail_src') or node.get('display_src') - timestamp = int_or_none(node.get('taken_at_timestamp')) - - comment_count = get_count('to_comment') - like_count = get_count('preview_like') - view_count = int_or_none(node.get('video_view_count')) - - info.update({ - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'comment_count': comment_count, - 'like_count': like_count, - 'view_count': view_count, - }) - - yield info - - page_info = media.get('page_info') - if not page_info or not isinstance(page_info, dict): - break - - has_next_page = page_info.get('has_next_page') - if not has_next_page: - break - - cursor = page_info.get('end_cursor') - if not cursor or not isinstance(cursor, compat_str): + has_next_page = traverse_obj(media, ('page_info', 'has_next_page')) + cursor = traverse_obj(media, ('page_info', 'end_cursor'), expected_type=str) + if not has_next_page or not cursor: break def _real_extract(self, url): @@ -414,11 +547,11 @@ class InstagramPlaylistIE(InfoExtractor): self._extract_graphql(data, url), user_or_tag, user_or_tag) -class InstagramUserIE(InstagramPlaylistIE): +class InstagramUserIE(InstagramPlaylistBaseIE): _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' - _TEST = { + _TESTS = [{ 'url': 'https://instagram.com/porsche', 'info_dict': { 'id': 'porsche', @@ -430,7 +563,7 @@ class InstagramUserIE(InstagramPlaylistIE): 'skip_download': True, 'playlistend': 5, } - } + }] _QUERY_HASH = '42323d64886122307be10013ad2dcc44', @@ -448,11 +581,11 @@ class InstagramUserIE(InstagramPlaylistIE): } -class InstagramTagIE(InstagramPlaylistIE): +class InstagramTagIE(InstagramPlaylistBaseIE): _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)' - IE_DESC = 'Instagram hashtag search' + IE_DESC = 'Instagram hashtag search URLs' IE_NAME = 'instagram:tag' - _TEST = { + _TESTS = [{ 'url': 'https://instagram.com/explore/tags/lolcats', 'info_dict': { 'id': 'lolcats', @@ -464,7 +597,7 @@ class InstagramTagIE(InstagramPlaylistIE): 'skip_download': True, 'playlistend': 50, } - } + }] _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314', @@ -481,3 +614,58 @@ class InstagramTagIE(InstagramPlaylistIE): 'tag_name': data['entry_data']['TagPage'][0]['graphql']['hashtag']['name'] } + + +class InstagramStoryIE(InstagramBaseIE): + _VALID_URL = r'https?://(?:www\.)?instagram\.com/stories/(?P<user>[^/]+)/(?P<id>\d+)' + IE_NAME = 'instagram:story' + + _TESTS = [{ + 'url': 'https://www.instagram.com/stories/highlights/18090946048123978/', + 'info_dict': { + 'id': '18090946048123978', + 'title': 'Rare', + }, + 'playlist_mincount': 50 + }] + + def _real_extract(self, url): + username, story_id = self._match_valid_url(url).groups() + + story_info_url = f'{username}/{story_id}/?__a=1' if username == 'highlights' else f'{username}/?__a=1' + story_info = self._download_json(f'https://www.instagram.com/stories/{story_info_url}', story_id, headers={ + 'X-IG-App-ID': 936619743392459, + 'X-ASBD-ID': 198387, + 'X-IG-WWW-Claim': 0, + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': url, + }) + user_id = story_info['user']['id'] + highlight_title = traverse_obj(story_info, ('highlight', 'title')) + + story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' + videos = self._download_json(f'https://i.instagram.com/api/v1/feed/reels_media/?reel_ids={story_info_url}', story_id, headers={ + 'X-IG-App-ID': 936619743392459, + 'X-ASBD-ID': 198387, + 'X-IG-WWW-Claim': 0, + })['reels'] + + full_name = traverse_obj(videos, ('user', 'full_name')) + + user_info = {} + if not (username and username != 'highlights' and full_name): + user_info = self._download_json( + f'https://i.instagram.com/api/v1/users/{user_id}/info/', story_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Linux; Android 11; SM-A505F Build/RP1A.200720.012; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/96.0.4664.45 Mobile Safari/537.36 Instagram 214.1.0.29.120 Android (30/11; 450dpi; 1080x2122; samsung; SM-A505F; a50; exynos9610; en_US; 333717274)', + }, note='Downloading user info') + + username = traverse_obj(user_info, ('user', 'username')) or username + full_name = traverse_obj(user_info, ('user', 'full_name')) or full_name + + highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items')) + return self.playlist_result([{ + **self._extract_product(highlight), + 'title': f'Story by {username}', + 'uploader': full_name, + 'uploader_id': user_id, + } for highlight in highlights], playlist_id=story_id, playlist_title=highlight_title) diff --git a/hypervideo_dl/extractor/internazionale.py b/hypervideo_dl/extractor/internazionale.py index 676e8e2..45e2af6 100644 --- a/hypervideo_dl/extractor/internazionale.py +++ b/hypervideo_dl/extractor/internazionale.py @@ -20,9 +20,6 @@ class InternazionaleIE(InfoExtractor): 'upload_date': '20150219', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'https://www.internazionale.it/video/2018/08/29/telefono-stare-con-noi-stessi', 'md5': '9db8663704cab73eb972d1cee0082c79', @@ -36,9 +33,6 @@ class InternazionaleIE(InfoExtractor): 'upload_date': '20180829', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - 'format': 'bestvideo', - }, }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/iprima.py b/hypervideo_dl/extractor/iprima.py index 28e6609..1a20384 100644 --- a/hypervideo_dl/extractor/iprima.py +++ b/hypervideo_dl/extractor/iprima.py @@ -8,12 +8,19 @@ from .common import InfoExtractor from ..utils import ( determine_ext, js_to_json, + urlencode_postdata, + ExtractorError, + parse_qs ) class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?!cnn)(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' _GEO_BYPASS = False + _NETRC_MACHINE = 'iprima' + _LOGIN_URL = 'https://auth.iprima.cz/oauth2/login' + _TOKEN_URL = 'https://auth.iprima.cz/oauth2/token' + access_token = None _TESTS = [{ 'url': 'https://prima.iprima.cz/particka/92-epizoda', @@ -22,16 +29,8 @@ class IPrimaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Partička (92)', 'description': 'md5:859d53beae4609e6dd7796413f1b6cac', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, { - 'url': 'https://cnn.iprima.cz/videa/70-epizoda', - 'info_dict': { - 'id': 'p681554', - 'ext': 'mp4', - 'title': 'HLAVNÍ ZPRÁVY 3.5.2020', + 'upload_date': '20201103', + 'timestamp': 1604437480, }, 'params': { 'skip_download': True, # m3u8 download @@ -44,11 +43,9 @@ class IPrimaIE(InfoExtractor): 'url': 'http://play.iprima.cz/closer-nove-pripady/closer-nove-pripady-iv-1', 'only_matching': True, }, { - # iframe api.play-backend.iprima.cz 'url': 'https://prima.iprima.cz/my-little-pony/mapa-znameni-2-2', 'only_matching': True, }, { - # iframe prima.iprima.cz 'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha', 'only_matching': True, }, { @@ -66,9 +63,125 @@ class IPrimaIE(InfoExtractor): }, { 'url': 'https://love.iprima.cz/laska-az-za-hrob/slib-dany-bratrovi', 'only_matching': True, - }, { - 'url': 'https://autosalon.iprima.cz/motorsport/7-epizoda-1', - 'only_matching': True, + }] + + def _perform_login(self, username, password): + if self.access_token: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, note='Downloading login page', + errnote='Downloading login page failed') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + '_email': username, + '_password': password}) + + _, login_handle = self._download_webpage_handle( + self._LOGIN_URL, None, data=urlencode_postdata(login_form), + note='Logging in') + + code = parse_qs(login_handle.geturl()).get('code')[0] + if not code: + raise ExtractorError('Login failed', expected=True) + + token_request_data = { + 'scope': 'openid+email+profile+phone+address+offline_access', + 'client_id': 'prima_sso', + 'grant_type': 'authorization_code', + 'code': code, + 'redirect_uri': 'https://auth.iprima.cz/sso/auth-check'} + + token_data = self._download_json( + self._TOKEN_URL, None, + note='Downloading token', errnote='Downloading token failed', + data=urlencode_postdata(token_request_data)) + + self.access_token = token_data.get('access_token') + if self.access_token is None: + raise ExtractorError('Getting token failed', expected=True) + + def _real_initialize(self): + if not self.access_token: + self.raise_login_required('Login is required to access any iPrima content', method='password') + + def _raise_access_error(self, error_code): + if error_code == 'PLAY_GEOIP_DENIED': + self.raise_geo_restricted(countries=['CZ'], metadata_available=True) + elif error_code is not None: + self.raise_no_formats('Access to stream infos forbidden', expected=True) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + ['og:title', 'twitter:title'], + webpage, 'title', default=None) + + video_id = self._search_regex(( + r'productId\s*=\s*([\'"])(?P<id>p\d+)\1', + r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1'), + webpage, 'real id', group='id') + + metadata = self._download_json( + f'https://api.play-backend.iprima.cz/api/v1//products/id-{video_id}/play', + video_id, note='Getting manifest URLs', errnote='Failed to get manifest URLs', + headers={'X-OTT-Access-Token': self.access_token}, + expected_status=403) + + self._raise_access_error(metadata.get('errorCode')) + + stream_infos = metadata.get('streamInfos') + formats = [] + if stream_infos is None: + self.raise_no_formats('Reading stream infos failed', expected=True) + else: + for manifest in stream_infos: + manifest_type = manifest.get('type') + manifest_url = manifest.get('url') + ext = determine_ext(manifest_url) + if manifest_type == 'HLS' or ext == 'm3u8': + formats += self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + elif manifest_type == 'DASH' or ext == 'mpd': + formats += self._extract_mpd_formats( + manifest_url, video_id, mpd_id='dash', fatal=False) + self._sort_formats(formats) + + final_result = self._search_json_ld(webpage, video_id) or {} + final_result.update({ + 'id': video_id, + 'title': title, + 'thumbnail': self._html_search_meta( + ['thumbnail', 'og:image', 'twitter:image'], + webpage, 'thumbnail', default=None), + 'formats': formats, + 'description': self._html_search_meta( + ['description', 'og:description', 'twitter:description'], + webpage, 'description', default=None)}) + + return final_result + + +class IPrimaCNNIE(InfoExtractor): + _VALID_URL = r'https?://cnn\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _GEO_BYPASS = False + + _TESTS = [{ + 'url': 'https://cnn.iprima.cz/porady/strunc/24072020-koronaviru-mam-plne-zuby-strasit-druhou-vlnou-je-absurdni-rika-senatorka-dernerova', + 'info_dict': { + 'id': 'p716177', + 'ext': 'mp4', + 'title': 'md5:277c6b1ed0577e51b40ddd35602ff43e', + }, + 'params': { + 'skip_download': 'm3u8' + } }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/iqiyi.py b/hypervideo_dl/extractor/iqiyi.py index b13b9f4..d07b39d 100644 --- a/hypervideo_dl/extractor/iqiyi.py +++ b/hypervideo_dl/extractor/iqiyi.py @@ -9,14 +9,28 @@ import time from .common import InfoExtractor from ..compat import ( compat_str, + compat_urllib_parse_unquote ) +from .openload import PhantomJSwrapper from ..utils import ( clean_html, + ExtractorError, + float_or_none, + format_field, get_element_by_id, get_element_by_attribute, - ExtractorError, + int_or_none, + js_to_json, ohdave_rsa_encrypt, + parse_age_limit, + parse_duration, + parse_iso8601, + parse_resolution, + qualities, remove_start, + str_or_none, + traverse_obj, + urljoin, ) @@ -96,9 +110,6 @@ class IqiyiIE(InfoExtractor): '18': 7, # 1080p } - def _real_initialize(self): - self._login() - @staticmethod def _rsa_fun(data): # public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js @@ -107,7 +118,7 @@ class IqiyiIE(InfoExtractor): return ohdave_rsa_encrypt(data, e, N) - def _login(self): + def _perform_login(self): raise ExtractorError("iQiyi's non-free authentication algorithm has made login impossible", expected=True) def get_raw_data(self, tvid, video_id): @@ -217,3 +228,359 @@ class IqiyiIE(InfoExtractor): 'title': title, 'formats': formats, } + + +class IqIE(InfoExtractor): + IE_NAME = 'iq.com' + IE_DESC = 'International version of iQiyi' + _VALID_URL = r'https?://(?:www\.)?iq\.com/play/(?:[\w%-]*-)?(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.iq.com/play/one-piece-episode-1000-1ma1i6ferf4', + 'md5': '2d7caf6eeca8a32b407094b33b757d39', + 'info_dict': { + 'ext': 'mp4', + 'id': '1ma1i6ferf4', + 'title': '航海王 第1000集', + 'description': 'Subtitle available on Sunday 4PM(GMT+8).', + 'duration': 1430, + 'timestamp': 1637488203, + 'upload_date': '20211121', + 'episode_number': 1000, + 'episode': 'Episode 1000', + 'series': 'One Piece', + 'age_limit': 13, + 'average_rating': float, + }, + 'params': { + 'format': '500', + }, + 'expected_warnings': ['format is restricted'] + }, { + # VIP-restricted video + 'url': 'https://www.iq.com/play/mermaid-in-the-fog-2021-gbdpx13bs4', + 'only_matching': True + }] + _BID_TAGS = { + '100': '240P', + '200': '360P', + '300': '480P', + '500': '720P', + '600': '1080P', + '610': '1080P50', + '700': '2K', + '800': '4K', + } + _LID_TAGS = { + '1': 'zh_CN', + '2': 'zh_TW', + '3': 'en', + '18': 'th', + '21': 'my', + '23': 'vi', + '24': 'id', + '26': 'es', + '28': 'ar', + } + + _DASH_JS = ''' + console.log(page.evaluate(function() { + var tvid = "%(tvid)s"; var vid = "%(vid)s"; var src = "%(src)s"; + var uid = "%(uid)s"; var dfp = "%(dfp)s"; var mode = "%(mode)s"; var lang = "%(lang)s"; + var bid_list = %(bid_list)s; var ut_list = %(ut_list)s; var tm = new Date().getTime(); + var cmd5x_func = %(cmd5x_func)s; var cmd5x_exporter = {}; cmd5x_func({}, cmd5x_exporter, {}); var cmd5x = cmd5x_exporter.cmd5x; + var authKey = cmd5x(cmd5x('') + tm + '' + tvid); + var k_uid = Array.apply(null, Array(32)).map(function() {return Math.floor(Math.random() * 15).toString(16)}).join(''); + var dash_paths = {}; + bid_list.forEach(function(bid) { + var query = { + 'tvid': tvid, + 'bid': bid, + 'ds': 1, + 'vid': vid, + 'src': src, + 'vt': 0, + 'rs': 1, + 'uid': uid, + 'ori': 'pcw', + 'ps': 1, + 'k_uid': k_uid, + 'pt': 0, + 'd': 0, + 's': '', + 'lid': '', + 'slid': 0, + 'cf': '', + 'ct': '', + 'authKey': authKey, + 'k_tag': 1, + 'ost': 0, + 'ppt': 0, + 'dfp': dfp, + 'prio': JSON.stringify({ + 'ff': 'f4v', + 'code': 2 + }), + 'k_err_retries': 0, + 'up': '', + 'su': 2, + 'applang': lang, + 'sver': 2, + 'X-USER-MODE': mode, + 'qd_v': 2, + 'tm': tm, + 'qdy': 'a', + 'qds': 0, + 'k_ft1': 141287244169348, + 'k_ft4': 34359746564, + 'k_ft5': 1, + 'bop': JSON.stringify({ + 'version': '10.0', + 'dfp': dfp + }), + }; + var enc_params = []; + for (var prop in query) { + enc_params.push(encodeURIComponent(prop) + '=' + encodeURIComponent(query[prop])); + } + ut_list.forEach(function(ut) { + enc_params.push('ut=' + ut); + }) + var dash_path = '/dash?' + enc_params.join('&'); dash_path += '&vf=' + cmd5x(dash_path); + dash_paths[bid] = dash_path; + }); + return JSON.stringify(dash_paths); + })); + saveAndExit(); + ''' + + def _extract_vms_player_js(self, webpage, video_id): + player_js_cache = self._downloader.cache.load('iq', 'player_js') + if player_js_cache: + return player_js_cache + webpack_js_url = self._proto_relative_url(self._search_regex( + r'<script src="((?:https?)?//stc.iqiyipic.com/_next/static/chunks/webpack-\w+\.js)"', webpage, 'webpack URL')) + webpack_js = self._download_webpage(webpack_js_url, video_id, note='Downloading webpack JS', errnote='Unable to download webpack JS') + webpack_map1, webpack_map2 = [self._parse_json(js_map, video_id, transform_source=js_to_json) for js_map in self._search_regex( + r'\(({[^}]*})\[\w+\][^\)]*\)\s*\+\s*["\']\.["\']\s*\+\s*({[^}]*})\[\w+\]\+["\']\.js', webpack_js, 'JS locations', group=(1, 2))] + for module_index in reversed(list(webpack_map2.keys())): + module_js = self._download_webpage( + f'https://stc.iqiyipic.com/_next/static/chunks/{webpack_map1.get(module_index, module_index)}.{webpack_map2[module_index]}.js', + video_id, note=f'Downloading #{module_index} module JS', errnote='Unable to download module JS', fatal=False) or '' + if 'vms request' in module_js: + self._downloader.cache.store('iq', 'player_js', module_js) + return module_js + raise ExtractorError('Unable to extract player JS') + + def _extract_cmd5x_function(self, webpage, video_id): + return self._search_regex(r',\s*(function\s*\([^\)]*\)\s*{\s*var _qda.+_qdc\(\)\s*})\s*,', + self._extract_vms_player_js(webpage, video_id), 'signature function') + + def _update_bid_tags(self, webpage, video_id): + extracted_bid_tags = self._parse_json( + self._search_regex( + r'arguments\[1\][^,]*,\s*function\s*\([^\)]*\)\s*{\s*"use strict";?\s*var \w=({.+}})\s*,\s*\w\s*=\s*{\s*getNewVd', + self._extract_vms_player_js(webpage, video_id), 'video tags', default=''), + video_id, transform_source=js_to_json, fatal=False) + if not extracted_bid_tags: + return + self._BID_TAGS = { + bid: traverse_obj(extracted_bid_tags, (bid, 'value'), expected_type=str, default=self._BID_TAGS.get(bid)) + for bid in extracted_bid_tags.keys() + } + + def _get_cookie(self, name, default=None): + cookie = self._get_cookies('https://iq.com/').get(name) + return cookie.value if cookie else default + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + self._update_bid_tags(webpage, video_id) + + next_props = self._search_nextjs_data(webpage, video_id)['props'] + page_data = next_props['initialState']['play'] + video_info = page_data['curVideoInfo'] + + uid = traverse_obj( + self._parse_json( + self._get_cookie('I00002', '{}'), video_id, transform_source=compat_urllib_parse_unquote, fatal=False), + ('data', 'uid'), default=0) + + if uid: + vip_data = self._download_json( + 'https://pcw-api.iq.com/api/vtype', video_id, note='Downloading VIP data', errnote='Unable to download VIP data', query={ + 'batch': 1, + 'platformId': 3, + 'modeCode': self._get_cookie('mod', 'intl'), + 'langCode': self._get_cookie('lang', 'en_us'), + 'deviceId': self._get_cookie('QC005', '') + }, fatal=False) + ut_list = traverse_obj(vip_data, ('data', 'all_vip', ..., 'vipType'), expected_type=str_or_none, default=[]) + else: + ut_list = ['0'] + + # bid 0 as an initial format checker + dash_paths = self._parse_json(PhantomJSwrapper(self).get( + url, html='<!DOCTYPE html>', video_id=video_id, note2='Executing signature code', jscode=self._DASH_JS % { + 'tvid': video_info['tvId'], + 'vid': video_info['vid'], + 'src': traverse_obj(next_props, ('initialProps', 'pageProps', 'ptid'), + expected_type=str, default='04022001010011000000'), + 'uid': uid, + 'dfp': self._get_cookie('dfp', ''), + 'mode': self._get_cookie('mod', 'intl'), + 'lang': self._get_cookie('lang', 'en_us'), + 'bid_list': '[' + ','.join(['0', *self._BID_TAGS.keys()]) + ']', + 'ut_list': '[' + ','.join(ut_list) + ']', + 'cmd5x_func': self._extract_cmd5x_function(webpage, video_id), + })[1].strip(), video_id) + + formats, subtitles = [], {} + initial_format_data = self._download_json( + urljoin('https://cache-video.iq.com', dash_paths['0']), video_id, + note='Downloading initial video format info', errnote='Unable to download initial video format info')['data'] + + preview_time = traverse_obj( + initial_format_data, ('boss_ts', (None, 'data'), ('previewTime', 'rtime')), expected_type=float_or_none, get_all=False) + if traverse_obj(initial_format_data, ('boss_ts', 'data', 'prv'), expected_type=int_or_none): + self.report_warning('This preview video is limited%s' % format_field(preview_time, template=' to %s seconds')) + + # TODO: Extract audio-only formats + for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none, default=[])): + dash_path = dash_paths.get(bid) + if not dash_path: + self.report_warning(f'Unknown format id: {bid}. It is currently not being extracted') + continue + format_data = traverse_obj(self._download_json( + urljoin('https://cache-video.iq.com', dash_path), video_id, + note=f'Downloading format data for {self._BID_TAGS[bid]}', errnote='Unable to download format data', + fatal=False), 'data', expected_type=dict) + + video_format = traverse_obj(format_data, ('program', 'video', lambda _, v: str(v['bid']) == bid), + expected_type=dict, default=[], get_all=False) or {} + extracted_formats = [] + if video_format.get('m3u8Url'): + extracted_formats.extend(self._extract_m3u8_formats( + urljoin(format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/'), video_format['m3u8Url']), + 'mp4', m3u8_id=bid, fatal=False)) + if video_format.get('mpdUrl'): + # TODO: Properly extract mpd hostname + extracted_formats.extend(self._extract_mpd_formats( + urljoin(format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/'), video_format['mpdUrl']), + mpd_id=bid, fatal=False)) + if video_format.get('m3u8'): + ff = video_format.get('ff', 'ts') + if ff == 'ts': + m3u8_formats, _ = self._parse_m3u8_formats_and_subtitles( + video_format['m3u8'], ext='mp4', m3u8_id=bid, fatal=False) + extracted_formats.extend(m3u8_formats) + elif ff == 'm4s': + mpd_data = traverse_obj( + self._parse_json(video_format['m3u8'], video_id, fatal=False), ('payload', ..., 'data'), expected_type=str) + if not mpd_data: + continue + mpd_formats, _ = self._parse_mpd_formats_and_subtitles( + mpd_data, bid, format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/')) + extracted_formats.extend(mpd_formats) + else: + self.report_warning(f'{ff} formats are currently not supported') + + if not extracted_formats: + if video_format.get('s'): + self.report_warning(f'{self._BID_TAGS[bid]} format is restricted') + else: + self.report_warning(f'Unable to extract {self._BID_TAGS[bid]} format') + for f in extracted_formats: + f.update({ + 'quality': qualities(list(self._BID_TAGS.keys()))(bid), + 'format_note': self._BID_TAGS[bid], + **parse_resolution(video_format.get('scrsz')) + }) + formats.extend(extracted_formats) + + self._sort_formats(formats) + + for sub_format in traverse_obj(initial_format_data, ('program', 'stl', ...), expected_type=dict, default=[]): + lang = self._LID_TAGS.get(str_or_none(sub_format.get('lid')), sub_format.get('_name')) + subtitles.setdefault(lang, []).extend([{ + 'ext': format_ext, + 'url': urljoin(initial_format_data.get('dstl', 'http://meta.video.iqiyi.com'), sub_format[format_key]) + } for format_key, format_ext in [('srt', 'srt'), ('webvtt', 'vtt')] if sub_format.get(format_key)]) + + extra_metadata = page_data.get('albumInfo') if video_info.get('albumId') and page_data.get('albumInfo') else video_info + return { + 'id': video_id, + 'title': video_info['name'], + 'formats': formats, + 'subtitles': subtitles, + 'description': video_info.get('mergeDesc'), + 'duration': parse_duration(video_info.get('len')), + 'age_limit': parse_age_limit(video_info.get('rating')), + 'average_rating': traverse_obj(page_data, ('playScoreInfo', 'score'), expected_type=float_or_none), + 'timestamp': parse_iso8601(video_info.get('isoUploadDate')), + 'categories': traverse_obj(extra_metadata, ('videoTagMap', ..., ..., 'name'), expected_type=str), + 'cast': traverse_obj(extra_metadata, ('actorArr', ..., 'name'), expected_type=str), + 'episode_number': int_or_none(video_info.get('order')) or None, + 'series': video_info.get('albumName'), + } + + +class IqAlbumIE(InfoExtractor): + IE_NAME = 'iq.com:album' + _VALID_URL = r'https?://(?:www\.)?iq\.com/album/(?:[\w%-]*-)?(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.iq.com/album/one-piece-1999-1bk9icvr331', + 'info_dict': { + 'id': '1bk9icvr331', + 'title': 'One Piece', + 'description': 'Subtitle available on Sunday 4PM(GMT+8).' + }, + 'playlist_mincount': 238 + }, { + # Movie/single video + 'url': 'https://www.iq.com/album/九龙城寨-2021-22yjnij099k', + 'info_dict': { + 'ext': 'mp4', + 'id': '22yjnij099k', + 'title': '九龙城寨', + 'description': 'md5:8a09f50b8ba0db4dc69bc7c844228044', + 'duration': 5000, + 'timestamp': 1641911371, + 'upload_date': '20220111', + 'series': '九龙城寨', + 'cast': ['Shi Yan Neng', 'Yu Lang', 'Peter lv', 'Sun Zi Jun', 'Yang Xiao Bo'], + 'age_limit': 13, + 'average_rating': float, + }, + 'expected_warnings': ['format is restricted'] + }] + + def _entries(self, album_id_num, page_ranges, album_id=None, mode_code='intl', lang_code='en_us'): + for page_range in page_ranges: + page = self._download_json( + f'https://pcw-api.iq.com/api/episodeListSource/{album_id_num}', album_id, + note=f'Downloading video list episodes {page_range.get("msg", "")}', + errnote='Unable to download video list', query={ + 'platformId': 3, + 'modeCode': mode_code, + 'langCode': lang_code, + 'endOrder': page_range['to'], + 'startOrder': page_range['from'] + }) + for video in page['data']['epg']: + yield self.url_result('https://www.iq.com/play/%s' % (video.get('playLocSuffix') or video['qipuIdStr']), + IqIE.ie_key(), video.get('qipuIdStr'), video.get('name')) + + def _real_extract(self, url): + album_id = self._match_id(url) + webpage = self._download_webpage(url, album_id) + next_data = self._search_nextjs_data(webpage, album_id) + album_data = next_data['props']['initialState']['album']['videoAlbumInfo'] + + if album_data.get('videoType') == 'singleVideo': + return self.url_result('https://www.iq.com/play/%s' % album_id, IqIE.ie_key()) + return self.playlist_result( + self._entries(album_data['albumId'], album_data['totalPageRange'], album_id, + traverse_obj(next_data, ('props', 'initialProps', 'pageProps', 'modeCode')), + traverse_obj(next_data, ('props', 'initialProps', 'pageProps', 'langCode'))), + album_id, album_data.get('name'), album_data.get('desc')) diff --git a/hypervideo_dl/extractor/itprotv.py b/hypervideo_dl/extractor/itprotv.py new file mode 100644 index 0000000..64cb4e6 --- /dev/null +++ b/hypervideo_dl/extractor/itprotv.py @@ -0,0 +1,141 @@ +# coding: utf-8 + +import re + +from .common import InfoExtractor + +from ..utils import ( + int_or_none, + str_or_none, + traverse_obj, + urljoin +) + + +class ITProTVBaseIE(InfoExtractor): + _ENDPOINTS = { + 'course': 'course?url={}&brand=00002560-0000-3fa9-0000-1d61000035f3', + 'episode': 'brand/00002560-0000-3fa9-0000-1d61000035f3/episode?url={}' + } + + def _call_api(self, ep, item_id, webpage): + return self._download_json( + f'https://api.itpro.tv/api/urza/v3/consumer-web/{self._ENDPOINTS[ep].format(item_id)}', + item_id, note=f'Fetching {ep} data API', + headers={'Authorization': f'Bearer {self._fetch_jwt(webpage)}'})[ep] + + def _fetch_jwt(self, webpage): + return self._search_regex(r'{"passedToken":"([\w-]+\.[\w-]+\.[\w-]+)",', webpage, 'jwt') + + def _check_if_logged_in(self, webpage): + if re.match(r'{\s*member\s*:\s*null', webpage): + self.raise_login_required() + + +class ITProTVIE(ITProTVBaseIE): + _VALID_URL = r'https://app.itpro.tv/course/(?P<course>[\w-]+)/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://app.itpro.tv/course/guided-tour/introductionitprotv', + 'md5': 'bca4a28c2667fd1a63052e71a94bb88c', + 'info_dict': { + 'id': 'introductionitprotv', + 'ext': 'mp4', + 'title': 'An Introduction to ITProTV 101', + 'thumbnail': 'https://itprotv-image-bucket.s3.amazonaws.com/getting-started/itprotv-101-introduction-PGM.11_39_56_02.Still001.png', + 'description': 'md5:b175c2c3061ce35a4dd33865b2c1da4e', + 'duration': 269, + 'series': 'ITProTV 101', + 'series_id': 'guided-tour', + 'availability': 'needs_auth', + 'chapter': 'ITProTV 101', + 'chapter_number': 1, + 'chapter_id': '5dbb3de426b46c0010b5d1b6' + }, + }, + { + 'url': 'https://app.itpro.tv/course/beyond-tech/job-interview-tips', + 'md5': '101a299b98c47ccf4c67f9f0951defa8', + 'info_dict': { + 'id': 'job-interview-tips', + 'ext': 'mp4', + 'title': 'Job Interview Tips', + 'thumbnail': 'https://s3.amazonaws.com:443/production-itprotv-thumbnails/2f370bf5-294d-4bbe-ab80-c0b5781630ea.png', + 'description': 'md5:30d8ba483febdf89ec85623aad3c3cb6', + 'duration': 267, + 'series': 'Beyond Tech', + 'series_id': 'beyond-tech', + 'availability': 'needs_auth', + 'chapter': 'Job Development', + 'chapter_number': 2, + 'chapter_id': '5f7c78d424330c000edf04d9' + }, + }] + + def _real_extract(self, url): + episode_id, course_name = self._match_valid_url(url).group('id', 'course') + webpage = self._download_webpage(url, episode_id) + self._check_if_logged_in(webpage) + course = self._call_api('course', course_name, webpage) + episode = self._call_api('episode', episode_id, webpage) + + chapter_number, chapter = next(( + (i, topic) for i, topic in enumerate(course.get('topics') or [], 1) + if traverse_obj(topic, 'id') == episode.get('topic')), {}) + + return { + 'id': episode_id, + 'title': episode.get('title'), + 'description': episode.get('description'), + 'thumbnail': episode.get('thumbnail'), + 'formats': [ + {'url': episode[f'jwVideo{h}Embed'], 'height': h} + for h in (320, 480, 720, 1080) if episode.get(f'jwVideo{h}Embed') + ], + 'duration': int_or_none(episode.get('length')), + 'series': course.get('name'), + 'series_id': course.get('url'), + 'chapter': str_or_none(chapter.get('title')), + 'chapter_number': chapter_number, + 'chapter_id': str_or_none(chapter.get('id')), + 'subtitles': { + 'en': [{'ext': 'vtt', 'data': episode['enCaptionData']}] + } if episode.get('enCaptionData') else None, + } + + +class ITProTVCourseIE(ITProTVBaseIE): + _VALID_URL = r'https?://app.itpro.tv/course/(?P<id>[\w-]+)/?(?:$|[#?])' + _TESTS = [ + { + 'url': 'https://app.itpro.tv/course/guided-tour', + 'info_dict': { + 'id': 'guided-tour', + 'description': 'md5:b175c2c3061ce35a4dd33865b2c1da4e', + 'title': 'ITProTV 101', + }, + 'playlist_count': 6 + }, + { + 'url': 'https://app.itpro.tv/course/beyond-tech', + 'info_dict': { + 'id': 'beyond-tech', + 'description': 'md5:44cd99855e7f81a15ce1269bd0621fed', + 'title': 'Beyond Tech' + }, + 'playlist_count': 15 + }, + ] + + def _real_extract(self, url): + course_id = self._match_id(url) + webpage = self._download_webpage(url, course_id) + self._check_if_logged_in(webpage) + course = self._call_api('course', course_id, webpage) + + entries = [self.url_result( + urljoin(url, f'{course_id}/{episode["url"]}'), ITProTVIE, + episode['url'], episode.get('title'), url_transparent=True) + for episode in course['episodes']] + + return self.playlist_result( + entries, course_id, course.get('name'), course.get('description')) diff --git a/hypervideo_dl/extractor/itv.py b/hypervideo_dl/extractor/itv.py index 4cd34a2..66705a2 100644 --- a/hypervideo_dl/extractor/itv.py +++ b/hypervideo_dl/extractor/itv.py @@ -117,7 +117,7 @@ class ITVIE(InfoExtractor): # See: https://github.com/hypervideo/hypervideo/issues/986 platform_tag_subs, featureset_subs = next( ((platform_tag, featureset) - for platform_tag, featuresets in reversed(variants.items()) for featureset in featuresets + for platform_tag, featuresets in reversed(list(variants.items())) for featureset in featuresets if try_get(featureset, lambda x: x[2]) == 'outband-webvtt'), (None, None)) @@ -146,8 +146,8 @@ class ITVIE(InfoExtractor): # See: https://github.com/hypervideo/hypervideo/issues/986 platform_tag_video, featureset_video = next( ((platform_tag, featureset) - for platform_tag, featuresets in reversed(variants.items()) for featureset in featuresets - if try_get(featureset, lambda x: x[:2]) == ['hls', 'aes']), + for platform_tag, featuresets in reversed(list(variants.items())) for featureset in featuresets + if set(try_get(featureset, lambda x: x[:2]) or []) == {'aes', 'hls'}), (None, None)) if not platform_tag_video or not featureset_video: raise ExtractorError('No downloads available', expected=True, video_id=video_id) @@ -220,35 +220,42 @@ class ITVIE(InfoExtractor): class ITVBTCCIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?itv\.com/(?:news|btcc)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ 'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action', 'info_dict': { 'id': 'btcc-2019-brands-hatch-gp-race-action', 'title': 'BTCC 2019: Brands Hatch GP race action', }, 'playlist_count': 12, - } - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' + }, { + 'url': 'https://www.itv.com/news/2021-10-27/i-have-to-protect-the-country-says-rishi-sunak-as-uk-faces-interest-rate-hike', + 'info_dict': { + 'id': 'i-have-to-protect-the-country-says-rishi-sunak-as-uk-faces-interest-rate-hike', + 'title': 'md5:6ef054dd9f069330db3dcc66cb772d32' + }, + 'playlist_count': 4 + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - json_map = try_get(self._parse_json(self._html_search_regex( - '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)</script>', webpage, 'json_map'), playlist_id), + json_map = try_get( + self._search_nextjs_data(webpage, playlist_id), lambda x: x['props']['pageProps']['article']['body']['content']) or [] - # Discard empty objects - video_ids = [] + entries = [] for video in json_map: - if video['data'].get('id'): - video_ids.append(video['data']['id']) - - entries = [ - self.url_result( - smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, { + if not any(video['data'].get(attr) == 'Brightcove' for attr in ('name', 'type')): + continue + video_id = video['data']['id'] + account_id = video['data']['accountId'] + player_id = video['data']['playerId'] + entries.append(self.url_result( + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), { # ITV does not like some GB IP ranges, so here are some # IP blocks it accepts 'geo_ip_blocks': [ @@ -256,8 +263,7 @@ class ITVBTCCIE(InfoExtractor): ], 'referrer': url, }), - ie=BrightcoveNewIE.ie_key(), video_id=video_id) - for video_id in video_ids] + ie=BrightcoveNewIE.ie_key(), video_id=video_id)) title = self._og_search_title(webpage, fatal=False) diff --git a/hypervideo_dl/extractor/ivideon.py b/hypervideo_dl/extractor/ivideon.py index 01e7b22..44b2208 100644 --- a/hypervideo_dl/extractor/ivideon.py +++ b/hypervideo_dl/extractor/ivideon.py @@ -75,7 +75,7 @@ class IvideonIE(InfoExtractor): return { 'id': server_id, - 'title': self._live_title(camera_name or server_id), + 'title': camera_name or server_id, 'description': description, 'is_live': True, 'formats': formats, diff --git a/hypervideo_dl/extractor/iwara.py b/hypervideo_dl/extractor/iwara.py index 254d986..c0e01e3 100644 --- a/hypervideo_dl/extractor/iwara.py +++ b/hypervideo_dl/extractor/iwara.py @@ -76,8 +76,7 @@ class IwaraIE(InfoExtractor): 'age_limit': age_limit, } - title = remove_end(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), ' | Iwara') + title = remove_end(self._html_extract_title(webpage), ' | Iwara') thumbnail = self._html_search_regex( r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None) diff --git a/hypervideo_dl/extractor/jamendo.py b/hypervideo_dl/extractor/jamendo.py index 1db7c64..755d970 100644 --- a/hypervideo_dl/extractor/jamendo.py +++ b/hypervideo_dl/extractor/jamendo.py @@ -59,7 +59,7 @@ class JamendoIE(InfoExtractor): })[0] def _real_extract(self, url): - track_id, display_id = self._VALID_URL_RE.match(url).groups() + track_id, display_id = self._match_valid_url(url).groups() # webpage = self._download_webpage( # 'https://www.jamendo.com/track/' + track_id, track_id) # models = self._parse_json(self._html_search_regex( diff --git a/hypervideo_dl/extractor/joj.py b/hypervideo_dl/extractor/joj.py index 6376181..7350f53 100644 --- a/hypervideo_dl/extractor/joj.py +++ b/hypervideo_dl/extractor/joj.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + format_field, int_or_none, js_to_json, try_get, @@ -72,7 +73,7 @@ class JojIE(InfoExtractor): r'(\d+)[pP]\.', format_url, 'height', default=None) formats.append({ 'url': format_url, - 'format_id': '%sp' % height if height else None, + 'format_id': format_field(height, template='%sp'), 'height': int(height), }) if not formats: diff --git a/hypervideo_dl/extractor/kakao.py b/hypervideo_dl/extractor/kakao.py index 97c986d..483ab71 100644 --- a/hypervideo_dl/extractor/kakao.py +++ b/hypervideo_dl/extractor/kakao.py @@ -3,10 +3,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, int_or_none, strip_or_none, + str_or_none, traverse_obj, unified_timestamp, ) @@ -24,10 +26,17 @@ class KakaoIE(InfoExtractor): 'id': '301965083', 'ext': 'mp4', 'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』', - 'uploader_id': 2671005, + 'description': '', + 'uploader_id': '2671005', 'uploader': '그랑그랑이', 'timestamp': 1488160199, 'upload_date': '20170227', + 'like_count': int, + 'thumbnail': r're:http://.+/thumb\.png', + 'tags': ['乃木坂'], + 'view_count': int, + 'duration': 1503, + 'comment_count': int, } }, { 'url': 'http://tv.kakao.com/channel/2653210/cliplink/300103180', @@ -37,11 +46,21 @@ class KakaoIE(InfoExtractor): 'ext': 'mp4', 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', - 'uploader_id': 2653210, + 'uploader_id': '2653210', 'uploader': '쇼! 음악중심', 'timestamp': 1485684628, 'upload_date': '20170129', + 'like_count': int, + 'thumbnail': r're:http://.+/thumb\.png', + 'tags': 'count:28', + 'view_count': int, + 'duration': 184, + 'comment_count': int, } + }, { + # geo restricted + 'url': 'https://tv.kakao.com/channel/3643855/cliplink/412069491', + 'only_matching': True, }] def _real_extract(self, url): @@ -73,19 +92,24 @@ class KakaoIE(InfoExtractor): title = clip.get('title') or clip_link.get('displayTitle') formats = [] - for fmt in clip.get('videoOutputList', []): + for fmt in clip.get('videoOutputList') or []: profile_name = fmt.get('profile') if not profile_name or profile_name == 'AUDIO': continue query.update({ 'profile': profile_name, - 'fields': '-*,url', + 'fields': '-*,code,message,url', }) + try: + fmt_url_json = self._download_json( + cdn_api_base, video_id, query=query, + note='Downloading video URL for profile %s' % profile_name) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + resp = self._parse_json(e.cause.read().decode(), video_id) + if resp.get('code') == 'GeoBlocked': + self.raise_geo_restricted() - fmt_url_json = self._download_json( - cdn_api_base, video_id, - 'Downloading video URL for profile %s' % profile_name, - query=query, fatal=False) fmt_url = traverse_obj(fmt_url_json, ('videoLocation', 'url')) if not fmt_url: continue @@ -105,7 +129,7 @@ class KakaoIE(InfoExtractor): for thumb in clip.get('clipChapterThumbnailList') or []: thumbs.append({ 'url': thumb.get('thumbnailUrl'), - 'id': compat_str(thumb.get('timeInSec')), + 'id': str(thumb.get('timeInSec')), 'preference': -1 if thumb.get('isDefault') else 0 }) top_thumbnail = clip.get('thumbnailUrl') @@ -120,7 +144,7 @@ class KakaoIE(InfoExtractor): 'title': title, 'description': strip_or_none(clip.get('description')), 'uploader': traverse_obj(clip_link, ('channel', 'name')), - 'uploader_id': clip_link.get('channelId'), + 'uploader_id': str_or_none(clip_link.get('channelId')), 'thumbnails': thumbs, 'timestamp': unified_timestamp(clip_link.get('createTime')), 'duration': int_or_none(clip.get('duration')), diff --git a/hypervideo_dl/extractor/kaltura.py b/hypervideo_dl/extractor/kaltura.py index c8f60ef..f6dfc9c 100644 --- a/hypervideo_dl/extractor/kaltura.py +++ b/hypervideo_dl/extractor/kaltura.py @@ -12,6 +12,7 @@ from ..compat import ( from ..utils import ( clean_html, ExtractorError, + format_field, int_or_none, unsmuggle_url, smuggle_url, @@ -300,6 +301,7 @@ class KalturaIE(InfoExtractor): data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url) formats = [] + subtitles = {} for f in flavor_assets: # Continue if asset is not ready if f.get('status') != 2: @@ -343,13 +345,14 @@ class KalturaIE(InfoExtractor): if '/playManifest/' in data_url: m3u8_url = sign_url(data_url.replace( 'format/url', 'format/applehttp')) - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( m3u8_url, entry_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) self._sort_formats(formats) - subtitles = {} if captions: for caption in captions.get('objects', []): # Continue if caption is not ready @@ -372,6 +375,6 @@ class KalturaIE(InfoExtractor): 'thumbnail': info.get('thumbnailUrl'), 'duration': info.get('duration'), 'timestamp': info.get('createdAt'), - 'uploader_id': info.get('userId') if info.get('userId') != 'None' else None, + 'uploader_id': format_field(info, 'userId', ignore=('None', None)), 'view_count': info.get('plays'), } diff --git a/hypervideo_dl/extractor/keezmovies.py b/hypervideo_dl/extractor/keezmovies.py index 027f43c..06dbcbb 100644 --- a/hypervideo_dl/extractor/keezmovies.py +++ b/hypervideo_dl/extractor/keezmovies.py @@ -8,6 +8,7 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( determine_ext, ExtractorError, + format_field, int_or_none, str_to_int, strip_or_none, @@ -69,7 +70,7 @@ class KeezMoviesIE(InfoExtractor): video_url, title, 32).decode('utf-8') formats.append({ 'url': format_url, - 'format_id': '%dp' % height if height else None, + 'format_id': format_field(height, template='%dp'), 'height': height, 'tbr': tbr, }) diff --git a/hypervideo_dl/extractor/kelbyone.py b/hypervideo_dl/extractor/kelbyone.py new file mode 100644 index 0000000..20c26cf --- /dev/null +++ b/hypervideo_dl/extractor/kelbyone.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class KelbyOneIE(InfoExtractor): + _VALID_URL = r'https?://members\.kelbyone\.com/course/(?P<id>[^$&?#/]+)' + + _TESTS = [{ + 'url': 'https://members.kelbyone.com/course/glyn-dewis-mastering-selections/', + 'playlist_mincount': 1, + 'info_dict': { + 'id': 'glyn-dewis-mastering-selections', + 'title': 'Trailer - Mastering Selections in Photoshop', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'MkiOnLqK', + 'ext': 'mp4', + 'title': 'Trailer - Mastering Selections in Photoshop', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': 'https://content.jwplatform.com/v2/media/MkiOnLqK/poster.jpg?width=720', + 'timestamp': 1601568639, + 'duration': 90, + 'upload_date': '20201001', + }, + }] + }] + + def _entries(self, playlist): + for item in playlist: + video_id = item['mediaid'] + thumbnails = [{ + 'url': image.get('src'), + 'width': int_or_none(image.get('width')), + } for image in item.get('images') or []] + formats, subtitles = [], {} + for source in item.get('sources') or []: + if not source.get('file'): + continue + if source.get('type') == 'application/vnd.apple.mpegurl': + fmts, subs = self._extract_m3u8_formats_and_subtitles(source['file'], video_id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subs, subtitles) + elif source.get('type') == 'audio/mp4': + formats.append({ + 'format_id': source.get('label'), + 'url': source['file'], + 'vcodec': 'none', + }) + else: + formats.append({ + 'format_id': source.get('label'), + 'height': source.get('height'), + 'width': source.get('width'), + 'url': source['file'], + }) + for track in item.get('tracks'): + if track.get('kind') == 'captions' and track.get('file'): + subtitles.setdefault('en', []).append({ + 'url': track['file'], + }) + self._sort_formats(formats) + yield { + 'id': video_id, + 'title': item['title'], + 'description': item.get('description'), + 'thumbnails': thumbnails, + 'thumbnail': item.get('image'), + 'timestamp': item.get('pubdate'), + 'duration': item.get('duration'), + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + item_id = self._match_id(url) + webpage = self._download_webpage(url, item_id) + playlist_url = self._html_search_regex(r'playlist"\:"(https.*content\.jwplatform\.com.*json)"', webpage, 'playlist url').replace('\\', '') + course_data = self._download_json(playlist_url, item_id) + return self.playlist_result(self._entries(course_data['playlist']), item_id, + course_data.get('title'), course_data.get('description')) diff --git a/hypervideo_dl/extractor/kinopoisk.py b/hypervideo_dl/extractor/kinopoisk.py index 9e8d01f..cdbb642 100644 --- a/hypervideo_dl/extractor/kinopoisk.py +++ b/hypervideo_dl/extractor/kinopoisk.py @@ -23,9 +23,6 @@ class KinoPoiskIE(InfoExtractor): 'duration': 4533, 'age_limit': 12, }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'https://www.kinopoisk.ru/film/81041', 'only_matching': True, diff --git a/hypervideo_dl/extractor/koo.py b/hypervideo_dl/extractor/koo.py index 8154ba7..2d6ed3b 100644 --- a/hypervideo_dl/extractor/koo.py +++ b/hypervideo_dl/extractor/koo.py @@ -8,7 +8,7 @@ from ..utils import ( class KooIE(InfoExtractor): - _VALID_URL = r'(?:https?://)(?:www\.)?kooapp\.com/koo/[^/]+/(?P<id>[^/&#$?]+)' + _VALID_URL = r'https?://(?:www\.)?kooapp\.com/koo/[^/]+/(?P<id>[^/&#$?]+)' _TESTS = [{ # Test for video in the comments 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/946c4189-bc2d-4524-b95b-43f641e2adde', 'info_dict': { diff --git a/hypervideo_dl/extractor/la7.py b/hypervideo_dl/extractor/la7.py index 363fbd6..de985e4 100644 --- a/hypervideo_dl/extractor/la7.py +++ b/hypervideo_dl/extractor/la7.py @@ -7,8 +7,9 @@ from .common import InfoExtractor from ..utils import ( determine_ext, float_or_none, + HEADRequest, + int_or_none, parse_duration, - smuggle_url, unified_strdate, ) @@ -25,19 +26,38 @@ class LA7IE(InfoExtractor): 'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722', 'md5': '8b613ffc0c4bf9b9e377169fc19c214c', 'info_dict': { - 'id': '0_42j6wd36', + 'id': 'inccool8-02-10-2015-163722', 'ext': 'mp4', 'title': 'Inc.Cool8', 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico', 'thumbnail': 're:^https?://.*', - 'uploader_id': 'kdla7pillole@iltrovatore.it', - 'timestamp': 1443814869, 'upload_date': '20151002', }, }, { 'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077', 'only_matching': True, }] + _HOST = 'https://awsvodpkg.iltrovatore.it' + + def _generate_mp4_url(self, quality, m3u8_formats): + for f in m3u8_formats: + if f['vcodec'] != 'none' and quality in f['url']: + http_url = '%s%s.mp4' % (self._HOST, quality) + + urlh = self._request_webpage( + HEADRequest(http_url), quality, + note='Check filesize', fatal=False) + if urlh: + http_f = f.copy() + del http_f['manifest_url'] + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', 'https-'), + 'url': http_url, + 'protocol': 'https', + 'filesize_approx': int_or_none(urlh.headers.get('Content-Length', None)), + }) + return http_f + return None def _real_extract(self, url): video_id = self._match_id(url) @@ -46,22 +66,30 @@ class LA7IE(InfoExtractor): url = '%s//%s' % (self.http_scheme(), url) webpage = self._download_webpage(url, video_id) + video_path = self._search_regex(r'(/content/.*?).mp4', webpage, 'video_path') - player_data = self._search_regex( - [r'(?s)videoParams\s*=\s*({.+?});', r'videoLa7\(({[^;]+})\);'], - webpage, 'player data') - vid = self._search_regex(r'vid\s*:\s*"(.+?)",', player_data, 'vid') + formats = self._extract_mpd_formats( + f'{self._HOST}/local/dash/,{video_path}.mp4.urlset/manifest.mpd', + video_id, mpd_id='dash', fatal=False) + m3u8_formats = self._extract_m3u8_formats( + f'{self._HOST}/local/hls/,{video_path}.mp4.urlset/master.m3u8', + video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + + for q in filter(None, video_path.split(',')): + http_f = self._generate_mp4_url(q, m3u8_formats) + if http_f: + formats.append(http_f) + + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'url': smuggle_url('kaltura:103:%s' % vid, { - 'service_url': 'http://nkdam.iltrovatore.it', - }), 'id': video_id, 'title': self._og_search_title(webpage, default=None), 'description': self._og_search_description(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'ie_key': 'Kaltura', + 'formats': formats, + 'upload_date': unified_strdate(self._search_regex(r'datetime="(.+?)"', webpage, 'upload_date', fatal=False)) } diff --git a/hypervideo_dl/extractor/laola1tv.py b/hypervideo_dl/extractor/laola1tv.py index fa21736..b5d27c2 100644 --- a/hypervideo_dl/extractor/laola1tv.py +++ b/hypervideo_dl/extractor/laola1tv.py @@ -112,7 +112,7 @@ class Laola1TvEmbedIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'upload_date': unified_strdate(_v('time_date')), 'uploader': _v('meta_organisation'), 'categories': categories, @@ -161,7 +161,7 @@ class Laola1TvBaseIE(Laola1TvEmbedIE): return { 'id': video_id, 'display_id': display_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': video_data.get('description'), 'thumbnail': video_data.get('image'), 'categories': categories, diff --git a/hypervideo_dl/extractor/lastfm.py b/hypervideo_dl/extractor/lastfm.py new file mode 100644 index 0000000..5215717 --- /dev/null +++ b/hypervideo_dl/extractor/lastfm.py @@ -0,0 +1,129 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none, format_field + + +class LastFMPlaylistBaseIE(InfoExtractor): + def _entries(self, url, playlist_id): + webpage = self._download_webpage(url, playlist_id) + start_page_number = int_or_none(self._search_regex( + r'\bpage=(\d+)', url, 'page', default=None)) or 1 + last_page_number = int_or_none(self._search_regex( + r'>(\d+)</a>[^<]*</li>[^<]*<li[^>]+class="pagination-next', webpage, 'last_page', default=None)) + + for page_number in range(start_page_number, (last_page_number or start_page_number) + 1): + webpage = self._download_webpage( + url, playlist_id, + note='Downloading page %d%s' % (page_number, format_field(last_page_number, template=' of %d')), + query={'page': page_number}) + page_entries = [ + self.url_result(player_url, 'Youtube') + for player_url in set(re.findall(r'data-youtube-url="([^"]+)"', webpage)) + ] + + for e in page_entries: + yield e + + def _real_extract(self, url): + playlist_id = self._match_id(url) + return self.playlist_result(self._entries(url, playlist_id), playlist_id) + + +class LastFMPlaylistIE(LastFMPlaylistBaseIE): + _VALID_URL = r'https?://(?:www\.)?last\.fm/(music|tag)/(?P<id>[^/]+)(?:/[^/]+)?/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.last.fm/music/Oasis/(What%27s+the+Story)+Morning+Glory%3F', + 'info_dict': { + 'id': 'Oasis', + }, + 'playlist_count': 11, + }, { + 'url': 'https://www.last.fm/music/Oasis', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/music/Oasis/', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/music/Oasis?top_tracks_date_preset=ALL#top-tracks', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/music/Oasis/+tracks', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/music/Oasis/+tracks?page=2', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/music/Oasis/+tracks?date_preset=LAST_90_DAYS#top-tracks', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/tag/rock', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/tag/rock/tracks', + 'only_matching': True, + }] + + +class LastFMUserIE(LastFMPlaylistBaseIE): + _VALID_URL = r'https?://(?:www\.)?last\.fm/user/[^/]+/playlists/(?P<id>[^/#?]+)' + _TESTS = [{ + 'url': 'https://www.last.fm/user/mehq/playlists/12319471', + 'info_dict': { + 'id': '12319471', + }, + 'playlist_count': 30, + }] + + +class LastFMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?last\.fm/music(?:/[^/]+){2}/(?P<id>[^/#?]+)' + _TESTS = [{ + 'url': 'https://www.last.fm/music/Oasis/_/Wonderwall', + 'md5': '9c4a70c2e84c03d54fe24229b9e13b7b', + 'info_dict': { + 'id': '6hzrDeceEKc', + 'ext': 'mp4', + 'title': 'Oasis - Wonderwall (Official Video)', + 'thumbnail': r're:^https?://i.ytimg.com/.*\.jpg$', + 'description': 'md5:0848669853c10687cc28e88b5756738f', + 'uploader': 'Oasis', + 'uploader_id': 'oasisinetofficial', + 'upload_date': '20080207', + 'album': '(What\'s The Story) Morning Glory? (Remastered)', + 'track': 'Wonderwall (Remastered)', + 'channel_id': 'UCUDVBtnOQi4c7E8jebpjc9Q', + 'view_count': int, + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCUDVBtnOQi4c7E8jebpjc9Q', + 'tags': 'count:39', + 'creator': 'Oasis', + 'uploader_url': 're:^https?://www.youtube.com/user/oasisinetofficial', + 'duration': 279, + 'alt_title': 'Wonderwall (Remastered)', + 'age_limit': 0, + 'channel': 'Oasis', + 'channel_follower_count': int, + 'categories': ['Music'], + 'availability': 'public', + 'like_count': int, + 'playable_in_embed': True, + 'artist': 'Oasis', + }, + 'add_ie': ['Youtube'], + }, { + 'url': 'https://www.last.fm/music/Oasis/_/Don%27t+Look+Back+In+Anger+-+Remastered/', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/music/Guns+N%27+Roses/_/Sweet+Child+o%27+Mine', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player_url = self._search_regex(r'(?s)class="header-new-playlink"\s+href="([^"]+)"', webpage, 'player_url') + return self.url_result(player_url, 'Youtube') diff --git a/hypervideo_dl/extractor/lbry.py b/hypervideo_dl/extractor/lbry.py index 0f87bf1..5d5457c 100644 --- a/hypervideo_dl/extractor/lbry.py +++ b/hypervideo_dl/extractor/lbry.py @@ -17,6 +17,7 @@ from ..utils import ( parse_qs, OnDemandPagedList, try_get, + UnsupportedError, urljoin, ) @@ -184,28 +185,38 @@ class LBRYIE(LBRYBaseIE): display_id = compat_urllib_parse_unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') - result_value = result['value'] - if result_value.get('stream_type') not in self._SUPPORTED_STREAM_TYPES: - raise ExtractorError('Unsupported URL', expected=True) - claim_id = result['claim_id'] - title = result_value['title'] - streaming_url = self._call_api_proxy( - 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] + if result['value'].get('stream_type') in self._SUPPORTED_STREAM_TYPES: + claim_id, is_live, headers = result['claim_id'], False, None + streaming_url = self._call_api_proxy( + 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] + final_url = self._request_webpage( + streaming_url, display_id, note='Downloading streaming redirect url info').geturl() + elif result.get('value_type') == 'stream': + claim_id, is_live = result['signing_channel']['claim_id'], True + headers = {'referer': 'https://player.odysee.live/'} + live_data = self._download_json( + f'https://api.live.odysee.com/v1/odysee/live/{claim_id}', claim_id, + note='Downloading livestream JSON metadata')['data'] + streaming_url = final_url = live_data.get('url') + if not final_url and not live_data.get('live'): + self.raise_no_formats('This stream is not live', True, claim_id) + else: + raise UnsupportedError(url) + info = self._parse_stream(result, url) - urlh = self._request_webpage( - streaming_url, display_id, note='Downloading streaming redirect url info') - if determine_ext(urlh.geturl()) == 'm3u8': + if determine_ext(final_url) == 'm3u8': info['formats'] = self._extract_m3u8_formats( - urlh.geturl(), display_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + final_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live, headers=headers) self._sort_formats(info['formats']) else: info['url'] = streaming_url - info.update({ + return { + **info, 'id': claim_id, - 'title': title, - }) - return info + 'title': result['value']['title'], + 'is_live': is_live, + 'http_headers': headers, + } class LBRYChannelIE(LBRYBaseIE): diff --git a/hypervideo_dl/extractor/lecturio.py b/hypervideo_dl/extractor/lecturio.py index 9d22287..0ee1eeb 100644 --- a/hypervideo_dl/extractor/lecturio.py +++ b/hypervideo_dl/extractor/lecturio.py @@ -22,14 +22,7 @@ class LecturioBaseIE(InfoExtractor): _LOGIN_URL = 'https://app.lecturio.com/en/login' _NETRC_MACHINE = 'lecturio' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): # Sets some cookies _, urlh = self._download_webpage_handle( self._LOGIN_URL, None, 'Downloading login popup') diff --git a/hypervideo_dl/extractor/lego.py b/hypervideo_dl/extractor/lego.py index b9d8b16..901f43b 100644 --- a/hypervideo_dl/extractor/lego.py +++ b/hypervideo_dl/extractor/lego.py @@ -8,6 +8,7 @@ from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, + join_nonempty, qualities, ) @@ -102,12 +103,8 @@ class LEGOIE(InfoExtractor): m3u8_id=video_source_format, fatal=False)) else: video_source_quality = video_source.get('Quality') - format_id = [] - for v in (video_source_format, video_source_quality): - if v: - format_id.append(v) f = { - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(video_source_format, video_source_quality), 'quality': q(video_source_quality), 'url': video_source_url, } diff --git a/hypervideo_dl/extractor/limelight.py b/hypervideo_dl/extractor/limelight.py index 369141d..b20681a 100644 --- a/hypervideo_dl/extractor/limelight.py +++ b/hypervideo_dl/extractor/limelight.py @@ -194,7 +194,7 @@ class LimelightBaseIE(InfoExtractor): cc_url = cc.get('webvttFileUrl') if not cc_url: continue - lang = cc.get('languageCode') or self._search_regex(r'/[a-z]{2}\.vtt', cc_url, 'lang', default='en') + lang = cc.get('languageCode') or self._search_regex(r'/([a-z]{2})\.vtt', cc_url, 'lang', default='en') subtitles.setdefault(lang, []).append({ 'url': cc_url, }) diff --git a/hypervideo_dl/extractor/line.py b/hypervideo_dl/extractor/line.py index d4bcae6..987c434 100644 --- a/hypervideo_dl/extractor/line.py +++ b/hypervideo_dl/extractor/line.py @@ -5,95 +5,12 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + format_field, int_or_none, - js_to_json, str_or_none, ) -class LineTVIE(InfoExtractor): - _VALID_URL = r'https?://tv\.line\.me/v/(?P<id>\d+)_[^/]+-(?P<segment>ep\d+-\d+)' - - _TESTS = [{ - 'url': 'https://tv.line.me/v/793123_goodbye-mrblack-ep1-1/list/69246', - 'info_dict': { - 'id': '793123_ep1-1', - 'ext': 'mp4', - 'title': 'Goodbye Mr.Black | EP.1-1', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 998.509, - 'view_count': int, - }, - }, { - 'url': 'https://tv.line.me/v/2587507_%E6%B4%BE%E9%81%A3%E5%A5%B3%E9%86%ABx-ep1-02/list/185245', - 'only_matching': True, - }] - - def _real_extract(self, url): - series_id, segment = self._match_valid_url(url).groups() - video_id = '%s_%s' % (series_id, segment) - - webpage = self._download_webpage(url, video_id) - - player_params = self._parse_json(self._search_regex( - r'naver\.WebPlayer\(({[^}]+})\)', webpage, 'player parameters'), - video_id, transform_source=js_to_json) - - video_info = self._download_json( - 'https://global-nvapis.line.me/linetv/rmcnmv/vod_play_videoInfo.json', - video_id, query={ - 'videoId': player_params['videoId'], - 'key': player_params['key'], - }) - - stream = video_info['streams'][0] - extra_query = '?__gda__=' + stream['key']['value'] - formats = self._extract_m3u8_formats( - stream['source'] + extra_query, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls') - - for a_format in formats: - a_format['url'] += extra_query - - duration = None - for video in video_info.get('videos', {}).get('list', []): - encoding_option = video.get('encodingOption', {}) - abr = video['bitrate']['audio'] - vbr = video['bitrate']['video'] - tbr = abr + vbr - formats.append({ - 'url': video['source'], - 'format_id': 'http-%d' % int(tbr), - 'height': encoding_option.get('height'), - 'width': encoding_option.get('width'), - 'abr': abr, - 'vbr': vbr, - 'filesize': video.get('size'), - }) - if video.get('duration') and duration is None: - duration = video['duration'] - - self._sort_formats(formats) - - if formats and not formats[0].get('width'): - formats[0]['vcodec'] = 'none' - - title = self._og_search_title(webpage) - - # like_count requires an additional API request https://tv.line.me/api/likeit/getCount - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'extra_param_to_segment_url': extra_query[1:], - 'duration': duration, - 'thumbnails': [{'url': thumbnail['source']} - for thumbnail in video_info.get('thumbnails', {}).get('list', [])], - 'view_count': video_info.get('meta', {}).get('count'), - } - - class LineLiveBaseIE(InfoExtractor): _API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/' @@ -116,12 +33,12 @@ class LineLiveBaseIE(InfoExtractor): return { 'id': broadcast_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'thumbnails': thumbnails, 'timestamp': int_or_none(item.get('createdAt')), 'channel': channel.get('name'), 'channel_id': channel_id, - 'channel_url': 'https://live.line.me/channels/' + channel_id if channel_id else None, + 'channel_url': format_field(channel_id, template='https://live.line.me/channels/%s'), 'duration': int_or_none(item.get('archiveDuration')), 'view_count': int_or_none(item.get('viewerCount')), 'comment_count': int_or_none(item.get('chatCount')), @@ -132,16 +49,19 @@ class LineLiveBaseIE(InfoExtractor): class LineLiveIE(LineLiveBaseIE): _VALID_URL = r'https?://live\.line\.me/channels/(?P<channel_id>\d+)/broadcast/(?P<id>\d+)' _TESTS = [{ - 'url': 'https://live.line.me/channels/4867368/broadcast/16331360', - 'md5': 'bc931f26bf1d4f971e3b0982b3fab4a3', + 'url': 'https://live.line.me/channels/5833718/broadcast/18373277', + 'md5': '2c15843b8cb3acd55009ddcb2db91f7c', 'info_dict': { - 'id': '16331360', - 'title': '振りコピ講座😙😙😙', + 'id': '18373277', + 'title': '2021/12/05 (15分犬)定例譲渡会🐶', 'ext': 'mp4', - 'timestamp': 1617095132, - 'upload_date': '20210330', - 'channel': '白川ゆめか', - 'channel_id': '4867368', + 'timestamp': 1638674925, + 'upload_date': '20211205', + 'thumbnail': 'md5:e1f5817e60f4a72b7e43377cf308d7ef', + 'channel_url': 'https://live.line.me/channels/5833718', + 'channel': 'Yahooニュース掲載🗞プロフ見てね🐕🐕', + 'channel_id': '5833718', + 'duration': 937, 'view_count': int, 'comment_count': int, 'is_live': False, @@ -193,8 +113,8 @@ class LineLiveChannelIE(LineLiveBaseIE): 'url': 'https://live.line.me/channels/5893542', 'info_dict': { 'id': '5893542', - 'title': 'いくらちゃん', - 'description': 'md5:c3a4af801f43b2fac0b02294976580be', + 'title': 'いくらちゃんだよぉ🦒', + 'description': 'md5:4d418087973ad081ceb1b3481f0b1816', }, 'playlist_mincount': 29 } diff --git a/hypervideo_dl/extractor/linkedin.py b/hypervideo_dl/extractor/linkedin.py index 3ce906e..0f57bfa 100644 --- a/hypervideo_dl/extractor/linkedin.py +++ b/hypervideo_dl/extractor/linkedin.py @@ -6,18 +6,51 @@ import re from .common import InfoExtractor from ..utils import ( + clean_html, + extract_attributes, ExtractorError, float_or_none, + get_element_by_class, int_or_none, srt_subtitles_timecode, + strip_or_none, + mimetype2ext, try_get, urlencode_postdata, urljoin, ) -class LinkedInLearningBaseIE(InfoExtractor): +class LinkedInBaseIE(InfoExtractor): _NETRC_MACHINE = 'linkedin' + _logged_in = False + + def _perform_login(self, username, password): + if self._logged_in: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + action_url = urljoin(self._LOGIN_URL, self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url', + default='https://www.linkedin.com/uas/login-submit', group='url')) + data = self._hidden_inputs(login_page) + data.update({ + 'session_key': username, + 'session_password': password, + }) + login_submit_page = self._download_webpage( + action_url, None, 'Logging in', + data=urlencode_postdata(data)) + error = self._search_regex( + r'<span[^>]+class="error"[^>]*>\s*(.+?)\s*</span>', + login_submit_page, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + LinkedInBaseIE._logged_in = True + + +class LinkedInLearningBaseIE(LinkedInBaseIE): _LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning' def _call_api(self, course_slug, fields, video_slug=None, resolution=None): @@ -34,6 +67,8 @@ class LinkedInLearningBaseIE(InfoExtractor): }) sub = ' %dp' % resolution api_url = 'https://www.linkedin.com/learning-api/detailedCourses' + if not self._get_cookies(api_url).get('JSESSIONID'): + self.raise_login_required() return self._download_json( api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={ 'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value, @@ -49,29 +84,47 @@ class LinkedInLearningBaseIE(InfoExtractor): def _get_video_id(self, video_data, course_slug, video_slug): return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug) - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - action_url = urljoin(self._LOGIN_URL, self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url', - default='https://www.linkedin.com/uas/login-submit', group='url')) - data = self._hidden_inputs(login_page) - data.update({ - 'session_key': email, - 'session_password': password, - }) - login_submit_page = self._download_webpage( - action_url, None, 'Logging in', - data=urlencode_postdata(data)) - error = self._search_regex( - r'<span[^>]+class="error"[^>]*>\s*(.+?)\s*</span>', - login_submit_page, 'error', default=None) - if error: - raise ExtractorError(error, expected=True) +class LinkedInIE(LinkedInBaseIE): + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/.+?(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.linkedin.com/posts/mishalkhawaja_sendinblueviews-toronto-digitalmarketing-ugcPost-6850898786781339649-mM20', + 'info_dict': { + 'id': '6850898786781339649', + 'ext': 'mp4', + 'title': 'Mishal K. on LinkedIn: #sendinblueviews #toronto #digitalmarketing', + 'description': 'md5:be125430bab1c574f16aeb186a4d5b19', + 'creator': 'Mishal K.' + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_extract_title(webpage) + description = clean_html(get_element_by_class('share-update-card__update-text', webpage)) + like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage)) + creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage))) + + sources = self._parse_json(extract_attributes(self._search_regex(r'(<video[^>]+>)', webpage, 'video'))['data-sources'], video_id) + formats = [{ + 'url': source['src'], + 'ext': mimetype2ext(source.get('type')), + 'tbr': float_or_none(source.get('data-bitrate'), scale=1000), + } for source in sources] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'like_count': like_count, + 'creator': creator, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': description, + } class LinkedInLearningIE(LinkedInLearningBaseIE): @@ -102,7 +155,6 @@ class LinkedInLearningIE(LinkedInLearningBaseIE): def _real_extract(self, url): course_slug, video_slug = self._match_valid_url(url).groups() - video_data = None formats = [] for width, height in ((640, 360), (960, 540), (1280, 720)): video_data = self._call_api( diff --git a/hypervideo_dl/extractor/linuxacademy.py b/hypervideo_dl/extractor/linuxacademy.py index 2053970..6aff88e 100644 --- a/hypervideo_dl/extractor/linuxacademy.py +++ b/hypervideo_dl/extractor/linuxacademy.py @@ -75,14 +75,7 @@ class LinuxAcademyIE(InfoExtractor): _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx' _NETRC_MACHINE = 'linuxacademy' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): def random_string(): return ''.join([ random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') diff --git a/hypervideo_dl/extractor/litv.py b/hypervideo_dl/extractor/litv.py index 18d237e..16b475a 100644 --- a/hypervideo_dl/extractor/litv.py +++ b/hypervideo_dl/extractor/litv.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + traverse_obj, smuggle_url, unsmuggle_url, ) @@ -55,9 +56,6 @@ class LiTVIE(InfoExtractor): episode_title = program_info['title'] content_id = season_list['contentId'] - if prompt: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id)) - all_episodes = [ self.url_result(smuggle_url( self._URL_TEMPLATE % (program_info['contentType'], episode['contentId']), @@ -67,16 +65,10 @@ class LiTVIE(InfoExtractor): return self.playlist_result(all_episodes, content_id, episode_title) def _real_extract(self, url): - url, data = unsmuggle_url(url, {}) + url, smuggled_data = unsmuggle_url(url, {}) video_id = self._match_id(url) - noplaylist = self.get_param('noplaylist') - noplaylist_prompt = True - if 'force_noplaylist' in data: - noplaylist = data['force_noplaylist'] - noplaylist_prompt = False - webpage = self._download_webpage(url, video_id) program_info = self._parse_json(self._search_regex( @@ -84,14 +76,9 @@ class LiTVIE(InfoExtractor): video_id) season_list = list(program_info.get('seasonList', {}).values()) - if season_list: - if not noplaylist: - return self._extract_playlist( - season_list[0], video_id, program_info, - prompt=noplaylist_prompt) - - if noplaylist_prompt: - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + playlist_id = traverse_obj(season_list, 0, 'contentId') + if self._yes_playlist(playlist_id, video_id, smuggled_data): + return self._extract_playlist(season_list[0], video_id, program_info) # In browsers `getMainUrl` request is always issued. Usually this # endpoint gives the same result as the data embedded in the webpage. diff --git a/hypervideo_dl/extractor/livestream.py b/hypervideo_dl/extractor/livestream.py index f591289..45bf26d 100644 --- a/hypervideo_dl/extractor/livestream.py +++ b/hypervideo_dl/extractor/livestream.py @@ -176,7 +176,7 @@ class LivestreamIE(InfoExtractor): return { 'id': broadcast_id, 'formats': formats, - 'title': self._live_title(stream_info['stream_title']) if is_live else stream_info['stream_title'], + 'title': stream_info['stream_title'], 'thumbnail': stream_info.get('thumbnail_url'), 'is_live': is_live, } @@ -344,7 +344,7 @@ class LivestreamOriginalIE(InfoExtractor): is_live = video_data.get('isLive') info.update({ 'id': content_id, - 'title': self._live_title(info['title']) if is_live else info['title'], + 'title': info['title'], 'formats': self._extract_video_formats(video_data, content_id), 'is_live': is_live, }) diff --git a/hypervideo_dl/extractor/lnkgo.py b/hypervideo_dl/extractor/lnkgo.py index 1467596..bd2dffa 100644 --- a/hypervideo_dl/extractor/lnkgo.py +++ b/hypervideo_dl/extractor/lnkgo.py @@ -6,8 +6,10 @@ from .common import InfoExtractor from ..utils import ( clean_html, compat_str, + format_field, int_or_none, parse_iso8601, + unified_strdate, ) @@ -71,17 +73,97 @@ class LnkGoIE(InfoExtractor): video_id, 'mp4', 'm3u8_native') self._sort_formats(formats) - poster_image = video_info.get('posterImage') - return { 'id': video_id, 'display_id': display_id, 'title': title, 'formats': formats, - 'thumbnail': 'https://lnk.lt/all-images/' + poster_image if poster_image else None, + 'thumbnail': format_field(video_info, 'posterImage', 'https://lnk.lt/all-images/%s'), 'duration': int_or_none(video_info.get('duration')), 'description': clean_html(video_info.get('htmlDescription')), 'age_limit': self._AGE_LIMITS.get(video_info.get('pgRating'), 0), 'timestamp': parse_iso8601(video_info.get('airDate')), 'view_count': int_or_none(video_info.get('viewsCount')), } + + +class LnkIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lnk\.lt/[^/]+/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://lnk.lt/zinios/79791', + 'info_dict': { + 'id': '79791', + 'ext': 'mp4', + 'title': 'LNK.lt: Viešintų gyventojai sukilo prieš radijo bangų siųstuvą', + 'description': 'Svarbiausios naujienos trumpai, LNK žinios ir Info dienos pokalbiai.', + 'view_count': int, + 'duration': 233, + 'upload_date': '20191123', + 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 13431, + 'series': 'Naujausi žinių reportažai', + 'episode': 'Episode 13431' + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://lnk.lt/istorijos-trumpai/152546', + 'info_dict': { + 'id': '152546', + 'ext': 'mp4', + 'title': 'Radžio koncertas gaisre ', + 'description': 'md5:0666b5b85cb9fc7c1238dec96f71faba', + 'view_count': int, + 'duration': 54, + 'upload_date': '20220105', + 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 1036, + 'series': 'Istorijos trumpai', + 'episode': 'Episode 1036' + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://lnk.lt/gyvunu-pasaulis/151549', + 'info_dict': { + 'id': '151549', + 'ext': 'mp4', + 'title': 'Gyvūnų pasaulis', + 'description': '', + 'view_count': int, + 'duration': 1264, + 'upload_date': '20220108', + 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 16, + 'series': 'Gyvūnų pasaulis', + 'episode': 'Episode 16' + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + video_json = self._download_json(f'https://lnk.lt/api/video/video-config/{id}', id)['videoInfo'] + formats, subtitles = [], {} + if video_json.get('videoUrl'): + fmts, subs = self._extract_m3u8_formats_and_subtitles(video_json['videoUrl'], id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + if video_json.get('videoFairplayUrl') and not video_json.get('drm'): + fmts, subs = self._extract_m3u8_formats_and_subtitles(video_json['videoFairplayUrl'], id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + + self._sort_formats(formats) + return { + 'id': id, + 'title': video_json.get('title'), + 'description': video_json.get('description'), + 'view_count': video_json.get('viewsCount'), + 'duration': video_json.get('duration'), + 'upload_date': unified_strdate(video_json.get('airDate')), + 'thumbnail': format_field(video_json, 'posterImage', 'https://lnk.lt/all-images/%s'), + 'episode_number': int_or_none(video_json.get('episodeNumber')), + 'series': video_json.get('programTitle'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/lynda.py b/hypervideo_dl/extractor/lynda.py index 58cf172..ce30474 100644 --- a/hypervideo_dl/extractor/lynda.py +++ b/hypervideo_dl/extractor/lynda.py @@ -21,9 +21,6 @@ class LyndaBaseIE(InfoExtractor): _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' _NETRC_MACHINE = 'lynda' - def _real_initialize(self): - self._login() - @staticmethod def _check_error(json_string, key_or_keys): keys = [key_or_keys] if isinstance(key_or_keys, compat_str) else key_or_keys @@ -32,7 +29,7 @@ class LyndaBaseIE(InfoExtractor): if error: raise ExtractorError('Unable to login: %s' % error, expected=True) - def _login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url): + def _perform_login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url): action_url = self._search_regex( r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_html, 'post url', default=fallback_action_url, group='url') @@ -55,11 +52,7 @@ class LyndaBaseIE(InfoExtractor): return response, action_url - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): # Step 1: download signin page signin_page = self._download_webpage( self._SIGNIN_URL, None, 'Downloading signin page') diff --git a/hypervideo_dl/extractor/mainstreaming.py b/hypervideo_dl/extractor/mainstreaming.py new file mode 100644 index 0000000..0f349a7 --- /dev/null +++ b/hypervideo_dl/extractor/mainstreaming.py @@ -0,0 +1,219 @@ +# coding: utf-8 +import re + +from .common import InfoExtractor + +from ..utils import ( + int_or_none, + js_to_json, + parse_duration, + traverse_obj, + try_get, + urljoin +) + + +class MainStreamingIE(InfoExtractor): + _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P<id>\w+)' + IE_DESC = 'MainStreaming Player' + + _TESTS = [ + { + # Live stream offline, has alternative content id + 'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/53EN6GxbWaJC', + 'info_dict': { + 'id': '53EN6GxbWaJC', + 'title': 'Diretta homepage 2021-12-31 12:00', + 'description': '', + 'live_status': 'was_live', + 'ext': 'mp4', + 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', + }, + 'expected_warnings': [ + 'Ignoring alternative content ID: WDAF1KOWUpH3', + 'MainStreaming said: Live event is OFFLINE' + ], + 'skip': 'live stream offline' + }, { + # playlist + 'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/WDAF1KOWUpH3', + 'info_dict': { + 'id': 'WDAF1KOWUpH3', + 'title': 'Playlist homepage', + }, + 'playlist_mincount': 2 + }, { + # livestream + 'url': 'https://webtools-859c1818ed614cc5b0047439470927b0.msvdn.net/embed/tDoFkZD3T1Lw', + 'info_dict': { + 'id': 'tDoFkZD3T1Lw', + 'title': r're:Class CNBC Live \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'live_status': 'is_live', + 'ext': 'mp4', + 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', + }, + 'skip': 'live stream' + }, { + 'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/EUlZfGWkGpOd?autoPlay=false', + 'info_dict': { + 'id': 'EUlZfGWkGpOd', + 'title': 'La Settimana ', + 'description': '03 Ottobre ore 02:00', + 'ext': 'mp4', + 'live_status': 'not_live', + 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', + 'duration': 1512 + } + }, { + # video without webtools- prefix + 'url': 'https://f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/MfuWmzL2lGkA?autoplay=false&T=1635860445', + 'info_dict': { + 'id': 'MfuWmzL2lGkA', + 'title': 'TG Mattina', + 'description': '06 Ottobre ore 08:00', + 'ext': 'mp4', + 'live_status': 'not_live', + 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', + 'duration': 789.04 + } + }, { + # always-on livestream with DVR + 'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/HVvPMzy', + 'info_dict': { + 'id': 'HVvPMzy', + 'title': r're:^Diretta LaC News24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'description': 'canale all news', + 'live_status': 'is_live', + 'ext': 'mp4', + 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', + }, + 'params': { + 'skip_download': True, + }, + }, { + # no host + 'url': 'https://webtools.msvdn.net/embed/MfuWmzL2lGkA', + 'only_matching': True + }, { + 'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/amp_embed/tDoFkZD3T1Lw', + 'only_matching': True + }, { + 'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/content/tDoFkZD3T1Lw#', + 'only_matching': True + } + ] + + @staticmethod + def _extract_urls(webpage): + mobj = re.findall( + r'<iframe[^>]+?src=["\']?(?P<url>%s)["\']?' % MainStreamingIE._VALID_URL, webpage) + if mobj: + return [group[0] for group in mobj] + + def _playlist_entries(self, host, playlist_content): + for entry in playlist_content: + content_id = entry.get('contentID') + yield { + '_type': 'url', + 'ie_key': MainStreamingIE.ie_key(), + 'id': content_id, + 'duration': int_or_none(traverse_obj(entry, ('duration', 'totalSeconds'))), + 'title': entry.get('title'), + 'url': f'https://{host}/embed/{content_id}' + } + + @staticmethod + def _get_webtools_host(host): + if not host.startswith('webtools'): + host = 'webtools' + ('-' if not host.startswith('.') else '') + host + return host + + def _get_webtools_base_url(self, host): + return f'{self.http_scheme()}//{self._get_webtools_host(host)}' + + def _call_api(self, host: str, path: str, item_id: str, query=None, note='Downloading API JSON', fatal=False): + # JSON API, does not appear to be documented + return self._call_webtools_api(host, '/api/v2/' + path, item_id, query, note, fatal) + + def _call_webtools_api(self, host: str, path: str, item_id: str, query=None, note='Downloading webtools API JSON', fatal=False): + # webtools docs: https://webtools.msvdn.net/ + return self._download_json( + urljoin(self._get_webtools_base_url(host), path), item_id, query=query, note=note, fatal=fatal) + + def _real_extract(self, url): + host, video_id = self._match_valid_url(url).groups() + content_info = try_get( + self._call_api( + host, f'content/{video_id}', video_id, note='Downloading content info API JSON'), lambda x: x['playerContentInfo']) + # Fallback + if not content_info: + webpage = self._download_webpage(url, video_id) + player_config = self._parse_json( + self._search_regex( + r'config\s*=\s*({.+?})\s*;', webpage, 'mainstreaming player config', + default='{}', flags=re.DOTALL), + video_id, transform_source=js_to_json, fatal=False) or {} + content_info = player_config['contentInfo'] + + host = content_info.get('host') or host + video_id = content_info.get('contentID') or video_id + title = content_info.get('title') + description = traverse_obj(content_info, 'longDescription', 'shortDescription', expected_type=str) + live_status = 'not_live' + if content_info.get('drmEnabled'): + self.report_drm(video_id) + + alternative_content_id = content_info.get('alternativeContentID') + if alternative_content_id: + self.report_warning(f'Ignoring alternative content ID: {alternative_content_id}') + + content_type = int_or_none(content_info.get('contentType')) + format_base_url = None + formats = [] + subtitles = {} + # Live content + if content_type == 20: + dvr_enabled = traverse_obj(content_info, ('playerSettings', 'dvrEnabled'), expected_type=bool) + format_base_url = f"https://{host}/live/{content_info['liveSourceID']}/{video_id}/%s{'?DVR' if dvr_enabled else ''}" + live_status = 'is_live' + heartbeat = self._call_api(host, f'heartbeat/{video_id}', video_id, note='Checking stream status') or {} + if heartbeat.get('heartBeatUp') is False: + self.raise_no_formats(f'MainStreaming said: {heartbeat.get("responseMessage")}', expected=True) + live_status = 'was_live' + + # Playlist + elif content_type == 31: + return self.playlist_result( + self._playlist_entries(host, content_info.get('playlistContents')), video_id, title, description) + # Normal video content? + elif content_type == 10: + format_base_url = f'https://{host}/vod/{video_id}/%s' + # Progressive format + # Note: in https://webtools.msvdn.net/loader/playerV2.js there is mention of original.mp3 format, + # however it seems to be the same as original.mp4? + formats.append({'url': format_base_url % 'original.mp4', 'format_note': 'original', 'quality': 1}) + else: + self.raise_no_formats(f'Unknown content type {content_type}') + + if format_base_url: + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + format_base_url % 'playlist.m3u8', video_id=video_id, fatal=False) + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles( + format_base_url % 'manifest.mpd', video_id=video_id, fatal=False) + + subtitles = self._merge_subtitles(m3u8_subs, mpd_subs) + formats.extend(m3u8_formats + mpd_formats) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'live_status': live_status, + 'duration': parse_duration(content_info.get('duration')), + 'tags': content_info.get('tags'), + 'subtitles': subtitles, + 'thumbnail': urljoin(self._get_webtools_base_url(host), f'image/{video_id}/poster') + } diff --git a/hypervideo_dl/extractor/mangomolo.py b/hypervideo_dl/extractor/mangomolo.py index acee370..68ce138 100644 --- a/hypervideo_dl/extractor/mangomolo.py +++ b/hypervideo_dl/extractor/mangomolo.py @@ -33,7 +33,7 @@ class MangomoloBaseIE(InfoExtractor): return { 'id': page_id, - 'title': self._live_title(page_id) if self._IS_LIVE else page_id, + 'title': page_id, 'uploader_id': hidden_inputs.get('userid'), 'duration': int_or_none(hidden_inputs.get('duration')), 'is_live': self._IS_LIVE, diff --git a/hypervideo_dl/extractor/manyvids.py b/hypervideo_dl/extractor/manyvids.py index e8d7163..bd24f88 100644 --- a/hypervideo_dl/extractor/manyvids.py +++ b/hypervideo_dl/extractor/manyvids.py @@ -89,4 +89,5 @@ class ManyVidsIE(InfoExtractor): 'view_count': view_count, 'like_count': like_count, 'formats': formats, + 'uploader': self._html_search_regex(r'<meta[^>]+name="author"[^>]*>([^<]+)', webpage, 'uploader'), } diff --git a/hypervideo_dl/extractor/matchtv.py b/hypervideo_dl/extractor/matchtv.py index bc9933a..e003b8d 100644 --- a/hypervideo_dl/extractor/matchtv.py +++ b/hypervideo_dl/extractor/matchtv.py @@ -49,7 +49,7 @@ class MatchTVIE(InfoExtractor): self._sort_formats(formats) return { 'id': video_id, - 'title': self._live_title('Матч ТВ - Прямой эфир'), + 'title': 'Матч ТВ - Прямой эфир', 'is_live': True, 'formats': formats, } diff --git a/hypervideo_dl/extractor/mdr.py b/hypervideo_dl/extractor/mdr.py index 0bdd626..3ca174c 100644 --- a/hypervideo_dl/extractor/mdr.py +++ b/hypervideo_dl/extractor/mdr.py @@ -2,13 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_urlparse from ..utils import ( determine_ext, int_or_none, + join_nonempty, parse_duration, parse_iso8601, url_or_none, @@ -148,13 +146,9 @@ class MDRIE(InfoExtractor): abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) - format_id = [media_type] - if vbr or abr: - format_id.append(compat_str(vbr or abr)) - f = { 'url': video_url, - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(media_type, vbr or abr), 'filesize': filesize, 'abr': abr, 'vbr': vbr, diff --git a/hypervideo_dl/extractor/medaltv.py b/hypervideo_dl/extractor/medaltv.py index 2ece5aa..59cc307 100644 --- a/hypervideo_dl/extractor/medaltv.py +++ b/hypervideo_dl/extractor/medaltv.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, + format_field, float_or_none, int_or_none, str_or_none, @@ -118,7 +119,7 @@ class MedalTVIE(InfoExtractor): author = try_get( hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {} author_id = str_or_none(author.get('id')) - author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None + author_url = format_field(author_id, template='https://medal.tv/users/%s') return { 'id': video_id, diff --git a/hypervideo_dl/extractor/mediaklikk.py b/hypervideo_dl/extractor/mediaklikk.py index b9b6d73..18ff3be 100644 --- a/hypervideo_dl/extractor/mediaklikk.py +++ b/hypervideo_dl/extractor/mediaklikk.py @@ -12,8 +12,8 @@ from ..compat import ( class MediaKlikkIE(InfoExtractor): - _VALID_URL = r'''(?x)^https?:\/\/(?:www\.)? - (?:mediaklikk|m4sport|hirado|petofilive)\.hu\/.*?videok?\/ + _VALID_URL = r'''(?x)https?://(?:www\.)? + (?:mediaklikk|m4sport|hirado|petofilive)\.hu/.*?(?:videok?|cikk)/ (?:(?P<year>[0-9]{4})/(?P<month>[0-9]{1,2})/(?P<day>[0-9]{1,2})/)? (?P<id>[^/#?_]+)''' diff --git a/hypervideo_dl/extractor/mediaset.py b/hypervideo_dl/extractor/mediaset.py index 26e7abc..d6b456c 100644 --- a/hypervideo_dl/extractor/mediaset.py +++ b/hypervideo_dl/extractor/mediaset.py @@ -1,13 +1,18 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import re from .theplatform import ThePlatformBaseIE from ..utils import ( ExtractorError, + GeoRestrictedError, int_or_none, + OnDemandPagedList, parse_qs, + try_get, + urljoin, update_url_query, ) @@ -33,7 +38,7 @@ class MediasetIE(ThePlatformBaseIE): 'id': 'F310575103000102', 'ext': 'mp4', 'title': 'Episodio 1', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'description': 'md5:e8017b7d7194e9bfb75299c2b8d81e02', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2682.0, 'upload_date': '20210530', @@ -41,6 +46,11 @@ class MediasetIE(ThePlatformBaseIE): 'timestamp': 1622413946, 'uploader': 'Canale 5', 'uploader_id': 'C5', + 'season': 'Season 1', + 'episode': 'Episode 1', + 'season_number': 1, + 'episode_number': 1, + 'chapters': [{'start_time': 0.0, 'end_time': 439.88}, {'start_time': 439.88, 'end_time': 1685.84}, {'start_time': 1685.84, 'end_time': 2682.0}], }, }, { 'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501', @@ -49,7 +59,7 @@ class MediasetIE(ThePlatformBaseIE): 'id': 'F309013801000501', 'ext': 'mp4', 'title': 'Puntata del 25 maggio', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'description': 'md5:ee2e456e3eb1dba5e814596655bb5296', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 6565.008, 'upload_date': '20200903', @@ -57,6 +67,11 @@ class MediasetIE(ThePlatformBaseIE): 'timestamp': 1599172492, 'uploader': 'Canale 5', 'uploader_id': 'C5', + 'season': 'Season 5', + 'episode': 'Episode 5', + 'season_number': 5, + 'episode_number': 5, + 'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}], }, }, { 'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-69-pezzo-di-luna_F303843101017801', @@ -65,7 +80,7 @@ class MediasetIE(ThePlatformBaseIE): 'id': 'F303843101017801', 'ext': 'mp4', 'title': 'Episodio 69 - Pezzo di luna', - 'description': '', + 'description': 'md5:7c32c8ec4118b72588b9412f11353f73', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 263.008, 'upload_date': '20200902', @@ -73,6 +88,11 @@ class MediasetIE(ThePlatformBaseIE): 'timestamp': 1599064700, 'uploader': 'Italia 1', 'uploader_id': 'I1', + 'season': 'Season 5', + 'episode': 'Episode 178', + 'season_number': 5, + 'episode_number': 178, + 'chapters': [{'start_time': 0.0, 'end_time': 261.88}, {'start_time': 261.88, 'end_time': 263.008}], }, }, { 'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-51-tu-chi-sei_F303843107000601', @@ -81,7 +101,7 @@ class MediasetIE(ThePlatformBaseIE): 'id': 'F303843107000601', 'ext': 'mp4', 'title': 'Episodio 51 - Tu chi sei?', - 'description': '', + 'description': 'md5:42ef006e56824cc31787a547590923f4', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 367.021, 'upload_date': '20200902', @@ -89,6 +109,28 @@ class MediasetIE(ThePlatformBaseIE): 'timestamp': 1599069817, 'uploader': 'Italia 1', 'uploader_id': 'I1', + 'season': 'Season 5', + 'episode': 'Episode 6', + 'season_number': 5, + 'episode_number': 6, + 'chapters': [{'start_time': 0.0, 'end_time': 358.68}, {'start_time': 358.68, 'end_time': 367.021}], + }, + }, { + # movie + 'url': 'https://www.mediasetplay.mediaset.it/movie/selvaggi/selvaggi_F006474501000101', + 'md5': '720440187a2ae26af8148eb9e6b901ed', + 'info_dict': { + 'id': 'F006474501000101', + 'ext': 'mp4', + 'title': 'Selvaggi', + 'description': 'md5:cfdedbbfdd12d4d0e5dcf1fa1b75284f', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 5233.01, + 'upload_date': '20210729', + 'timestamp': 1627594716, + 'uploader': 'Cine34', + 'uploader_id': 'B6', + 'chapters': [{'start_time': 0.0, 'end_time': 1938.56}, {'start_time': 1938.56, 'end_time': 5233.01}], }, }, { # clip @@ -156,6 +198,22 @@ class MediasetIE(ThePlatformBaseIE): video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src']) return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) + def _check_drm_formats(self, tp_formats, video_id): + has_nondrm, drm_manifest = False, '' + for f in tp_formats: + if '_sampleaes/' in (f.get('manifest_url') or ''): + drm_manifest = drm_manifest or f['manifest_url'] + f['has_drm'] = True + if not f.get('has_drm') and f.get('manifest_url'): + has_nondrm = True + + nodrm_manifest = re.sub(r'_sampleaes/(\w+)_fp_', r'/\1_no_', drm_manifest) + if has_nondrm or nodrm_manifest == drm_manifest: + return + + tp_formats.extend(self._extract_m3u8_formats( + nodrm_manifest, video_id, m3u8_id='hls', fatal=False) or []) + def _real_extract(self, url): guid = self._match_id(url) tp_path = 'PR1GhC/media/guid/2702976343/' + guid @@ -163,10 +221,10 @@ class MediasetIE(ThePlatformBaseIE): formats = [] subtitles = {} - first_e = None + first_e = geo_e = None asset_type = 'geoNo:HD,browser,geoIT|geoNo:HD,geoIT|geoNo:SD,browser,geoIT|geoNo:SD,geoIT|geoNo|HD|SD' # TODO: fixup ISM+none manifest URLs - for f in ('MPEG4', 'MPEG-DASH+none', 'M3U+none'): + for f in ('MPEG4', 'M3U'): try: tp_formats, tp_subtitles = self._extract_theplatform_smil( update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), { @@ -175,13 +233,19 @@ class MediasetIE(ThePlatformBaseIE): 'assetTypes': asset_type, }), guid, 'Downloading %s SMIL data' % (f.split('+')[0])) except ExtractorError as e: + if not geo_e and isinstance(e, GeoRestrictedError): + geo_e = e if not first_e: first_e = e - break + continue + self._check_drm_formats(tp_formats, guid) formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) - if first_e and not formats: - raise first_e + + # check for errors and report them + if (first_e or geo_e) and not formats: + raise geo_e or first_e + self._sort_formats(formats) feed_data = self._download_json( @@ -197,18 +261,95 @@ class MediasetIE(ThePlatformBaseIE): break info.update({ - 'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')), - 'season_number': int_or_none(feed_data.get('tvSeasonNumber')), - 'series': feed_data.get('mediasetprogram$brandTitle'), + 'description': info.get('description') or feed_data.get('description') or feed_data.get('longDescription'), 'uploader': publish_info.get('description'), 'uploader_id': publish_info.get('channel'), 'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')), 'thumbnail': thumbnail, }) + if feed_data.get('programType') == 'episode': + info.update({ + 'episode_number': int_or_none( + feed_data.get('tvSeasonEpisodeNumber')), + 'season_number': int_or_none( + feed_data.get('tvSeasonNumber')), + 'series': feed_data.get('mediasetprogram$brandTitle'), + }) + info.update({ 'id': guid, 'formats': formats, 'subtitles': subtitles, }) return info + + +class MediasetShowIE(MediasetIE): + _VALID_URL = r'''(?x) + (?: + https?:// + (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ + (?: + (?:fiction|programmi-tv|serie-tv|kids)/(?:.+?/)? + (?:[a-z-]+)_SE(?P<id>\d{12}) + (?:,ST(?P<st>\d{12}))? + (?:,sb(?P<sb>\d{9}))?$ + ) + ) + ''' + _TESTS = [{ + # TV Show webpage (general webpage) + 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061', + 'info_dict': { + 'id': '000000000061', + 'title': 'Le Iene', + }, + 'playlist_mincount': 7, + }, { + # TV Show webpage (specific season) + 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061,ST000000002763', + 'info_dict': { + 'id': '000000002763', + 'title': 'Le Iene', + }, + 'playlist_mincount': 7, + }, { + # TV Show specific playlist (with multiple pages) + 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/iservizi_SE000000000061,ST000000002763,sb100013375', + 'info_dict': { + 'id': '100013375', + 'title': 'I servizi', + }, + 'playlist_mincount': 50, + }] + + _BY_SUBBRAND = 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2?byCustomValue={subBrandId}{%s}&sort=:publishInfo_lastPublished|desc,tvSeasonEpisodeNumber|desc&range=%d-%d' + _PAGE_SIZE = 25 + + def _fetch_page(self, sb, page): + lower_limit = page * self._PAGE_SIZE + 1 + upper_limit = lower_limit + self._PAGE_SIZE - 1 + content = self._download_json( + self._BY_SUBBRAND % (sb, lower_limit, upper_limit), sb) + for entry in content.get('entries') or []: + yield self.url_result( + 'mediaset:' + entry['guid'], + playlist_title=entry['mediasetprogram$subBrandDescription']) + + def _real_extract(self, url): + playlist_id, st, sb = self._match_valid_url(url).group('id', 'st', 'sb') + if not sb: + page = self._download_webpage(url, st or playlist_id) + entries = [self.url_result(urljoin('https://www.mediasetplay.mediaset.it', url)) + for url in re.findall(r'href="([^<>=]+SE\d{12},ST\d{12},sb\d{9})">[^<]+<', page)] + title = (self._html_search_regex(r'(?s)<h1[^>]*>(.+?)</h1>', page, 'title', default=None) + or self._og_search_title(page)) + return self.playlist_result(entries, st or playlist_id, title) + + entries = OnDemandPagedList( + functools.partial(self._fetch_page, sb), + self._PAGE_SIZE) + title = try_get(entries, lambda x: x[0]['playlist_title']) + + return self.playlist_result(entries, sb, title) diff --git a/hypervideo_dl/extractor/mediasite.py b/hypervideo_dl/extractor/mediasite.py index ace86c2..fbf9223 100644 --- a/hypervideo_dl/extractor/mediasite.py +++ b/hypervideo_dl/extractor/mediasite.py @@ -14,6 +14,7 @@ from ..utils import ( float_or_none, mimetype2ext, str_or_none, + try_call, try_get, unescapeHTML, unsmuggle_url, @@ -145,11 +146,11 @@ class MediasiteIE(InfoExtractor): 'duration': slide['Time'] / 1000, }) - next_time = try_get(None, [ - lambda _: Stream['Slides'][i + 1]['Time'], - lambda _: duration, - lambda _: slide['Time'], - ], expected_type=(int, float)) + next_time = try_call( + lambda: Stream['Slides'][i + 1]['Time'], + lambda: duration, + lambda: slide['Time'], + expected_type=(int, float)) fragments.append({ 'path': fname_template.format(slide.get('Number', i + 1)), diff --git a/hypervideo_dl/extractor/megatvcom.py b/hypervideo_dl/extractor/megatvcom.py new file mode 100644 index 0000000..0d6793a --- /dev/null +++ b/hypervideo_dl/extractor/megatvcom.py @@ -0,0 +1,173 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + ExtractorError, + extract_attributes, + get_element_by_class, + get_element_html_by_id, + HEADRequest, + parse_qs, + unescapeHTML, + unified_timestamp, +) + + +class MegaTVComBaseIE(InfoExtractor): + _PLAYER_DIV_ID = 'player_div_id' + + def _extract_player_attrs(self, webpage): + player_el = get_element_html_by_id(self._PLAYER_DIV_ID, webpage) + return { + re.sub(r'^data-(?:kwik_)?', '', k): v + for k, v in extract_attributes(player_el).items() + if k not in ('id',) + } + + +class MegaTVComIE(MegaTVComBaseIE): + IE_NAME = 'megatvcom' + IE_DESC = 'megatv.com videos' + _VALID_URL = r'https?://(?:www\.)?megatv\.com/(?:\d{4}/\d{2}/\d{2}|[^/]+/(?P<id>\d+))/(?P<slug>[^/]+)' + + _TESTS = [{ + 'url': 'https://www.megatv.com/2021/10/23/egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia/', + 'md5': '6546a1a37fff0dd51c9dce5f490b7d7d', + 'info_dict': { + 'id': '520979', + 'ext': 'mp4', + 'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2', + 'description': 'md5:0209fa8d318128569c0d256a5c404db1', + 'timestamp': 1634975747, + 'upload_date': '20211023', + 'display_id': 'egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia', + 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/10/ΠΕΙΡΑΙΑΣ-1024x450.jpg', + }, + }, { + 'url': 'https://www.megatv.com/tvshows/527800/epeisodio-65-12/', + 'md5': 'cba2085d45c1abeb8e7e9b7e1d6c0072', + 'info_dict': { + 'id': '527800', + 'ext': 'mp4', + 'title': 'md5:fc322cb51f682eecfe2f54cd5ab3a157', + 'description': 'md5:b2b7ed3690a78f2a0156eb790fdc00df', + 'timestamp': 1636048859, + 'upload_date': '20211104', + 'display_id': 'epeisodio-65-12', + 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/11/16-1-1.jpg', + }, + }] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).group('id', 'slug') + _is_article = video_id is None + webpage = self._download_webpage(url, video_id or display_id) + if _is_article: + video_id = self._search_regex( + r'<article[^>]*\sid=["\']Article_(\d+)["\']', webpage, 'article id') + player_attrs = self._extract_player_attrs(webpage) + title = player_attrs.get('label') or self._og_search_title(webpage) + description = get_element_by_class( + 'article-wrapper' if _is_article else 'story_content', + webpage) + description = clean_html(re.sub(r'<script[^>]*>[^<]+</script>', '', description)) + if not description: + description = self._og_search_description(webpage) + thumbnail = player_attrs.get('image') or self._og_search_thumbnail(webpage) + timestamp = unified_timestamp(self._html_search_meta( + 'article:published_time', webpage)) + source = player_attrs.get('source') + if not source: + raise ExtractorError('No source found', video_id=video_id) + if determine_ext(source) == 'm3u8': + formats, subs = self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4') + else: + formats, subs = [{'url': source}], {} + if player_attrs.get('subs'): + self._merge_subtitles({'und': [{'url': player_attrs['subs']}]}, target=subs) + self._sort_formats(formats) + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subs, + } + + +class MegaTVComEmbedIE(MegaTVComBaseIE): + IE_NAME = 'megatvcom:embed' + IE_DESC = 'megatv.com embedded videos' + _VALID_URL = r'(?:https?:)?//(?:www\.)?megatv\.com/embed/?\?p=(?P<id>\d+)' + _EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''') + + _TESTS = [{ + 'url': 'https://www.megatv.com/embed/?p=2020520979', + 'md5': '6546a1a37fff0dd51c9dce5f490b7d7d', + 'info_dict': { + 'id': '520979', + 'ext': 'mp4', + 'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2', + 'description': 'md5:0209fa8d318128569c0d256a5c404db1', + 'timestamp': 1634975747, + 'upload_date': '20211023', + 'display_id': 'egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia', + 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/10/ΠΕΙΡΑΙΑΣ-1024x450.jpg', + }, + }, { + 'url': 'https://www.megatv.com/embed/?p=2020534081', + 'md5': '6ac8b3ce4dc6120c802f780a1e6b3812', + 'info_dict': { + 'id': '534081', + 'ext': 'mp4', + 'title': 'md5:062e9d5976ef854d8bdc1f5724d9b2d0', + 'description': 'md5:36dbe4c3762d2ede9513eea8d07f6d52', + 'timestamp': 1636376351, + 'upload_date': '20211108', + 'display_id': 'neo-rekor-stin-timi-tou-ilektrikou-reymatos-pano-apo-ta-200e-i-xondriki-timi-tou-ilektrikou', + 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/11/Capture-266.jpg', + }, + }] + + @classmethod + def _extract_urls(cls, webpage): + for mobj in cls._EMBED_RE.finditer(webpage): + yield unescapeHTML(mobj.group('url')) + + def _match_canonical_url(self, webpage): + LINK_RE = r'''(?x) + <link(?: + rel=(?P<_q1>["'])(?P<canonical>canonical)(?P=_q1)| + href=(?P<_q2>["'])(?P<href>(?:(?!(?P=_q2)).)+)(?P=_q2)| + [^>]*? + )+> + ''' + for mobj in re.finditer(LINK_RE, webpage): + canonical, href = mobj.group('canonical', 'href') + if canonical and href: + return unescapeHTML(href) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player_attrs = self._extract_player_attrs(webpage) + canonical_url = player_attrs.get('share_url') or self._match_canonical_url(webpage) + if not canonical_url: + raise ExtractorError('canonical URL not found') + video_id = parse_qs(canonical_url)['p'][0] + + # Defer to megatvcom as the metadata extracted from the embeddable page some + # times are slightly different, for the same video + canonical_url = self._request_webpage( + HEADRequest(canonical_url), video_id, + note='Resolve canonical URL', + errnote='Could not resolve canonical URL').geturl() + return self.url_result(canonical_url, MegaTVComIE.ie_key(), video_id) diff --git a/hypervideo_dl/extractor/mgtv.py b/hypervideo_dl/extractor/mgtv.py index cab3aa0..4ac70ea 100644 --- a/hypervideo_dl/extractor/mgtv.py +++ b/hypervideo_dl/extractor/mgtv.py @@ -13,12 +13,15 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + try_get, + url_or_none, ) class MGTVIE(InfoExtractor): _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html' IE_DESC = '芒果TV' + IE_NAME = 'MangoTV' _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', @@ -31,6 +34,32 @@ class MGTVIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, }, { + 'url': 'https://w.mgtv.com/b/427837/15588271.html', + 'info_dict': { + 'id': '15588271', + 'ext': 'mp4', + 'title': '春日迟迟再出发 沉浸版', + 'description': 'md5:a7a05a05b1aa87bd50cae619b19bbca6', + 'thumbnail': r're:^https?://.+\.jpg', + 'duration': 4026, + }, + }, { + 'url': 'https://w.mgtv.com/b/333652/7329822.html', + 'info_dict': { + 'id': '7329822', + 'ext': 'mp4', + 'title': '拜托,请你爱我', + 'description': 'md5:cd81be6499bafe32e4d143abd822bf9c', + 'thumbnail': r're:^https?://.+\.jpg', + 'duration': 2656, + }, + }, { + 'url': 'https://w.mgtv.com/b/427837/15591647.html', + 'only_matching': True, + }, { + 'url': 'https://w.mgtv.com/b/388252/15634192.html?fpa=33318&fpos=4&lastp=ch_home', + 'only_matching': True, + }, { 'url': 'http://www.mgtv.com/b/301817/3826653.html', 'only_matching': True, }, { @@ -40,12 +69,14 @@ class MGTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1] + tk2 = base64.urlsafe_b64encode( + f'did={compat_str(uuid.uuid4()).encode()}|pno=1030|ver=0.3.0301|clit={int(time.time())}'.encode())[::-1] try: api_data = self._download_json( 'https://pcweb.api.mgtv.com/player/video', video_id, query={ 'tk2': tk2, 'video_id': video_id, + 'type': 'pch5' }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: @@ -61,6 +92,7 @@ class MGTVIE(InfoExtractor): 'pm2': api_data['atc']['pm2'], 'tk2': tk2, 'video_id': video_id, + 'src': 'intelmgtv', }, headers=self.geo_verification_headers())['data'] stream_domain = stream_data['stream_domain'][0] @@ -71,7 +103,7 @@ class MGTVIE(InfoExtractor): continue format_data = self._download_json( stream_domain + stream_path, video_id, - note='Download video info for format #%d' % idx) + note=f'Download video info for format #{idx}') format_url = format_data.get('info') if not format_url: continue @@ -79,7 +111,7 @@ class MGTVIE(InfoExtractor): r'_(\d+)_mp4/', format_url, 'tbr', default=None)) formats.append({ 'format_id': compat_str(tbr or idx), - 'url': format_url, + 'url': url_or_none(format_url), 'ext': 'mp4', 'tbr': tbr, 'protocol': 'm3u8_native', @@ -97,4 +129,25 @@ class MGTVIE(InfoExtractor): 'description': info.get('desc'), 'duration': int_or_none(info.get('duration')), 'thumbnail': info.get('thumb'), + 'subtitles': self.extract_subtitles(video_id, stream_domain), } + + def _get_subtitles(self, video_id, domain): + info = self._download_json(f'https://pcweb.api.mgtv.com/video/title?videoId={video_id}', + video_id, fatal=False) or {} + subtitles = {} + for sub in try_get(info, lambda x: x['data']['title']) or []: + url_sub = sub.get('url') + if not url_sub: + continue + locale = sub.get('captionCountrySimpleName') + sub = self._download_json(f'{domain}{url_sub}', video_id, fatal=False, + note=f'Download subtitle for locale {sub.get("name")} ({locale})') or {} + sub_url = url_or_none(sub.get('info')) + if not sub_url: + continue + subtitles.setdefault(locale or 'en', []).append({ + 'url': sub_url, + 'ext': 'srt' + }) + return subtitles diff --git a/hypervideo_dl/extractor/miaopai.py b/hypervideo_dl/extractor/miaopai.py index f9e35ac..cf0610b 100644 --- a/hypervideo_dl/extractor/miaopai.py +++ b/hypervideo_dl/extractor/miaopai.py @@ -24,8 +24,7 @@ class MiaoPaiIE(InfoExtractor): webpage = self._download_webpage( url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD}) - title = self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title') + title = self._html_extract_title(webpage) thumbnail = self._html_search_regex( r'<div[^>]+class=(?P<q1>[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P<q2>[\'"])(?P<url>[^\'"]+)(?P=q2)', webpage, 'thumbnail', fatal=False, group='url') diff --git a/hypervideo_dl/extractor/microsoftstream.py b/hypervideo_dl/extractor/microsoftstream.py new file mode 100644 index 0000000..4d5a9df --- /dev/null +++ b/hypervideo_dl/extractor/microsoftstream.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from base64 import b64decode + +from .common import InfoExtractor +from ..utils import ( + merge_dicts, + parse_iso8601, + parse_duration, + parse_resolution, + try_get, + url_basename, +) + + +class MicrosoftStreamIE(InfoExtractor): + IE_NAME = 'microsoftstream' + IE_DESC = 'Microsoft Stream' + _VALID_URL = r'https?://(?:web|www|msit)\.microsoftstream\.com/video/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + _TESTS = [{ + 'url': 'https://web.microsoftstream.com/video/6e51d928-4f46-4f1c-b141-369925e37b62?list=user&userId=f5491e02-e8fe-4e34-b67c-ec2e79a6ecc0', + 'only_matching': True, + }, { + 'url': 'https://msit.microsoftstream.com/video/b60f5987-aabd-4e1c-a42f-c559d138f2ca', + 'only_matching': True, + }] + + def _get_all_subtitles(self, api_url, video_id, headers): + subtitles = {} + automatic_captions = {} + text_tracks = self._download_json( + f'{api_url}/videos/{video_id}/texttracks', video_id, + note='Downloading subtitles JSON', fatal=False, headers=headers, + query={'api-version': '1.4-private'}).get('value') or [] + for track in text_tracks: + if not track.get('language') or not track.get('url'): + continue + sub_dict = automatic_captions if track.get('autoGenerated') else subtitles + sub_dict.setdefault(track['language'], []).append({ + 'ext': 'vtt', + 'url': track.get('url') + }) + return { + 'subtitles': subtitles, + 'automatic_captions': automatic_captions + } + + def extract_all_subtitles(self, *args, **kwargs): + if (self.get_param('writesubtitles', False) + or self.get_param('writeautomaticsub', False) + or self.get_param('listsubtitles')): + return self._get_all_subtitles(*args, **kwargs) + return {} + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + if '<title>Microsoft Stream</title>' not in webpage: + self.raise_login_required(method='cookies') + + access_token = self._html_search_regex(r'"AccessToken":"(.+?)"', webpage, 'access token') + api_url = self._html_search_regex(r'"ApiGatewayUri":"(.+?)"', webpage, 'api url') + + headers = {'Authorization': f'Bearer {access_token}'} + + video_data = self._download_json( + f'{api_url}/videos/{video_id}', video_id, + headers=headers, query={ + '$expand': 'creator,tokens,status,liveEvent,extensions', + 'api-version': '1.4-private' + }) + video_id = video_data.get('id') or video_id + language = video_data.get('language') + + thumbnails = [] + for thumbnail_id in ('extraSmall', 'small', 'medium', 'large'): + thumbnail_url = try_get(video_data, lambda x: x['posterImage'][thumbnail_id]['url'], str) + if not thumbnail_url: + continue + thumb = { + 'id': thumbnail_id, + 'url': thumbnail_url, + } + thumb_name = url_basename(thumbnail_url) + thumb_name = str(b64decode(thumb_name + '=' * (-len(thumb_name) % 4))) + thumb.update(parse_resolution(thumb_name)) + thumbnails.append(thumb) + + formats = [] + for playlist in video_data['playbackUrls']: + if playlist['mimeType'] == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + playlist['playbackUrl'], video_id, + ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False, headers=headers)) + elif playlist['mimeType'] == 'application/dash+xml': + formats.extend(self._extract_mpd_formats( + playlist['playbackUrl'], video_id, mpd_id='dash', + fatal=False, headers=headers)) + elif playlist['mimeType'] == 'application/vnd.ms-sstr+xml': + formats.extend(self._extract_ism_formats( + playlist['playbackUrl'], video_id, ism_id='mss', + fatal=False, headers=headers)) + formats = [merge_dicts(f, {'language': language}) for f in formats] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['name'], + 'description': video_data.get('description'), + 'uploader': try_get(video_data, lambda x: x['creator']['name'], str), + 'uploader_id': try_get(video_data, (lambda x: x['creator']['mail'], + lambda x: x['creator']['id']), str), + 'thumbnails': thumbnails, + **self.extract_all_subtitles(api_url, video_id, headers), + 'timestamp': parse_iso8601(video_data.get('created')), + 'duration': parse_duration(try_get(video_data, lambda x: x['media']['duration'])), + 'webpage_url': f'https://web.microsoftstream.com/video/{video_id}', + 'view_count': try_get(video_data, lambda x: x['metrics']['views'], int), + 'like_count': try_get(video_data, lambda x: x['metrics']['likes'], int), + 'comment_count': try_get(video_data, lambda x: x['metrics']['comments'], int), + 'formats': formats, + } diff --git a/hypervideo_dl/extractor/mildom.py b/hypervideo_dl/extractor/mildom.py index c147cbb..5f2df29 100644 --- a/hypervideo_dl/extractor/mildom.py +++ b/hypervideo_dl/extractor/mildom.py @@ -1,92 +1,42 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 -from datetime import datetime -import itertools +import functools import json from .common import InfoExtractor from ..utils import ( - std_headers, - update_url_query, + determine_ext, + dict_get, + ExtractorError, + float_or_none, + OnDemandPagedList, random_uuidv4, - try_get, -) -from ..compat import ( - compat_str, + traverse_obj, ) class MildomBaseIE(InfoExtractor): _GUEST_ID = None - _DISPATCHER_CONFIG = None - - def _call_api(self, url, video_id, query={}, note='Downloading JSON metadata', init=False): - url = update_url_query(url, self._common_queries(query, init=init)) - return self._download_json(url, video_id, note=note)['body'] - - def _common_queries(self, query={}, init=False): - dc = self._fetch_dispatcher_config() - r = { - 'timestamp': self.iso_timestamp(), - '__guest_id': '' if init else self.guest_id(), - '__location': dc['location'], - '__country': dc['country'], - '__cluster': dc['cluster'], - '__platform': 'web', - '__la': self.lang_code(), - '__pcv': 'v2.9.44', - 'sfr': 'pc', - 'accessToken': '', - } - r.update(query) - return r - - def _fetch_dispatcher_config(self): - if not self._DISPATCHER_CONFIG: - tmp = self._download_json( - 'https://disp.mildom.com/serverListV2', 'initialization', - note='Downloading dispatcher_config', data=json.dumps({ - 'protover': 0, - 'data': base64.b64encode(json.dumps({ - 'fr': 'web', - 'sfr': 'pc', - 'devi': 'Windows', - 'la': 'ja', - 'gid': None, - 'loc': '', - 'clu': '', - 'wh': '1919*810', - 'rtm': self.iso_timestamp(), - 'ua': std_headers['User-Agent'], - }).encode('utf8')).decode('utf8').replace('\n', ''), - }).encode('utf8')) - self._DISPATCHER_CONFIG = self._parse_json(base64.b64decode(tmp['data']), 'initialization') - return self._DISPATCHER_CONFIG - - @staticmethod - def iso_timestamp(): - 'new Date().toISOString()' - return datetime.utcnow().isoformat()[0:-3] + 'Z' - - def guest_id(self): - 'getGuestId' - if self._GUEST_ID: - return self._GUEST_ID - self._GUEST_ID = try_get( - self, ( - lambda x: x._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/guest/h5init', 'initialization', - note='Downloading guest token', init=True)['guest_id'] or None, - lambda x: x._get_cookies('https://www.mildom.com').get('gid').value, - lambda x: x._get_cookies('https://m.mildom.com').get('gid').value, - ), compat_str) or '' - return self._GUEST_ID - - def lang_code(self): - 'getCurrentLangCode' - return 'ja' + + def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', body=None): + if not self._GUEST_ID: + self._GUEST_ID = f'pc-gp-{random_uuidv4()}' + + content = self._download_json( + url, video_id, note=note, data=json.dumps(body).encode() if body else None, + headers={'Content-Type': 'application/json'} if body else {}, + query={ + '__guest_id': self._GUEST_ID, + '__platform': 'web', + **(query or {}), + }) + + if content['code'] != 0: + raise ExtractorError( + f'Mildom says: {content["message"]} (code {content["code"]})', + expected=True) + return content['body'] class MildomIE(MildomBaseIE): @@ -96,31 +46,13 @@ class MildomIE(MildomBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - url = 'https://www.mildom.com/%s' % video_id - - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(f'https://www.mildom.com/{video_id}', video_id) enterstudio = self._call_api( 'https://cloudac.mildom.com/nonolive/gappserv/live/enterstudio', video_id, note='Downloading live metadata', query={'user_id': video_id}) result_video_id = enterstudio.get('log_id', video_id) - title = try_get( - enterstudio, ( - lambda x: self._html_search_meta('twitter:description', webpage), - lambda x: x['anchor_intro'], - ), compat_str) - description = try_get( - enterstudio, ( - lambda x: x['intro'], - lambda x: x['live_intro'], - ), compat_str) - uploader = try_get( - enterstudio, ( - lambda x: self._html_search_meta('twitter:title', webpage), - lambda x: x['loginname'], - ), compat_str) - servers = self._call_api( 'https://cloudac.mildom.com/nonolive/gappserv/live/liveserver', result_video_id, note='Downloading live server list', query={ @@ -128,17 +60,20 @@ class MildomIE(MildomBaseIE): 'live_server_type': 'hls', }) - stream_query = self._common_queries({ - 'streamReqId': random_uuidv4(), - 'is_lhls': '0', - }) - m3u8_url = update_url_query(servers['stream_server'] + '/%s_master.m3u8' % video_id, stream_query) - formats = self._extract_m3u8_formats(m3u8_url, result_video_id, 'mp4', headers={ - 'Referer': 'https://www.mildom.com/', - 'Origin': 'https://www.mildom.com', - }, note='Downloading m3u8 information') - - del stream_query['streamReqId'], stream_query['timestamp'] + playback_token = self._call_api( + 'https://cloudac.mildom.com/nonolive/gappserv/live/token', result_video_id, + note='Obtaining live playback token', body={'host_id': video_id, 'type': 'hls'}) + playback_token = traverse_obj(playback_token, ('data', ..., 'token'), get_all=False) + if not playback_token: + raise ExtractorError('Failed to obtain live playback token') + + formats = self._extract_m3u8_formats( + f'{servers["stream_server"]}/{video_id}_master.m3u8?{playback_token}', + result_video_id, 'mp4', headers={ + 'Referer': 'https://www.mildom.com/', + 'Origin': 'https://www.mildom.com', + }) + for fmt in formats: fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/' @@ -146,9 +81,10 @@ class MildomIE(MildomBaseIE): return { 'id': result_video_id, - 'title': title, - 'description': description, - 'uploader': uploader, + 'title': self._html_search_meta('twitter:description', webpage, default=None) or traverse_obj(enterstudio, 'anchor_intro'), + 'description': traverse_obj(enterstudio, 'intro', 'live_intro', expected_type=str), + 'timestamp': float_or_none(enterstudio.get('live_start_ms'), scale=1000), + 'uploader': self._html_search_meta('twitter:title', webpage, default=None) or traverse_obj(enterstudio, 'loginname'), 'uploader_id': video_id, 'formats': formats, 'is_live': True, @@ -157,15 +93,55 @@ class MildomIE(MildomBaseIE): class MildomVodIE(MildomBaseIE): IE_NAME = 'mildom:vod' - IE_DESC = 'Download a VOD in Mildom' - _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+)' + IE_DESC = 'VOD in Mildom' + _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+-?[0-9]*)' + _TESTS = [{ + 'url': 'https://www.mildom.com/playback/10882672/10882672-1597662269', + 'info_dict': { + 'id': '10882672-1597662269', + 'ext': 'mp4', + 'title': '始めてのミルダム配信じゃぃ!', + 'thumbnail': r're:^https?://.*\.(png|jpg)$', + 'upload_date': '20200817', + 'duration': 4138.37, + 'description': 'ゲームをしたくて!', + 'timestamp': 1597662269.0, + 'uploader_id': '10882672', + 'uploader': 'kson組長(けいそん)', + }, + }, { + 'url': 'https://www.mildom.com/playback/10882672/10882672-1597758589870-477', + 'info_dict': { + 'id': '10882672-1597758589870-477', + 'ext': 'mp4', + 'title': '【kson】感染メイズ!麻酔銃で無双する', + 'thumbnail': r're:^https?://.*\.(png|jpg)$', + 'timestamp': 1597759093.0, + 'uploader': 'kson組長(けいそん)', + 'duration': 4302.58, + 'uploader_id': '10882672', + 'description': 'このステージ絶対乗り越えたい', + 'upload_date': '20200818', + }, + }, { + 'url': 'https://www.mildom.com/playback/10882672/10882672-buha9td2lrn97fk2jme0', + 'info_dict': { + 'id': '10882672-buha9td2lrn97fk2jme0', + 'ext': 'mp4', + 'title': '【kson組長】CART RACER!!!', + 'thumbnail': r're:^https?://.*\.(png|jpg)$', + 'uploader_id': '10882672', + 'uploader': 'kson組長(けいそん)', + 'upload_date': '20201104', + 'timestamp': 1604494797.0, + 'duration': 4657.25, + 'description': 'WTF', + }, + }] def _real_extract(self, url): - m = self._match_valid_url(url) - user_id, video_id = m.group('user_id'), m.group('id') - url = 'https://www.mildom.com/playback/%s/%s' % (user_id, video_id) - - webpage = self._download_webpage(url, video_id) + user_id, video_id = self._match_valid_url(url).group('user_id', 'id') + webpage = self._download_webpage(f'https://www.mildom.com/playback/{user_id}/{video_id}', video_id) autoplay = self._call_api( 'https://cloudac.mildom.com/nonolive/videocontent/playback/getPlaybackDetail', video_id, @@ -173,20 +149,6 @@ class MildomVodIE(MildomBaseIE): 'v_id': video_id, })['playback'] - title = try_get( - autoplay, ( - lambda x: self._html_search_meta('og:description', webpage), - lambda x: x['title'], - ), compat_str) - description = try_get( - autoplay, ( - lambda x: x['video_intro'], - ), compat_str) - uploader = try_get( - autoplay, ( - lambda x: x['author_info']['login_name'], - ), compat_str) - formats = [{ 'url': autoplay['audio_url'], 'format_id': 'audio', @@ -211,14 +173,81 @@ class MildomVodIE(MildomBaseIE): return { 'id': video_id, - 'title': title, - 'description': description, - 'uploader': uploader, + 'title': self._html_search_meta(('og:description', 'description'), webpage, default=None) or autoplay.get('title'), + 'description': traverse_obj(autoplay, 'video_intro'), + 'timestamp': float_or_none(autoplay.get('publish_time'), scale=1000), + 'duration': float_or_none(autoplay.get('video_length'), scale=1000), + 'thumbnail': dict_get(autoplay, ('upload_pic', 'video_pic')), + 'uploader': traverse_obj(autoplay, ('author_info', 'login_name')), 'uploader_id': user_id, 'formats': formats, } +class MildomClipIE(MildomBaseIE): + IE_NAME = 'mildom:clip' + IE_DESC = 'Clip in Mildom' + _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/clip/(?P<id>(?P<user_id>\d+)-[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'https://www.mildom.com/clip/10042245-63921673e7b147ebb0806d42b5ba5ce9', + 'info_dict': { + 'id': '10042245-63921673e7b147ebb0806d42b5ba5ce9', + 'title': '全然違ったよ', + 'timestamp': 1619181890, + 'duration': 59, + 'thumbnail': r're:https?://.+', + 'uploader': 'ざきんぽ', + 'uploader_id': '10042245', + }, + }, { + 'url': 'https://www.mildom.com/clip/10111524-ebf4036e5aa8411c99fb3a1ae0902864', + 'info_dict': { + 'id': '10111524-ebf4036e5aa8411c99fb3a1ae0902864', + 'title': 'かっこいい', + 'timestamp': 1621094003, + 'duration': 59, + 'thumbnail': r're:https?://.+', + 'uploader': '(ルーキー', + 'uploader_id': '10111524', + }, + }, { + 'url': 'https://www.mildom.com/clip/10660174-2c539e6e277c4aaeb4b1fbe8d22cb902', + 'info_dict': { + 'id': '10660174-2c539e6e277c4aaeb4b1fbe8d22cb902', + 'title': 'あ', + 'timestamp': 1614769431, + 'duration': 31, + 'thumbnail': r're:https?://.+', + 'uploader': 'ドルゴルスレンギーン=ダグワドルジ', + 'uploader_id': '10660174', + }, + }] + + def _real_extract(self, url): + user_id, video_id = self._match_valid_url(url).group('user_id', 'id') + webpage = self._download_webpage(f'https://www.mildom.com/clip/{video_id}', video_id) + + clip_detail = self._call_api( + 'https://cloudac-cf-jp.mildom.com/nonolive/videocontent/clip/detail', video_id, + note='Downloading playback metadata', query={ + 'clip_id': video_id, + }) + + return { + 'id': video_id, + 'title': self._html_search_meta( + ('og:description', 'description'), webpage, default=None) or clip_detail.get('title'), + 'timestamp': float_or_none(clip_detail.get('create_time')), + 'duration': float_or_none(clip_detail.get('length')), + 'thumbnail': clip_detail.get('cover'), + 'uploader': traverse_obj(clip_detail, ('user_info', 'loginname')), + 'uploader_id': user_id, + + 'url': clip_detail['url'], + 'ext': determine_ext(clip_detail.get('url'), 'mp4'), + } + + class MildomUserVodIE(MildomBaseIE): IE_NAME = 'mildom:user:vod' IE_DESC = 'Download all VODs from specific user in Mildom' @@ -229,22 +258,32 @@ class MildomUserVodIE(MildomBaseIE): 'id': '10093333', 'title': 'Uploads from ねこばたけ', }, - 'playlist_mincount': 351, + 'playlist_mincount': 732, + }, { + 'url': 'https://www.mildom.com/profile/10882672', + 'info_dict': { + 'id': '10882672', + 'title': 'Uploads from kson組長(けいそん)', + }, + 'playlist_mincount': 201, }] - def _entries(self, user_id): - for page in itertools.count(1): - reply = self._call_api( - 'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList', - user_id, note='Downloading page %d' % page, query={ - 'user_id': user_id, - 'page': page, - 'limit': '30', - }) - if not reply: - break - for x in reply: - yield self.url_result('https://www.mildom.com/playback/%s/%s' % (user_id, x['v_id'])) + def _fetch_page(self, user_id, page): + page += 1 + reply = self._call_api( + 'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList', + user_id, note=f'Downloading page {page}', query={ + 'user_id': user_id, + 'page': page, + 'limit': '30', + }) + if not reply: + return + for x in reply: + v_id = x.get('v_id') + if not v_id: + continue + yield self.url_result(f'https://www.mildom.com/playback/{user_id}/{v_id}') def _real_extract(self, url): user_id = self._match_id(url) @@ -255,4 +294,5 @@ class MildomUserVodIE(MildomBaseIE): query={'user_id': user_id}, note='Downloading user profile')['user_info'] return self.playlist_result( - self._entries(user_id), user_id, 'Uploads from %s' % profile['loginname']) + OnDemandPagedList(functools.partial(self._fetch_page, user_id), 30), + user_id, f'Uploads from {profile["loginname"]}') diff --git a/hypervideo_dl/extractor/minds.py b/hypervideo_dl/extractor/minds.py index 8e9f0f8..9da0720 100644 --- a/hypervideo_dl/extractor/minds.py +++ b/hypervideo_dl/extractor/minds.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( clean_html, + format_field, int_or_none, str_or_none, strip_or_none, @@ -120,7 +121,7 @@ class MindsIE(MindsBaseIE): 'timestamp': int_or_none(entity.get('time_created')), 'uploader': strip_or_none(owner.get('name')), 'uploader_id': uploader_id, - 'uploader_url': 'https://www.minds.com/' + uploader_id if uploader_id else None, + 'uploader_url': format_field(uploader_id, template='https://www.minds.com/%s'), 'view_count': int_or_none(entity.get('play:count')), 'like_count': int_or_none(entity.get('thumbs:up:count')), 'dislike_count': int_or_none(entity.get('thumbs:down:count')), diff --git a/hypervideo_dl/extractor/mirrativ.py b/hypervideo_dl/extractor/mirrativ.py index 81aea54..2111de6 100644 --- a/hypervideo_dl/extractor/mirrativ.py +++ b/hypervideo_dl/extractor/mirrativ.py @@ -19,9 +19,25 @@ class MirrativBaseIE(InfoExtractor): class MirrativIE(MirrativBaseIE): IE_NAME = 'mirrativ' _VALID_URL = r'https?://(?:www\.)?mirrativ\.com/live/(?P<id>[^/?#&]+)' - LIVE_API_URL = 'https://www.mirrativ.com/api/live/live?live_id=%s' TESTS = [{ + 'url': 'https://mirrativ.com/live/UQomuS7EMgHoxRHjEhNiHw', + 'info_dict': { + 'id': 'UQomuS7EMgHoxRHjEhNiHw', + 'title': 'ねむいぃ、。『参加型』🔰jcが初めてやるCOD✨初見さん大歓迎💗', + 'is_live': True, + 'description': 'md5:bfcd8f77f2fab24c3c672e5620f3f16e', + 'thumbnail': r're:https?://.+', + 'uploader': '# あ ち ゅ 。💡', + 'uploader_id': '118572165', + 'duration': None, + 'view_count': 1241, + 'release_timestamp': 1646229192, + 'timestamp': 1646229167, + 'was_live': False, + }, + 'skip': 'livestream', + }, { 'url': 'https://mirrativ.com/live/POxyuG1KmW2982lqlDTuPw', 'only_matching': True, }] @@ -29,12 +45,11 @@ class MirrativIE(MirrativBaseIE): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage('https://www.mirrativ.com/live/%s' % video_id, video_id) - live_response = self._download_json(self.LIVE_API_URL % video_id, video_id) + live_response = self._download_json(f'https://www.mirrativ.com/api/live/live?live_id={video_id}', video_id) self.assert_error(live_response) hls_url = dict_get(live_response, ('archive_url_hls', 'streaming_url_hls')) is_live = bool(live_response.get('is_live')) - was_live = bool(live_response.get('is_archive')) if not hls_url: raise ExtractorError('Neither archive nor live is available.', expected=True) @@ -42,55 +57,29 @@ class MirrativIE(MirrativBaseIE): hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', live=is_live) - rtmp_url = live_response.get('streaming_url_edge') - if rtmp_url: - keys_to_copy = ('width', 'height', 'vcodec', 'acodec', 'tbr') - fmt = { - 'format_id': 'rtmp', - 'url': rtmp_url, - 'protocol': 'rtmp', - 'ext': 'mp4', - } - fmt.update({k: traverse_obj(formats, (0, k)) for k in keys_to_copy}) - formats.append(fmt) self._sort_formats(formats) - title = self._og_search_title(webpage, default=None) or self._search_regex( - r'<title>\s*(.+?) - Mirrativ\s*</title>', webpage) or live_response.get('title') - description = live_response.get('description') - thumbnail = live_response.get('image_url') - - duration = try_get(live_response, lambda x: x['ended_at'] - x['started_at']) - view_count = live_response.get('total_viewer_num') - release_timestamp = live_response.get('started_at') - timestamp = live_response.get('created_at') - - owner = live_response.get('owner', {}) - uploader = owner.get('name') - uploader_id = owner.get('user_id') - return { 'id': video_id, - 'title': title, + 'title': self._og_search_title(webpage, default=None) or self._search_regex( + r'<title>\s*(.+?) - Mirrativ\s*</title>', webpage) or live_response.get('title'), 'is_live': is_live, - 'description': description, + 'description': live_response.get('description'), 'formats': formats, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'duration': duration, - 'view_count': view_count, - 'release_timestamp': release_timestamp, - 'timestamp': timestamp, - 'was_live': was_live, + 'thumbnail': live_response.get('image_url'), + 'uploader': traverse_obj(live_response, ('owner', 'name')), + 'uploader_id': traverse_obj(live_response, ('owner', 'user_id')), + 'duration': try_get(live_response, lambda x: x['ended_at'] - x['started_at']) if not is_live else None, + 'view_count': live_response.get('total_viewer_num'), + 'release_timestamp': live_response.get('started_at'), + 'timestamp': live_response.get('created_at'), + 'was_live': bool(live_response.get('is_archive')), } class MirrativUserIE(MirrativBaseIE): IE_NAME = 'mirrativ:user' _VALID_URL = r'https?://(?:www\.)?mirrativ\.com/user/(?P<id>\d+)' - LIVE_HISTORY_API_URL = 'https://www.mirrativ.com/api/live/live_history?user_id=%s&page=%d' - USER_INFO_API_URL = 'https://www.mirrativ.com/api/user/profile?user_id=%s' _TESTS = [{ # Live archive is available up to 3 days @@ -104,8 +93,8 @@ class MirrativUserIE(MirrativBaseIE): page = 1 while page is not None: api_response = self._download_json( - self.LIVE_HISTORY_API_URL % (user_id, page), user_id, - note='Downloading page %d' % page) + f'https://www.mirrativ.com/api/live/live_history?user_id={user_id}&page={page}', user_id, + note=f'Downloading page {page}') self.assert_error(api_response) lives = api_response.get('lives') if not lives: @@ -123,12 +112,10 @@ class MirrativUserIE(MirrativBaseIE): def _real_extract(self, url): user_id = self._match_id(url) user_info = self._download_json( - self.USER_INFO_API_URL % user_id, user_id, + f'https://www.mirrativ.com/api/user/profile?user_id={user_id}', user_id, note='Downloading user info', fatal=False) self.assert_error(user_info) - uploader = user_info.get('name') - description = user_info.get('description') - - entries = self._entries(user_id) - return self.playlist_result(entries, user_id, uploader, description) + return self.playlist_result( + self._entries(user_id), user_id, + user_info.get('name'), user_info.get('description')) diff --git a/hypervideo_dl/extractor/mixch.py b/hypervideo_dl/extractor/mixch.py new file mode 100644 index 0000000..31f450d --- /dev/null +++ b/hypervideo_dl/extractor/mixch.py @@ -0,0 +1,85 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, +) + + +class MixchIE(InfoExtractor): + IE_NAME = 'mixch' + _VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://mixch.tv/u/16236849/live', + 'skip': 'don\'t know if this live persists', + 'info_dict': { + 'id': '16236849', + 'title': '24配信シェア⭕️投票🙏💦', + 'comment_count': 13145, + 'view_count': 28348, + 'timestamp': 1636189377, + 'uploader': '🦥伊咲👶🏻#フレアワ', + 'uploader_id': '16236849', + } + }, { + 'url': 'https://mixch.tv/u/16137876/live', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://mixch.tv/u/{video_id}/live', video_id) + + initial_js_state = self._parse_json(self._search_regex( + r'(?m)^\s*window\.__INITIAL_JS_STATE__\s*=\s*(\{.+?\});\s*$', webpage, 'initial JS state'), video_id) + if not initial_js_state.get('liveInfo'): + raise ExtractorError('Livestream has ended.', expected=True) + + return { + 'id': video_id, + 'title': traverse_obj(initial_js_state, ('liveInfo', 'title')), + 'comment_count': traverse_obj(initial_js_state, ('liveInfo', 'comments')), + 'view_count': traverse_obj(initial_js_state, ('liveInfo', 'visitor')), + 'timestamp': traverse_obj(initial_js_state, ('liveInfo', 'created')), + 'uploader': traverse_obj(initial_js_state, ('broadcasterInfo', 'name')), + 'uploader_id': video_id, + 'formats': [{ + 'format_id': 'hls', + 'url': traverse_obj(initial_js_state, ('liveInfo', 'hls')) or 'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_%s.m3u8' % video_id, + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + 'is_live': True, + } + + +class MixchArchiveIE(InfoExtractor): + IE_NAME = 'mixch:archive' + _VALID_URL = r'https?://(?:www\.)?mixch\.tv/archive/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://mixch.tv/archive/421', + 'skip': 'paid video, no DRM. expires at Jan 23', + 'info_dict': { + 'id': '421', + 'title': '96NEKO SHOW TIME', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + html5_videos = self._parse_html5_media_entries( + url, webpage.replace('video-js', 'video'), video_id, 'hls') + if not html5_videos: + self.raise_login_required(method='cookies') + infodict = html5_videos[0] + infodict.update({ + 'id': video_id, + 'title': self._html_search_regex(r'class="archive-title">(.+?)</', webpage, 'title') + }) + + return infodict diff --git a/hypervideo_dl/extractor/mixcloud.py b/hypervideo_dl/extractor/mixcloud.py index a0c043d..c2dd078 100644 --- a/hypervideo_dl/extractor/mixcloud.py +++ b/hypervideo_dl/extractor/mixcloud.py @@ -12,6 +12,7 @@ from ..compat import ( compat_zip ) from ..utils import ( + ExtractorError, int_or_none, parse_iso8601, strip_or_none, @@ -125,7 +126,20 @@ class MixcloudIE(MixcloudBaseIE): tag { name } - }''', track_id, username, slug) + } + restrictedReason + id''', track_id, username, slug) + + if not cloudcast: + raise ExtractorError('Track not found', expected=True) + + reason = cloudcast.get('restrictedReason') + if reason == 'tracklist': + raise ExtractorError('Track unavailable in your country due to licensing restrictions', expected=True) + elif reason == 'repeat_play': + raise ExtractorError('You have reached your play limit for this track', expected=True) + elif reason: + raise ExtractorError('Track is restricted', expected=True) title = cloudcast['name'] diff --git a/hypervideo_dl/extractor/mlssoccer.py b/hypervideo_dl/extractor/mlssoccer.py new file mode 100644 index 0000000..1d6d4b8 --- /dev/null +++ b/hypervideo_dl/extractor/mlssoccer.py @@ -0,0 +1,117 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MLSSoccerIE(InfoExtractor): + _VALID_DOMAINS = r'(?:(?:cfmontreal|intermiamicf|lagalaxy|lafc|houstondynamofc|dcunited|atlutd|mlssoccer|fcdallas|columbuscrew|coloradorapids|fccincinnati|chicagofirefc|austinfc|nashvillesc|whitecapsfc|sportingkc|soundersfc|sjearthquakes|rsl|timbers|philadelphiaunion|orlandocitysc|newyorkredbulls|nycfc)\.com|(?:torontofc)\.ca|(?:revolutionsoccer)\.net)' + _VALID_URL = r'https?://(?:www\.)?%s/video/#?(?P<id>[^/&$#?]+)' % _VALID_DOMAINS + + _TESTS = [{ + 'url': 'https://www.mlssoccer.com/video/the-octagon-can-alphonso-davies-lead-canada-to-first-world-cup-since-1986#the-octagon-can-alphonso-davies-lead-canada-to-first-world-cup-since-1986', + 'info_dict': { + 'id': '6276033198001', + 'ext': 'mp4', + 'title': 'The Octagon | Can Alphonso Davies lead Canada to first World Cup since 1986?', + 'description': 'md5:f0a883ee33592a0221798f451a98be8f', + 'thumbnail': 'https://cf-images.us-east-1.prod.boltdns.net/v1/static/5530036772001/1bbc44f6-c63c-4981-82fa-46b0c1f891e0/5c1ca44a-a033-4e98-b531-ff24c4947608/160x90/match/image.jpg', + 'duration': 350.165, + 'timestamp': 1633627291, + 'uploader_id': '5530036772001', + 'tags': ['club/canada'], + 'is_live': False, + 'upload_date': '20211007', + 'filesize_approx': 255193528.83200002 + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.whitecapsfc.com/video/highlights-san-jose-earthquakes-vs-vancouver-whitecaps-fc-october-23-2021#highlights-san-jose-earthquakes-vs-vancouver-whitecaps-fc-october-23-2021', + 'only_matching': True + }, { + 'url': 'https://www.torontofc.ca/video/highlights-toronto-fc-vs-cf-montreal-october-23-2021-x6733#highlights-toronto-fc-vs-cf-montreal-october-23-2021-x6733', + 'only_matching': True + }, { + 'url': 'https://www.sportingkc.com/video/post-match-press-conference-john-pulskamp-oct-27-2021#post-match-press-conference-john-pulskamp-oct-27-2021', + 'only_matching': True + }, { + 'url': 'https://www.soundersfc.com/video/highlights-seattle-sounders-fc-vs-sporting-kansas-city-october-23-2021', + 'only_matching': True + }, { + 'url': 'https://www.sjearthquakes.com/video/#highlights-austin-fc-vs-san-jose-earthquakes-june-19-2021', + 'only_matching': True + }, { + 'url': 'https://www.rsl.com/video/2021-u-of-u-health-mic-d-up-vs-colorado-10-16-21#2021-u-of-u-health-mic-d-up-vs-colorado-10-16-21', + 'only_matching': True + }, { + 'url': 'https://www.timbers.com/video/highlights-d-chara-asprilla-with-goals-in-portland-timbers-2-0-win-over-san-jose#highlights-d-chara-asprilla-with-goals-in-portland-timbers-2-0-win-over-san-jose', + 'only_matching': True + }, { + 'url': 'https://www.philadelphiaunion.com/video/highlights-torvphi', + 'only_matching': True + }, { + 'url': 'https://www.orlandocitysc.com/video/highlight-columbus-crew-vs-orlando-city-sc', + 'only_matching': True + }, { + 'url': 'https://www.newyorkredbulls.com/video/all-access-matchday-double-derby-week#all-access-matchday-double-derby-week', + 'only_matching': True + }, { + 'url': 'https://www.nycfc.com/video/highlights-nycfc-1-0-chicago-fire-fc#highlights-nycfc-1-0-chicago-fire-fc', + 'only_matching': True + }, { + 'url': 'https://www.revolutionsoccer.net/video/two-minute-highlights-revs-1-rapids-0-october-27-2021#two-minute-highlights-revs-1-rapids-0-october-27-2021', + 'only_matching': True + }, { + 'url': 'https://www.nashvillesc.com/video/goal-c-j-sapong-nashville-sc-92nd-minute', + 'only_matching': True + }, { + 'url': 'https://www.cfmontreal.com/video/faits-saillants-tor-v-mtl#faits-saillants-orl-v-mtl-x5645', + 'only_matching': True + }, { + 'url': 'https://www.intermiamicf.com/video/all-access-victory-vs-nashville-sc-by-ukg#all-access-victory-vs-nashville-sc-by-ukg', + 'only_matching': True + }, { + 'url': 'https://www.lagalaxy.com/video/#moment-of-the-month-presented-by-san-manuel-casino-rayan-raveloson-scores-his-se', + 'only_matching': True + }, { + 'url': 'https://www.lafc.com/video/breaking-down-lafc-s-final-6-matches-of-the-2021-mls-regular-season#breaking-down-lafc-s-final-6-matches-of-the-2021-mls-regular-season', + 'only_matching': True + }, { + 'url': 'https://www.houstondynamofc.com/video/postgame-press-conference-michael-nelson-presented-by-coushatta-casino-res-x9660#postgame-press-conference-michael-nelson-presented-by-coushatta-casino-res-x9660', + 'only_matching': True + }, { + 'url': 'https://www.dcunited.com/video/tony-alfaro-my-family-pushed-me-to-believe-everything-was-possible', + 'only_matching': True + }, { + 'url': 'https://www.fcdallas.com/video/highlights-fc-dallas-vs-minnesota-united-fc-october-02-2021#highlights-fc-dallas-vs-minnesota-united-fc-october-02-2021', + 'only_matching': True + }, { + 'url': 'https://www.columbuscrew.com/video/match-rewind-columbus-crew-vs-new-york-red-bulls-october-23-2021', + 'only_matching': True + }, { + 'url': 'https://www.coloradorapids.com/video/postgame-reaction-robin-fraser-october-27#postgame-reaction-robin-fraser-october-27', + 'only_matching': True + }, { + 'url': 'https://www.fccincinnati.com/video/#keeping-cincy-chill-presented-by-coors-lite', + 'only_matching': True + }, { + 'url': 'https://www.chicagofirefc.com/video/all-access-fire-score-dramatic-road-win-in-cincy#all-access-fire-score-dramatic-road-win-in-cincy', + 'only_matching': True + }, { + 'url': 'https://www.austinfc.com/video/highlights-colorado-rapids-vs-austin-fc-september-29-2021#highlights-colorado-rapids-vs-austin-fc-september-29-2021', + 'only_matching': True + }, { + 'url': 'https://www.atlutd.com/video/goal-josef-martinez-scores-in-the-73rd-minute#goal-josef-martinez-scores-in-the-73rd-minute', + 'only_matching': True + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._html_search_regex(r'data-options\=\"([^\"]+)\"', webpage, 'json'), id)['videoList'][0] + return { + 'id': id, + '_type': 'url', + 'url': 'https://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (data_json['accountId'], data_json['videoId']), + 'ie_key': 'BrightcoveNew', + } diff --git a/hypervideo_dl/extractor/mojvideo.py b/hypervideo_dl/extractor/mojvideo.py index 0421f3f..16d9405 100644 --- a/hypervideo_dl/extractor/mojvideo.py +++ b/hypervideo_dl/extractor/mojvideo.py @@ -38,8 +38,7 @@ class MojvideoIE(InfoExtractor): r'<errordesc>([^<]*)</errordesc>', playerapi, 'error description', fatal=False) raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True) - title = self._html_search_regex( - r'<title>([^<]+)</title>', playerapi, 'title') + title = self._html_extract_title(playerapi) video_url = self._html_search_regex( r'<file>([^<]+)</file>', playerapi, 'video URL') thumbnail = self._html_search_regex( diff --git a/hypervideo_dl/extractor/mtv.py b/hypervideo_dl/extractor/mtv.py index e060884..be5de0a 100644 --- a/hypervideo_dl/extractor/mtv.py +++ b/hypervideo_dl/extractor/mtv.py @@ -15,6 +15,7 @@ from ..utils import ( float_or_none, HEADRequest, int_or_none, + join_nonempty, RegexNotFoundError, sanitized_Request, strip_or_none, @@ -99,9 +100,9 @@ class MTVServicesInfoExtractor(InfoExtractor): formats.extend([{ 'ext': 'flv' if rtmp_video_url.startswith('rtmp') else ext, 'url': rtmp_video_url, - 'format_id': '-'.join(filter(None, [ + 'format_id': join_nonempty( 'rtmp' if rtmp_video_url.startswith('rtmp') else None, - rendition.get('bitrate')])), + rendition.get('bitrate')), 'width': int(rendition.get('width')), 'height': int(rendition.get('height')), }]) @@ -311,11 +312,17 @@ class MTVServicesInfoExtractor(InfoExtractor): main_container = self._extract_child_with_type(data, 'MainContainer') ab_testing = self._extract_child_with_type(main_container, 'ABTesting') video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer') - mgid = video_player['props']['media']['video']['config']['uri'] + if video_player: + mgid = try_get(video_player, lambda x: x['props']['media']['video']['config']['uri']) + else: + flex_wrapper = self._extract_child_with_type(ab_testing or main_container, 'FlexWrapper') + auth_suite_wrapper = self._extract_child_with_type(flex_wrapper, 'AuthSuiteWrapper') + player = self._extract_child_with_type(auth_suite_wrapper or flex_wrapper, 'Player') + if player: + mgid = try_get(player, lambda x: x['props']['videoDetail']['mgid']) if not mgid: - mgid = self._search_regex( - r'"media":{"video":{"config":{"uri":"(mgid:.*?)"', webpage, 'mgid', default=None) + raise ExtractorError('Could not extract mgid') return mgid diff --git a/hypervideo_dl/extractor/muenchentv.py b/hypervideo_dl/extractor/muenchentv.py index d256236..a53929e 100644 --- a/hypervideo_dl/extractor/muenchentv.py +++ b/hypervideo_dl/extractor/muenchentv.py @@ -33,7 +33,7 @@ class MuenchenTVIE(InfoExtractor): display_id = 'live' webpage = self._download_webpage(url, display_id) - title = self._live_title(self._og_search_title(webpage)) + title = self._og_search_title(webpage) data_js = self._search_regex( r'(?s)\nplaylist:\s*(\[.*?}\]),', diff --git a/hypervideo_dl/extractor/murrtube.py b/hypervideo_dl/extractor/murrtube.py new file mode 100644 index 0000000..1eb5de6 --- /dev/null +++ b/hypervideo_dl/extractor/murrtube.py @@ -0,0 +1,165 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + OnDemandPagedList, + determine_ext, + int_or_none, + try_get, +) + + +class MurrtubeIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + murrtube:| + https?://murrtube\.net/videos/(?P<slug>[a-z0-9\-]+)\- + ) + (?P<id>[a-f0-9]{8}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{12}) + ''' + _TEST = { + 'url': 'https://murrtube.net/videos/inferno-x-skyler-148b6f2a-fdcc-4902-affe-9c0f41aaaca0', + 'md5': '169f494812d9a90914b42978e73aa690', + 'info_dict': { + 'id': '148b6f2a-fdcc-4902-affe-9c0f41aaaca0', + 'ext': 'mp4', + 'title': 'Inferno X Skyler', + 'description': 'Humping a very good slutty sheppy (roomate)', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 284, + 'uploader': 'Inferno Wolf', + 'age_limit': 18, + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'tags': ['hump', 'breed', 'Fursuit', 'murrsuit', 'bareback'], + } + } + + def _download_gql(self, video_id, op, note=None, fatal=True): + result = self._download_json( + 'https://murrtube.net/graphql', + video_id, note, data=json.dumps(op).encode(), fatal=fatal, + headers={'Content-Type': 'application/json'}) + return result['data'] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._download_gql(video_id, { + 'operationName': 'Medium', + 'variables': { + 'id': video_id, + }, + 'query': '''\ +query Medium($id: ID!) { + medium(id: $id) { + title + description + key + duration + commentsCount + likesCount + viewsCount + thumbnailKey + tagList + user { + name + __typename + } + __typename + } +}'''}) + meta = data['medium'] + + storage_url = 'https://storage.murrtube.net/murrtube/' + format_url = storage_url + meta.get('key', '') + thumbnail = storage_url + meta.get('thumbnailKey', '') + + if determine_ext(format_url) == 'm3u8': + formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', fatal=False) + else: + formats = [{'url': format_url}] + + return { + 'id': video_id, + 'title': meta.get('title'), + 'description': meta.get('description'), + 'formats': formats, + 'thumbnail': thumbnail, + 'duration': int_or_none(meta.get('duration')), + 'uploader': try_get(meta, lambda x: x['user']['name']), + 'view_count': meta.get('viewsCount'), + 'like_count': meta.get('likesCount'), + 'comment_count': meta.get('commentsCount'), + 'tags': meta.get('tagList'), + 'age_limit': 18, + } + + +class MurrtubeUserIE(MurrtubeIE): + IE_DESC = 'Murrtube user profile' + _VALID_URL = r'https?://murrtube\.net/(?P<id>[^/]+)$' + _TEST = { + 'url': 'https://murrtube.net/stormy', + 'info_dict': { + 'id': 'stormy', + }, + 'playlist_mincount': 27, + } + _PAGE_SIZE = 10 + + def _fetch_page(self, username, user_id, page): + data = self._download_gql(username, { + 'operationName': 'Media', + 'variables': { + 'limit': self._PAGE_SIZE, + 'offset': page * self._PAGE_SIZE, + 'sort': 'latest', + 'userId': user_id, + }, + 'query': '''\ +query Media($q: String, $sort: String, $userId: ID, $offset: Int!, $limit: Int!) { + media(q: $q, sort: $sort, userId: $userId, offset: $offset, limit: $limit) { + id + __typename + } +}'''}, + 'Downloading page {0}'.format(page + 1)) + if data is None: + raise ExtractorError(f'Failed to retrieve video list for page {page + 1}') + + media = data['media'] + + for entry in media: + yield self.url_result('murrtube:{0}'.format(entry['id']), MurrtubeIE.ie_key()) + + def _real_extract(self, url): + username = self._match_id(url) + data = self._download_gql(username, { + 'operationName': 'User', + 'variables': { + 'id': username, + }, + 'query': '''\ +query User($id: ID!) { + user(id: $id) { + id + __typename + } +}'''}, + 'Downloading user info') + if data is None: + raise ExtractorError('Failed to fetch user info') + + user = data['user'] + + entries = OnDemandPagedList(functools.partial( + self._fetch_page, username, user.get('id')), self._PAGE_SIZE) + + return self.playlist_result(entries, username) diff --git a/hypervideo_dl/extractor/musescore.py b/hypervideo_dl/extractor/musescore.py index dcd2638..09fadf8 100644 --- a/hypervideo_dl/extractor/musescore.py +++ b/hypervideo_dl/extractor/musescore.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class MuseScoreIE(InfoExtractor): - _VALID_URL = r'(?:https?://)(?:www\.)?musescore\.com/(?:user/\d+|[^/]+)(?:/scores)?/(?P<id>[^#&?]+)' + _VALID_URL = r'https?://(?:www\.)?musescore\.com/(?:user/\d+|[^/]+)(?:/scores)?/(?P<id>[^#&?]+)' _TESTS = [{ 'url': 'https://musescore.com/user/73797/scores/142975', 'info_dict': { @@ -13,7 +13,7 @@ class MuseScoreIE(InfoExtractor): 'ext': 'mp3', 'title': 'WA Mozart Marche Turque (Turkish March fingered)', 'description': 'md5:7ede08230e4eaabd67a4a98bb54d07be', - 'thumbnail': r're:(?:https?://)(?:www\.)?musescore\.com/.*\.png[^$]+', + 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+', 'uploader': 'PapyPiano', 'creator': 'Wolfgang Amadeus Mozart', } @@ -24,7 +24,7 @@ class MuseScoreIE(InfoExtractor): 'ext': 'mp3', 'title': 'Sweet Child O\' Mine – Guns N\' Roses sweet child', 'description': 'md5:4dca71191c14abc312a0a4192492eace', - 'thumbnail': r're:(?:https?://)(?:www\.)?musescore\.com/.*\.png[^$]+', + 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+', 'uploader': 'roxbelviolin', 'creator': 'Guns N´Roses Arr. Roxbel Violin', } @@ -35,7 +35,7 @@ class MuseScoreIE(InfoExtractor): 'ext': 'mp3', 'title': 'Für Elise – Beethoven', 'description': 'md5:49515a3556d5ecaf9fa4b2514064ac34', - 'thumbnail': r're:(?:https?://)(?:www\.)?musescore\.com/.*\.png[^$]+', + 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+', 'uploader': 'ClassicMan', 'creator': 'Ludwig van Beethoven (1770–1827)', } diff --git a/hypervideo_dl/extractor/musicdex.py b/hypervideo_dl/extractor/musicdex.py new file mode 100644 index 0000000..05f7220 --- /dev/null +++ b/hypervideo_dl/extractor/musicdex.py @@ -0,0 +1,175 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + date_from_str, + format_field, + try_get, + unified_strdate, +) + + +class MusicdexBaseIE(InfoExtractor): + def _return_info(self, track_json, album_json, id): + return { + 'id': str(id), + 'title': track_json.get('name'), + 'track': track_json.get('name'), + 'description': track_json.get('description'), + 'track_number': track_json.get('number'), + 'url': format_field(track_json, 'url', 'https://www.musicdex.org/%s'), + 'duration': track_json.get('duration'), + 'genre': [genre.get('name') for genre in track_json.get('genres') or []], + 'like_count': track_json.get('likes_count'), + 'view_count': track_json.get('plays'), + 'artist': [artist.get('name') for artist in track_json.get('artists') or []], + 'album_artist': [artist.get('name') for artist in album_json.get('artists') or []], + 'thumbnail': format_field(album_json, 'image', 'https://www.musicdex.org/%s'), + 'album': album_json.get('name'), + 'release_year': try_get(album_json, lambda x: date_from_str(unified_strdate(x['release_date'])).year), + 'extractor_key': MusicdexSongIE.ie_key(), + 'extractor': 'MusicdexSong', + } + + +class MusicdexSongIE(MusicdexBaseIE): + _VALID_URL = r'https?://(?:www\.)?musicdex\.org/track/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://www.musicdex.org/track/306/dual-existence', + 'info_dict': { + 'id': '306', + 'ext': 'mp3', + 'title': 'dual existence', + 'description': '#NIPPONSEI @ IRC.RIZON.NET', + 'track': 'dual existence', + 'track_number': 1, + 'duration': 266000, + 'genre': ['Anime'], + 'like_count': int, + 'view_count': int, + 'artist': ['fripSide'], + 'album_artist': ['fripSide'], + 'thumbnail': 'https://www.musicdex.org/storage/album/9iDIam1DHTVqUG4UclFIEq1WAFGXfPW4y0TtZa91.png', + 'album': 'To Aru Kagaku no Railgun T OP2 Single - dual existence', + 'release_year': 2020 + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://www.musicdex.org/secure/tracks/{id}?defaultRelations=true', id)['track'] + return self._return_info(data_json, data_json.get('album') or {}, id) + + +class MusicdexAlbumIE(MusicdexBaseIE): + _VALID_URL = r'https?://(?:www\.)?musicdex\.org/album/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://www.musicdex.org/album/56/tenmon-and-eiichiro-yanagi-minori/ef-a-tale-of-memories-original-soundtrack-2-fortissimo', + 'playlist_mincount': 28, + 'info_dict': { + 'id': '56', + 'genre': ['OST'], + 'view_count': int, + 'artist': ['TENMON & Eiichiro Yanagi / minori'], + 'title': 'ef - a tale of memories Original Soundtrack 2 ~fortissimo~', + 'release_year': 2008, + 'thumbnail': 'https://www.musicdex.org/storage/album/2rSHkyYBYfB7sbvElpEyTMcUn6toY7AohOgJuDlE.jpg', + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://www.musicdex.org/secure/albums/{id}?defaultRelations=true', id)['album'] + entries = [self._return_info(track, data_json, track['id']) for track in data_json.get('tracks') or [] if track.get('id')] + + return { + '_type': 'playlist', + 'id': id, + 'title': data_json.get('name'), + 'description': data_json.get('description'), + 'genre': [genre.get('name') for genre in data_json.get('genres') or []], + 'view_count': data_json.get('plays'), + 'artist': [artist.get('name') for artist in data_json.get('artists') or []], + 'thumbnail': format_field(data_json, 'image', 'https://www.musicdex.org/%s'), + 'release_year': try_get(data_json, lambda x: date_from_str(unified_strdate(x['release_date'])).year), + 'entries': entries, + } + + +class MusicdexPageIE(MusicdexBaseIE): + def _entries(self, id): + next_page_url = self._API_URL % id + while next_page_url: + data_json = self._download_json(next_page_url, id)['pagination'] + for data in data_json.get('data') or []: + yield data + next_page_url = data_json.get('next_page_url') + + +class MusicdexArtistIE(MusicdexPageIE): + _VALID_URL = r'https?://(?:www\.)?musicdex\.org/artist/(?P<id>\d+)' + _API_URL = 'https://www.musicdex.org/secure/artists/%s/albums?page=1' + + _TESTS = [{ + 'url': 'https://www.musicdex.org/artist/11/fripside', + 'playlist_mincount': 28, + 'info_dict': { + 'id': '11', + 'view_count': int, + 'title': 'fripSide', + 'thumbnail': 'https://www.musicdex.org/storage/artist/ZmOz0lN2vsweegB660em3xWffCjLPmTQHqJls5Xx.jpg', + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://www.musicdex.org/secure/artists/{id}', id)['artist'] + entries = [] + for album in self._entries(id): + entries.extend(self._return_info(track, album, track['id']) for track in album.get('tracks') or [] if track.get('id')) + + return { + '_type': 'playlist', + 'id': id, + 'title': data_json.get('name'), + 'view_count': data_json.get('plays'), + 'thumbnail': format_field(data_json, 'image_small', 'https://www.musicdex.org/%s'), + 'entries': entries, + } + + +class MusicdexPlaylistIE(MusicdexPageIE): + _VALID_URL = r'https?://(?:www\.)?musicdex\.org/playlist/(?P<id>\d+)' + _API_URL = 'https://www.musicdex.org/secure/playlists/%s/tracks?perPage=10000&page=1' + + _TESTS = [{ + 'url': 'https://www.musicdex.org/playlist/9/test', + 'playlist_mincount': 73, + 'info_dict': { + 'id': '9', + 'view_count': int, + 'title': 'Test', + 'thumbnail': 'https://www.musicdex.org/storage/album/jXATI79f0IbQ2sgsKYOYRCW3zRwF3XsfHhzITCuJ.jpg', + 'description': 'Test 123 123 21312 32121321321321312', + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://www.musicdex.org/secure/playlists/{id}', id)['playlist'] + entries = [self._return_info(track, track.get('album') or {}, track['id']) + for track in self._entries(id) or [] if track.get('id')] + + return { + '_type': 'playlist', + 'id': id, + 'title': data_json.get('name'), + 'description': data_json.get('description'), + 'view_count': data_json.get('plays'), + 'thumbnail': format_field(data_json, 'image', 'https://www.musicdex.org/%s'), + 'entries': entries, + } diff --git a/hypervideo_dl/extractor/mxplayer.py b/hypervideo_dl/extractor/mxplayer.py index 5874556..3c2afd8 100644 --- a/hypervideo_dl/extractor/mxplayer.py +++ b/hypervideo_dl/extractor/mxplayer.py @@ -180,7 +180,7 @@ class MxplayerIE(InfoExtractor): class MxplayerShowIE(InfoExtractor): - _VALID_URL = r'(?:https?://)(?:www\.)?mxplayer\.in/show/(?P<display_id>[-\w]+)-(?P<id>\w+)/?(?:$|[#?])' + _VALID_URL = r'https?://(?:www\.)?mxplayer\.in/show/(?P<display_id>[-\w]+)-(?P<id>\w+)/?(?:$|[#?])' _TESTS = [{ 'url': 'https://www.mxplayer.in/show/watch-chakravartin-ashoka-samrat-series-online-a8f44e3cc0814b5601d17772cedf5417', 'playlist_mincount': 440, diff --git a/hypervideo_dl/extractor/myspass.py b/hypervideo_dl/extractor/myspass.py index db7ebc9..1775d5f 100644 --- a/hypervideo_dl/extractor/myspass.py +++ b/hypervideo_dl/extractor/myspass.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -13,33 +11,74 @@ from ..utils import ( class MySpassIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?myspass\.de/([^/]+/)*(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?myspass\.de/(?:[^/]+/)*(?P<id>\d+)/?[^/]*$' + _TESTS = [{ 'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', 'md5': '0b49f4844a068f8b33f4b7c88405862b', 'info_dict': { 'id': '11741', 'ext': 'mp4', - 'description': 'Wer kann in die Fußstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?', + 'description': 'md5:9f0db5044c8fe73f528a390498f7ce9b', 'title': '17.02.2013 - Die Highlights, Teil 2', + 'thumbnail': r're:.*\.jpg', + 'duration': 323.0, + 'episode': '17.02.2013 - Die Highlights, Teil 2', + 'season_id': '544', + 'episode_number': 1, + 'series': 'Absolute Mehrheit', + 'season_number': 2, + 'season': 'Season 2', + }, + }, + { + 'url': 'https://www.myspass.de/shows/tvshows/tv-total/Novak-Puffovic-bei-bester-Laune--/44996/', + 'md5': 'eb28b7c5e254192046e86ebaf7deac8f', + 'info_dict': { + 'id': '44996', + 'ext': 'mp4', + 'description': 'md5:74c7f886e00834417f1e427ab0da6121', + 'title': 'Novak Puffovic bei bester Laune', + 'thumbnail': r're:.*\.jpg', + 'episode_number': 8, + 'episode': 'Novak Puffovic bei bester Laune', + 'series': 'TV total', + 'season': 'Season 19', + 'season_id': '987', + 'duration': 2941.0, + 'season_number': 19, + }, + }, + { + 'url': 'https://www.myspass.de/channels/tv-total-raabigramm/17033/20831/', + 'md5': '7b293a6b9f3a7acdd29304c8d0dbb7cc', + 'info_dict': { + 'id': '20831', + 'ext': 'mp4', + 'description': 'Gefühle pur: Schaut euch die ungeschnittene Version von Stefans Liebesbeweis an die Moderationsgrazie von Welt, Verona Feldbusch, an.', + 'title': 'Raabigramm Verona Feldbusch', + 'thumbnail': r're:.*\.jpg', + 'episode_number': 6, + 'episode': 'Raabigramm Verona Feldbusch', + 'series': 'TV total', + 'season': 'Season 1', + 'season_id': '34', + 'duration': 105.0, + 'season_number': 1, }, - } + }] def _real_extract(self, url): video_id = self._match_id(url) - metadata = self._download_xml( - 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=' + video_id, - video_id) + metadata = self._download_xml('http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=' + video_id, video_id) title = xpath_text(metadata, 'title', fatal=True) video_url = xpath_text(metadata, 'url_flv', 'download url', True) video_id_int = int(video_id) - for group in re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url).groups(): + for group in self._search_regex(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url, 'myspass', group=(1, 2, 3), default=[]): group_int = int(group) if group_int > video_id_int: - video_url = video_url.replace( - group, compat_str(group_int // video_id_int)) + video_url = video_url.replace(group, compat_str(group_int // video_id_int)) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/n1.py b/hypervideo_dl/extractor/n1.py index 7a09c67..fdb7f32 100644 --- a/hypervideo_dl/extractor/n1.py +++ b/hypervideo_dl/extractor/n1.py @@ -3,8 +3,6 @@ from __future__ import unicode_literals import re -from .youtube import YoutubeIE -from .reddit import RedditRIE from .common import InfoExtractor from ..utils import ( unified_timestamp, @@ -40,7 +38,7 @@ class N1InfoAssetIE(InfoExtractor): class N1InfoIIE(InfoExtractor): IE_NAME = 'N1Info:article' - _VALID_URL = r'https?://(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)/(?:[^/]+/){1,2}(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)|nova\.rs)/(?:[^/]+/){1,2}(?P<id>[^/]+)' _TESTS = [{ # Youtube embedded 'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/', @@ -90,10 +88,18 @@ class N1InfoIIE(InfoExtractor): 'uploader': 'YouLotWhatDontStop', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { + 'url': 'https://nova.rs/vesti/politika/zaklina-tatalovic-ani-brnabic-pricate-lazi-video/', + 'info_dict': { + 'id': 'tnjganabrnabicizaklinatatalovic100danavladegp-novas-worldwide', + 'ext': 'mp4', + 'title': 'Žaklina Tatalović Ani Brnabić: Pričate laži (VIDEO)', + 'upload_date': '20211102', + 'timestamp': 1635861677, + }, + }, { 'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/', 'only_matching': True, }] @@ -116,16 +122,16 @@ class N1InfoIIE(InfoExtractor): 'title': title, 'thumbnail': video_data.get('data-thumbnail'), 'timestamp': timestamp, - 'ie_key': N1InfoAssetIE.ie_key()}) + 'ie_key': 'N1InfoAsset'}) embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage) for embedded_video in embedded_videos: video_data = extract_attributes(embedded_video) - url = video_data.get('src') + url = video_data.get('src') or '' if url.startswith('https://www.youtube.com'): - entries.append(self.url_result(url, ie=YoutubeIE.ie_key())) + entries.append(self.url_result(url, ie='Youtube')) elif url.startswith('https://www.redditmedia.com'): - entries.append(self.url_result(url, ie=RedditRIE.ie_key())) + entries.append(self.url_result(url, ie='RedditR')) return { '_type': 'playlist', diff --git a/hypervideo_dl/extractor/nate.py b/hypervideo_dl/extractor/nate.py new file mode 100644 index 0000000..072faf6 --- /dev/null +++ b/hypervideo_dl/extractor/nate.py @@ -0,0 +1,124 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + str_or_none, + traverse_obj, + unified_strdate, +) + + +class NateIE(InfoExtractor): + _VALID_URL = r'https?://tv\.nate\.com/clip/(?P<id>[0-9]+)' + + _TESTS = [{ + 'url': 'https://tv.nate.com/clip/1848976', + 'info_dict': { + 'id': '1848976', + 'ext': 'mp4', + 'title': '[결승 오프닝 타이틀] 2018 LCK 서머 스플릿 결승전 kt Rolster VS Griffin', + 'description': 'md5:e1b79a7dcf0d8d586443f11366f50e6f', + 'thumbnail': r're:^https?://.*\.jpg', + 'upload_date': '20180908', + 'age_limit': 15, + 'duration': 73, + 'uploader': '2018 LCK 서머 스플릿(롤챔스)', + 'channel': '2018 LCK 서머 스플릿(롤챔스)', + 'channel_id': '3606', + 'uploader_id': '3606', + 'tags': 'count:59', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://tv.nate.com/clip/4300566', + 'info_dict': { + 'id': '4300566', + 'ext': 'mp4', + 'title': '[심쿵엔딩] 이준호x이세영, 서로를 기억하며 끌어안는 두 사람!💕, MBC 211204 방송', + 'description': 'md5:be1653502d9c13ce344ddf7828e089fa', + 'thumbnail': r're:^https?://.*\.jpg', + 'upload_date': '20211204', + 'age_limit': 15, + 'duration': 201, + 'uploader': '옷소매 붉은 끝동', + 'channel': '옷소매 붉은 끝동', + 'channel_id': '27987', + 'uploader_id': '27987', + 'tags': 'count:20', + }, + 'params': {'skip_download': True} + }] + + _QUALITY = { + '36': 2160, + '35': 1080, + '34': 720, + '33': 480, + '32': 360, + '31': 270, + } + + def _real_extract(self, url): + id = self._match_id(url) + video_data = self._download_json(f'https://tv.nate.com/api/v1/clip/{id}', id) + formats = [{ + 'format_id': f_url[-2:], + 'url': f_url, + 'height': self._QUALITY.get(f_url[-2:]), + 'quality': int_or_none(f_url[-2:]), + } for f_url in video_data.get('smcUriList') or []] + self._sort_formats(formats) + return { + 'id': id, + 'title': video_data.get('clipTitle'), + 'description': video_data.get('synopsis'), + 'thumbnail': video_data.get('contentImg'), + 'upload_date': unified_strdate(traverse_obj(video_data, 'broadDate', 'regDate')), + 'age_limit': video_data.get('targetAge'), + 'duration': video_data.get('playTime'), + 'formats': formats, + 'uploader': video_data.get('programTitle'), + 'channel': video_data.get('programTitle'), + 'channel_id': str_or_none(video_data.get('programSeq')), + 'uploader_id': str_or_none(video_data.get('programSeq')), + 'tags': video_data['hashTag'].split(',') if video_data.get('hashTag') else None, + } + + +class NateProgramIE(InfoExtractor): + _VALID_URL = r'https?://tv\.nate\.com/program/clips/(?P<id>[0-9]+)' + + _TESTS = [{ + 'url': 'https://tv.nate.com/program/clips/27987', + 'playlist_mincount': 191, + 'info_dict': { + 'id': '27987', + }, + }, { + 'url': 'https://tv.nate.com/program/clips/3606', + 'playlist_mincount': 15, + 'info_dict': { + 'id': '3606', + }, + }] + + def _entries(self, id): + for page_num in itertools.count(1): + program_data = self._download_json(f'https://tv.nate.com/api/v1/program/{id}/clip/ranking?size=20&page={page_num}', + id, note=f'Downloading page {page_num}') + for clip in program_data.get('content') or []: + clip_id = clip.get('clipSeq') + if clip_id: + yield self.url_result( + 'https://tv.nate.com/clip/%s' % clip_id, + ie=NateIE.ie_key(), video_id=clip_id) + if program_data.get('last'): + break + + def _real_extract(self, url): + id = self._match_id(url) + return self.playlist_result(self._entries(id), playlist_id=id) diff --git a/hypervideo_dl/extractor/naver.py b/hypervideo_dl/extractor/naver.py index acf53c1..a6821ba 100644 --- a/hypervideo_dl/extractor/naver.py +++ b/hypervideo_dl/extractor/naver.py @@ -40,6 +40,7 @@ class NaverBaseIE(InfoExtractor): formats.append({ 'format_id': '%s_%s' % (stream.get('type') or stream_type, dict_get(encoding_option, ('name', 'id'))), 'url': stream_url, + 'ext': 'mp4', 'width': int_or_none(encoding_option.get('width')), 'height': int_or_none(encoding_option.get('height')), 'vbr': int_or_none(bitrate.get('video')), @@ -174,7 +175,7 @@ class NaverLiveIE(InfoExtractor): 'url': 'https://tv.naver.com/l/52010', 'info_dict': { 'id': '52010', - 'ext': 'm3u8', + 'ext': 'mp4', 'title': '[LIVE] 뉴스특보 : "수도권 거리두기, 2주간 2단계로 조정"', 'description': 'md5:df7f0c237a5ed5e786ce5c91efbeaab3', 'channel_id': 'NTV-ytnnews24-0', @@ -184,7 +185,7 @@ class NaverLiveIE(InfoExtractor): 'url': 'https://tv.naver.com/l/51549', 'info_dict': { 'id': '51549', - 'ext': 'm3u8', + 'ext': 'mp4', 'title': '연합뉴스TV - 코로나19 뉴스특보', 'description': 'md5:c655e82091bc21e413f549c0eaccc481', 'channel_id': 'NTV-yonhapnewstv-0', @@ -233,7 +234,7 @@ class NaverLiveIE(InfoExtractor): continue formats.extend(self._extract_m3u8_formats( - quality.get('url'), video_id, 'm3u8', + quality.get('url'), video_id, 'mp4', m3u8_id=quality.get('qualityId'), live=True )) self._sort_formats(formats) diff --git a/hypervideo_dl/extractor/nba.py b/hypervideo_dl/extractor/nba.py index 7390ef8..359cc52 100644 --- a/hypervideo_dl/extractor/nba.py +++ b/hypervideo_dl/extractor/nba.py @@ -165,14 +165,10 @@ class NBAWatchIE(NBAWatchBaseIE): def _real_extract(self, url): display_id = self._match_id(url) collection_id = parse_qs(url).get('collection', [None])[0] - if collection_id: - if self.get_param('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % display_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % collection_id) - return self.url_result( - 'https://www.nba.com/watch/list/collection/' + collection_id, - NBAWatchCollectionIE.ie_key(), collection_id) + if self._yes_playlist(collection_id, display_id): + return self.url_result( + 'https://www.nba.com/watch/list/collection/' + collection_id, + NBAWatchCollectionIE.ie_key(), collection_id) return self._extract_video('seoName', display_id) diff --git a/hypervideo_dl/extractor/nbc.py b/hypervideo_dl/extractor/nbc.py index f304f19..1094034 100644 --- a/hypervideo_dl/extractor/nbc.py +++ b/hypervideo_dl/extractor/nbc.py @@ -197,9 +197,12 @@ class NBCSportsVPlayerIE(InfoExtractor): 'timestamp': 1426270238, 'upload_date': '20150313', 'uploader': 'NBCU-SPORTS', + 'duration': 72.818, + 'chapters': [], + 'thumbnail': r're:^https?://.*\.jpg$' } }, { - 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/PEgOtlNcC_y2', 'only_matching': True, }, { 'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true', @@ -208,16 +211,15 @@ class NBCSportsVPlayerIE(InfoExtractor): @staticmethod def _extract_url(webpage): - iframe_m = re.search( - r'<(?:iframe[^>]+|div[^>]+data-(?:mpx-)?)src="(?P<url>%s[^"]+)"' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage) - if iframe_m: - return iframe_m.group('url') + video_urls = re.search( + r'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>%s[^\"]+)' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage) + if video_urls: + return video_urls.group('url') def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - theplatform_url = self._og_search_video_url(webpage).replace( - 'vplayer.nbcsports.com', 'player.theplatform.com') + theplatform_url = self._html_search_regex(r'tp:releaseUrl="(.+?)"', webpage, 'url') return self.url_result(theplatform_url, 'ThePlatform') @@ -235,6 +237,9 @@ class NBCSportsIE(InfoExtractor): 'uploader': 'NBCU-SPORTS', 'upload_date': '20150330', 'timestamp': 1427726529, + 'chapters': [], + 'thumbnail': 'https://hdliveextra-a.akamaihd.net/HD/image_sports/NBCU_Sports_Group_-_nbcsports/253/303/izzodps.jpg', + 'duration': 528.395, } }, { # data-mpx-src @@ -305,7 +310,7 @@ class NBCSportsStreamIE(AdobePassIE): self._sort_formats(formats) return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': live_source.get('description'), 'formats': formats, 'is_live': is_live, @@ -403,9 +408,7 @@ class NBCNewsIE(ThePlatformIE): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( - r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', - webpage, 'bootstrap json'), video_id)['props']['initialState'] + data = self._search_nextjs_data(webpage, video_id)['props']['initialState'] video_data = try_get(data, lambda x: x['video']['current'], dict) if not video_data: video_data = data['article']['content'][0]['primaryMedia']['video'] @@ -545,8 +548,6 @@ class NBCOlympicsStreamIE(AdobePassIE): title = event_config['eventTitle'] is_live = {'live': True, 'replay': False}.get(event_config.get('eventStatus')) - if is_live: - title = self._live_title(title) source_url = self._download_json( f'https://api-leap.nbcsports.com/feeds/assets/{pid}?application=NBCOlympics&platform=desktop&format=nbc-player&env=staging', diff --git a/hypervideo_dl/extractor/ndr.py b/hypervideo_dl/extractor/ndr.py index f2bae2c..1917254 100644 --- a/hypervideo_dl/extractor/ndr.py +++ b/hypervideo_dl/extractor/ndr.py @@ -245,8 +245,6 @@ class NDREmbedBaseIE(InfoExtractor): live = playlist.get('config', {}).get('streamType') in ['httpVideoLive', 'httpAudioLive'] title = config['title'] - if live: - title = self._live_title(title) uploader = ppjson.get('config', {}).get('branding') upload_date = ppjson.get('config', {}).get('publicationDate') duration = int_or_none(config.get('duration')) diff --git a/hypervideo_dl/extractor/nebula.py b/hypervideo_dl/extractor/nebula.py index 9698a35..77f2535 100644 --- a/hypervideo_dl/extractor/nebula.py +++ b/hypervideo_dl/extractor/nebula.py @@ -1,22 +1,161 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import json import time +import urllib -from urllib.error import HTTPError -from .common import InfoExtractor -from ..compat import compat_str, compat_urllib_parse_unquote, compat_urllib_parse_quote from ..utils import ( ExtractorError, parse_iso8601, try_get, - urljoin, ) +from .common import InfoExtractor + + +class NebulaBaseIE(InfoExtractor): + _NETRC_MACHINE = 'watchnebula' + + _nebula_api_token = None + _nebula_bearer_token = None + _zype_access_token = None + + def _perform_nebula_auth(self): + username, password = self._get_login_info() + if not (username and password): + self.raise_login_required() + + data = json.dumps({'email': username, 'password': password}).encode('utf8') + response = self._download_json( + 'https://api.watchnebula.com/api/v1/auth/login/', + data=data, fatal=False, video_id=None, + headers={ + 'content-type': 'application/json', + # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint + 'cookie': '' + }, + note='Logging in to Nebula with supplied credentials', + errnote='Authentication failed or rejected') + if not response or not response.get('key'): + self.raise_login_required() + + # save nebula token as cookie + self._set_cookie( + 'nebula.app', 'nebula-auth', + urllib.parse.quote( + json.dumps({ + "apiToken": response["key"], + "isLoggingIn": False, + "isLoggingOut": False, + }, separators=(",", ":"))), + expire_time=int(time.time()) + 86400 * 365, + ) + + return response['key'] + + def _retrieve_nebula_api_token(self): + """ + Check cookie jar for valid token. Try to authenticate using credentials if no valid token + can be found in the cookie jar. + """ + nebula_cookies = self._get_cookies('https://nebula.app') + nebula_cookie = nebula_cookies.get('nebula-auth') + if nebula_cookie: + self.to_screen('Authenticating to Nebula with token from cookie jar') + nebula_cookie_value = urllib.parse.unquote(nebula_cookie.value) + nebula_api_token = self._parse_json(nebula_cookie_value, None).get('apiToken') + if nebula_api_token: + return nebula_api_token + + return self._perform_nebula_auth() + + def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''): + assert method in ('GET', 'POST',) + assert auth_type in ('api', 'bearer',) + + def inner_call(): + authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}' + return self._download_json( + url, video_id, note=note, headers={'Authorization': authorization}, + data=b'' if method == 'POST' else None) + + try: + return inner_call() + except ExtractorError as exc: + # if 401 or 403, attempt credential re-auth and retry + if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403): + self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}') + self._perform_login() + return inner_call() + else: + raise + + def _fetch_nebula_bearer_token(self): + """ + Get a Bearer token for the Nebula API. This will be required to fetch video meta data. + """ + response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/', + method='POST', + note='Authorizing to Nebula') + return response['token'] + + def _fetch_zype_access_token(self): + """ + Get a Zype access token, which is required to access video streams -- in our case: to + generate video URLs. + """ + user_object = self._call_nebula_api('https://api.watchnebula.com/api/v1/auth/user/', note='Retrieving Zype access token') + + access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], str) + if not access_token: + if try_get(user_object, lambda x: x['is_subscribed'], bool): + # TODO: Reimplement the same Zype token polling the Nebula frontend implements + # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532 + raise ExtractorError( + 'Unable to extract Zype access token from Nebula API authentication endpoint. ' + 'Open an arbitrary video in a browser with this account to generate a token', + expected=True) + raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') + return access_token + + def _build_video_info(self, episode): + zype_id = episode['zype_id'] + zype_video_url = f'https://player.zype.com/embed/{zype_id}.html?access_token={self._zype_access_token}' + channel_slug = episode['channel_slug'] + return { + 'id': episode['zype_id'], + 'display_id': episode['slug'], + '_type': 'url_transparent', + 'ie_key': 'Zype', + 'url': zype_video_url, + 'title': episode['title'], + 'description': episode['description'], + 'timestamp': parse_iso8601(episode['published_at']), + 'thumbnails': [{ + # 'id': tn.get('name'), # this appears to be null + 'url': tn['original'], + 'height': key, + } for key, tn in episode['assets']['thumbnail'].items()], + 'duration': episode['duration'], + 'channel': episode['channel_title'], + 'channel_id': channel_slug, + 'channel_url': f'https://nebula.app/{channel_slug}', + 'uploader': episode['channel_title'], + 'uploader_id': channel_slug, + 'uploader_url': f'https://nebula.app/{channel_slug}', + 'series': episode['channel_title'], + 'creator': episode['channel_title'], + } + def _perform_login(self, username=None, password=None): + # FIXME: username should be passed from here to inner functions + self._nebula_api_token = self._retrieve_nebula_api_token() + self._nebula_bearer_token = self._fetch_nebula_bearer_token() + self._zype_access_token = self._fetch_zype_access_token() -class NebulaIE(InfoExtractor): +class NebulaIE(NebulaBaseIE): _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P<id>[-\w]+)' _TESTS = [ { @@ -30,12 +169,13 @@ class NebulaIE(InfoExtractor): 'upload_date': '20180731', 'timestamp': 1533009600, 'channel': 'Lindsay Ellis', + 'channel_id': 'lindsayellis', 'uploader': 'Lindsay Ellis', + 'uploader_id': 'lindsayellis', }, 'params': { 'usenetrc': True, }, - 'skip': 'All Nebula content requires authentication', }, { 'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', @@ -47,13 +187,14 @@ class NebulaIE(InfoExtractor): 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', 'upload_date': '20200327', 'timestamp': 1585348140, - 'channel': 'The Logistics of D-Day', - 'uploader': 'The Logistics of D-Day', + 'channel': 'Real Engineering', + 'channel_id': 'realengineering', + 'uploader': 'Real Engineering', + 'uploader_id': 'realengineering', }, 'params': { 'usenetrc': True, }, - 'skip': 'All Nebula content requires authentication', }, { 'url': 'https://nebula.app/videos/money-episode-1-the-draw', @@ -66,173 +207,82 @@ class NebulaIE(InfoExtractor): 'upload_date': '20200323', 'timestamp': 1584980400, 'channel': 'Tom Scott Presents: Money', + 'channel_id': 'tom-scott-presents-money', 'uploader': 'Tom Scott Presents: Money', + 'uploader_id': 'tom-scott-presents-money', }, 'params': { 'usenetrc': True, }, - 'skip': 'All Nebula content requires authentication', }, { 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', 'only_matching': True, }, ] - _NETRC_MACHINE = 'watchnebula' - _nebula_token = None + def _fetch_video_metadata(self, slug): + return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/', + video_id=slug, + auth_type='bearer', + note='Fetching video meta data') - def _retrieve_nebula_auth(self): - """ - Log in to Nebula, and returns a Nebula API token - """ + def _real_extract(self, url): + slug = self._match_id(url) + video = self._fetch_video_metadata(slug) + return self._build_video_info(video) - username, password = self._get_login_info() - if not (username and password): - self.raise_login_required() - self.report_login() - data = json.dumps({'email': username, 'password': password}).encode('utf8') - response = self._download_json( - 'https://api.watchnebula.com/api/v1/auth/login/', - data=data, fatal=False, video_id=None, - headers={ - 'content-type': 'application/json', - # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint - 'cookie': '' +class NebulaCollectionIE(NebulaBaseIE): + IE_NAME = 'nebula:collection' + _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/(?!videos/)(?P<id>[-\w]+)' + _TESTS = [ + { + 'url': 'https://nebula.app/tom-scott-presents-money', + 'info_dict': { + 'id': 'tom-scott-presents-money', + 'title': 'Tom Scott Presents: Money', + 'description': 'Tom Scott hosts a series all about trust, negotiation and money.', }, - note='Authenticating to Nebula with supplied credentials', - errnote='Authentication failed or rejected') - if not response or not response.get('key'): - self.raise_login_required() - - # save nebula token as cookie - self._set_cookie( - 'nebula.app', 'nebula-auth', - compat_urllib_parse_quote( - json.dumps({ - "apiToken": response["key"], - "isLoggingIn": False, - "isLoggingOut": False, - }, separators=(",", ":"))), - expire_time=int(time.time()) + 86400 * 365, - ) - - return response['key'] - - def _retrieve_zype_api_key(self, page_url, display_id): - """ - Retrieves the Zype API key - """ - - # Find the js that has the API key from the webpage and download it - webpage = self._download_webpage(page_url, video_id=display_id) - main_script_relpath = self._search_regex( - r'<script[^>]*src="(?P<script_relpath>[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage, - group='script_relpath', name='script relative path', fatal=True) - main_script_abspath = urljoin(page_url, main_script_relpath) - main_script = self._download_webpage(main_script_abspath, video_id=display_id, - note='Retrieving Zype API key') - - api_key = self._search_regex( - r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P<api_key>[\w-]*)"', main_script, - group='api_key', name='API key', fatal=True) - - return api_key - - def _call_zype_api(self, path, params, video_id, api_key, note): - """ - A helper for making calls to the Zype API. - """ - query = {'api_key': api_key, 'per_page': 1} - query.update(params) - return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note) - - def _call_nebula_api(self, path, video_id, access_token, note): - """ - A helper for making calls to the Nebula API. - """ - return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={ - 'Authorization': 'Token {access_token}'.format(access_token=access_token) - }, note=note) - - def _fetch_zype_access_token(self, video_id): - try: - user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token') - except ExtractorError as exc: - # if 401, attempt credential auth and retry - if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.code == 401: - self._nebula_token = self._retrieve_nebula_auth() - user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token') - else: - raise - - access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str) - if not access_token: - if try_get(user_object, lambda x: x['is_subscribed'], bool): - # TODO: Reimplement the same Zype token polling the Nebula frontend implements - # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532 - raise ExtractorError( - 'Unable to extract Zype access token from Nebula API authentication endpoint. ' - 'Open an arbitrary video in a browser with this account to generate a token', - expected=True) - raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') - return access_token - - def _extract_channel_title(self, video_meta): - # TODO: Implement the API calls giving us the channel list, - # so that we can do the title lookup and then figure out the channel URL - categories = video_meta.get('categories', []) if video_meta else [] - # the channel name is the value of the first category - for category in categories: - if category.get('value'): - return category['value'][0] - - def _real_initialize(self): - # check cookie jar for valid token - nebula_cookies = self._get_cookies('https://nebula.app') - nebula_cookie = nebula_cookies.get('nebula-auth') - if nebula_cookie: - self.to_screen('Authenticating to Nebula with token from cookie jar') - nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value) - self._nebula_token = self._parse_json(nebula_cookie_value, None).get('apiToken') + 'playlist_count': 5, + 'params': { + 'usenetrc': True, + }, + }, { + 'url': 'https://nebula.app/lindsayellis', + 'info_dict': { + 'id': 'lindsayellis', + 'title': 'Lindsay Ellis', + 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.', + }, + 'playlist_mincount': 100, + 'params': { + 'usenetrc': True, + }, + }, + ] - # try to authenticate using credentials if no valid token has been found - if not self._nebula_token: - self._nebula_token = self._retrieve_nebula_auth() + def _generate_playlist_entries(self, collection_id, channel): + episodes = channel['episodes']['results'] + for page_num in itertools.count(2): + for episode in episodes: + yield self._build_video_info(episode) + next_url = channel['episodes']['next'] + if not next_url: + break + channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer', + note=f'Retrieving channel page {page_num}') + episodes = channel['episodes']['results'] def _real_extract(self, url): - display_id = self._match_id(url) - api_key = self._retrieve_zype_api_key(url, display_id) - - response = self._call_zype_api('/videos', {'friendly_title': display_id}, - display_id, api_key, note='Retrieving metadata from Zype') - if len(response.get('response') or []) != 1: - raise ExtractorError('Unable to find video on Zype API') - video_meta = response['response'][0] - - video_id = video_meta['_id'] - zype_access_token = self._fetch_zype_access_token(display_id) + collection_id = self._match_id(url) + channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/' + channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel') + channel_details = channel['details'] - channel_title = self._extract_channel_title(video_meta) - - return { - 'id': video_id, - 'display_id': display_id, - '_type': 'url_transparent', - 'ie_key': 'Zype', - 'url': 'https://player.zype.com/embed/%s.html?access_token=%s' % (video_id, zype_access_token), - 'title': video_meta.get('title'), - 'description': video_meta.get('description'), - 'timestamp': parse_iso8601(video_meta.get('published_at')), - 'thumbnails': [{ - 'id': tn.get('name'), # this appears to be null - 'url': tn['url'], - 'width': tn.get('width'), - 'height': tn.get('height'), - } for tn in video_meta.get('thumbnails', [])], - 'duration': video_meta.get('duration'), - 'channel': channel_title, - 'uploader': channel_title, # we chose uploader = channel name - # TODO: uploader_url, channel_id, channel_url - } + return self.playlist_result( + entries=self._generate_playlist_entries(collection_id, channel), + playlist_id=collection_id, + playlist_title=channel_details['title'], + playlist_description=channel_details['description'] + ) diff --git a/hypervideo_dl/extractor/neteasemusic.py b/hypervideo_dl/extractor/neteasemusic.py index 7652371..57b4774 100644 --- a/hypervideo_dl/extractor/neteasemusic.py +++ b/hypervideo_dl/extractor/neteasemusic.py @@ -405,17 +405,12 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): name = info['name'] description = info['description'] - if not info['songs'] or self.get_param('noplaylist'): - if info['songs']: - self.to_screen( - 'Downloading just the main audio %s because of --no-playlist' - % info['mainSong']['id']) - + if not self._yes_playlist(info['songs'] and program_id, info['mainSong']['id']): formats = self.extract_formats(info['mainSong']) self._sort_formats(formats) return { - 'id': program_id, + 'id': info['mainSong']['id'], 'title': name, 'description': description, 'creator': info['dj']['brand'], @@ -425,10 +420,6 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): 'formats': formats, } - self.to_screen( - 'Downloading playlist %s - add --no-playlist to just download the main audio %s' - % (program_id, info['mainSong']['id'])) - song_ids = [info['mainSong']['id']] song_ids.extend([song['id'] for song in info['songs']]) entries = [ diff --git a/hypervideo_dl/extractor/newgrounds.py b/hypervideo_dl/extractor/newgrounds.py index bbbd9e8..6525a6d 100644 --- a/hypervideo_dl/extractor/newgrounds.py +++ b/hypervideo_dl/extractor/newgrounds.py @@ -6,7 +6,9 @@ import re from .common import InfoExtractor from ..utils import ( + clean_html, extract_attributes, + get_element_by_id, int_or_none, parse_count, parse_duration, @@ -29,7 +31,8 @@ class NewgroundsIE(InfoExtractor): 'timestamp': 1378878540, 'upload_date': '20130911', 'duration': 143, - 'description': 'md5:6d885138814015dfd656c2ddb00dacfc', + 'view_count': int, + 'description': 'md5:b8b3c2958875189f07d8e313462e8c4f', }, }, { 'url': 'https://www.newgrounds.com/portal/view/1', @@ -41,6 +44,7 @@ class NewgroundsIE(InfoExtractor): 'uploader': 'Brian-Beaton', 'timestamp': 955064100, 'upload_date': '20000406', + 'view_count': int, 'description': 'Scrotum plays "catch."', 'age_limit': 17, }, @@ -54,7 +58,8 @@ class NewgroundsIE(InfoExtractor): 'uploader': 'ZONE-SAMA', 'timestamp': 1487965140, 'upload_date': '20170224', - 'description': 'ZTV News Episode 8 (February 2017)', + 'view_count': int, + 'description': 'md5:aff9b330ec2e78ed93b1ad6d017accc6', 'age_limit': 17, }, 'params': { @@ -70,7 +75,8 @@ class NewgroundsIE(InfoExtractor): 'uploader': 'Egoraptor', 'timestamp': 1140663240, 'upload_date': '20060223', - 'description': 'Metal Gear is awesome is so is this movie.', + 'view_count': int, + 'description': 'md5:9246c181614e23754571995104da92e0', 'age_limit': 13, } }, { @@ -80,7 +86,7 @@ class NewgroundsIE(InfoExtractor): 'id': '297383', 'ext': 'swf', 'title': 'Metal Gear Awesome', - 'description': 'Metal Gear is awesome is so is this movie.', + 'description': 'Metal Gear Awesome', 'uploader': 'Egoraptor', 'upload_date': '20060223', 'timestamp': 1140663240, @@ -100,8 +106,7 @@ class NewgroundsIE(InfoExtractor): uploader = None webpage = self._download_webpage(url, media_id) - title = self._html_search_regex( - r'<title>(.+?)</title>', webpage, 'title') + title = self._html_extract_title(webpage) media_url_string = self._search_regex( r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None) @@ -145,10 +150,13 @@ class NewgroundsIE(InfoExtractor): (r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)', r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+)'), webpage, 'timestamp', default=None)) + duration = parse_duration(self._html_search_regex( r'"duration"\s*:\s*["\']?(\d+)["\']?', webpage, 'duration', default=None)) + description = clean_html(get_element_by_id('author_comments', webpage)) or self._og_search_description(webpage) + view_count = parse_count(self._html_search_regex( r'(?s)<dt>\s*(?:Views|Listens)\s*</dt>\s*<dd>([\d\.,]+)</dd>', webpage, 'view count', default=None)) @@ -177,7 +185,7 @@ class NewgroundsIE(InfoExtractor): 'duration': duration, 'formats': formats, 'thumbnail': self._og_search_thumbnail(webpage), - 'description': self._og_search_description(webpage), + 'description': description, 'age_limit': age_limit, 'view_count': view_count, } @@ -210,8 +218,7 @@ class NewgroundsPlaylistIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - title = self._search_regex( - r'<title>([^>]+)</title>', webpage, 'title', default=None) + title = self._html_extract_title(webpage, default=None) # cut left menu webpage = self._search_regex( diff --git a/hypervideo_dl/extractor/newstube.py b/hypervideo_dl/extractor/newstube.py index dab4aec..479141a 100644 --- a/hypervideo_dl/extractor/newstube.py +++ b/hypervideo_dl/extractor/newstube.py @@ -5,11 +5,9 @@ import base64 import hashlib from .common import InfoExtractor -from ..aes import aes_cbc_decrypt +from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 from ..utils import ( - bytes_to_intlist, int_or_none, - intlist_to_bytes, parse_codecs, parse_duration, ) @@ -47,10 +45,8 @@ class NewstubeIE(InfoExtractor): })) key = hashlib.pbkdf2_hmac( 'sha1', video_guid.replace('-', '').encode(), enc_data[:16], 1)[:16] - dec_data = aes_cbc_decrypt( - bytes_to_intlist(enc_data[32:]), bytes_to_intlist(key), - bytes_to_intlist(enc_data[16:32])) - sources = self._parse_json(intlist_to_bytes(dec_data[:-dec_data[-1]]), video_guid) + dec_data = unpad_pkcs7(aes_cbc_decrypt_bytes(enc_data[32:], key, enc_data[16:32])) + sources = self._parse_json(dec_data, video_guid) formats = [] for source in sources: diff --git a/hypervideo_dl/extractor/newsy.py b/hypervideo_dl/extractor/newsy.py new file mode 100644 index 0000000..cf31641 --- /dev/null +++ b/hypervideo_dl/extractor/newsy.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + merge_dicts, +) + + +class NewsyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?newsy\.com/stories/(?P<id>[^/?#$&]+)' + + _TESTS = [{ + 'url': 'https://www.newsy.com/stories/nft-trend-leads-to-fraudulent-art-auctions/', + 'info_dict': { + 'id': '609d65125b086c24fb529312', + 'ext': 'mp4', + 'title': 'NFT Art Auctions Have A Piracy Problem', + 'description': 'md5:971e52ab8bc97e50305475cde8284c83', + 'display_id': 'nft-trend-leads-to-fraudulent-art-auctions', + 'timestamp': 1621339200, + 'duration': 339630, + 'thumbnail': 'https://cdn.newsy.com/images/videos/x/1620927824_xyrrP4.jpg', + 'upload_date': '20210518' + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + data_json = self._parse_json(self._html_search_regex( + r'data-video-player\s?=\s?"({[^"]+})">', webpage, 'data'), display_id, js_to_json) + ld_json = self._search_json_ld(webpage, display_id, fatal=False) + + formats, subtitles = [], {} + if data_json.get('stream'): + fmts, subs = self._extract_m3u8_formats_and_subtitles(data_json['stream'], display_id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + self._sort_formats(formats) + return merge_dicts(ld_json, { + 'id': data_json['id'], + 'display_id': display_id, + 'title': data_json.get('headline'), + 'duration': data_json.get('duration'), + 'thumbnail': data_json.get('image'), + 'formats': formats, + 'subtitles': subtitles, + }) diff --git a/hypervideo_dl/extractor/nexx.py b/hypervideo_dl/extractor/nexx.py index 860d636..a521bb6 100644 --- a/hypervideo_dl/extractor/nexx.py +++ b/hypervideo_dl/extractor/nexx.py @@ -12,6 +12,8 @@ from ..utils import ( ExtractorError, int_or_none, parse_duration, + srt_subtitles_timecode, + traverse_obj, try_get, urlencode_postdata, ) @@ -20,7 +22,7 @@ from ..utils import ( class NexxIE(InfoExtractor): _VALID_URL = r'''(?x) (?: - https?://api\.nexx(?:\.cloud|cdn\.com)/v3/(?P<domain_id>\d+)/videos/byid/| + https?://api\.nexx(?:\.cloud|cdn\.com)/v3(?:\.\d)?/(?P<domain_id>\d+)/videos/byid/| nexx:(?:(?P<domain_id_s>\d+):)?| https?://arc\.nexx\.cloud/api/video/ ) @@ -42,35 +44,37 @@ class NexxIE(InfoExtractor): 'timestamp': 1384264416, 'upload_date': '20131112', }, + 'skip': 'Spiegel nexx CDNs are now disabled' }, { - # episode - 'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858', + # episode with captions + 'url': 'https://api.nexx.cloud/v3.1/741/videos/byid/1701834', 'info_dict': { - 'id': '247858', + 'id': '1701834', 'ext': 'mp4', - 'title': 'Return of the Golden Child (OV)', - 'description': 'md5:5d969537509a92b733de21bae249dc63', - 'release_year': 2017, + 'title': 'Mein Leben mit \'nem TikTok E-Boy 😤', + 'alt_title': 'Mein Leben mit \'nem TikTok E-Boy 😤', + 'description': 'md5:f84f395a881fd143f952c892deab528d', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1397, - 'timestamp': 1495033267, - 'upload_date': '20170517', + 'duration': 770, + 'timestamp': 1595600027, + 'upload_date': '20200724', 'episode_number': 2, 'season_number': 2, + 'episode': 'Episode 2', + 'season': 'Season 2', }, 'params': { 'skip_download': True, }, - 'skip': 'HTTP Error 404: Not Found', }, { - # does not work via arc 'url': 'nexx:741:1269984', - 'md5': 'c714b5b238b2958dc8d5642addba6886', + 'md5': 'd5f14e14b592501e51addd5abef95a7f', 'info_dict': { 'id': '1269984', 'ext': 'mp4', - 'title': '1 TAG ohne KLO... wortwörtlich! 😑', - 'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑', + 'title': '1 TAG ohne KLO... wortwörtlich! ?', + 'alt_title': '1 TAG ohne KLO... wortwörtlich! ?', + 'description': 'md5:2016393a31991a900946432ccdd09a6f', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 607, 'timestamp': 1518614955, @@ -91,6 +95,7 @@ class NexxIE(InfoExtractor): 'timestamp': 1527874460, 'upload_date': '20180601', }, + 'skip': 'Spiegel nexx CDNs are now disabled' }, { 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', 'only_matching': True, @@ -138,6 +143,8 @@ class NexxIE(InfoExtractor): return NexxIE._extract_urls(webpage)[0] def _handle_error(self, response): + if traverse_obj(response, ('metadata', 'notice'), expected_type=str): + self.report_warning('%s said: %s' % (self.IE_NAME, response['metadata']['notice'])) status = int_or_none(try_get( response, lambda x: x['metadata']['status']) or 200) if 200 <= status < 300: @@ -220,6 +227,65 @@ class NexxIE(InfoExtractor): return formats + def _extract_3q_formats(self, video, video_id): + stream_data = video['streamdata'] + cdn = stream_data['cdnType'] + assert cdn == '3q' + + q_acc, q_prefix, q_locator, q_hash = stream_data['qAccount'], stream_data['qPrefix'], stream_data['qLocator'], stream_data['qHash'] + protection_key = traverse_obj( + video, ('protectiondata', 'key'), expected_type=str) + + def get_cdn_shield_base(shield_type=''): + for secure in ('', 's'): + cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) + if cdn_shield: + return 'http%s://%s' % (secure, cdn_shield) + return f'http://sdn-global-{"prog" if shield_type.lower() == "prog" else "streaming"}-cache.3qsdn.com/' + (f's/{protection_key}/' if protection_key else '') + + stream_base = get_cdn_shield_base() + + formats = [] + formats.extend(self._extract_m3u8_formats( + f'{stream_base}{q_acc}/files/{q_prefix}/{q_locator}/{q_acc}-{stream_data.get("qHEVCHash") or q_hash}.ism/manifest.m3u8', + video_id, 'mp4', m3u8_id=f'{cdn}-hls', fatal=False)) + formats.extend(self._extract_mpd_formats( + f'{stream_base}{q_acc}/files/{q_prefix}/{q_locator}/{q_acc}-{q_hash}.ism/manifest.mpd', + video_id, mpd_id=f'{cdn}-dash', fatal=False)) + + progressive_base = get_cdn_shield_base('Prog') + q_references = stream_data.get('qReferences') or '' + fds = q_references.split(',') + for fd in fds: + ss = fd.split(':') + if len(ss) != 3: + continue + tbr = int_or_none(ss[1], scale=1000) + formats.append({ + 'url': f'{progressive_base}{q_acc}/uploads/{q_acc}-{ss[2]}.webm', + 'format_id': f'{cdn}-{ss[0]}{"-%s" % tbr if tbr else ""}', + 'tbr': tbr, + }) + + azure_file_distribution = stream_data.get('azureFileDistribution') or '' + fds = azure_file_distribution.split(',') + for fd in fds: + ss = fd.split(':') + if len(ss) != 3: + continue + tbr = int_or_none(ss[0]) + width, height = ss[1].split('x') if len(ss[1].split('x')) == 2 else (None, None) + f = { + 'url': f'{progressive_base}{q_acc}/files/{q_prefix}/{q_locator}/{ss[2]}.mp4', + 'format_id': f'{cdn}-http-{"-%s" % tbr if tbr else ""}', + 'tbr': tbr, + 'width': int_or_none(width), + 'height': int_or_none(height), + } + formats.append(f) + + return formats + def _extract_azure_formats(self, video, video_id): stream_data = video['streamdata'] cdn = stream_data['cdnType'] @@ -345,10 +411,11 @@ class NexxIE(InfoExtractor): # md5( operation + domain_id + domain_secret ) # where domain_secret is a static value that will be given by nexx.tv # as per [1]. Here is how this "secret" is generated (reversed - # from _play.api.init function, search for clienttoken). So it's - # actually not static and not that much of a secret. + # from _play._factory.data.getDomainData function, search for + # domaintoken or enableAPIAccess). So it's actually not static + # and not that much of a secret. # 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf - secret = result['device']['clienttoken'][int(device_id[0]):] + secret = result['device']['domaintoken'][int(device_id[0]):] secret = secret[0:len(secret) - int(device_id[-1])] op = 'byid' @@ -360,15 +427,18 @@ class NexxIE(InfoExtractor): result = self._call_api( domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={ - 'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description', + 'additionalfields': 'language,channel,format,licenseby,slug,fileversion,episode,season', 'addInteractionOptions': '1', 'addStatusDetails': '1', 'addStreamDetails': '1', - 'addCaptions': '1', + 'addFeatures': '1', + # Caption format selection doesn't seem to be enforced? + 'addCaptions': 'vtt', 'addScenes': '1', + 'addChapters': '1', 'addHotSpots': '1', + 'addConnectedMedia': 'persons', 'addBumpers': '1', - 'captionFormat': 'data', }, headers={ 'X-Request-CID': cid, 'X-Request-Token': request_token, @@ -384,28 +454,48 @@ class NexxIE(InfoExtractor): formats = self._extract_azure_formats(video, video_id) elif cdn == 'free': formats = self._extract_free_formats(video, video_id) + elif cdn == '3q': + formats = self._extract_3q_formats(video, video_id) else: - # TODO: reverse more cdns - assert False + self.raise_no_formats(f'{cdn} formats are currently not supported', video_id) self._sort_formats(formats) + subtitles = {} + for sub in video.get('captiondata') or []: + if sub.get('data'): + subtitles.setdefault(sub.get('language', 'en'), []).append({ + 'ext': 'srt', + 'data': '\n\n'.join( + f'{i + 1}\n{srt_subtitles_timecode(line["fromms"] / 1000)} --> {srt_subtitles_timecode(line["toms"] / 1000)}\n{line["caption"]}' + for i, line in enumerate(sub['data'])), + 'name': sub.get('language_long') or sub.get('title') + }) + elif sub.get('url'): + subtitles.setdefault(sub.get('language', 'en'), []).append({ + 'url': sub['url'], + 'ext': sub.get('format'), + 'name': sub.get('language_long') or sub.get('title') + }) + return { 'id': video_id, 'title': title, 'alt_title': general.get('subtitle'), 'description': general.get('description'), 'release_year': int_or_none(general.get('year')), - 'creator': general.get('studio') or general.get('studio_adref'), + 'creator': general.get('studio') or general.get('studio_adref') or None, 'thumbnail': try_get( video, lambda x: x['imagedata']['thumb'], compat_str), 'duration': parse_duration(general.get('runtime')), 'timestamp': int_or_none(general.get('uploaded')), - 'episode_number': int_or_none(try_get( - video, lambda x: x['episodedata']['episode'])), - 'season_number': int_or_none(try_get( - video, lambda x: x['episodedata']['season'])), + 'episode_number': traverse_obj( + video, (('episodedata', 'general'), 'episode'), expected_type=int, get_all=False), + 'season_number': traverse_obj( + video, (('episodedata', 'general'), 'season'), expected_type=int, get_all=False), + 'cast': traverse_obj(video, ('connectedmedia', ..., 'title'), expected_type=str), 'formats': formats, + 'subtitles': subtitles, } @@ -427,7 +517,6 @@ class NexxEmbedIE(InfoExtractor): 'upload_date': '20140305', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { diff --git a/hypervideo_dl/extractor/nfb.py b/hypervideo_dl/extractor/nfb.py new file mode 100644 index 0000000..a12e503 --- /dev/null +++ b/hypervideo_dl/extractor/nfb.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class NFBIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nfb\.ca/film/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.nfb.ca/film/trafficopter/', + 'info_dict': { + 'id': 'trafficopter', + 'ext': 'mp4', + 'title': 'Trafficopter', + 'description': 'md5:060228455eb85cf88785c41656776bc0', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Barrie Howells', + 'release_year': 1972, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage('https://www.nfb.ca/film/%s/' % video_id, video_id) + + iframe = self._html_search_regex( + r'<[^>]+\bid=["\']player-iframe["\'][^>]*src=["\']([^"\']+)', + webpage, 'iframe', default=None, fatal=True) + if iframe.startswith('/'): + iframe = f'https://www.nfb.ca{iframe}' + + player = self._download_webpage(iframe, video_id) + + source = self._html_search_regex( + r'source:\s*\'([^\']+)', + player, 'source', default=None, fatal=True) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(source, video_id, ext='mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._html_search_regex( + r'<[^>]+\bid=["\']titleHeader["\'][^>]*>\s*<h1[^>]*>\s*([^<]+?)\s*</h1>', + webpage, 'title', default=None), + 'description': self._html_search_regex( + r'<[^>]+\bid=["\']tabSynopsis["\'][^>]*>\s*<p[^>]*>\s*([^<]+)', + webpage, 'description', default=None), + 'thumbnail': self._html_search_regex( + r'poster:\s*\'([^\']+)', + player, 'thumbnail', default=None), + 'uploader': self._html_search_regex( + r'<[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', + webpage, 'uploader', default=None), + 'release_year': int_or_none(self._html_search_regex( + r'<[^>]+\bitemprop=["\']datePublished["\'][^>]*>([^<]+)', + webpage, 'release_year', default=None)), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/nfl.py b/hypervideo_dl/extractor/nfl.py index 871923e..821276a 100644 --- a/hypervideo_dl/extractor/nfl.py +++ b/hypervideo_dl/extractor/nfl.py @@ -89,7 +89,7 @@ class NFLBaseIE(InfoExtractor): 'ext': determine_ext(image_url, 'jpg'), }] info.update({ - 'title': self._live_title(title) if is_live else title, + 'title': title, 'is_live': is_live, 'description': clean_html(item.get('description')), 'thumbnails': thumbnails, diff --git a/hypervideo_dl/extractor/nhk.py b/hypervideo_dl/extractor/nhk.py index 950a3d0..3b8efc3 100644 --- a/hypervideo_dl/extractor/nhk.py +++ b/hypervideo_dl/extractor/nhk.py @@ -1,8 +1,15 @@ from __future__ import unicode_literals +import re from .common import InfoExtractor -from ..utils import urljoin +from ..utils import ( + parse_duration, + traverse_obj, + unescapeHTML, + unified_timestamp, + urljoin +) class NhkBaseIE(InfoExtractor): @@ -73,6 +80,7 @@ class NhkBaseIE(InfoExtractor): m3u8_id='hls', fatal=False) for f in info['formats']: f['language'] = lang + self._sort_formats(info['formats']) else: info.update({ '_type': 'url_transparent', @@ -175,3 +183,145 @@ class NhkVodProgramIE(NhkBaseIE): program_title = entries[0].get('series') return self.playlist_result(entries, program_id, program_title) + + +class NhkForSchoolBangumiIE(InfoExtractor): + _VALID_URL = r'https?://www2\.nhk\.or\.jp/school/movie/(?P<type>bangumi|clip)\.cgi\?das_id=(?P<id>[a-zA-Z0-9_-]+)' + _TESTS = [{ + 'url': 'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id=D0005150191_00000', + 'info_dict': { + 'id': 'D0005150191_00003', + 'title': 'にている かな', + 'duration': 599.999, + 'timestamp': 1396414800, + + 'upload_date': '20140402', + 'ext': 'mp4', + + 'chapters': 'count:12' + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + program_type, video_id = self._match_valid_url(url).groups() + + webpage = self._download_webpage( + f'https://www2.nhk.or.jp/school/movie/{program_type}.cgi?das_id={video_id}', video_id) + + # searches all variables + base_values = {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)} + # and programObj values too + program_values = {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)} + # extract all chapters + chapter_durations = [parse_duration(g.group(1)) for g in re.finditer(r'chapterTime\.push\(\'([0-9:]+?)\'\);', webpage)] + chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'<div class="cpTitle"><span>(scene\s*\d+)?</span>([^<]+?)</div>', webpage)] + + # this is how player_core.js is actually doing (!) + version = base_values.get('r_version') or program_values.get('version') + if version: + video_id = f'{video_id.split("_")[0]}_{version}' + + formats = self._extract_m3u8_formats( + f'https://nhks-vh.akamaihd.net/i/das/{video_id[0:8]}/{video_id}_V_000.f4v/master.m3u8', + video_id, ext='mp4', m3u8_id='hls') + self._sort_formats(formats) + + duration = parse_duration(base_values.get('r_duration')) + + chapters = None + if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles): + start_time = chapter_durations + end_time = chapter_durations[1:] + [duration] + chapters = [{ + 'start_time': s, + 'end_time': e, + 'title': t, + } for s, e, t in zip(start_time, end_time, chapter_titles)] + + return { + 'id': video_id, + 'title': program_values.get('name'), + 'duration': parse_duration(base_values.get('r_duration')), + 'timestamp': unified_timestamp(base_values['r_upload']), + 'formats': formats, + 'chapters': chapters, + } + + +class NhkForSchoolSubjectIE(InfoExtractor): + IE_DESC = 'Portal page for each school subjects, like Japanese (kokugo, 国語) or math (sansuu/suugaku or 算数・数学)' + KNOWN_SUBJECTS = ( + 'rika', 'syakai', 'kokugo', + 'sansuu', 'seikatsu', 'doutoku', + 'ongaku', 'taiiku', 'zukou', + 'gijutsu', 'katei', 'sougou', + 'eigo', 'tokkatsu', + 'tokushi', 'sonota', + ) + _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>%s)/?(?:[\?#].*)?$' % '|'.join(re.escape(s) for s in KNOWN_SUBJECTS) + + _TESTS = [{ + 'url': 'https://www.nhk.or.jp/school/sougou/', + 'info_dict': { + 'id': 'sougou', + 'title': '総合的な学習の時間', + }, + 'playlist_mincount': 16, + }, { + 'url': 'https://www.nhk.or.jp/school/rika/', + 'info_dict': { + 'id': 'rika', + 'title': '理科', + }, + 'playlist_mincount': 15, + }] + + def _real_extract(self, url): + subject_id = self._match_id(url) + webpage = self._download_webpage(url, subject_id) + + return self.playlist_from_matches( + re.finditer(rf'href="((?:https?://www\.nhk\.or\.jp)?/school/{re.escape(subject_id)}/[^/]+/)"', webpage), + subject_id, + self._html_search_regex(r'(?s)<span\s+class="subjectName">\s*<img\s*[^<]+>\s*([^<]+?)</span>', webpage, 'title', fatal=False), + lambda g: urljoin(url, g.group(1))) + + +class NhkForSchoolProgramListIE(InfoExtractor): + _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>(?:%s)/[a-zA-Z0-9_-]+)' % ( + '|'.join(re.escape(s) for s in NhkForSchoolSubjectIE.KNOWN_SUBJECTS) + ) + _TESTS = [{ + 'url': 'https://www.nhk.or.jp/school/sougou/q/', + 'info_dict': { + 'id': 'sougou/q', + 'title': 'Q~こどものための哲学', + }, + 'playlist_mincount': 20, + }] + + def _real_extract(self, url): + program_id = self._match_id(url) + + webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id) + + title = (self._og_search_title(webpage) + or self._html_extract_title(webpage) + or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False)) + title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None + description = self._html_search_regex( + r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>', + webpage, 'description', fatal=False, group=0) + + bangumi_list = self._download_json( + f'https://www.nhk.or.jp/school/{program_id}/meta/program.json', program_id) + # they're always bangumi + bangumis = [ + self.url_result(f'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id={x}') + for x in traverse_obj(bangumi_list, ('part', ..., 'part-video-dasid')) or []] + + return self.playlist_result(bangumis, program_id, title, description) diff --git a/hypervideo_dl/extractor/niconico.py b/hypervideo_dl/extractor/niconico.py index 76f0870..4eb6ed0 100644 --- a/hypervideo_dl/extractor/niconico.py +++ b/hypervideo_dl/extractor/niconico.py @@ -2,32 +2,39 @@ from __future__ import unicode_literals import datetime +import functools import itertools import json import re +import time from .common import InfoExtractor, SearchInfoExtractor -from ..postprocessor.ffmpeg import FFmpegPostProcessor from ..compat import ( - compat_str, compat_parse_qs, compat_urllib_parse_urlparse, + compat_HTTPError, ) from ..utils import ( ExtractorError, - dict_get, + OnDemandPagedList, + bug_reports_message, + clean_html, float_or_none, int_or_none, - OnDemandPagedList, + join_nonempty, parse_duration, + parse_filesize, parse_iso8601, - PostProcessingError, + parse_resolution, + qualities, remove_start, str_or_none, + traverse_obj, try_get, - unified_timestamp, + unescapeHTML, + update_url_query, + url_or_none, urlencode_postdata, - xpath_text, ) @@ -37,7 +44,7 @@ class NiconicoIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.nicovideo.jp/watch/sm22312215', - 'md5': 'a5bad06f1347452102953f323c69da34s', + 'md5': 'd1a75c0823e2f629128c43e1212760f9', 'info_dict': { 'id': 'sm22312215', 'ext': 'mp4', @@ -160,35 +167,42 @@ class NiconicoIE(InfoExtractor): }, { 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', 'only_matching': True, + }, { + 'note': 'a video that is only served as an ENCRYPTED HLS.', + 'url': 'https://www.nicovideo.jp/watch/so38016254', + 'only_matching': True, }] - _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' + _VALID_URL = r'https?://(?:(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch|nico\.ms)/(?P<id>(?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' - + _COMMENT_API_ENDPOINTS = ( + 'https://nvcomment.nicovideo.jp/legacy/api.json', + 'https://nmsg.nicovideo.jp/api.json',) _API_HEADERS = { 'X-Frontend-ID': '6', - 'X-Frontend-Version': '0' + 'X-Frontend-Version': '0', + 'X-Niconico-Language': 'en-us', + 'Referer': 'https://www.nicovideo.jp/', + 'Origin': 'https://www.nicovideo.jp', } - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - # No authentication to be performed - if not username: - return True - - # Log in + def _perform_login(self, username, password): login_ok = True login_form_strs = { 'mail_tel': username, 'password': password, } + self._request_webpage( + 'https://account.nicovideo.jp/login', None, + note='Acquiring Login session') urlh = self._request_webpage( - 'https://account.nicovideo.jp/api/v1/login', None, + 'https://account.nicovideo.jp/login/redirector?show_button_twitter=1&site=niconico&show_button_facebook=1', None, note='Logging in', errnote='Unable to log in', - data=urlencode_postdata(login_form_strs)) + data=urlencode_postdata(login_form_strs), + headers={ + 'Referer': 'https://account.nicovideo.jp/login', + 'Content-Type': 'application/x-www-form-urlencoded', + }) if urlh is False: login_ok = False else: @@ -200,8 +214,8 @@ class NiconicoIE(InfoExtractor): return login_ok def _get_heartbeat_info(self, info_dict): - video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/') + dmc_protocol = info_dict['_expected_protocol'] api_data = ( info_dict.get('_api_data') @@ -216,49 +230,50 @@ class NiconicoIE(InfoExtractor): session_api_endpoint = try_get(session_api_data, lambda x: x['urls'][0]) def ping(): - status = try_get( - self._download_json( - 'https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', video_id, - query={'t': try_get(api_data, lambda x: x['media']['delivery']['trackingId'])}, - note='Acquiring permission for downloading video', - headers=self._API_HEADERS), - lambda x: x['meta']['status']) - if status != 200: - self.report_warning('Failed to acquire permission for playing video. The video may not download.') + tracking_id = traverse_obj(api_data, ('media', 'delivery', 'trackingId')) + if tracking_id: + tracking_url = update_url_query('https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', {'t': tracking_id}) + watch_request_response = self._download_json( + tracking_url, video_id, + note='Acquiring permission for downloading video', fatal=False, + headers=self._API_HEADERS) + if traverse_obj(watch_request_response, ('meta', 'status')) != 200: + self.report_warning('Failed to acquire permission for playing video. Video download may fail.') yesno = lambda x: 'yes' if x else 'no' - # m3u8 (encryption) - if try_get(api_data, lambda x: x['media']['delivery']['encryption']) is not None: + if dmc_protocol == 'http': + protocol = 'http' + protocol_parameters = { + 'http_output_download_parameters': { + 'use_ssl': yesno(session_api_data['urls'][0]['isSsl']), + 'use_well_known_port': yesno(session_api_data['urls'][0]['isWellKnownPort']), + } + } + elif dmc_protocol == 'hls': protocol = 'm3u8' - encryption = self._parse_json(session_api_data['token'], video_id)['hls_encryption'] - session_api_http_parameters = { - 'parameters': { - 'hls_parameters': { - 'encryption': { - encryption: { - 'encrypted_key': try_get(api_data, lambda x: x['media']['delivery']['encryption']['encryptedKey']), - 'key_uri': try_get(api_data, lambda x: x['media']['delivery']['encryption']['keyUri']) - } - }, - 'transfer_preset': '', - 'use_ssl': yesno(session_api_endpoint['isSsl']), - 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']), - 'segment_duration': 6000, - } + segment_duration = try_get(self._configuration_arg('segment_duration'), lambda x: int(x[0])) or 6000 + parsed_token = self._parse_json(session_api_data['token'], video_id) + encryption = traverse_obj(api_data, ('media', 'delivery', 'encryption')) + protocol_parameters = { + 'hls_parameters': { + 'segment_duration': segment_duration, + 'transfer_preset': '', + 'use_ssl': yesno(session_api_data['urls'][0]['isSsl']), + 'use_well_known_port': yesno(session_api_data['urls'][0]['isWellKnownPort']), } } - # http - else: - protocol = 'http' - session_api_http_parameters = { - 'parameters': { - 'http_output_download_parameters': { - 'use_ssl': yesno(session_api_endpoint['isSsl']), - 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']), + if 'hls_encryption' in parsed_token and encryption: + protocol_parameters['hls_parameters']['encryption'] = { + parsed_token['hls_encryption']: { + 'encrypted_key': encryption['encryptedKey'], + 'key_uri': encryption['keyUri'], } } - } + else: + protocol = 'm3u8_native' + else: + raise ExtractorError(f'Unsupported DMC protocol: {dmc_protocol}') session_response = self._download_json( session_api_endpoint['url'], video_id, @@ -292,11 +307,13 @@ class NiconicoIE(InfoExtractor): 'lifetime': session_api_data.get('heartbeatLifetime') } }, - 'priority': session_api_data.get('priority'), + 'priority': session_api_data['priority'], 'protocol': { 'name': 'http', 'parameters': { - 'http_parameters': session_api_http_parameters + 'http_parameters': { + 'parameters': protocol_parameters + } } }, 'recipe_id': session_api_data.get('recipeId'), @@ -324,36 +341,35 @@ class NiconicoIE(InfoExtractor): return info_dict, heartbeat_info_dict - def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): - def parse_format_id(id_code): - mobj = re.match(r'''(?x) - (?:archive_)? - (?:(?P<codec>[^_]+)_)? - (?:(?P<br>[\d]+)kbps_)? - (?:(?P<res>[\d+]+)p_)? - ''', '%s_' % id_code) - return mobj.groupdict() if mobj else {} - - protocol = 'niconico_dmc' - format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) - vdict = parse_format_id(video_quality['id']) - adict = parse_format_id(audio_quality['id']) - resolution = try_get(video_quality, lambda x: x['metadata']['resolution'], dict) or {'height': vdict.get('res')} - vbr = try_get(video_quality, lambda x: x['metadata']['bitrate'], float) + def _extract_format_for_quality(self, video_id, audio_quality, video_quality, dmc_protocol): + + if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'): + return None + + def extract_video_quality(video_quality): + return parse_filesize('%sB' % self._search_regex( + r'\| ([0-9]*\.?[0-9]*[MK])', video_quality, 'vbr', default='')) + + format_id = '-'.join( + [remove_start(s['id'], 'archive_') for s in (video_quality, audio_quality)] + [dmc_protocol]) + + vid_qual_label = traverse_obj(video_quality, ('metadata', 'label')) + vid_quality = traverse_obj(video_quality, ('metadata', 'bitrate')) return { - 'url': '%s:%s/%s/%s' % (protocol, video_id, video_quality['id'], audio_quality['id']), + 'url': 'niconico_dmc:%s/%s/%s' % (video_id, video_quality['id'], audio_quality['id']), 'format_id': format_id, - 'format_note': 'DMC %s' % try_get(video_quality, lambda x: x['metadata']['label'], compat_str), + 'format_note': join_nonempty('DMC', vid_qual_label, dmc_protocol.upper(), delim=' '), 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 - 'vcodec': vdict.get('codec'), - 'acodec': adict.get('codec'), - 'vbr': float_or_none(vbr, 1000) or float_or_none(vdict.get('br')), - 'abr': float_or_none(audio_quality.get('bitrate'), 1000) or float_or_none(adict.get('br')), - 'height': int_or_none(resolution.get('height', vdict.get('res'))), - 'width': int_or_none(resolution.get('width')), - 'quality': -2 if 'low' in format_id else -1, # Default quality value is -1 - 'protocol': protocol, + 'acodec': 'aac', + 'vcodec': 'h264', + 'abr': float_or_none(traverse_obj(audio_quality, ('metadata', 'bitrate')), 1000), + 'vbr': float_or_none(vid_quality if vid_quality > 0 else extract_video_quality(vid_qual_label), 1000), + 'height': traverse_obj(video_quality, ('metadata', 'resolution', 'height')), + 'width': traverse_obj(video_quality, ('metadata', 'resolution', 'width')), + 'quality': -2 if 'low' in video_quality['id'] else None, + 'protocol': 'niconico_dmc', + '_expected_protocol': dmc_protocol, 'http_headers': { 'Origin': 'https://www.nicovideo.jp', 'Referer': 'https://www.nicovideo.jp/watch/' + video_id, @@ -363,251 +379,220 @@ class NiconicoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - # Get video webpage for API data. - webpage, handle = self._download_webpage_handle( - 'http://www.nicovideo.jp/watch/' + video_id, video_id) - if video_id.startswith('so'): - video_id = self._match_id(handle.geturl()) - - api_data = self._parse_json(self._html_search_regex( - 'data-api-data="([^"]+)"', webpage, - 'API data', default='{}'), video_id) - - def get_video_info_web(items): - return dict_get(api_data['video'], items) - - # Get video info - video_info_xml = self._download_xml( - 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, - video_id, note='Downloading video info page') - - def get_video_info_xml(items): - if not isinstance(items, list): - items = [items] - for item in items: - ret = xpath_text(video_info_xml, './/' + item) - if ret: - return ret - - if get_video_info_xml('error'): - error_code = get_video_info_xml('code') - - if error_code == 'DELETED': - raise ExtractorError('The video has been deleted.', - expected=True) - elif error_code == 'NOT_FOUND': - raise ExtractorError('The video is not found.', - expected=True) - elif error_code == 'COMMUNITY': - self.to_screen('%s: The video is community members only.' % video_id) - else: - raise ExtractorError('%s reports error: %s' % (self.IE_NAME, error_code)) + try: + webpage, handle = self._download_webpage_handle( + 'http://www.nicovideo.jp/watch/' + video_id, video_id) + if video_id.startswith('so'): + video_id = self._match_id(handle.geturl()) - # Start extracting video formats - formats = [] - - # Get HTML5 videos info - quality_info = try_get(api_data, lambda x: x['media']['delivery']['movie']) - if not quality_info: - raise ExtractorError('The video can\'t be downloaded', expected=True) - - for audio_quality in quality_info.get('audios') or {}: - for video_quality in quality_info.get('videos') or {}: - if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'): - continue - formats.append(self._extract_format_for_quality( - api_data, video_id, audio_quality, video_quality)) - - # Get flv/swf info - timestamp = None - video_real_url = try_get(api_data, lambda x: x['video']['smileInfo']['url']) - if video_real_url: - is_economy = video_real_url.endswith('low') - - if is_economy: - self.report_warning('Site is currently in economy mode! You will only have access to lower quality streams') + api_data = self._parse_json(self._html_search_regex( + 'data-api-data="([^"]+)"', webpage, + 'API data', default='{}'), video_id) + except ExtractorError as e: + try: + api_data = self._download_json( + 'https://www.nicovideo.jp/api/watch/v3/%s?_frontendId=6&_frontendVersion=0&actionTrackId=AAAAAAAAAA_%d' % (video_id, round(time.time() * 1000)), video_id, + note='Downloading API JSON', errnote='Unable to fetch data')['data'] + except ExtractorError: + if not isinstance(e.cause, compat_HTTPError): + raise + webpage = e.cause.read().decode('utf-8', 'replace') + error_msg = self._html_search_regex( + r'(?s)<section\s+class="(?:(?:ErrorMessage|WatchExceptionPage-message)\s*)+">(.+?)</section>', + webpage, 'error reason', default=None) + if not error_msg: + raise + raise ExtractorError(re.sub(r'\s+', ' ', error_msg), expected=True) - # Invoking ffprobe to determine resolution - pp = FFmpegPostProcessor(self._downloader) - cookies = self._get_cookies('https://nicovideo.jp').output(header='', sep='; path=/; domain=nicovideo.jp;\n') + formats = [] - self.to_screen('%s: %s' % (video_id, 'Checking smile format with ffprobe')) + def get_video_info(*items, get_first=True, **kwargs): + return traverse_obj(api_data, ('video', *items), get_all=not get_first, **kwargs) - try: - metadata = pp.get_metadata_object(video_real_url, ['-cookies', cookies]) - except PostProcessingError as err: - raise ExtractorError(err.msg, expected=True) - - v_stream = a_stream = {} - - # Some complex swf files doesn't have video stream (e.g. nm4809023) - for stream in metadata['streams']: - if stream['codec_type'] == 'video': - v_stream = stream - elif stream['codec_type'] == 'audio': - a_stream = stream - - # Community restricted videos seem to have issues with the thumb API not returning anything at all - filesize = int( - (get_video_info_xml('size_high') if not is_economy else get_video_info_xml('size_low')) - or metadata['format']['size'] - ) - extension = ( - get_video_info_xml('movie_type') - or 'mp4' if 'mp4' in metadata['format']['format_name'] else metadata['format']['format_name'] - ) - - # 'creation_time' tag on video stream of re-encoded SMILEVIDEO mp4 files are '1970-01-01T00:00:00.000000Z'. - timestamp = ( - parse_iso8601(get_video_info_web('first_retrieve')) - or unified_timestamp(get_video_info_web('postedDateTime')) - ) - metadata_timestamp = ( - parse_iso8601(try_get(v_stream, lambda x: x['tags']['creation_time'])) - or timestamp if extension != 'mp4' else 0 - ) - - # According to compconf, smile videos from pre-2017 are always better quality than their DMC counterparts - smile_threshold_timestamp = parse_iso8601('2016-12-08T00:00:00+09:00') - - is_source = timestamp < smile_threshold_timestamp or metadata_timestamp > 0 - - # If movie file size is unstable, old server movie is not source movie. - if filesize > 1: - formats.append({ - 'url': video_real_url, - 'format_id': 'smile' if not is_economy else 'smile_low', - 'format_note': 'SMILEVIDEO source' if not is_economy else 'SMILEVIDEO low quality', - 'ext': extension, - 'container': extension, - 'vcodec': v_stream.get('codec_name'), - 'acodec': a_stream.get('codec_name'), - # Some complex swf files doesn't have total bit rate metadata (e.g. nm6049209) - 'tbr': int_or_none(metadata['format'].get('bit_rate'), scale=1000), - 'vbr': int_or_none(v_stream.get('bit_rate'), scale=1000), - 'abr': int_or_none(a_stream.get('bit_rate'), scale=1000), - 'height': int_or_none(v_stream.get('height')), - 'width': int_or_none(v_stream.get('width')), - 'source_preference': 5 if not is_economy else -2, - 'quality': 5 if is_source and not is_economy else None, - 'filesize': filesize - }) + quality_info = api_data['media']['delivery']['movie'] + session_api_data = quality_info['session'] + for (audio_quality, video_quality, protocol) in itertools.product(quality_info['audios'], quality_info['videos'], session_api_data['protocols']): + fmt = self._extract_format_for_quality(video_id, audio_quality, video_quality, protocol) + if fmt: + formats.append(fmt) self._sort_formats(formats) # Start extracting information - title = ( - get_video_info_xml('title') # prefer to get the untranslated original title - or get_video_info_web(['originalTitle', 'title']) - or self._og_search_title(webpage, default=None) - or self._html_search_regex( - r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>', - webpage, 'video title')) - - watch_api_data_string = self._html_search_regex( - r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>', - webpage, 'watch api data', default=None) - watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {} - video_detail = watch_api_data.get('videoDetail', {}) - - thumbnail = ( - self._html_search_regex(r'<meta property="og:image" content="([^"]+)">', webpage, 'thumbnail data', default=None) - or dict_get( # choose highest from 720p to 240p - get_video_info_web('thumbnail'), - ['ogp', 'player', 'largeUrl', 'middleUrl', 'url']) - or self._html_search_meta('image', webpage, 'thumbnail', default=None) - or video_detail.get('thumbnail')) - - description = get_video_info_web('description') - - if not timestamp: - match = self._html_search_meta('datePublished', webpage, 'date published', default=None) - if match: - timestamp = parse_iso8601(match.replace('+', ':00+')) - if not timestamp and video_detail.get('postedAt'): - timestamp = parse_iso8601( - video_detail['postedAt'].replace('/', '-'), - delimiter=' ', timezone=datetime.timedelta(hours=9)) - timestamp = timestamp or try_get(api_data, lambda x: parse_iso8601(x['video']['registeredAt'])) - - view_count = int_or_none(get_video_info_web(['view_counter', 'viewCount'])) - if not view_count: - match = self._html_search_regex( - r'>Views: <strong[^>]*>([^<]+)</strong>', - webpage, 'view count', default=None) - if match: - view_count = int_or_none(match.replace(',', '')) - view_count = ( - view_count - or video_detail.get('viewCount') - or try_get(api_data, lambda x: x['video']['count']['view'])) - - comment_count = ( - int_or_none(get_video_info_web('comment_num')) - or video_detail.get('commentCount') - or try_get(api_data, lambda x: x['video']['count']['comment'])) - - if not comment_count: - match = self._html_search_regex( - r'>Comments: <strong[^>]*>([^<]+)</strong>', - webpage, 'comment count', default=None) - if match: - comment_count = int_or_none(match.replace(',', '')) - - duration = (parse_duration( - get_video_info_web('length') - or self._html_search_meta( - 'video:duration', webpage, 'video duration', default=None)) - or video_detail.get('length') - or get_video_info_web('duration')) - - webpage_url = get_video_info_web('watch_url') or url - - # for channel movie and community movie - channel_id = try_get( - api_data, - (lambda x: x['channel']['globalId'], - lambda x: x['community']['globalId'])) - channel = try_get( - api_data, - (lambda x: x['channel']['name'], - lambda x: x['community']['name'])) - - # Note: cannot use api_data.get('owner', {}) because owner may be set to "null" - # in the JSON, which will cause None to be returned instead of {}. - owner = try_get(api_data, lambda x: x.get('owner'), dict) or {} - uploader_id = str_or_none( - get_video_info_web(['ch_id', 'user_id']) - or owner.get('id') - or channel_id - ) - uploader = ( - get_video_info_web(['ch_name', 'user_nickname']) - or owner.get('nickname') - or channel - ) + tags = None + if webpage: + # use og:video:tag (not logged in) + og_video_tags = re.finditer(r'<meta\s+property="og:video:tag"\s*content="(.*?)">', webpage) + tags = list(filter(None, (clean_html(x.group(1)) for x in og_video_tags))) + if not tags: + # use keywords and split with comma (not logged in) + kwds = self._html_search_meta('keywords', webpage, default=None) + if kwds: + tags = [x for x in kwds.split(',') if x] + if not tags: + # find in json (logged in) + tags = traverse_obj(api_data, ('tag', 'items', ..., 'name')) + + thumb_prefs = qualities(['url', 'middleUrl', 'largeUrl', 'player', 'ogp']) return { 'id': video_id, '_api_data': api_data, - 'title': title, + 'title': get_video_info(('originalTitle', 'title')) or self._og_search_title(webpage, default=None), 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - 'uploader': uploader, - 'timestamp': timestamp, - 'uploader_id': uploader_id, - 'channel': channel, - 'channel_id': channel_id, - 'view_count': view_count, - 'comment_count': comment_count, - 'duration': duration, - 'webpage_url': webpage_url, + 'thumbnails': [{ + 'id': key, + 'url': url, + 'ext': 'jpg', + 'preference': thumb_prefs(key), + **parse_resolution(url, lenient=True), + } for key, url in (get_video_info('thumbnail') or {}).items() if url], + 'description': clean_html(get_video_info('description')), + 'uploader': traverse_obj(api_data, ('owner', 'nickname'), ('channel', 'name'), ('community', 'name')), + 'uploader_id': str_or_none(traverse_obj(api_data, ('owner', 'id'), ('channel', 'id'), ('community', 'id'))), + 'timestamp': parse_iso8601(get_video_info('registeredAt')) or parse_iso8601( + self._html_search_meta('video:release_date', webpage, 'date published', default=None)), + 'channel': traverse_obj(api_data, ('channel', 'name'), ('community', 'name')), + 'channel_id': traverse_obj(api_data, ('channel', 'id'), ('community', 'id')), + 'view_count': int_or_none(get_video_info('count', 'view')), + 'tags': tags, + 'genre': traverse_obj(api_data, ('genre', 'label'), ('genre', 'key')), + 'comment_count': get_video_info('count', 'comment', expected_type=int), + 'duration': ( + parse_duration(self._html_search_meta('video:duration', webpage, 'video duration', default=None)) + or get_video_info('duration')), + 'webpage_url': url_or_none(url) or f'https://www.nicovideo.jp/watch/{video_id}', + 'subtitles': self.extract_subtitles(video_id, api_data, session_api_data), + } + + def _get_subtitles(self, video_id, api_data, session_api_data): + comment_user_key = traverse_obj(api_data, ('comment', 'keys', 'userKey')) + user_id_str = session_api_data.get('serviceUserId') + + thread_ids = traverse_obj(api_data, ('comment', 'threads', lambda _, v: v['isActive'])) + raw_danmaku = self._extract_all_comments(video_id, thread_ids, user_id_str, comment_user_key) + if not raw_danmaku: + self.report_warning(f'Failed to get comments. {bug_reports_message()}') + return + return { + 'comments': [{ + 'ext': 'json', + 'data': json.dumps(raw_danmaku), + }], } + def _extract_all_comments(self, video_id, threads, user_id, user_key): + auth_data = { + 'user_id': user_id, + 'userkey': user_key, + } if user_id and user_key else {'user_id': ''} + + # Request Start + post_data = [{'ping': {'content': 'rs:0'}}] + for i, thread in enumerate(threads): + thread_id = thread['id'] + thread_fork = thread['fork'] + # Post Start (2N) + post_data.append({'ping': {'content': f'ps:{i * 2}'}}) + post_data.append({'thread': { + 'fork': thread_fork, + 'language': 0, + 'nicoru': 3, + 'scores': 1, + 'thread': thread_id, + 'version': '20090904', + 'with_global': 1, + **auth_data, + }}) + # Post Final (2N) + post_data.append({'ping': {'content': f'pf:{i * 2}'}}) + + # Post Start (2N+1) + post_data.append({'ping': {'content': f'ps:{i * 2 + 1}'}}) + post_data.append({'thread_leaves': { + # format is '<bottom of minute range>-<top of minute range>:<comments per minute>,<total last comments' + # unfortunately NND limits (deletes?) comment returns this way, so you're only able to grab the last 1000 per language + 'content': '0-999999:999999,999999,nicoru:999999', + 'fork': thread_fork, + 'language': 0, + 'nicoru': 3, + 'scores': 1, + 'thread': thread_id, + **auth_data, + }}) + # Post Final (2N+1) + post_data.append({'ping': {'content': f'pf:{i * 2 + 1}'}}) + # Request Final + post_data.append({'ping': {'content': 'rf:0'}}) + + for api_url in self._COMMENT_API_ENDPOINTS: + comments = self._download_json( + api_url, video_id, data=json.dumps(post_data).encode(), fatal=False, + headers={ + 'Referer': 'https://www.nicovideo.jp/watch/%s' % video_id, + 'Origin': 'https://www.nicovideo.jp', + 'Content-Type': 'text/plain;charset=UTF-8', + }, + note='Downloading comments', errnote=f'Failed to access endpoint {api_url}') + if comments: + return comments + + +class NiconicoPlaylistBaseIE(InfoExtractor): + _PAGE_SIZE = 100 + + _API_HEADERS = { + 'X-Frontend-ID': '6', + 'X-Frontend-Version': '0', + 'X-Niconico-Language': 'en-us' + } + + def _call_api(self, list_id, resource, query): + "Implement this in child class" + pass + + @staticmethod + def _parse_owner(item): + return { + 'uploader': traverse_obj(item, ('owner', 'name')), + 'uploader_id': traverse_obj(item, ('owner', 'id')), + } + + def _fetch_page(self, list_id, page): + page += 1 + resp = self._call_api(list_id, 'page %d' % page, { + 'page': page, + 'pageSize': self._PAGE_SIZE, + }) + # this is needed to support both mylist and user + for video in traverse_obj(resp, ('items', ..., ('video', None))) or []: + video_id = video.get('id') + if not video_id: + # skip {"video": {"id": "blablabla", ...}} + continue + count = video.get('count') or {} + get_count = lambda x: int_or_none(count.get(x)) + yield { + '_type': 'url', + 'id': video_id, + 'title': video.get('title'), + 'url': f'https://www.nicovideo.jp/watch/{video_id}', + 'description': video.get('shortDescription'), + 'duration': int_or_none(video.get('duration')), + 'view_count': get_count('view'), + 'comment_count': get_count('comment'), + 'thumbnail': traverse_obj(video, ('thumbnail', ('nHdUrl', 'largeUrl', 'listingUrl', 'url'))), + 'ie_key': NiconicoIE.ie_key(), + **self._parse_owner(video), + } + + def _entries(self, list_id): + return OnDemandPagedList(functools.partial(self._fetch_page, list_id), self._PAGE_SIZE) + -class NiconicoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/|my/)?mylist/(?P<id>\d+)' +class NiconicoPlaylistIE(NiconicoPlaylistBaseIE): + IE_NAME = 'niconico:playlist' + _VALID_URL = r'https?://(?:(?:www\.|sp\.)?nicovideo\.jp|nico\.ms)/(?:user/\d+/)?(?:my/)?mylist/(?:#/)?(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.nicovideo.jp/mylist/27411728', @@ -618,73 +603,115 @@ class NiconicoPlaylistIE(InfoExtractor): 'uploader': 'のっく', 'uploader_id': '805442', }, - 'playlist_mincount': 225, + 'playlist_mincount': 291, }, { 'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728', 'only_matching': True, + }, { + 'url': 'https://www.nicovideo.jp/my/mylist/#/68048635', + 'only_matching': True, }] - _API_HEADERS = { - 'X-Frontend-ID': '6', - 'X-Frontend-Version': '0' - } + def _call_api(self, list_id, resource, query): + return self._download_json( + f'https://nvapi.nicovideo.jp/v2/mylists/{list_id}', list_id, + f'Downloading {resource}', query=query, + headers=self._API_HEADERS)['data']['mylist'] def _real_extract(self, url): list_id = self._match_id(url) + mylist = self._call_api(list_id, 'list', { + 'pageSize': 1, + }) + return self.playlist_result( + self._entries(list_id), list_id, + mylist.get('name'), mylist.get('description'), **self._parse_owner(mylist)) - def get_page_data(pagenum, pagesize): - return self._download_json( - 'http://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, - query={'page': 1 + pagenum, 'pageSize': pagesize}, - headers=self._API_HEADERS).get('data').get('mylist') - - data = get_page_data(0, 1) - title = data.get('name') - description = data.get('description') - uploader = data.get('owner').get('name') - uploader_id = data.get('owner').get('id') - - def pagefunc(pagenum): - data = get_page_data(pagenum, 25) - return ({ - '_type': 'url', - 'url': 'http://www.nicovideo.jp/watch/' + item.get('watchId'), - } for item in data.get('items')) - - return { - '_type': 'playlist', - 'id': list_id, - 'title': title, - 'description': description, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'entries': OnDemandPagedList(pagefunc, 25), - } - - -NicovideoSearchIE_NAME = 'nicovideo:search' +class NiconicoSeriesIE(InfoExtractor): + IE_NAME = 'niconico:series' + _VALID_URL = r'https?://(?:(?:www\.|sp\.)?nicovideo\.jp|nico\.ms)/series/(?P<id>\d+)' -class NicovideoSearchURLIE(InfoExtractor): - IE_NAME = f'{NicovideoSearchIE_NAME}_url' - IE_DESC = 'Nico video search URLs' - _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/search/(?P<id>[^?#&]+)?' _TESTS = [{ - 'url': 'http://www.nicovideo.jp/search/sm9', + 'url': 'https://www.nicovideo.jp/series/110226', 'info_dict': { - 'id': 'sm9', - 'title': 'sm9' + 'id': '110226', + 'title': 'ご立派ァ!のシリーズ', }, - 'playlist_mincount': 40, + 'playlist_mincount': 10, # as of 2021/03/17 }, { - 'url': 'https://www.nicovideo.jp/search/sm9?sort=h&order=d&end=2020-12-31&start=2020-01-01', + 'url': 'https://www.nicovideo.jp/series/12312/', 'info_dict': { - 'id': 'sm9', - 'title': 'sm9' + 'id': '12312', + 'title': 'バトルスピリッツ お勧めカード紹介(調整中)', }, - 'playlist_count': 31, + 'playlist_mincount': 97, # as of 2021/03/17 + }, { + 'url': 'https://nico.ms/series/203559', + 'only_matching': True, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(f'https://www.nicovideo.jp/series/{list_id}', list_id) + + title = self._search_regex( + (r'<title>「(.+)(全', + r'<div class="TwitterShareButton"\s+data-text="(.+)\s+https:'), + webpage, 'title', fatal=False) + if title: + title = unescapeHTML(title) + playlist = [ + self.url_result(f'https://www.nicovideo.jp/watch/{v_id}', video_id=v_id) + for v_id in re.findall(r'href="/watch/([a-z0-9]+)" data-href="/watch/\1', webpage)] + return self.playlist_result(playlist, list_id, title) + + +class NiconicoHistoryIE(NiconicoPlaylistBaseIE): + IE_NAME = 'niconico:history' + IE_DESC = 'NicoNico user history. Requires cookies.' + _VALID_URL = r'https?://(?:www\.|sp\.)?nicovideo\.jp/my/history' + + _TESTS = [{ + 'note': 'PC page, with /video', + 'url': 'https://www.nicovideo.jp/my/history/video', + 'only_matching': True, + }, { + 'note': 'PC page, without /video', + 'url': 'https://www.nicovideo.jp/my/history', + 'only_matching': True, + }, { + 'note': 'mobile page, with /video', + 'url': 'https://sp.nicovideo.jp/my/history/video', + 'only_matching': True, + }, { + 'note': 'mobile page, without /video', + 'url': 'https://sp.nicovideo.jp/my/history', + 'only_matching': True, }] + def _call_api(self, list_id, resource, query): + return self._download_json( + 'https://nvapi.nicovideo.jp/v1/users/me/watch/history', 'history', + f'Downloading {resource}', query=query, + headers=self._API_HEADERS)['data'] + + def _real_extract(self, url): + list_id = 'history' + try: + mylist = self._call_api(list_id, 'list', { + 'pageSize': 1, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self.raise_login_required('You have to be logged in to get your watch history') + raise + return self.playlist_result(self._entries(list_id), list_id, **self._parse_owner(mylist)) + + +class NicovideoSearchBaseIE(InfoExtractor): + _SEARCH_TYPE = 'search' + def _entries(self, url, item_id, query=None, note='Downloading page %(page)s'): query = query or {} pages = [query['page']] if 'page' in query else itertools.count(1) @@ -697,26 +724,45 @@ class NicovideoSearchURLIE(InfoExtractor): if not results: break - def _real_extract(self, url): - query = self._match_id(url) - return self.playlist_result(self._entries(url, query), query, query) + def _search_results(self, query): + return self._entries( + self._proto_relative_url(f'//www.nicovideo.jp/{self._SEARCH_TYPE}/{query}'), query) -class NicovideoSearchIE(SearchInfoExtractor, NicovideoSearchURLIE): - IE_DESC = 'Nico video searches' - _MAX_RESULTS = float('inf') - IE_NAME = NicovideoSearchIE_NAME +class NicovideoSearchIE(NicovideoSearchBaseIE, SearchInfoExtractor): + IE_DESC = 'Nico video search' + IE_NAME = 'nicovideo:search' _SEARCH_KEY = 'nicosearch' - _TESTS = [] - def _search_results(self, query): - return self._entries( - self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query) + +class NicovideoSearchURLIE(NicovideoSearchBaseIE): + IE_NAME = f'{NicovideoSearchIE.IE_NAME}_url' + IE_DESC = 'Nico video search URLs' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/search/(?P<id>[^?#&]+)?' + _TESTS = [{ + 'url': 'http://www.nicovideo.jp/search/sm9', + 'info_dict': { + 'id': 'sm9', + 'title': 'sm9' + }, + 'playlist_mincount': 40, + }, { + 'url': 'https://www.nicovideo.jp/search/sm9?sort=h&order=d&end=2020-12-31&start=2020-01-01', + 'info_dict': { + 'id': 'sm9', + 'title': 'sm9' + }, + 'playlist_count': 31, + }] + + def _real_extract(self, url): + query = self._match_id(url) + return self.playlist_result(self._entries(url, query), query, query) -class NicovideoSearchDateIE(NicovideoSearchIE): - IE_DESC = 'Nico video searches, newest first' - IE_NAME = f'{NicovideoSearchIE_NAME}:date' +class NicovideoSearchDateIE(NicovideoSearchBaseIE, SearchInfoExtractor): + IE_DESC = 'Nico video search, newest first' + IE_NAME = f'{NicovideoSearchIE.IE_NAME}:date' _SEARCH_KEY = 'nicosearchdate' _TESTS = [{ 'url': 'nicosearchdateall:a', @@ -757,7 +803,26 @@ class NicovideoSearchDateIE(NicovideoSearchIE): if page_num: query['page'] = str(page_num) - yield from NicovideoSearchURLIE._entries(self, url, item_id, query=query, note=note) + yield from super()._entries(url, item_id, query=query, note=note) + + +class NicovideoTagURLIE(NicovideoSearchBaseIE): + IE_NAME = 'niconico:tag' + IE_DESC = 'NicoNico video tag URLs' + _SEARCH_TYPE = 'tag' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/tag/(?P<id>[^?#&]+)?' + _TESTS = [{ + 'url': 'https://www.nicovideo.jp/tag/ドキュメンタリー淫夢', + 'info_dict': { + 'id': 'ドキュメンタリー淫夢', + 'title': 'ドキュメンタリー淫夢' + }, + 'playlist_mincount': 400, + }] + + def _real_extract(self, url): + query = self._match_id(url) + return self.playlist_result(self._entries(url, query), query, query) class NiconicoUserIE(InfoExtractor): diff --git a/hypervideo_dl/extractor/ninecninemedia.py b/hypervideo_dl/extractor/ninecninemedia.py index 4aaf21a..7818427 100644 --- a/hypervideo_dl/extractor/ninecninemedia.py +++ b/hypervideo_dl/extractor/ninecninemedia.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, @@ -99,3 +98,37 @@ class NineCNineMediaIE(InfoExtractor): } return info + + +class CPTwentyFourIE(InfoExtractor): + IE_NAME = 'cp24' + _GEO_COUNTRIES = ['CA'] + _VALID_URL = r'https?://(?:www\.)?cp24\.com/news/(?P<id>[^?#]+)' + + _TESTS = [{ + 'url': 'https://www.cp24.com/news/video-shows-atm-being-ripped-out-of-business-by-pickup-truck-driver-in-mississauga-1.5676877', + 'info_dict': { + 'id': '2328005', + 'ext': 'mp4', + 'title': 'WATCH: Truck rips ATM from Mississauga business', + 'description': 'md5:cf7498480885f080a754389a2b2f7073', + 'timestamp': 1637618377, + 'episode_number': None, + 'season': 'Season 0', + 'season_number': 0, + 'season_id': 57974, + 'series': 'CTV News Toronto', + 'duration': 26.86, + 'thumbnail': 'http://images2.9c9media.com/image_asset/2014_11_5_2eb609a0-475b-0132-fbd6-34b52f6f1279_jpg_2000x1125.jpg', + 'upload_date': '20211122', + }, + 'params': {'skip_download': True, 'format': 'bv'} + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + id, destination = self._search_regex( + r'getAuthStates\("(?P<id>[^"]+)",\s?"(?P<destination>[^"]+)"\);', + webpage, 'video id and destination', group=('id', 'destination')) + return self.url_result(f'9c9media:{destination}:{id}', ie=NineCNineMediaIE.ie_key(), video_id=id) diff --git a/hypervideo_dl/extractor/nitter.py b/hypervideo_dl/extractor/nitter.py index a0546cd..8bb709c 100644 --- a/hypervideo_dl/extractor/nitter.py +++ b/hypervideo_dl/extractor/nitter.py @@ -5,7 +5,6 @@ from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( parse_count, - unified_strdate, unified_timestamp, remove_end, determine_ext, @@ -25,6 +24,16 @@ class NitterIE(InfoExtractor): 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion', 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion', '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion', + 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion', + 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion', + 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion', + 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion', + 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion', + 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion', + 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion', + 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion', + 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion', + 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion', 'nitter.i2p', 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p', @@ -36,28 +45,55 @@ class NitterIE(InfoExtractor): 'nitter.42l.fr', 'nitter.pussthecat.org', 'nitter.nixnet.services', - 'nitter.mastodont.cat', - 'nitter.tedomum.net', 'nitter.fdn.fr', 'nitter.1d4.us', 'nitter.kavin.rocks', - 'tweet.lambda.dance', - 'nitter.cc', - 'nitter.vxempire.xyz', 'nitter.unixfox.eu', 'nitter.domain.glass', - 'nitter.himiko.cloud', 'nitter.eu', 'nitter.namazso.eu', - 'nitter.mailstation.de', 'nitter.actionsack.com', - 'nitter.cattube.org', - 'nitter.dark.fail', 'birdsite.xanny.family', - 'nitter.40two.app', - 'nitter.skrep.in', + 'nitter.hu', + 'twitr.gq', + 'nitter.moomoo.me', + 'nittereu.moomoo.me', + 'bird.from.tf', + 'nitter.it', + 'twitter.censors.us', + 'twitter.grimneko.de', + 'nitter.alefvanoon.xyz', + 'n.hyperborea.cloud', + 'nitter.ca', + 'twitter.076.ne.jp', + 'twitter.mstdn.social', + 'nitter.fly.dev', + 'notabird.site', + 'nitter.weiler.rocks', + 'nitter.silkky.cloud', + 'nitter.sethforprivacy.com', + 'nttr.stream', + 'nitter.cutelab.space', + 'nitter.nl', + 'nitter.mint.lgbt', + 'nitter.bus-hit.me', + 'fuckthesacklers.network', + 'nitter.govt.land', + 'nitter.datatunnel.xyz', + 'nitter.esmailelbob.xyz', + 'tw.artemislena.eu', + 'de.nttr.stream', + 'nitter.winscloud.net', + 'nitter.tiekoetter.com', + 'nitter.spaceint.fr', + 'twtr.bch.bar', + 'nitter.exonip.de', + 'nitter.mastodon.pro', + 'nitter.notraxx.ch', + # not in the list anymore + 'nitter.skrep.in', 'nitter.snopyta.org', ) @@ -68,96 +104,121 @@ class NitterIE(InfoExtractor): # official, rate limited 'nitter.net', # offline + 'is-nitter.resolv.ee', + 'lu-nitter.resolv.ee', 'nitter.13ad.de', + 'nitter.40two.app', + 'nitter.cattube.org', + 'nitter.cc', + 'nitter.dark.fail', + 'nitter.himiko.cloud', + 'nitter.koyu.space', + 'nitter.mailstation.de', + 'nitter.mastodont.cat', + 'nitter.tedomum.net', + 'nitter.tokhmi.xyz', 'nitter.weaponizedhumiliation.com', + 'nitter.vxempire.xyz', + 'tweet.lambda.dance', ) INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES - _INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')' - _VALID_URL = r'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE} + _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})' + _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' current_instance = random.choice(HTTP_INSTANCES) _TESTS = [ { # GIF (wrapped in mp4) - 'url': 'https://%s/firefox/status/1314279897502629888#m' % current_instance, + 'url': f'https://{current_instance}/firefox/status/1314279897502629888#m', 'info_dict': { 'id': '1314279897502629888', 'ext': 'mp4', - 'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. \n\nReport harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg\n\n#UnfckTheInternet', - 'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. \n\nReport harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg\n\n#UnfckTheInternet', + 'title': 'md5:7890a9277da4639ab624dd899424c5d8', + 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Firefox 🔥', 'uploader_id': 'firefox', - 'uploader_url': 'https://%s/firefox' % current_instance, + 'uploader_url': f'https://{current_instance}/firefox', 'upload_date': '20201008', 'timestamp': 1602183720, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, }, }, { # normal video - 'url': 'https://%s/Le___Doc/status/1299715685392756737#m' % current_instance, + 'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m', 'info_dict': { 'id': '1299715685392756737', 'ext': 'mp4', - 'title': 'Le Doc - "Je ne prédis jamais rien"\nD Raoult, Août 2020...', + 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...', 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Le Doc', + 'uploader': 're:^Le *Doc', 'uploader_id': 'Le___Doc', - 'uploader_url': 'https://%s/Le___Doc' % current_instance, + 'uploader_url': f'https://{current_instance}/Le___Doc', 'upload_date': '20200829', - 'timestamp': 1598711341, + 'timestamp': 1598711340, 'view_count': int, 'like_count': int, 'repost_count': int, 'comment_count': int, }, }, { # video embed in a "Streaming Political Ads" box - 'url': 'https://%s/mozilla/status/1321147074491092994#m' % current_instance, + 'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m', 'info_dict': { 'id': '1321147074491092994', 'ext': 'mp4', - 'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?\n\nThis isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. \n\nLearn more ➡️ https://mzl.la/StreamingAds", - 'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?\n\nThis isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. \n\nLearn more ➡️ https://mzl.la/StreamingAds", + 'title': 'md5:8290664aabb43b9189145c008386bf12', + 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Mozilla', 'uploader_id': 'mozilla', - 'uploader_url': 'https://%s/mozilla' % current_instance, + 'uploader_url': f'https://{current_instance}/mozilla', 'upload_date': '20201027', - 'timestamp': 1603820982 + 'timestamp': 1603820940, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], }, { # not the first tweet but main-tweet - 'url': 'https://%s/TheNaturalNu/status/1379050895539724290#m' % current_instance, + 'url': f'https://{current_instance}/firefox/status/1354848277481414657#m', 'info_dict': { - 'id': '1379050895539724290', + 'id': '1354848277481414657', 'ext': 'mp4', - 'title': 'Dorothy Zbornak - This had me hollering!!', - 'description': 'This had me hollering!!', + 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700', + 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Dorothy Zbornak', - 'uploader_id': 'TheNaturalNu', - 'uploader_url': 'https://%s/TheNaturalNu' % current_instance, - 'timestamp': 1617626329, - 'upload_date': '20210405' + 'uploader': 'Firefox 🔥', + 'uploader_id': 'firefox', + 'uploader_url': f'https://{current_instance}/firefox', + 'upload_date': '20210128', + 'timestamp': 1611855960, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, } } ] def _real_extract(self, url): - video_id = self._match_id(url) + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') parsed_url = compat_urlparse.urlparse(url) - base_url = '%s://%s' % (parsed_url.scheme, parsed_url.netloc) + base_url = f'{parsed_url.scheme}://{parsed_url.netloc}' self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on') - full_webpage = self._download_webpage(url, video_id) + full_webpage = webpage = self._download_webpage(url, video_id) main_tweet_start = full_webpage.find('class="main-tweet"') if main_tweet_start > 0: webpage = full_webpage[main_tweet_start:] - if not webpage: - webpage = full_webpage - video_url = '%s%s' % (base_url, self._html_search_regex(r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url')) + video_url = '%s%s' % (base_url, self._html_search_regex( + r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url')) ext = determine_ext(video_url) if ext == 'unknown_video': @@ -168,61 +229,49 @@ class NitterIE(InfoExtractor): 'ext': ext }] - title = self._og_search_description(full_webpage) - if not title: - title = self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title') - description = title + title = description = self._og_search_description(full_webpage) or self._html_search_regex( + r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False) - mobj = self._match_valid_url(url) - uploader_id = ( - mobj.group('uploader_id') - or self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False) - ) + uploader_id = self._html_search_regex( + r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id - if uploader_id: - uploader_url = '%s/%s' % (base_url, uploader_id) + uploader = self._html_search_regex( + r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False) + if uploader: + title = f'{uploader} - {title}' - uploader = self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False) + counts = { + f'{x[0]}_count': self._html_search_regex( + fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>', + webpage, f'{x[0]} count', fatal=False) + for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment')) + } + counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()} - if uploader: - title = '%s - %s' % (uploader, title) - - view_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-play[^>]*></span>\s([^<]+)</div>', webpage, 'view count', fatal=False)) - like_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-heart[^>]*></span>\s([^<]+)</div>', webpage, 'like count', fatal=False)) - repost_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-retweet[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False)) - comment_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-comment[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False)) - - thumbnail = self._html_search_meta('og:image', full_webpage, 'thumbnail url') - if not thumbnail: - thumbnail = '%s%s' % (base_url, self._html_search_regex(r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)) - thumbnail = remove_end(thumbnail, '%3Asmall') - - thumbnails = [] - thumbnail_ids = ('thumb', 'small', 'large', 'medium', 'orig') - for id in thumbnail_ids: - thumbnails.append({ - 'id': id, - 'url': thumbnail + '%3A' + id, - }) - - date = self._html_search_regex(r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False) - upload_date = unified_strdate(date) - timestamp = unified_timestamp(date) + thumbnail = ( + self._html_search_meta('og:image', full_webpage, 'thumbnail url') + or remove_end('%s%s' % (base_url, self._html_search_regex( + r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall')) + + thumbnails = [ + {'id': id, 'url': f'{thumbnail}%3A{id}'} + for id in ('thumb', 'small', 'large', 'medium', 'orig') + ] + + date = self._html_search_regex( + r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', + webpage, 'upload date', default='').replace('·', '') return { 'id': video_id, 'title': title, 'description': description, 'uploader': uploader, - 'timestamp': timestamp, + 'timestamp': unified_timestamp(date), 'uploader_id': uploader_id, - 'uploader_url': uploader_url, - 'view_count': view_count, - 'like_count': like_count, - 'repost_count': repost_count, - 'comment_count': comment_count, + 'uploader_url': f'{base_url}/{uploader_id}', 'formats': formats, 'thumbnails': thumbnails, 'thumbnail': thumbnail, - 'upload_date': upload_date, + **counts, } diff --git a/hypervideo_dl/extractor/njpwworld.py b/hypervideo_dl/extractor/njpwworld.py index 3639d14..68c8c8e 100644 --- a/hypervideo_dl/extractor/njpwworld.py +++ b/hypervideo_dl/extractor/njpwworld.py @@ -43,15 +43,7 @@ class NJPWWorldIE(InfoExtractor): _LOGIN_URL = 'https://front.njpwworld.com/auth/login' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - # No authentication to be performed - if not username: - return True - + def _perform_login(self, username, password): # Setup session (will set necessary cookies) self._request_webpage( 'https://njpwworld.com/', None, note='Setting up session') @@ -77,13 +69,8 @@ class NJPWWorldIE(InfoExtractor): for kind, vid in re.findall(r'if\s+\(\s*imageQualityType\s*==\s*\'([^\']+)\'\s*\)\s*{\s*video_id\s*=\s*"(\d+)"', webpage): player_path = '/intent?id=%s&type=url' % vid player_url = compat_urlparse.urljoin(url, player_path) - formats.append({ - 'url': player_url, - 'format_id': kind, - 'ext': 'mp4', - 'protocol': 'm3u8', - 'quality': 2 if kind == 'high' else 1, - }) + formats += self._extract_m3u8_formats( + player_url, video_id, 'mp4', 'm3u8_native', m3u8_id=kind, fatal=False, quality=int(kind == 'high')) self._sort_formats(formats) diff --git a/hypervideo_dl/extractor/noco.py b/hypervideo_dl/extractor/noco.py index 78c4952..28af909 100644 --- a/hypervideo_dl/extractor/noco.py +++ b/hypervideo_dl/extractor/noco.py @@ -61,14 +61,7 @@ class NocoIE(InfoExtractor): } ] - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login = self._download_json( self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata({ diff --git a/hypervideo_dl/extractor/noodlemagazine.py b/hypervideo_dl/extractor/noodlemagazine.py new file mode 100644 index 0000000..2f170bb --- /dev/null +++ b/hypervideo_dl/extractor/noodlemagazine.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_count, + unified_strdate +) + + +class NoodleMagazineIE(InfoExtractor): + _VALID_URL = r'https?://(?:www|adult\.)?noodlemagazine\.com/watch/(?P<id>[0-9-_]+)' + _TEST = { + 'url': 'https://adult.noodlemagazine.com/watch/-67421364_456239604', + 'md5': '9e02aa763612929d0b4b850591a9248b', + 'info_dict': { + 'id': '-67421364_456239604', + 'title': 'Aria alexander manojob', + 'thumbnail': r're:^https://.*\.jpg', + 'ext': 'mp4', + 'duration': 903, + 'view_count': int, + 'like_count': int, + 'description': 'Aria alexander manojob', + 'tags': ['aria', 'alexander', 'manojob'], + 'upload_date': '20190218', + 'age_limit': 18 + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + duration = parse_duration(self._html_search_meta('video:duration', webpage, 'duration', default=None)) + description = self._og_search_property('description', webpage, default='').replace(' watch online hight quality video', '') + tags = self._html_search_meta('video:tag', webpage, default='').split(', ') + view_count = parse_count(self._html_search_meta('ya:ovs:views_total', webpage, default=None)) + like_count = parse_count(self._html_search_meta('ya:ovs:likes', webpage, default=None)) + upload_date = unified_strdate(self._html_search_meta('ya:ovs:upload_date', webpage, default='')) + + key = self._html_search_regex(rf'/{video_id}\?(?:.*&)?m=([^&"\'\s,]+)', webpage, 'key') + playlist_info = self._download_json(f'https://adult.noodlemagazine.com/playlist/{video_id}?m={key}', video_id) + thumbnail = self._og_search_property('image', webpage, default=None) or playlist_info.get('image') + + formats = [{ + 'url': source.get('file'), + 'quality': source.get('label'), + 'ext': source.get('type'), + } for source in playlist_info.get('sources')] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'description': description, + 'tags': tags, + 'view_count': view_count, + 'like_count': like_count, + 'upload_date': upload_date, + 'age_limit': 18 + } diff --git a/hypervideo_dl/extractor/nova.py b/hypervideo_dl/extractor/nova.py index 3acb881..00a64f8 100644 --- a/hypervideo_dl/extractor/nova.py +++ b/hypervideo_dl/extractor/nova.py @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, js_to_json, qualities, + traverse_obj, unified_strdate, url_or_none, ) @@ -17,30 +18,45 @@ from ..utils import ( class NovaEmbedIE(InfoExtractor): _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1', - 'md5': 'ee009bafcc794541570edd44b71cbea3', 'info_dict': { 'id': '8o0n0r', - 'ext': 'mp4', 'title': '2180. díl', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2578, }, - } + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['DRM protected', 'Requested format is not available'], + }, { + 'url': 'https://media.cms.nova.cz/embed/KybpWYvcgOa', + 'info_dict': { + 'id': 'KybpWYvcgOa', + 'ext': 'mp4', + 'title': 'Borhyová oslavila 60? Soutěžící z pořadu odboural moderátora Ondřeje Sokola', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 114, + }, + 'params': {'skip_download': 'm3u8'}, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + has_drm = False duration = None formats = [] player = self._parse_json( self._search_regex( - r'Player\.init\s*\([^,]+,\s*(?:\w+\s*\?\s*{.+?}\s*:\s*)?({.+})\s*,\s*{.+?}\s*\)\s*;', - webpage, 'player', default='{}'), video_id, fatal=False) + (r'(?:(?:replacePlaceholders|processAdTagModifier).*?:\s*)?(?:replacePlaceholders|processAdTagModifier)\s*\(\s*(?P<json>{.*?})\s*\)(?:\s*\))?\s*,', + r'Player\.init\s*\([^,]+,(?P<cndn>\s*\w+\s*\?)?\s*(?P<json>{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)'), + webpage, 'player', default='{}', group='json'), video_id, fatal=False) if player: for format_id, format_list in player['tracks'].items(): if not isinstance(format_list, list): @@ -48,6 +64,10 @@ class NovaEmbedIE(InfoExtractor): for format_dict in format_list: if not isinstance(format_dict, dict): continue + if (not self.get_param('allow_unplayable_formats') + and traverse_obj(format_dict, ('drm', 'keySystem'))): + has_drm = True + continue format_url = url_or_none(format_dict.get('src')) format_type = format_dict.get('type') ext = determine_ext(format_url) @@ -104,6 +124,8 @@ class NovaEmbedIE(InfoExtractor): f['format_id'] = f_id formats.append(f) + if not formats and has_drm: + self.report_drm(video_id) self._sort_formats(formats) title = self._og_search_title( diff --git a/hypervideo_dl/extractor/novaplay.py b/hypervideo_dl/extractor/novaplay.py index 724986a..bfb2c87 100644 --- a/hypervideo_dl/extractor/novaplay.py +++ b/hypervideo_dl/extractor/novaplay.py @@ -41,9 +41,7 @@ class NovaPlayIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_props = self._parse_json(self._search_regex( - r'<script\s?id=\"__NEXT_DATA__\"\s?type=\"application/json\">({.+})</script>', - webpage, 'video_props'), video_id)['props']['pageProps']['video'] + video_props = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video'] m3u8_url = self._download_json( f'https://nbg-api.fite.tv/api/v2/videos/{video_id}/streams', video_id, headers={'x-flipps-user-agent': 'Flipps/75/9.7'})[0]['url'] diff --git a/hypervideo_dl/extractor/npo.py b/hypervideo_dl/extractor/npo.py index ed547d0..a8aaef6 100644 --- a/hypervideo_dl/extractor/npo.py +++ b/hypervideo_dl/extractor/npo.py @@ -467,7 +467,7 @@ class NPOIE(NPOBaseIE): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': metadata.get('info'), 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], 'upload_date': unified_strdate(metadata.get('gidsdatum')), @@ -561,7 +561,7 @@ class NPORadioIE(InfoExtractor): return { 'id': video_id, 'url': stream['url'], - 'title': self._live_title(title), + 'title': title, 'acodec': codec, 'ext': codec, 'is_live': True, diff --git a/hypervideo_dl/extractor/npr.py b/hypervideo_dl/extractor/npr.py index 9d1122f..49f062d 100644 --- a/hypervideo_dl/extractor/npr.py +++ b/hypervideo_dl/extractor/npr.py @@ -91,7 +91,8 @@ class NprIE(InfoExtractor): elif format_id == 'smil': smil_formats = self._extract_smil_formats( format_url, media_id, transform_source=lambda s: s.replace( - 'rtmp://flash.npr.org/ondemand/', 'https://ondemand.npr.org/')) + 'rtmp://flash.npr.org/ondemand/', 'https://ondemand.npr.org/'), + fatal=False) self._check_formats(smil_formats, media_id) formats.extend(smil_formats) else: diff --git a/hypervideo_dl/extractor/nrk.py b/hypervideo_dl/extractor/nrk.py index b556bc6..4d723e8 100644 --- a/hypervideo_dl/extractor/nrk.py +++ b/hypervideo_dl/extractor/nrk.py @@ -8,6 +8,7 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + compat_HTTPError, determine_ext, ExtractorError, int_or_none, @@ -147,10 +148,14 @@ class NRKIE(NRKBaseIE): def _real_extract(self, url): video_id = self._match_id(url).split('/')[-1] - path_templ = 'playback/%s/' + video_id - def call_playback_api(item, query=None): - return self._call_api(path_templ % item, video_id, item, query=query) + try: + return self._call_api(f'playback/{item}/program/{video_id}', video_id, item, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + return self._call_api(f'playback/{item}/{video_id}', video_id, item, query=query) + raise + # known values for preferredCdn: akamai, iponly, minicdn and telenor manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) @@ -188,7 +193,7 @@ class NRKIE(NRKBaseIE): title = titles['title'] alt_title = titles.get('subtitle') - description = preplay.get('description') + description = try_get(preplay, lambda x: x['description'].replace('\r', '\n')) duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) thumbnails = [] diff --git a/hypervideo_dl/extractor/nrl.py b/hypervideo_dl/extractor/nrl.py index 22a2df8..0bd5086 100644 --- a/hypervideo_dl/extractor/nrl.py +++ b/hypervideo_dl/extractor/nrl.py @@ -16,7 +16,6 @@ class NRLTVIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - 'format': 'bestvideo', }, } diff --git a/hypervideo_dl/extractor/ntvcojp.py b/hypervideo_dl/extractor/ntvcojp.py index 0c8221b..c9af911 100644 --- a/hypervideo_dl/extractor/ntvcojp.py +++ b/hypervideo_dl/extractor/ntvcojp.py @@ -3,8 +3,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - js_to_json, + ExtractorError, smuggle_url, + traverse_obj, ) @@ -19,7 +20,7 @@ class NTVCoJpCUIE(InfoExtractor): 'ext': 'mp4', 'title': '桜エビと炒り卵がポイント! 「中華風 エビチリおにぎり」──『美虎』五十嵐美幸', 'upload_date': '20181213', - 'description': 'md5:211b52f4fd60f3e0e72b68b0c6ba52a9', + 'description': 'md5:1985b51a9abc285df0104d982a325f2a', 'uploader_id': '3855502814001', 'timestamp': 1544669941, }, @@ -28,22 +29,30 @@ class NTVCoJpCUIE(InfoExtractor): 'skip_download': True, }, } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - player_config = self._parse_json(self._search_regex( - r'(?s)PLAYER_CONFIG\s*=\s*({.+?})', - webpage, 'player config'), display_id, js_to_json) - video_id = player_config['videoId'] - account_id = player_config.get('account') or '3855502814001' + player_config = self._search_nuxt_data(webpage, display_id) + video_id = traverse_obj(player_config, ('movie', 'video_id')) + if not video_id: + raise ExtractorError('Failed to extract video ID for Brightcove') + account_id = traverse_obj(player_config, ('player', 'account')) or '3855502814001' + title = traverse_obj(player_config, ('movie', 'name')) + if not title: + og_title = self._og_search_title(webpage, fatal=False) or traverse_obj(player_config, ('player', 'title')) + if og_title: + title = og_title.split('(', 1)[0].strip() + description = (traverse_obj(player_config, ('movie', 'description')) + or self._html_search_meta(['description', 'og:description'], webpage)) return { '_type': 'url_transparent', 'id': video_id, 'display_id': display_id, - 'title': self._search_regex(r'<h1[^>]+class="title"[^>]*>([^<]+)', webpage, 'title').strip(), - 'description': self._html_search_meta(['description', 'og:description'], webpage), + 'title': title, + 'description': description, 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account_id, video_id), {'geo_countries': ['JP']}), 'ie_key': 'BrightcoveNew', } diff --git a/hypervideo_dl/extractor/nuvid.py b/hypervideo_dl/extractor/nuvid.py index 7487824..84fb97d 100644 --- a/hypervideo_dl/extractor/nuvid.py +++ b/hypervideo_dl/extractor/nuvid.py @@ -1,11 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import ( parse_duration, int_or_none, - try_get, + strip_or_none, + traverse_obj, + url_or_none, ) @@ -20,14 +23,30 @@ class NuvidIE(InfoExtractor): 'title': 'italian babe', 'duration': 321.0, 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', } }, { 'url': 'https://m.nuvid.com/video/6523263', + 'md5': 'ebd22ce8e47e1d9a4d0756a15c67da52', 'info_dict': { 'id': '6523263', 'ext': 'mp4', - 'age_limit': 18, 'title': 'Slut brunette college student anal dorm', + 'duration': 421.0, + 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', + 'thumbnails': list, + } + }, { + 'url': 'http://m.nuvid.com/video/6415801/', + 'md5': '638d5ececb138d5753593f751ae3f697', + 'info_dict': { + 'id': '6415801', + 'ext': 'mp4', + 'title': 'My best friend wanted to fuck my wife for a long time', + 'duration': 1882, + 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', } }] @@ -46,6 +65,16 @@ class NuvidIE(InfoExtractor): 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', }) + webpage = self._download_webpage( + 'http://m.nuvid.com/video/%s' % (video_id, ), + video_id, 'Downloading video page', fatal=False) or '' + + title = strip_or_none(video_data.get('title') or self._html_search_regex( + (r'''<span\s[^>]*?\btitle\s*=\s*(?P<q>"|'|\b)(?P<title>[^"]+)(?P=q)\s*>''', + r'''<div\s[^>]*?\bclass\s*=\s*(?P<q>"|'|\b)thumb-holder video(?P=q)>\s*<h5\b[^>]*>(?P<title>[^<]+)</h5''', + r'''<span\s[^>]*?\bclass\s*=\s*(?P<q>"|'|\b)title_thumb(?P=q)>(?P<title>[^<]+)</span'''), + webpage, 'title', group='title')) + formats = [{ 'url': source, 'format_id': qualities.get(quality), @@ -55,19 +84,19 @@ class NuvidIE(InfoExtractor): self._check_formats(formats, video_id) self._sort_formats(formats) - title = video_data.get('title') - thumbnail_base_url = try_get(video_data, lambda x: x['thumbs']['url']) - thumbnail_extension = try_get(video_data, lambda x: x['thumbs']['extension']) - thumbnail_id = self._search_regex( - r'/media/videos/tmb/6523263/preview/(/d+)' + thumbnail_extension, video_data.get('poster', ''), 'thumbnail id', default=19) - thumbnail = f'{thumbnail_base_url}player/{thumbnail_id}{thumbnail_extension}' - duration = parse_duration(video_data.get('duration') or video_data.get('duration_format')) + duration = parse_duration(traverse_obj(video_data, 'duration', 'duration_format')) + thumbnails = [ + {'url': thumb_url} for thumb_url in re.findall( + r'<div\s+class\s*=\s*"video-tmb-wrap"\s*>\s*<img\s+src\s*=\s*"([^"]+)"\s*/>', webpage) + if url_or_none(thumb_url)] + if url_or_none(video_data.get('poster')): + thumbnails.append({'url': video_data['poster'], 'preference': 1}) return { 'id': video_id, 'formats': formats, 'title': title, - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'duration': duration, 'age_limit': 18, } diff --git a/hypervideo_dl/extractor/odnoklassniki.py b/hypervideo_dl/extractor/odnoklassniki.py index 9cacd38..293f1aa 100644 --- a/hypervideo_dl/extractor/odnoklassniki.py +++ b/hypervideo_dl/extractor/odnoklassniki.py @@ -12,6 +12,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + float_or_none, unified_strdate, int_or_none, qualities, @@ -34,6 +35,38 @@ class OdnoklassnikiIE(InfoExtractor): (?P<id>[\d-]+) ''' _TESTS = [{ + 'note': 'Coub embedded', + 'url': 'http://ok.ru/video/1484130554189', + 'info_dict': { + 'id': '1keok9', + 'ext': 'mp4', + 'timestamp': 1545580896, + 'view_count': int, + 'thumbnail': 'https://coub-anubis-a.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg', + 'title': 'Народная забава', + 'uploader': 'Nevata', + 'upload_date': '20181223', + 'age_limit': 0, + 'uploader_id': 'nevata.s', + 'like_count': int, + 'duration': 8.08, + 'repost_count': int, + }, + }, { + 'note': 'vk.com embedded', + 'url': 'https://ok.ru/video/3568183087575', + 'info_dict': { + 'id': '-165101755_456243749', + 'ext': 'mp4', + 'uploader_id': '-165101755', + 'duration': 132, + 'timestamp': 1642869935, + 'upload_date': '20220122', + 'thumbnail': str, + 'title': str, + 'uploader': str, + }, + }, { # metadata in JSON 'url': 'http://ok.ru/video/20079905452', 'md5': '0b62089b479e06681abaaca9d204f152', @@ -97,6 +130,14 @@ class OdnoklassnikiIE(InfoExtractor): }, 'skip': 'Video has not been found', }, { + 'note': 'Only available in mobile webpage', + 'url': 'https://m.ok.ru/video/2361249957145', + 'info_dict': { + 'id': '2361249957145', + 'title': 'Быковское крещение', + 'duration': 3038.181, + }, + }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', 'only_matching': True, }, { @@ -131,13 +172,24 @@ class OdnoklassnikiIE(InfoExtractor): return mobj.group('url') def _real_extract(self, url): + try: + return self._extract_desktop(url) + except ExtractorError as e: + try: + return self._extract_mobile(url) + except ExtractorError: + # error message of desktop webpage is in English + raise e + + def _extract_desktop(self, url): start_time = int_or_none(compat_parse_qs( compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0]) video_id = self._match_id(url) webpage = self._download_webpage( - 'http://ok.ru/video/%s' % video_id, video_id) + 'http://ok.ru/video/%s' % video_id, video_id, + note='Downloading desktop webpage') error = self._search_regex( r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<', @@ -151,6 +203,10 @@ class OdnoklassnikiIE(InfoExtractor): webpage, 'player', group='player')), video_id) + # embedded external player + if player.get('isExternalPlayer') and player.get('url'): + return self.url_result(player['url']) + flashvars = player['flashvars'] metadata = flashvars.get('metadata') @@ -206,6 +262,14 @@ class OdnoklassnikiIE(InfoExtractor): 'start_time': start_time, } + # pladform + if provider == 'OPEN_GRAPH': + info.update({ + '_type': 'url_transparent', + 'url': movie['contentId'], + }) + return info + if provider == 'USER_YOUTUBE': info.update({ '_type': 'url_transparent', @@ -215,7 +279,7 @@ class OdnoklassnikiIE(InfoExtractor): assert title if provider == 'LIVE_TV_APP': - info['title'] = self._live_title(title) + info['title'] = title quality = qualities(('4', '0', '1', '2', '3', '5')) @@ -265,3 +329,32 @@ class OdnoklassnikiIE(InfoExtractor): info['formats'] = formats return info + + def _extract_mobile(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://m.ok.ru/video/%s' % video_id, video_id, + note='Downloading mobile webpage') + + error = self._search_regex( + r'видео</a>\s*<div\s+class="empty">(.+?)</div>', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + json_data = self._search_regex( + r'data-video="(.+?)"', webpage, 'json data') + json_data = self._parse_json(unescapeHTML(json_data), video_id) or {} + + return { + 'id': video_id, + 'title': json_data.get('videoName'), + 'duration': float_or_none(json_data.get('videoDuration'), scale=1000), + 'thumbnail': json_data.get('videoPosterSrc'), + 'formats': [{ + 'format_id': 'mobile', + 'url': json_data.get('videoSrc'), + 'ext': 'mp4', + }] + } diff --git a/hypervideo_dl/extractor/oktoberfesttv.py b/hypervideo_dl/extractor/oktoberfesttv.py index a914068..2765674 100644 --- a/hypervideo_dl/extractor/oktoberfesttv.py +++ b/hypervideo_dl/extractor/oktoberfesttv.py @@ -25,8 +25,8 @@ class OktoberfestTVIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._live_title(self._html_search_regex( - r'<h1><strong>.*?</strong>(.*?)</h1>', webpage, 'title')) + title = self._html_search_regex( + r'<h1><strong>.*?</strong>(.*?)</h1>', webpage, 'title') clip = self._search_regex( r"clip:\s*\{\s*url:\s*'([^']+)'", webpage, 'clip') diff --git a/hypervideo_dl/extractor/olympics.py b/hypervideo_dl/extractor/olympics.py index 0bc9206..784f282 100644 --- a/hypervideo_dl/extractor/olympics.py +++ b/hypervideo_dl/extractor/olympics.py @@ -2,22 +2,27 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + int_or_none, + try_get +) class OlympicsReplayIE(InfoExtractor): - _VALID_URL = r'(?:https?://)(?:www\.)?olympics\.com/tokyo-2020/(?:[a-z]{2}/)?replay/(?P<id>[^/#&?]+)' + _VALID_URL = r'https?://(?:www\.)?olympics\.com(?:/tokyo-2020)?/[a-z]{2}/(?:replay|video)/(?P<id>[^/#&?]+)' _TESTS = [{ - 'url': 'https://olympics.com/tokyo-2020/en/replay/300622eb-abc0-43ea-b03b-c5f2d429ec7b/jumping-team-qualifier', + 'url': 'https://olympics.com/fr/video/men-s-109kg-group-a-weightlifting-tokyo-2020-replays', 'info_dict': { - 'id': '300622eb-abc0-43ea-b03b-c5f2d429ec7b', + 'id': 'f6a0753c-8e6f-4b7d-a435-027054a4f8e9', 'ext': 'mp4', - 'title': 'Jumping Team Qualifier', - 'release_date': '20210806', - 'upload_date': '20210713', + 'title': '+109kg (H) Groupe A - Haltérophilie | Replay de Tokyo 2020', + 'upload_date': '20210801', + 'timestamp': 1627783200, + 'description': 'md5:c66af4a5bc7429dbcc43d15845ff03b3', + 'uploader': 'International Olympic Committee', }, 'params': { - 'format': 'bv', + 'skip_download': True, }, }, { 'url': 'https://olympics.com/tokyo-2020/en/replay/bd242924-4b22-49a5-a846-f1d4c809250d/mens-bronze-medal-match-hun-esp', @@ -26,31 +31,39 @@ class OlympicsReplayIE(InfoExtractor): def _real_extract(self, url): id = self._match_id(url) - # The parameters are hardcoded in the webpage, it's not necessary to download the webpage just for these parameters. - # If in downloading webpage serves other functions aswell, then extract these parameters from it. - token_url = 'https://appovptok.ovpobs.tv/api/identity/app/token?api_key=OTk5NDcxOjpvY3N3LWFwaXVzZXI%3D&api_secret=ODY4ODM2MjE3ODMwYmVjNTAxMWZlMDJiMTYxZmY0MjFiMjMwMjllMjJmNDA1YWRiYzA5ODcxYTZjZTljZDkxOTo6NTM2NWIzNjRlMTM1ZmI2YWNjNmYzMGMzOGM3NzZhZTY%3D' - token = self._download_webpage(token_url, id) - headers = {'x-obs-app-token': token} - data_json = self._download_json(f'https://appocswtok.ovpobs.tv/api/schedule-sessions/{id}?include=stream', - id, headers=headers) - meta_data = data_json['data']['attributes'] - for t_dict in data_json['included']: - if t_dict.get('type') == 'Stream': - stream_data = t_dict['attributes'] + + webpage = self._download_webpage(url, id) + title = self._html_search_meta(('title', 'og:title', 'twitter:title'), webpage) + uuid = self._html_search_meta('episode_uid', webpage) + m3u8_url = self._html_search_meta('video_url', webpage) + json_ld = self._search_json_ld(webpage, uuid) + thumbnails_list = json_ld.get('image') + if not thumbnails_list: + thumbnails_list = self._html_search_regex( + r'["\']image["\']:\s*["\']([^"\']+)["\']', webpage, 'images', default='') + thumbnails_list = thumbnails_list.replace('[', '').replace(']', '').split(',') + thumbnails_list = [thumbnail.strip() for thumbnail in thumbnails_list] + thumbnails = [] + for thumbnail in thumbnails_list: + width_a, height_a, width = self._search_regex( + r'/images/image/private/t_(?P<width_a>\d+)-(?P<height_a>\d+)_(?P<width>\d+)/primary/[\W\w\d]+', + thumbnail, 'thumb', group=(1, 2, 3), default=(None, None, None)) + width_a, height_a, width = int_or_none(width_a), int_or_none(height_a), int_or_none(width) + thumbnails.append({ + 'url': thumbnail, + 'width': width, + 'height': int_or_none(try_get(width, lambda x: x * height_a / width_a)) + }) m3u8_url = self._download_json( - 'https://meteringtok.ovpobs.tv/api/playback-sessions', id, headers=headers, query={ - 'alias': stream_data['alias'], - 'stream': stream_data['stream'], - 'type': 'vod' - })['data']['attributes']['url'] - formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id) + f'https://olympics.com/tokenGenerator?url={m3u8_url}', uuid, note='Downloading m3u8 url') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, uuid, m3u8_id='hls') self._sort_formats(formats) return { - 'id': id, - 'title': meta_data['title'], - 'release_date': unified_strdate(meta_data.get('start') or meta_data.get('broadcastPublished')), - 'upload_date': unified_strdate(meta_data.get('publishedAt')), + 'id': uuid, + 'title': title, + 'thumbnails': thumbnails, 'formats': formats, 'subtitles': subtitles, + **json_ld } diff --git a/hypervideo_dl/extractor/ondemandkorea.py b/hypervideo_dl/extractor/ondemandkorea.py index cc3c587..e933ea2 100644 --- a/hypervideo_dl/extractor/ondemandkorea.py +++ b/hypervideo_dl/extractor/ondemandkorea.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -71,8 +73,8 @@ class OnDemandKoreaIE(InfoExtractor): jw_config = self._parse_json( self._search_regex( - r'(?s)odkPlayer\.init.*?(?P<options>{[^;]+}).*?;', - webpage, 'jw config', group='options'), + r'playlist\s*=\s*\[(?P<options>.+)];?$', + webpage, 'jw config', flags=re.MULTILINE, group='options'), video_id, transform_source=js_to_json) info = self._parse_jwplayer_data( jw_config, video_id, require_title=False, m3u8_id='hls', diff --git a/hypervideo_dl/extractor/onefootball.py b/hypervideo_dl/extractor/onefootball.py new file mode 100644 index 0000000..826faad --- /dev/null +++ b/hypervideo_dl/extractor/onefootball.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class OneFootballIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?onefootball\.com/[a-z]{2}/video/[^/&?#]+-(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://onefootball.com/en/video/highlights-fc-zuerich-3-3-fc-basel-34012334', + 'info_dict': { + 'id': '34012334', + 'ext': 'mp4', + 'title': 'Highlights: FC Zürich 3-3 FC Basel', + 'description': 'md5:33d9855cb790702c4fe42a513700aba8', + 'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34012334', + 'timestamp': 1635874604, + 'upload_date': '20211102' + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://onefootball.com/en/video/klopp-fumes-at-var-decisions-in-west-ham-defeat-34041020', + 'info_dict': { + 'id': '34041020', + 'ext': 'mp4', + 'title': 'Klopp fumes at VAR decisions in West Ham defeat', + 'description': 'md5:9c50371095a01ad3f63311c73d8f51a5', + 'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34041020', + 'timestamp': 1636314103, + 'upload_date': '20211107' + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._search_json_ld(webpage, id) + m3u8_url = self._html_search_regex(r'(https://cdn\.jwplayer\.com/manifests/.+\.m3u8)', webpage, 'm3u8_url') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title'), + 'description': data_json.get('description'), + 'thumbnail': data_json.get('thumbnail'), + 'timestamp': data_json.get('timestamp'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/onet.py b/hypervideo_dl/extractor/onet.py index bf53ea0..95177a2 100644 --- a/hypervideo_dl/extractor/onet.py +++ b/hypervideo_dl/extractor/onet.py @@ -182,14 +182,9 @@ class OnetChannelIE(OnetBaseIE): video_id = remove_start(current_clip_info['ckmId'], 'mvp:') video_name = url_basename(current_clip_info['url']) - if self.get_param('noplaylist'): - self.to_screen( - 'Downloading just video %s because of --no-playlist' % video_name) + if not self._yes_playlist(channel_id, video_name, playlist_label='channel'): return self._extract_from_id(video_id, webpage) - self.to_screen( - 'Downloading channel %s - add --no-playlist to just download video %s' % ( - channel_id, video_name)) matches = re.findall( r'<a[^>]+href=[\'"](%s[a-z]+/[0-9a-z-]+/[0-9a-z]+)' % self._URL_BASE_RE, webpage) diff --git a/hypervideo_dl/extractor/opencast.py b/hypervideo_dl/extractor/opencast.py new file mode 100644 index 0000000..cf8d917 --- /dev/null +++ b/hypervideo_dl/extractor/opencast.py @@ -0,0 +1,177 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_iso8601, + traverse_obj, + variadic, +) + + +class OpencastBaseIE(InfoExtractor): + _INSTANCES_RE = r'''(?: + opencast\.informatik\.kit\.edu| + electures\.uni-muenster\.de| + oc-presentation\.ltcc\.tuwien\.ac\.at| + medien\.ph-noe\.ac\.at| + oc-video\.ruhr-uni-bochum\.de| + oc-video1\.ruhr-uni-bochum\.de| + opencast\.informatik\.uni-goettingen\.de| + heicast\.uni-heidelberg\.de| + opencast\.hawk\.de:8080| + opencast\.hs-osnabrueck\.de| + video[0-9]+\.virtuos\.uni-osnabrueck\.de| + opencast\.uni-koeln\.de| + media\.opencast\.hochschule-rhein-waal\.de| + matterhorn\.dce\.harvard\.edu| + hs-harz\.opencast\.uni-halle\.de| + videocampus\.urz\.uni-leipzig\.de| + media\.uct\.ac\.za| + vid\.igb\.illinois\.edu| + cursosabertos\.c3sl\.ufpr\.br| + mcmedia\.missioncollege\.org| + clases\.odon\.edu\.uy + )''' + _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' + + def _call_api(self, host, video_id, **kwargs): + return self._download_json(self._API_BASE % (host, video_id), video_id, **kwargs) + + def _parse_mediapackage(self, video): + video_id = video.get('id') + if video_id is None: + raise ExtractorError('Video id was not found') + + formats = [] + for track in variadic(traverse_obj(video, ('media', 'track')) or []): + href = track.get('url') + if href is None: + continue + ext = determine_ext(href, None) + + transport = track.get('transport') + + if transport == 'DASH' or ext == 'mpd': + formats.extend(self._extract_mpd_formats_and_subtitles(href, video_id, mpd_id='dash', fatal=False)) + elif transport == 'HLS' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats_and_subtitles( + href, video_id, m3u8_id='hls', entry_protocol='m3u8_native', fatal=False)) + elif transport == 'HDS' or ext == 'f4m': + formats.extend(self._extract_f4m_formats(href, video_id, f4m_id='hds', fatal=False)) + elif transport == 'SMOOTH': + formats.extend(self._extract_ism_formats(href, video_id, ism_id='smooth', fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats(href, video_id, fatal=False)) + else: + track_obj = { + 'url': href, + 'ext': ext, + 'format_note': track.get('transport'), + 'resolution': traverse_obj(track, ('video', 'resolution')), + 'fps': int_or_none(traverse_obj(track, ('video', 'framerate'))), + 'vbr': int_or_none(traverse_obj(track, ('video', 'bitrate')), scale=1000), + 'vcodec': traverse_obj(track, ('video', 'encoder', 'type')) if track.get('video') else 'none', + 'abr': int_or_none(traverse_obj(track, ('audio', 'bitrate')), scale=1000), + 'asr': int_or_none(traverse_obj(track, ('audio', 'samplingrate'))), + 'acodec': traverse_obj(track, ('audio', 'encoder', 'type')) if track.get('audio') else 'none', + } + + if transport == 'RTMP': + m_obj = re.search(r'(?:rtmp://[^/]+/(?P<app>[^/]+))/(?P<ext>.+):(?P<playpath>.+)', href) + if not m_obj: + continue + track_obj.update({ + 'app': m_obj.group('app'), + 'ext': m_obj.group('ext'), + 'play_path': m_obj.group('ext') + ':' + m_obj.group('playpath'), + 'rtmp_live': True, + 'preference': -2, + }) + formats.append(track_obj) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': video.get('title'), + 'series': video.get('seriestitle'), + 'season_id': video.get('series'), + 'creator': traverse_obj(video, ('creators', 'creator')), + 'timestamp': parse_iso8601(video.get('start')), + 'thumbnail': traverse_obj(video, ('attachments', 'attachment', ..., 'url'), get_all=False), + } + + +class OpencastIE(OpencastBaseIE): + _VALID_URL = r'''(?x) + https?://(?P<host>%s)/paella/ui/watch.html\?.*? + id=(?P<id>%s) + ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE) + + _API_BASE = 'https://%s/search/episode.json?id=%s' + + _TESTS = [ + { + 'url': 'https://oc-video1.ruhr-uni-bochum.de/paella/ui/watch.html?id=ed063cd5-72c8-46b5-a60a-569243edcea8', + 'md5': '554c8e99a90f7be7e874619fcf2a3bc9', + 'info_dict': { + 'id': 'ed063cd5-72c8-46b5-a60a-569243edcea8', + 'ext': 'mp4', + 'title': '11 - Kryptographie - 24.11.2015', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1606208400, + 'upload_date': '20201124', + }, + } + ] + + def _real_extract(self, url): + host, video_id = self._match_valid_url(url).group('host', 'id') + return self._parse_mediapackage( + self._call_api(host, video_id)['search-results']['result']['mediapackage']) + + +class OpencastPlaylistIE(OpencastBaseIE): + _VALID_URL = r'''(?x) + https?://(?P<host>%s)/engage/ui/index.html\?.*? + epFrom=(?P<id>%s) + ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE) + + _API_BASE = 'https://%s/search/episode.json?sid=%s' + + _TESTS = [ + { + 'url': 'https://oc-video1.ruhr-uni-bochum.de/engage/ui/index.html?epFrom=cf68a4a1-36b1-4a53-a6ba-61af5705a0d0', + 'info_dict': { + 'id': 'cf68a4a1-36b1-4a53-a6ba-61af5705a0d0', + 'title': 'Kryptographie - WiSe 15/16', + }, + 'playlist_mincount': 28, + }, + { + 'url': 'https://oc-video.ruhr-uni-bochum.de/engage/ui/index.html?e=1&p=1&epFrom=b1a54262-3684-403f-9731-8e77c3766f9a', + 'info_dict': { + 'id': 'b1a54262-3684-403f-9731-8e77c3766f9a', + 'title': 'inSTUDIES-Social movements and prefigurative politics in a global perspective', + }, + 'playlist_mincount': 6, + }, + ] + + def _real_extract(self, url): + host, video_id = self._match_valid_url(url).group('host', 'id') + + entries = [ + self._parse_mediapackage(episode['mediapackage']) + for episode in variadic(self._call_api(host, video_id)['search-results']['result']) + if episode.get('mediapackage') + ] + + return self.playlist_result(entries, video_id, traverse_obj(entries, (0, 'series'))) diff --git a/hypervideo_dl/extractor/openload.py b/hypervideo_dl/extractor/openload.py index dfdd0e5..fe4740a 100644 --- a/hypervideo_dl/extractor/openload.py +++ b/hypervideo_dl/extractor/openload.py @@ -16,8 +16,7 @@ from ..utils import ( ExtractorError, get_exe_version, is_outdated_version, - std_headers, - process_communicate_or_kill, + Popen, ) @@ -208,7 +207,7 @@ class PhantomJSwrapper(object): replaces = self.options replaces['url'] = url - user_agent = headers.get('User-Agent') or std_headers['User-Agent'] + user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent'] replaces['ua'] = user_agent.replace('"', '\\"') replaces['jscode'] = jscode @@ -223,11 +222,10 @@ class PhantomJSwrapper(object): else: self.extractor.to_screen('%s: %s' % (video_id, note2)) - p = subprocess.Popen([ - self.exe, '--ssl-protocol=any', - self._TMP_FILES['script'].name - ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = process_communicate_or_kill(p) + p = Popen( + [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = p.communicate_or_kill() if p.returncode != 0: raise ExtractorError( 'Executing JS failed\n:' + encodeArgument(err)) diff --git a/hypervideo_dl/extractor/openrec.py b/hypervideo_dl/extractor/openrec.py index d7073ab..5eb1cdb 100644 --- a/hypervideo_dl/extractor/openrec.py +++ b/hypervideo_dl/extractor/openrec.py @@ -4,14 +4,71 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, + get_first, + int_or_none, traverse_obj, try_get, - unified_strdate + unified_strdate, + unified_timestamp, ) from ..compat import compat_str -class OpenRecIE(InfoExtractor): +class OpenRecBaseIE(InfoExtractor): + def _extract_pagestore(self, webpage, video_id): + return self._parse_json( + self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id) + + def _expand_media(self, video_id, media): + for name, m3u8_url in (media or {}).items(): + if not m3u8_url: + continue + yield from self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', m3u8_id=name) + + def _extract_movie(self, webpage, video_id, name, is_live): + window_stores = self._extract_pagestore(webpage, video_id) + movie_stores = [ + # extract all three important data (most of data are duplicated each other, but slightly different!) + traverse_obj(window_stores, ('v8', 'state', 'movie'), expected_type=dict), + traverse_obj(window_stores, ('v8', 'movie'), expected_type=dict), + traverse_obj(window_stores, 'movieStore', expected_type=dict), + ] + if not any(movie_stores): + raise ExtractorError(f'Failed to extract {name} info') + + formats = list(self._expand_media(video_id, get_first(movie_stores, 'media'))) + if not formats and is_live: + # archived livestreams + cookies = self._get_cookies('https://www.openrec.tv/') + detail = self._download_json( + f'https://apiv5.openrec.tv/api/v5/movies/{video_id}/detail', video_id, + headers={ + 'Origin': 'https://www.openrec.tv', + 'Referer': 'https://www.openrec.tv/', + 'access-token': try_get(cookies, lambda x: x.get('access_token').value), + 'uuid': try_get(cookies, lambda x: x.get('uuid').value), + }) + new_media = traverse_obj(detail, ('data', 'items', ..., 'media'), get_all=False) + formats = list(self._expand_media(video_id, new_media)) + is_live = False + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': get_first(movie_stores, 'title'), + 'description': get_first(movie_stores, 'introduction'), + 'thumbnail': get_first(movie_stores, 'thumbnailUrl'), + 'formats': formats, + 'uploader': get_first(movie_stores, ('channel', 'user', 'name')), + 'uploader_id': get_first(movie_stores, ('channel', 'user', 'id')), + 'timestamp': int_or_none(get_first(movie_stores, ['publishedAt', 'time']), scale=1000) or unified_timestamp(get_first(movie_stores, 'publishedAt')), + 'is_live': is_live, + } + + +class OpenRecIE(OpenRecBaseIE): IE_NAME = 'openrec' _VALID_URL = r'https?://(?:www\.)?openrec\.tv/live/(?P<id>[^/]+)' _TESTS = [{ @@ -24,53 +81,12 @@ class OpenRecIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('https://www.openrec.tv/live/%s' % video_id, video_id) - - window_stores = self._parse_json( - self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id) - movie_store = traverse_obj( - window_stores, - ('v8', 'state', 'movie'), - ('v8', 'movie'), - expected_type=dict) - if not movie_store: - raise ExtractorError('Failed to extract live info') - - title = movie_store.get('title') - description = movie_store.get('introduction') - thumbnail = movie_store.get('thumbnailUrl') - - channel_user = movie_store.get('channel', {}).get('user') - uploader = try_get(channel_user, lambda x: x['name'], compat_str) - uploader_id = try_get(channel_user, lambda x: x['id'], compat_str) - - timestamp = traverse_obj(movie_store, ('startedAt', 'time'), expected_type=int) - - m3u8_playlists = movie_store.get('media') - formats = [] - for (name, m3u8_url) in m3u8_playlists.items(): - if not m3u8_url: - continue - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8', - m3u8_id='hls-%s' % name, live=True)) - - self._sort_formats(formats) + webpage = self._download_webpage(f'https://www.openrec.tv/live/{video_id}', video_id) - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'formats': formats, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'timestamp': timestamp, - 'is_live': True, - } + return self._extract_movie(webpage, video_id, 'live', True) -class OpenRecCaptureIE(InfoExtractor): +class OpenRecCaptureIE(OpenRecBaseIE): IE_NAME = 'openrec:capture' _VALID_URL = r'https?://(?:www\.)?openrec\.tv/capture/(?P<id>[^/]+)' _TESTS = [{ @@ -89,38 +105,49 @@ class OpenRecCaptureIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('https://www.openrec.tv/capture/%s' % video_id, video_id) + webpage = self._download_webpage(f'https://www.openrec.tv/capture/{video_id}', video_id) - window_stores = self._parse_json( - self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id) + window_stores = self._extract_pagestore(webpage, video_id) movie_store = window_stores.get('movie') capture_data = window_stores.get('capture') if not capture_data: raise ExtractorError('Cannot extract title') - title = capture_data.get('title') - thumbnail = capture_data.get('thumbnailUrl') - upload_date = unified_strdate(capture_data.get('createdAt')) - - channel_info = movie_store.get('channel') or {} - uploader = channel_info.get('name') - uploader_id = channel_info.get('id') - m3u8_url = capture_data.get('source') - if not m3u8_url: - raise ExtractorError('Cannot extract m3u8 url') formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - + capture_data.get('source'), video_id, ext='mp4') self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, + 'title': capture_data.get('title'), + 'thumbnail': capture_data.get('thumbnailUrl'), 'formats': formats, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, + 'timestamp': unified_timestamp(traverse_obj(movie_store, 'createdAt', expected_type=compat_str)), + 'uploader': traverse_obj(movie_store, ('channel', 'name'), expected_type=compat_str), + 'uploader_id': traverse_obj(movie_store, ('channel', 'id'), expected_type=compat_str), + 'upload_date': unified_strdate(capture_data.get('createdAt')), } + + +class OpenRecMovieIE(OpenRecBaseIE): + IE_NAME = 'openrec:movie' + _VALID_URL = r'https?://(?:www\.)?openrec\.tv/movie/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.openrec.tv/movie/nqz5xl5km8v', + 'info_dict': { + 'id': 'nqz5xl5km8v', + 'title': '限定コミュニティ(Discord)参加方法ご説明動画', + 'description': 'md5:ebd563e5f5b060cda2f02bf26b14d87f', + 'thumbnail': r're:https://.+', + 'uploader': 'タイキとカズヒロ', + 'uploader_id': 'taiki_to_kazuhiro', + 'timestamp': 1638856800, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://www.openrec.tv/movie/{video_id}', video_id) + + return self._extract_movie(webpage, video_id, 'movie', False) diff --git a/hypervideo_dl/extractor/orf.py b/hypervideo_dl/extractor/orf.py index 428ec97..0628977 100644 --- a/hypervideo_dl/extractor/orf.py +++ b/hypervideo_dl/extractor/orf.py @@ -1,22 +1,26 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, determine_ext, float_or_none, HEADRequest, + InAdvancePagedList, int_or_none, + join_nonempty, orderedSet, remove_end, + smuggle_url, str_or_none, strip_jsonp, unescapeHTML, unified_strdate, + unsmuggle_url, url_or_none, ) @@ -24,9 +28,40 @@ from ..utils import ( class ORFTVthekIE(InfoExtractor): IE_NAME = 'orf:tvthek' IE_DESC = 'ORF TVthek' - _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)' + _VALID_URL = r'(?P<url>https?://tvthek\.orf\.at/(?:(?:[^/]+/){2}){1,2}(?P<id>\d+))(/[^/]+/(?P<vid>\d+))?(?:$|[?#])' _TESTS = [{ + 'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079', + 'info_dict': { + 'id': '14121079', + }, + 'playlist_count': 11, + 'params': {'noplaylist': True} + }, { + 'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079/Umfrage-Welches-Tier-ist-Sebastian-Kurz/15083150', + 'info_dict': { + 'id': '14121079', + }, + 'playlist_count': 1, + 'params': {'playlist_items': '5'} + }, { + 'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079/Umfrage-Welches-Tier-ist-Sebastian-Kurz/15083150', + 'info_dict': { + 'id': '14121079', + 'playlist_count': 1 + }, + 'playlist': [{ + 'info_dict': { + 'id': '15083150', + 'ext': 'mp4', + 'description': 'md5:7be1c485425f5f255a5e4e4815e77d04', + 'thumbnail': 'https://api-tvthek.orf.at/uploads/media/segments/0130/59/824271ea35cd8931a0fb08ab316a5b0a1562342c.jpeg', + 'title': 'Umfrage: Welches Tier ist Sebastian Kurz?', + } + }], + 'playlist_count': 1, + 'params': {'noplaylist': True, 'skip_download': 'm3u8'} + }, { 'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389', 'playlist': [{ 'md5': '2942210346ed779588f428a92db88712', @@ -61,8 +96,90 @@ class ORFTVthekIE(InfoExtractor): 'only_matching': True, }] + def _pagefunc(self, url, data_jsb, n, *, image=None): + sd = data_jsb[n] + video_id, title = str(sd['id']), sd['title'] + formats = [] + for fd in sd['sources']: + src = url_or_none(fd.get('src')) + if not src: + continue + format_id = join_nonempty('delivery', 'quality', 'quality_string', from_dict=fd) + ext = determine_ext(src) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + src, video_id, 'mp4', m3u8_id=format_id, fatal=False, note=f'Downloading {format_id} m3u8 manifest') + if any('/geoprotection' in f['url'] for f in m3u8_formats): + self.raise_geo_restricted() + formats.extend(m3u8_formats) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + src, video_id, f4m_id=format_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id=format_id, fatal=False, note=f'Downloading {format_id} mpd manifest')) + else: + formats.append({ + 'format_id': format_id, + 'url': src, + 'protocol': fd.get('protocol'), + }) + + # Check for geoblocking. + # There is a property is_geoprotection, but that's always false + geo_str = sd.get('geoprotection_string') + http_url = next( + (f['url'] for f in formats if re.match(r'^https?://.*\.mp4$', f['url'])), + None) if geo_str else None + if http_url: + self._request_webpage( + HEADRequest(http_url), video_id, fatal=False, note='Testing for geoblocking', + errnote=f'This video seems to be blocked outside of {geo_str}. You may want to try the streaming-* formats') + + self._sort_formats(formats) + + subtitles = {} + for sub in sd.get('subtitles', []): + sub_src = sub.get('src') + if not sub_src: + continue + subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({ + 'url': sub_src, + }) + + upload_date = unified_strdate(sd.get('created_date')) + + thumbnails = [] + preview = sd.get('preview_image_url') + if preview: + thumbnails.append({ + 'id': 'preview', + 'url': preview, + 'preference': 0, + }) + image = sd.get('image_full_url') or image + if image: + thumbnails.append({ + 'id': 'full', + 'url': image, + 'preference': 1, + }) + + yield { + 'id': video_id, + 'title': title, + 'webpage_url': smuggle_url(f'{url}/part/{video_id}', {'force_noplaylist': True}), + 'formats': formats, + 'subtitles': subtitles, + 'description': sd.get('description'), + 'duration': int_or_none(sd.get('duration_in_seconds')), + 'upload_date': upload_date, + 'thumbnails': thumbnails, + } + def _real_extract(self, url): - playlist_id = self._match_id(url) + url, smuggled_data = unsmuggle_url(url) + playlist_id, video_id, base_url = self._match_valid_url(url).group('id', 'vid', 'url') webpage = self._download_webpage(url, playlist_id) data_jsb = self._parse_json( @@ -71,112 +188,16 @@ class ORFTVthekIE(InfoExtractor): webpage, 'playlist', group='json'), playlist_id, transform_source=unescapeHTML)['playlist']['videos'] - entries = [] - for sd in data_jsb: - video_id, title = sd.get('id'), sd.get('title') - if not video_id or not title: - continue - video_id = compat_str(video_id) - formats = [] - for fd in sd['sources']: - src = url_or_none(fd.get('src')) - if not src: - continue - format_id_list = [] - for key in ('delivery', 'quality', 'quality_string'): - value = fd.get(key) - if value: - format_id_list.append(value) - format_id = '-'.join(format_id_list) - ext = determine_ext(src) - if ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id=format_id, fatal=False) - if any('/geoprotection' in f['url'] for f in m3u8_formats): - self.raise_geo_restricted() - formats.extend(m3u8_formats) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - src, video_id, f4m_id=format_id, fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src, video_id, mpd_id=format_id, fatal=False)) - else: - formats.append({ - 'format_id': format_id, - 'url': src, - 'protocol': fd.get('protocol'), - }) + if not self._yes_playlist(playlist_id, video_id, smuggled_data): + data_jsb = [sd for sd in data_jsb if str(sd.get('id')) == video_id] - # Check for geoblocking. - # There is a property is_geoprotection, but that's always false - geo_str = sd.get('geoprotection_string') - if geo_str: - try: - http_url = next( - f['url'] - for f in formats - if re.match(r'^https?://.*\.mp4$', f['url'])) - except StopIteration: - pass - else: - req = HEADRequest(http_url) - self._request_webpage( - req, video_id, - note='Testing for geoblocking', - errnote=(( - 'This video seems to be blocked outside of %s. ' - 'You may want to try the streaming-* formats.') - % geo_str), - fatal=False) - - self._check_formats(formats, video_id) - self._sort_formats(formats) - - subtitles = {} - for sub in sd.get('subtitles', []): - sub_src = sub.get('src') - if not sub_src: - continue - subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({ - 'url': sub_src, - }) - - upload_date = unified_strdate(sd.get('created_date')) - - thumbnails = [] - preview = sd.get('preview_image_url') - if preview: - thumbnails.append({ - 'id': 'preview', - 'url': preview, - 'preference': 0, - }) - image = sd.get('image_full_url') - if not image and len(data_jsb) == 1: - image = self._og_search_thumbnail(webpage) - if image: - thumbnails.append({ - 'id': 'full', - 'url': image, - 'preference': 1, - }) - - entries.append({ - '_type': 'video', - 'id': video_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'description': sd.get('description'), - 'duration': int_or_none(sd.get('duration_in_seconds')), - 'upload_date': upload_date, - 'thumbnails': thumbnails, - }) + playlist_count = len(data_jsb) + image = self._og_search_thumbnail(webpage) if playlist_count == 1 else None + page_func = functools.partial(self._pagefunc, base_url, data_jsb, image=image) return { '_type': 'playlist', - 'entries': entries, + 'entries': InAdvancePagedList(page_func, playlist_count, 1), 'id': playlist_id, } diff --git a/hypervideo_dl/extractor/packtpub.py b/hypervideo_dl/extractor/packtpub.py index c06fca7..62c52cd 100644 --- a/hypervideo_dl/extractor/packtpub.py +++ b/hypervideo_dl/extractor/packtpub.py @@ -47,10 +47,7 @@ class PacktPubIE(PacktPubBaseIE): _NETRC_MACHINE = 'packtpub' _TOKEN = None - def _real_initialize(self): - username, password = self._get_login_info() - if username is None: - return + def _perform_login(self, username, password): try: self._TOKEN = self._download_json( 'https://services.packtpub.com/auth-v1/users/tokens', None, diff --git a/hypervideo_dl/extractor/panopto.py b/hypervideo_dl/extractor/panopto.py new file mode 100644 index 0000000..3388f7f --- /dev/null +++ b/hypervideo_dl/extractor/panopto.py @@ -0,0 +1,607 @@ +import re +import calendar +import json +import functools +from datetime import datetime +from random import random + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlparse, + compat_urlparse +) + +from ..utils import ( + bug_reports_message, + ExtractorError, + get_first, + int_or_none, + OnDemandPagedList, + parse_qs, + srt_subtitles_timecode, + traverse_obj, +) + + +class PanoptoBaseIE(InfoExtractor): + BASE_URL_RE = r'(?P<base_url>https?://[\w.-]+\.panopto.(?:com|eu)/Panopto)' + + # see panopto core.js + _SUB_LANG_MAPPING = { + 0: 'en-US', + 1: 'en-GB', + 2: 'es-MX', + 3: 'es-ES', + 4: 'de-DE', + 5: 'fr-FR', + 6: 'nl-NL', + 7: 'th-TH', + 8: 'zh-CN', + 9: 'zh-TW', + 10: 'ko-KR', + 11: 'ja-JP', + 12: 'ru-RU', + 13: 'pt-PT', + 14: 'pl-PL', + 15: 'en-AU', + 16: 'da-DK', + 17: 'fi-FI', + 18: 'hu-HU', + 19: 'nb-NO', + 20: 'sv-SE', + 21: 'it-IT' + } + + def _call_api(self, base_url, path, video_id, data=None, fatal=True, **kwargs): + response = self._download_json( + base_url + path, video_id, data=json.dumps(data).encode('utf8') if data else None, + fatal=fatal, headers={'accept': 'application/json', 'content-type': 'application/json'}, **kwargs) + if not response: + return + error_code = traverse_obj(response, 'ErrorCode') + if error_code == 2: + self.raise_login_required(method='cookies') + elif error_code is not None: + msg = f'Panopto said: {response.get("ErrorMessage")}' + if fatal: + raise ExtractorError(msg, video_id=video_id, expected=True) + else: + self.report_warning(msg, video_id=video_id) + return response + + @staticmethod + def _parse_fragment(url): + return {k: json.loads(v[0]) for k, v in compat_urlparse.parse_qs(compat_urllib_parse_urlparse(url).fragment).items()} + + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'<iframe[^>]+src=["\'](?P<url>%s/Pages/(Viewer|Embed|Sessions/List)\.aspx[^"\']+)' % PanoptoIE.BASE_URL_RE, + webpage)] + + +class PanoptoIE(PanoptoBaseIE): + _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)id=(?P<id>[a-f0-9-]+)' + _TESTS = [ + { + 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=26b3ae9e-4a48-4dcc-96ba-0befba08a0fb', + 'info_dict': { + 'id': '26b3ae9e-4a48-4dcc-96ba-0befba08a0fb', + 'title': 'Panopto for Business - Use Cases', + 'timestamp': 1459184200, + 'thumbnail': r're:https://demo\.hosted\.panopto\.com/.+', + 'upload_date': '20160328', + 'ext': 'mp4', + 'cast': [], + 'chapters': [], + 'duration': 88.17099999999999, + 'average_rating': int, + 'uploader_id': '2db6b718-47a0-4b0b-9e17-ab0b00f42b1e', + 'channel_id': 'e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a', + 'channel': 'Showcase Videos' + }, + }, + { + 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=ed01b077-c9e5-4c7b-b8ff-15fa306d7a59', + 'info_dict': { + 'id': 'ed01b077-c9e5-4c7b-b8ff-15fa306d7a59', + 'title': 'Overcoming Top 4 Challenges of Enterprise Video', + 'uploader': 'Panopto Support', + 'timestamp': 1449409251, + 'thumbnail': r're:https://demo\.hosted\.panopto\.com/.+', + 'upload_date': '20151206', + 'ext': 'mp4', + 'chapters': 'count:12', + 'cast': ['Panopto Support'], + 'uploader_id': 'a96d1a31-b4de-489b-9eee-b4a5b414372c', + 'average_rating': int, + 'description': 'md5:4391837802b3fc856dadf630c4b375d1', + 'duration': 1088.2659999999998, + 'channel_id': '9f3c1921-43bb-4bda-8b3a-b8d2f05a8546', + 'channel': 'Webcasts', + }, + }, + { + # Extra params in URL + 'url': 'https://howtovideos.hosted.panopto.com/Panopto/Pages/Viewer.aspx?randomparam=thisisnotreal&id=5fa74e93-3d87-4694-b60e-aaa4012214ed&advance=true', + 'info_dict': { + 'id': '5fa74e93-3d87-4694-b60e-aaa4012214ed', + 'ext': 'mp4', + 'duration': 129.513, + 'cast': ['Kathryn Kelly'], + 'uploader_id': '316a0a58-7fa2-4cd9-be1c-64270d284a56', + 'timestamp': 1569845768, + 'tags': ['Viewer', 'Enterprise'], + 'chapters': [], + 'upload_date': '20190930', + 'thumbnail': r're:https://howtovideos\.hosted\.panopto\.com/.+', + 'description': 'md5:2d844aaa1b1a14ad0e2601a0993b431f', + 'title': 'Getting Started: View a Video', + 'average_rating': int, + 'uploader': 'Kathryn Kelly', + 'channel_id': 'fb93bc3c-6750-4b80-a05b-a921013735d3', + 'channel': 'Getting Started', + } + }, + { + # Does not allow normal Viewer.aspx. AUDIO livestream has no url, so should be skipped and only give one stream. + 'url': 'https://unisa.au.panopto.com/Panopto/Pages/Embed.aspx?id=9d9a0fa3-e99a-4ebd-a281-aac2017f4da4', + 'info_dict': { + 'id': '9d9a0fa3-e99a-4ebd-a281-aac2017f4da4', + 'ext': 'mp4', + 'cast': ['LTS CLI Script'], + 'chapters': [], + 'duration': 2178.45, + 'description': 'md5:ee5cf653919f55b72bce2dbcf829c9fa', + 'channel_id': 'b23e673f-c287-4cb1-8344-aae9005a69f8', + 'average_rating': int, + 'uploader_id': '38377323-6a23-41e2-9ff6-a8e8004bf6f7', + 'uploader': 'LTS CLI Script', + 'timestamp': 1572458134, + 'title': 'WW2 Vets Interview 3 Ronald Stanley George', + 'thumbnail': r're:https://unisa\.au\.panopto\.com/.+', + 'channel': 'World War II Veteran Interviews', + 'upload_date': '20191030', + }, + }, + { + # Slides/storyboard + 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=a7f12f1d-3872-4310-84b0-f8d8ab15326b', + 'info_dict': { + 'id': 'a7f12f1d-3872-4310-84b0-f8d8ab15326b', + 'ext': 'mhtml', + 'timestamp': 1448798857, + 'duration': 4712.681, + 'title': 'Cache Memory - CompSci 15-213, Lecture 12', + 'channel_id': 'e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a', + 'uploader_id': 'a96d1a31-b4de-489b-9eee-b4a5b414372c', + 'upload_date': '20151129', + 'average_rating': 0, + 'uploader': 'Panopto Support', + 'channel': 'Showcase Videos', + 'description': 'md5:55e51d54233ddb0e6c2ed388ca73822c', + 'cast': ['ISR Videographer', 'Panopto Support'], + 'chapters': 'count:28', + 'thumbnail': r're:https://demo\.hosted\.panopto\.com/.+', + }, + 'params': {'format': 'mhtml', 'skip_download': True} + }, + { + 'url': 'https://na-training-1.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=8285224a-9a2b-4957-84f2-acb0000c4ea9', + 'info_dict': { + 'id': '8285224a-9a2b-4957-84f2-acb0000c4ea9', + 'ext': 'mp4', + 'chapters': [], + 'title': 'Company Policy', + 'average_rating': 0, + 'timestamp': 1615058901, + 'channel': 'Human Resources', + 'tags': ['HumanResources'], + 'duration': 1604.243, + 'thumbnail': r're:https://na-training-1\.hosted\.panopto\.com/.+', + 'uploader_id': '8e8ba0a3-424f-40df-a4f1-ab3a01375103', + 'uploader': 'Cait M.', + 'upload_date': '20210306', + 'cast': ['Cait M.'], + 'subtitles': {'en-US': [{'ext': 'srt', 'data': 'md5:a3f4d25963fdeace838f327097c13265'}], + 'es-ES': [{'ext': 'srt', 'data': 'md5:57e9dad365fd0fbaf0468eac4949f189'}]}, + }, + 'params': {'writesubtitles': True, 'skip_download': True} + }, { + # On Panopto there are two subs: "Default" and en-US. en-US is blank and should be skipped. + 'url': 'https://na-training-1.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=940cbd41-f616-4a45-b13e-aaf1000c915b', + 'info_dict': { + 'id': '940cbd41-f616-4a45-b13e-aaf1000c915b', + 'ext': 'mp4', + 'subtitles': 'count:1', + 'title': 'HR Benefits Review Meeting*', + 'cast': ['Panopto Support'], + 'chapters': [], + 'timestamp': 1575024251, + 'thumbnail': r're:https://na-training-1\.hosted\.panopto\.com/.+', + 'channel': 'Zoom', + 'description': 'md5:04f90a9c2c68b7828144abfb170f0106', + 'uploader': 'Panopto Support', + 'average_rating': 0, + 'duration': 409.34499999999997, + 'uploader_id': 'b6ac04ad-38b8-4724-a004-a851004ea3df', + 'upload_date': '20191129', + + }, + 'params': {'writesubtitles': True, 'skip_download': True} + }, + { + 'url': 'https://ucc.cloud.panopto.eu/Panopto/Pages/Viewer.aspx?id=0e8484a4-4ceb-4d98-a63f-ac0200b455cb', + 'only_matching': True + }, + { + 'url': 'https://brown.hosted.panopto.com/Panopto/Pages/Embed.aspx?id=0b3ff73b-36a0-46c5-8455-aadf010a3638', + 'only_matching': True + }, + ] + + @classmethod + def suitable(cls, url): + return False if PanoptoPlaylistIE.suitable(url) else super().suitable(url) + + def _mark_watched(self, base_url, video_id, delivery_info): + duration = traverse_obj(delivery_info, ('Delivery', 'Duration'), expected_type=float) + invocation_id = delivery_info.get('InvocationId') + stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', ..., 'PublicID'), get_all=False, expected_type=str) + if invocation_id and stream_id and duration: + timestamp_str = f'/Date({calendar.timegm(datetime.utcnow().timetuple())}000)/' + data = { + 'streamRequests': [ + { + 'ClientTimeStamp': timestamp_str, + 'ID': 0, + 'InvocationID': invocation_id, + 'PlaybackSpeed': 1, + 'SecondsListened': duration - 1, + 'SecondsRejected': 0, + 'StartPosition': 0, + 'StartReason': 2, + 'StopReason': None, + 'StreamID': stream_id, + 'TimeStamp': timestamp_str, + 'UpdatesRejected': 0 + }, + ]} + + self._download_webpage( + base_url + '/Services/Analytics.svc/AddStreamRequests', video_id, + fatal=False, data=json.dumps(data).encode('utf8'), headers={'content-type': 'application/json'}, + note='Marking watched', errnote='Unable to mark watched') + + @staticmethod + def _extract_chapters(timestamps): + chapters = [] + for timestamp in timestamps or []: + caption = timestamp.get('Caption') + start, duration = int_or_none(timestamp.get('Time')), int_or_none(timestamp.get('Duration')) + if not caption or start is None or duration is None: + continue + chapters.append({ + 'start_time': start, + 'end_time': start + duration, + 'title': caption + }) + return chapters + + @staticmethod + def _extract_mhtml_formats(base_url, timestamps): + image_frags = {} + for timestamp in timestamps or []: + duration = timestamp.get('Duration') + obj_id, obj_sn = timestamp.get('ObjectIdentifier'), timestamp.get('ObjectSequenceNumber'), + if timestamp.get('EventTargetType') == 'PowerPoint' and obj_id is not None and obj_sn is not None: + image_frags.setdefault('slides', []).append({ + 'url': base_url + f'/Pages/Viewer/Image.aspx?id={obj_id}&number={obj_sn}', + 'duration': duration + }) + + obj_pid, session_id, abs_time = timestamp.get('ObjectPublicIdentifier'), timestamp.get('SessionID'), timestamp.get('AbsoluteTime') + if None not in (obj_pid, session_id, abs_time): + image_frags.setdefault('chapter', []).append({ + 'url': base_url + f'/Pages/Viewer/Thumb.aspx?eventTargetPID={obj_pid}&sessionPID={session_id}&number={obj_sn}&isPrimary=false&absoluteTime={abs_time}', + 'duration': duration, + }) + for name, fragments in image_frags.items(): + yield { + 'format_id': name, + 'ext': 'mhtml', + 'protocol': 'mhtml', + 'acodec': 'none', + 'vcodec': 'none', + 'url': 'about:invalid', + 'fragments': fragments + } + + @staticmethod + def _json2srt(data, delivery): + def _gen_lines(): + for i, line in enumerate(data): + start_time = line['Time'] + duration = line.get('Duration') + if duration: + end_time = start_time + duration + else: + end_time = traverse_obj(data, (i + 1, 'Time')) or delivery['Duration'] + yield f'{i + 1}\n{srt_subtitles_timecode(start_time)} --> {srt_subtitles_timecode(end_time)}\n{line["Caption"]}' + return '\n\n'.join(_gen_lines()) + + def _get_subtitles(self, base_url, video_id, delivery): + subtitles = {} + for lang in delivery.get('AvailableLanguages') or []: + response = self._call_api( + base_url, '/Pages/Viewer/DeliveryInfo.aspx', video_id, fatal=False, + note='Downloading captions JSON metadata', query={ + 'deliveryId': video_id, + 'getCaptions': True, + 'language': str(lang), + 'responseType': 'json' + } + ) + if not isinstance(response, list): + continue + subtitles.setdefault(self._SUB_LANG_MAPPING.get(lang) or 'default', []).append({ + 'ext': 'srt', + 'data': self._json2srt(response, delivery), + }) + return subtitles + + def _extract_streams_formats_and_subtitles(self, video_id, streams, **fmt_kwargs): + formats = [] + subtitles = {} + for stream in streams or []: + stream_formats = [] + http_stream_url = stream.get('StreamHttpUrl') + stream_url = stream.get('StreamUrl') + + if http_stream_url: + stream_formats.append({'url': http_stream_url}) + + if stream_url: + media_type = stream.get('ViewerMediaFileTypeName') + if media_type in ('hls', ): + m3u8_formats, stream_subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, video_id) + stream_formats.extend(m3u8_formats) + subtitles = self._merge_subtitles(subtitles, stream_subtitles) + else: + stream_formats.append({ + 'url': stream_url + }) + for fmt in stream_formats: + fmt.update({ + 'format_note': stream.get('Tag'), + **fmt_kwargs + }) + formats.extend(stream_formats) + + return formats, subtitles + + def _real_extract(self, url): + base_url, video_id = self._match_valid_url(url).group('base_url', 'id') + delivery_info = self._call_api( + base_url, '/Pages/Viewer/DeliveryInfo.aspx', video_id, + query={ + 'deliveryId': video_id, + 'invocationId': '', + 'isLiveNotes': 'false', + 'refreshAuthCookie': 'true', + 'isActiveBroadcast': 'false', + 'isEditing': 'false', + 'isKollectiveAgentInstalled': 'false', + 'isEmbed': 'false', + 'responseType': 'json', + } + ) + + delivery = delivery_info['Delivery'] + session_start_time = int_or_none(delivery.get('SessionStartTime')) + timestamps = delivery.get('Timestamps') + + # Podcast stream is usually the combined streams. We will prefer that by default. + podcast_formats, podcast_subtitles = self._extract_streams_formats_and_subtitles( + video_id, delivery.get('PodcastStreams'), format_note='PODCAST') + + streams_formats, streams_subtitles = self._extract_streams_formats_and_subtitles( + video_id, delivery.get('Streams'), preference=-10) + + formats = podcast_formats + streams_formats + formats.extend(self._extract_mhtml_formats(base_url, timestamps)) + subtitles = self._merge_subtitles( + podcast_subtitles, streams_subtitles, self.extract_subtitles(base_url, video_id, delivery)) + + self._sort_formats(formats) + self.mark_watched(base_url, video_id, delivery_info) + + return { + 'id': video_id, + 'title': delivery.get('SessionName'), + 'cast': traverse_obj(delivery, ('Contributors', ..., 'DisplayName'), default=[], expected_type=lambda x: x or None), + 'timestamp': session_start_time - 11640000000 if session_start_time else None, + 'duration': delivery.get('Duration'), + 'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random()}', + 'average_rating': delivery.get('AverageRating'), + 'chapters': self._extract_chapters(timestamps), + 'uploader': delivery.get('OwnerDisplayName') or None, + 'uploader_id': delivery.get('OwnerId'), + 'description': delivery.get('SessionAbstract'), + 'tags': traverse_obj(delivery, ('Tags', ..., 'Content')), + 'channel_id': delivery.get('SessionGroupPublicID'), + 'channel': traverse_obj(delivery, 'SessionGroupLongName', 'SessionGroupShortName', get_all=False), + 'formats': formats, + 'subtitles': subtitles + } + + +class PanoptoPlaylistIE(PanoptoBaseIE): + _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)pid=(?P<id>[a-f0-9-]+)' + _TESTS = [ + { + 'url': 'https://howtovideos.hosted.panopto.com/Panopto/Pages/Viewer.aspx?pid=f3b39fcf-882f-4849-93d6-a9f401236d36&id=5fa74e93-3d87-4694-b60e-aaa4012214ed&advance=true', + 'info_dict': { + 'title': 'Featured Video Tutorials', + 'id': 'f3b39fcf-882f-4849-93d6-a9f401236d36', + 'description': '', + }, + 'playlist_mincount': 36 + }, + { + 'url': 'https://utsa.hosted.panopto.com/Panopto/Pages/Viewer.aspx?pid=e2900555-3ad4-4bdb-854d-ad2401686190', + 'info_dict': { + 'title': 'Library Website Introduction Playlist', + 'id': 'e2900555-3ad4-4bdb-854d-ad2401686190', + 'description': 'md5:f958bca50a1cbda15fdc1e20d32b3ecb', + }, + 'playlist_mincount': 4 + }, + + ] + + def _entries(self, base_url, playlist_id, session_list_id): + session_list_info = self._call_api( + base_url, f'/Api/SessionLists/{session_list_id}?collections[0].maxCount=500&collections[0].name=items', playlist_id) + + items = session_list_info['Items'] + for item in items: + if item.get('TypeName') != 'Session': + self.report_warning('Got an item in the playlist that is not a Session' + bug_reports_message(), only_once=True) + continue + yield { + '_type': 'url', + 'id': item.get('Id'), + 'url': item.get('ViewerUri'), + 'title': item.get('Name'), + 'description': item.get('Description'), + 'duration': item.get('Duration'), + 'channel': traverse_obj(item, ('Parent', 'Name')), + 'channel_id': traverse_obj(item, ('Parent', 'Id')) + } + + def _real_extract(self, url): + base_url, playlist_id = self._match_valid_url(url).group('base_url', 'id') + + video_id = get_first(parse_qs(url), 'id') + if video_id: + if self.get_param('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + return self.url_result(base_url + f'/Pages/Viewer.aspx?id={video_id}', ie_key=PanoptoIE.ie_key(), video_id=video_id) + else: + self.to_screen(f'Downloading playlist {playlist_id}; add --no-playlist to just download video {video_id}') + + playlist_info = self._call_api(base_url, f'/Api/Playlists/{playlist_id}', playlist_id) + return self.playlist_result( + self._entries(base_url, playlist_id, playlist_info['SessionListId']), + playlist_id=playlist_id, playlist_title=playlist_info.get('Name'), + playlist_description=playlist_info.get('Description')) + + +class PanoptoListIE(PanoptoBaseIE): + _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/Sessions/List\.aspx' + _PAGE_SIZE = 250 + _TESTS = [ + { + 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx#folderID=%22e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a%22', + 'info_dict': { + 'id': 'e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a', + 'title': 'Showcase Videos' + }, + 'playlist_mincount': 140 + + }, + { + 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx#view=2&maxResults=250', + 'info_dict': { + 'id': 'panopto_list', + 'title': 'panopto_list' + }, + 'playlist_mincount': 300 + }, + { + # Folder that contains 8 folders and a playlist + 'url': 'https://howtovideos.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx?noredirect=true#folderID=%224b9de7ae-0080-4158-8496-a9ba01692c2e%22', + 'info_dict': { + 'id': '4b9de7ae-0080-4158-8496-a9ba01692c2e', + 'title': 'Video Tutorials' + }, + 'playlist_mincount': 9 + } + + ] + + def _fetch_page(self, base_url, query_params, display_id, page): + + params = { + 'sortColumn': 1, + 'getFolderData': True, + 'includePlaylists': True, + **query_params, + 'page': page, + 'maxResults': self._PAGE_SIZE, + } + + response = self._call_api( + base_url, '/Services/Data.svc/GetSessions', f'{display_id} page {page+1}', + data={'queryParameters': params}, fatal=False) + + for result in get_first(response, 'Results', default=[]): + # This could be a video, playlist (or maybe something else) + item_id = result.get('DeliveryID') + yield { + '_type': 'url', + 'id': item_id, + 'title': result.get('SessionName'), + 'url': traverse_obj(result, 'ViewerUrl', 'EmbedUrl', get_all=False) or (base_url + f'/Pages/Viewer.aspx?id={item_id}'), + 'duration': result.get('Duration'), + 'channel': result.get('FolderName'), + 'channel_id': result.get('FolderID'), + } + + for folder in get_first(response, 'Subfolders', default=[]): + folder_id = folder.get('ID') + yield self.url_result( + base_url + f'/Pages/Sessions/List.aspx#folderID="{folder_id}"', + ie_key=PanoptoListIE.ie_key(), video_id=folder_id, title=folder.get('Name')) + + def _extract_folder_metadata(self, base_url, folder_id): + response = self._call_api( + base_url, '/Services/Data.svc/GetFolderInfo', folder_id, + data={'folderID': folder_id}, fatal=False) + return { + 'title': get_first(response, 'Name', default=[]) + } + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + base_url = mobj.group('base_url') + + query_params = self._parse_fragment(url) + folder_id, display_id = query_params.get('folderID'), 'panopto_list' + + if query_params.get('isSubscriptionsPage'): + display_id = 'subscriptions' + if not query_params.get('subscribableTypes'): + query_params['subscribableTypes'] = [0, 1, 2] + elif query_params.get('isSharedWithMe'): + display_id = 'sharedwithme' + elif folder_id: + display_id = folder_id + + query = query_params.get('query') + if query: + display_id += f': query "{query}"' + + info = { + '_type': 'playlist', + 'id': display_id, + 'title': display_id, + } + if folder_id: + info.update(self._extract_folder_metadata(base_url, folder_id)) + + info['entries'] = OnDemandPagedList( + functools.partial(self._fetch_page, base_url, query_params, display_id), self._PAGE_SIZE) + + return info diff --git a/hypervideo_dl/extractor/paramountplus.py b/hypervideo_dl/extractor/paramountplus.py index 338b84d..94a9319 100644 --- a/hypervideo_dl/extractor/paramountplus.py +++ b/hypervideo_dl/extractor/paramountplus.py @@ -1,4 +1,5 @@ from __future__ import unicode_literals +import itertools from .common import InfoExtractor from .cbs import CBSBaseIE @@ -13,12 +14,12 @@ class ParamountPlusIE(CBSBaseIE): (?: paramountplus:| https?://(?:www\.)?(?: - paramountplus\.com/(?:shows/[^/]+/video|movies/[^/]+)/ + paramountplus\.com/(?:shows|movies)/(?:video|[^/]+/video|[^/]+)/ )(?P<id>[\w-]+))''' # All tests are blocked outside US _TESTS = [{ - 'url': 'https://www.paramountplus.com/shows/catdog/video/Oe44g5_NrlgiZE3aQVONleD6vXc8kP0k/catdog-climb-every-catdog-the-canine-mutiny/', + 'url': 'https://www.paramountplus.com/shows/video/Oe44g5_NrlgiZE3aQVONleD6vXc8kP0k/', 'info_dict': { 'id': 'Oe44g5_NrlgiZE3aQVONleD6vXc8kP0k', 'ext': 'mp4', @@ -33,7 +34,7 @@ class ParamountPlusIE(CBSBaseIE): 'skip_download': 'm3u8', }, }, { - 'url': 'https://www.paramountplus.com/shows/tooning-out-the-news/video/6hSWYWRrR9EUTz7IEe5fJKBhYvSUfexd/7-23-21-week-in-review-rep-jahana-hayes-howard-fineman-sen-michael-bennet-sheera-frenkel-cecilia-kang-/', + 'url': 'https://www.paramountplus.com/shows/video/6hSWYWRrR9EUTz7IEe5fJKBhYvSUfexd/', 'info_dict': { 'id': '6hSWYWRrR9EUTz7IEe5fJKBhYvSUfexd', 'ext': 'mp4', @@ -48,7 +49,7 @@ class ParamountPlusIE(CBSBaseIE): 'skip_download': 'm3u8', }, }, { - 'url': 'https://www.paramountplus.com/movies/daddys-home/vM2vm0kE6vsS2U41VhMRKTOVHyQAr6pC', + 'url': 'https://www.paramountplus.com/movies/video/vM2vm0kE6vsS2U41VhMRKTOVHyQAr6pC/', 'info_dict': { 'id': 'vM2vm0kE6vsS2U41VhMRKTOVHyQAr6pC', 'ext': 'mp4', @@ -60,11 +61,10 @@ class ParamountPlusIE(CBSBaseIE): }, 'params': { 'skip_download': 'm3u8', - 'format': 'bestvideo', }, 'expected_warnings': ['Ignoring subtitle tracks'], # TODO: Investigate this }, { - 'url': 'https://www.paramountplus.com/movies/sonic-the-hedgehog/5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc', + 'url': 'https://www.paramountplus.com/movies/video/5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc/', 'info_dict': { 'id': '5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc', 'ext': 'mp4', @@ -76,14 +76,19 @@ class ParamountPlusIE(CBSBaseIE): }, 'params': { 'skip_download': 'm3u8', - 'format': 'bestvideo', }, 'expected_warnings': ['Ignoring subtitle tracks'], }, { - 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/', + 'url': 'https://www.paramountplus.com/shows/the-real-world/video/mOVeHeL9ub9yWdyzSZFYz8Uj4ZBkVzQg/the-real-world-reunion/', 'only_matching': True, }, { - 'url': 'https://www.paramountplus.com/movies/million-dollar-american-princesses-meghan-and-harry/C0LpgNwXYeB8txxycdWdR9TjxpJOsdCq', + 'url': 'https://www.paramountplus.com/shows/video/mOVeHeL9ub9yWdyzSZFYz8Uj4ZBkVzQg/', + 'only_matching': True, + }, { + 'url': 'https://www.paramountplus.com/movies/video/W0VyStQqUnqKzJkrpSAIARuCc9YuYGNy/', + 'only_matching': True, + }, { + 'url': 'https://www.paramountplus.com/movies/paw-patrol-the-movie/W0VyStQqUnqKzJkrpSAIARuCc9YuYGNy/', 'only_matching': True, }] @@ -130,11 +135,13 @@ class ParamountPlusSeriesIE(InfoExtractor): 'id': 'spongebob-squarepants', } }] - _API_URL = 'https://www.paramountplus.com/shows/{}/xhr/episodes/page/0/size/100000/xs/0/season/0/' def _entries(self, show_name): - show_json = self._download_json(self._API_URL.format(show_name), video_id=show_name) - if show_json.get('success'): + for page in itertools.count(): + show_json = self._download_json( + f'https://www.paramountplus.com/shows/{show_name}/xhr/episodes/page/{page}/size/50/xs/0/season/0', show_name) + if not show_json.get('success'): + return for episode in show_json['result']['data']: yield self.url_result( 'https://www.paramountplus.com%s' % episode['url'], diff --git a/hypervideo_dl/extractor/parliamentliveuk.py b/hypervideo_dl/extractor/parliamentliveuk.py index 869ebd8..974d654 100644 --- a/hypervideo_dl/extractor/parliamentliveuk.py +++ b/hypervideo_dl/extractor/parliamentliveuk.py @@ -25,9 +25,6 @@ class ParliamentLiveUKIE(InfoExtractor): 'timestamp': 1395153872, 'upload_date': '20140318', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4', 'only_matching': True, diff --git a/hypervideo_dl/extractor/patreon.py b/hypervideo_dl/extractor/patreon.py index a189c02..963a0d6 100644 --- a/hypervideo_dl/extractor/patreon.py +++ b/hypervideo_dl/extractor/patreon.py @@ -88,11 +88,7 @@ class PatreonIE(InfoExtractor): # Currently Patreon exposes download URL via hidden CSS, so login is not # needed. Keeping this commented for when this inevitably changes. ''' - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_form = { 'redirectUrl': 'http://www.patreon.com/', 'email': username, @@ -108,8 +104,6 @@ class PatreonIE(InfoExtractor): if re.search(r'onLoginFailed', login_page): raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) - def _real_initialize(self): - self._login() ''' def _real_extract(self, url): @@ -161,7 +155,7 @@ class PatreonIE(InfoExtractor): if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo': embed_html = try_get(attributes, lambda x: x['embed']['html']) v_url = url_or_none(compat_urllib_parse_unquote( - self._search_regex(r'src=(https%3A%2F%2Fplayer\.vimeo\.com.+)%3F', embed_html, 'vimeo url', fatal=False))) + self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False))) if v_url: info.update({ '_type': 'url_transparent', @@ -191,7 +185,7 @@ class PatreonIE(InfoExtractor): class PatreonUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?P<id>[-_\w\d]+)/?(?:posts/?)?' + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?P<id>[-\w]+)' _TESTS = [{ 'url': 'https://www.patreon.com/dissonancepod/', diff --git a/hypervideo_dl/extractor/pbs.py b/hypervideo_dl/extractor/pbs.py index 0eabf9b..e48a2b8 100644 --- a/hypervideo_dl/extractor/pbs.py +++ b/hypervideo_dl/extractor/pbs.py @@ -193,7 +193,7 @@ class PBSIE(InfoExtractor): # Article with embedded player (or direct video) (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player - (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/ + (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+) ) ''' % '|'.join(list(zip(*_STATIONS))[0]) @@ -545,7 +545,7 @@ class PBSIE(InfoExtractor): for vid_id in video_id] return self.playlist_result(entries, display_id) - info = None + info = {} redirects = [] redirect_urls = set() @@ -660,6 +660,9 @@ class PBSIE(InfoExtractor): 'protocol': 'http', }) formats.append(f) + for f in formats: + if (f.get('format_note') or '').endswith(' AD'): # Audio description + f['language_preference'] = -10 self._sort_formats(formats) rating_str = info.get('rating') diff --git a/hypervideo_dl/extractor/peekvids.py b/hypervideo_dl/extractor/peekvids.py new file mode 100644 index 0000000..4bf6855 --- /dev/null +++ b/hypervideo_dl/extractor/peekvids.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class PeekVidsIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?peekvids\.com/ + (?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=) + (?P<id>[^/?&#]*) + ''' + _TESTS = [{ + 'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd', + 'md5': 'a00940646c428e232407e3e62f0e8ef5', + 'info_dict': { + 'id': 'BSyLMbN0YCd', + 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Watch Dane Jones - Cute redhead with perfect tits with Mini Vamp (7 min), uploaded by SEXYhub.com', + 'timestamp': 1642579329, + 'upload_date': '20220119', + 'duration': 416, + 'view_count': int, + 'age_limit': 18, + }, + }] + _DOMAIN = 'www.peekvids.com' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + short_video_id = self._html_search_regex(r'<video [^>]*data-id="(.+?)"', webpage, 'short video ID') + srcs = self._download_json( + f'https://{self._DOMAIN}/v-alt/{short_video_id}', video_id, + note='Downloading list of source files') + formats = [{ + 'url': url, + 'ext': 'mp4', + 'format_id': name[8:], + } for name, url in srcs.items() if len(name) > 8 and name.startswith('data-src')] + if not formats: + formats = [{'url': url} for url in srcs.values()] + self._sort_formats(formats) + + info = self._search_json_ld(webpage, video_id, expected_type='VideoObject') + info.update({ + 'id': video_id, + 'age_limit': 18, + 'formats': formats, + }) + return info + + +class PlayVidsIE(PeekVidsIE): + _VALID_URL = r'https?://(?:www\.)?playvids\.com/(?:embed/|[^/]{2}/)?(?P<id>[^/?#]*)' + _TESTS = [{ + 'url': 'https://www.playvids.com/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', + 'md5': 'cd7dfd8a2e815a45402369c76e3c1825', + 'info_dict': { + 'id': 'U3pBrYhsjXM', + 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Watch Dane Jones - Cute redhead with perfect tits with Mini Vamp video in HD, uploaded by SEXYhub.com', + 'timestamp': 1640435839, + 'upload_date': '20211225', + 'duration': 416, + 'view_count': int, + 'age_limit': 18, + }, + }, { + 'url': 'https://www.playvids.com/es/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', + 'only_matching': True, + }, { + 'url': 'https://www.playvids.com/embed/U3pBrYhsjXM', + 'only_matching': True, + }] + _DOMAIN = 'www.playvids.com' diff --git a/hypervideo_dl/extractor/peertube.py b/hypervideo_dl/extractor/peertube.py index 1e22f24..9d6b821 100644 --- a/hypervideo_dl/extractor/peertube.py +++ b/hypervideo_dl/extractor/peertube.py @@ -7,6 +7,7 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + format_field, int_or_none, parse_resolution, str_or_none, @@ -86,6 +87,7 @@ class PeerTubeIE(InfoExtractor): maindreieck-tv\.de| mani\.tube| manicphase\.me| + media\.fsfe\.org| media\.gzevd\.de| media\.inno3\.cricket| media\.kaitaia\.life| @@ -1386,8 +1388,7 @@ class PeerTubePlaylistIE(InfoExtractor): playlist_timestamp = unified_timestamp(info.get('createdAt')) channel = try_get(info, lambda x: x['ownerAccount']['name']) or info.get('displayName') channel_id = try_get(info, lambda x: x['ownerAccount']['id']) or info.get('id') - thumbnail = info.get('thumbnailPath') - thumbnail = f'https://{host}{thumbnail}' if thumbnail else None + thumbnail = format_field(info, 'thumbnailPath', f'https://{host}%s') entries = OnDemandPagedList(functools.partial( self.fetch_page, host, id, type), self._PAGE_SIZE) diff --git a/hypervideo_dl/extractor/peertv.py b/hypervideo_dl/extractor/peertv.py new file mode 100644 index 0000000..002d33a --- /dev/null +++ b/hypervideo_dl/extractor/peertv.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import js_to_json + + +class PeerTVIE(InfoExtractor): + IE_NAME = 'peer.tv' + _VALID_URL = r'https?://(?:www\.)?peer\.tv/(?:de|it|en)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.peer.tv/de/841', + 'info_dict': { + 'id': '841', + 'ext': 'mp4', + 'title': 'Die Brunnenburg', + 'description': 'md5:4395f6142b090338340ab88a3aae24ed', + }, + }, { + 'url': 'https://www.peer.tv/it/404', + 'info_dict': { + 'id': '404', + 'ext': 'mp4', + 'title': 'Cascate di ghiaccio in Val Gardena', + 'description': 'md5:e8e5907f236171842674e8090e3577b8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_key = self._html_search_regex(r'player\.peer\.tv/js/([a-zA-Z0-9]+)', webpage, 'video key') + + js = self._download_webpage(f'https://player.peer.tv/js/{video_key}/', video_id, + headers={'Referer': 'https://www.peer.tv/'}, note='Downloading session id') + + session_id = self._search_regex(r'["\']session_id["\']:\s*["\']([a-zA-Z0-9]+)["\']', js, 'session id') + + player_webpage = self._download_webpage( + f'https://player.peer.tv/jsc/{video_key}/{session_id}?jsr=aHR0cHM6Ly93d3cucGVlci50di9kZS84NDE=&cs=UTF-8&mq=2&ua=0&webm=p&mp4=p&hls=1', + video_id, note='Downloading player webpage') + + m3u8_url = self._search_regex(r'["\']playlist_url["\']:\s*(["\'][^"\']+["\'])', player_webpage, 'm3u8 url') + m3u8_url = self._parse_json(m3u8_url, video_id, transform_source=js_to_json) + + formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._html_search_regex(r'<h1>(.+?)</h1>', webpage, 'title').replace('\xa0', ' '), + 'formats': formats, + 'description': self._html_search_meta(('og:description', 'description'), webpage), + 'thumbnail': self._html_search_meta(('og:image', 'image'), webpage) + } diff --git a/hypervideo_dl/extractor/peloton.py b/hypervideo_dl/extractor/peloton.py index 287d341..7d83225 100644 --- a/hypervideo_dl/extractor/peloton.py +++ b/hypervideo_dl/extractor/peloton.py @@ -203,7 +203,6 @@ class PelotonLiveIE(InfoExtractor): 'chapters': 'count:3' }, 'params': { - 'format': 'bestvideo', 'skip_download': 'm3u8', }, '_skip': 'Account needed' diff --git a/hypervideo_dl/extractor/periscope.py b/hypervideo_dl/extractor/periscope.py index b93a02b..1a292b8 100644 --- a/hypervideo_dl/extractor/periscope.py +++ b/hypervideo_dl/extractor/periscope.py @@ -33,7 +33,7 @@ class PeriscopeBaseIE(InfoExtractor): return { 'id': broadcast.get('id') or video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'timestamp': parse_iso8601(broadcast.get('created_at')), 'uploader': uploader, 'uploader_id': broadcast.get('user_id') or broadcast.get('username'), diff --git a/hypervideo_dl/extractor/piapro.py b/hypervideo_dl/extractor/piapro.py new file mode 100644 index 0000000..c4eb491 --- /dev/null +++ b/hypervideo_dl/extractor/piapro.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + ExtractorError, + parse_duration, + parse_filesize, + str_to_int, + unified_timestamp, + urlencode_postdata, +) + + +class PiaproIE(InfoExtractor): + _NETRC_MACHINE = 'piapro' + _VALID_URL = r'https?://piapro\.jp/t/(?P<id>\w+)/?' + _TESTS = [{ + 'url': 'https://piapro.jp/t/NXYR', + 'md5': 'a9d52f27d13bafab7ee34116a7dcfa77', + 'info_dict': { + 'id': 'NXYR', + 'ext': 'mp3', + 'uploader': 'wowaka', + 'uploader_id': 'wowaka', + 'title': '裏表ラバーズ', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }] + + _login_status = False + + def _perform_login(self, username, password): + login_ok = True + login_form_strs = { + '_username': username, + '_password': password, + '_remember_me': 'on', + 'login': 'ログイン' + } + self._request_webpage('https://piapro.jp/login/', None) + urlh = self._request_webpage( + 'https://piapro.jp/login/exe', None, + note='Logging in', errnote='Unable to log in', + data=urlencode_postdata(login_form_strs)) + if urlh is False: + login_ok = False + else: + parts = compat_urlparse.urlparse(urlh.geturl()) + if parts.path != '/': + login_ok = False + if not login_ok: + self.report_warning( + 'unable to log in: bad username or password') + self._login_status = login_ok + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + category_id = self._search_regex(r'categoryId=(.+)">', webpage, 'category ID') + if category_id not in ('1', '2', '21', '22', '23', '24', '25'): + raise ExtractorError('The URL does not contain audio.', expected=True) + + str_duration, str_filesize = self._search_regex( + r'サイズ:</span>(.+?)/\(([0-9,]+?[KMG]?B))', webpage, 'duration and size', + group=(1, 2), default=(None, None)) + str_viewcount = self._search_regex(r'閲覧数:</span>([0-9,]+)\s+', webpage, 'view count', fatal=False) + + uploader_id, uploader = self._search_regex( + r'<a\s+class="cd_user-name"\s+href="/(.*)">([^<]+)さん<', webpage, 'uploader', + group=(1, 2), default=(None, None)) + content_id = self._search_regex(r'contentId\:\'(.+)\'', webpage, 'content ID') + create_date = self._search_regex(r'createDate\:\'(.+)\'', webpage, 'timestamp') + + player_webpage = self._download_webpage( + f'https://piapro.jp/html5_player_popup/?id={content_id}&cdate={create_date}', + video_id, note='Downloading player webpage') + + return { + 'id': video_id, + 'title': self._html_search_regex(r'<h1\s+class="cd_works-title">(.+?)</h1>', webpage, 'title', fatal=False), + 'description': self._html_search_regex(r'<p\s+class="cd_dtl_cap">(.+?)</p>\s*<div', webpage, 'description', fatal=False), + 'uploader': uploader, + 'uploader_id': uploader_id, + 'timestamp': unified_timestamp(create_date, False), + 'duration': parse_duration(str_duration), + 'view_count': str_to_int(str_viewcount), + 'thumbnail': self._html_search_meta('twitter:image', webpage), + + 'filesize_approx': parse_filesize(str_filesize.replace(',', '')), + 'url': self._search_regex(r'mp3:\s*\'(.*?)\'\}', player_webpage, 'url'), + 'ext': 'mp3', + 'vcodec': 'none', + } diff --git a/hypervideo_dl/extractor/picarto.py b/hypervideo_dl/extractor/picarto.py index e6c51e1..adf21fd 100644 --- a/hypervideo_dl/extractor/picarto.py +++ b/hypervideo_dl/extractor/picarto.py @@ -77,7 +77,7 @@ class PicartoIE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(title.strip()), + 'title': title.strip(), 'is_live': True, 'channel': channel_id, 'channel_id': metadata.get('id'), @@ -111,7 +111,7 @@ class PicartoVodIE(InfoExtractor): vod_info = self._parse_json( self._search_regex( r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, - video_id), + 'vod player'), video_id, transform_source=js_to_json) formats = self._extract_m3u8_formats( diff --git a/hypervideo_dl/extractor/piksel.py b/hypervideo_dl/extractor/piksel.py index a362664..84c3de2 100644 --- a/hypervideo_dl/extractor/piksel.py +++ b/hypervideo_dl/extractor/piksel.py @@ -4,11 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( dict_get, ExtractorError, int_or_none, + join_nonempty, parse_iso8601, try_get, unescapeHTML, @@ -116,12 +116,8 @@ class PikselIE(InfoExtractor): elif asset_type == 'audio': tbr = abr - format_id = ['http'] - if tbr: - format_id.append(compat_str(tbr)) - formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty('http', tbr), 'url': unescapeHTML(http_url), 'vbr': vbr, 'abr': abr, @@ -167,7 +163,7 @@ class PikselIE(InfoExtractor): re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, transform_source=transform_source, fatal=False)) - self._sort_formats(formats) + self._sort_formats(formats, ('tbr', )) # Incomplete resolution information subtitles = {} for caption in video_data.get('captions', []): diff --git a/hypervideo_dl/extractor/pixivsketch.py b/hypervideo_dl/extractor/pixivsketch.py new file mode 100644 index 0000000..f0ad0b2 --- /dev/null +++ b/hypervideo_dl/extractor/pixivsketch.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_timestamp, +) + + +class PixivSketchBaseIE(InfoExtractor): + def _call_api(self, video_id, path, referer, note='Downloading JSON metadata'): + response = self._download_json(f'https://sketch.pixiv.net/api/{path}', video_id, note=note, headers={ + 'Referer': referer, + 'X-Requested-With': referer, + }) + errors = traverse_obj(response, ('errors', ..., 'message')) + if errors: + raise ExtractorError(' '.join(f'{e}.' for e in errors)) + return response.get('data') or {} + + +class PixivSketchIE(PixivSketchBaseIE): + IE_NAME = 'pixiv:sketch' + _VALID_URL = r'https?://sketch\.pixiv\.net/@(?P<uploader_id>[a-zA-Z0-9_-]+)/lives/(?P<id>\d+)/?' + _TESTS = [{ + 'url': 'https://sketch.pixiv.net/@nuhutya/lives/3654620468641830507', + 'info_dict': { + 'id': '7370666691623196569', + 'title': 'まにあえクリスマス!', + 'uploader': 'ぬふちゃ', + 'uploader_id': 'nuhutya', + 'channel_id': '9844815', + 'age_limit': 0, + 'timestamp': 1640351536, + }, + 'skip': True, + }, { + # these two (age_limit > 0) requires you to login on website, but it's actually not required for download + 'url': 'https://sketch.pixiv.net/@namahyou/lives/4393103321546851377', + 'info_dict': { + 'id': '4907995960957946943', + 'title': 'クリスマスなんて知らん🖕', + 'uploader': 'すゃもり', + 'uploader_id': 'suya2mori2', + 'channel_id': '31169300', + 'age_limit': 15, + 'timestamp': 1640347640, + }, + 'skip': True, + }, { + 'url': 'https://sketch.pixiv.net/@8aki/lives/3553803162487249670', + 'info_dict': { + 'id': '1593420639479156945', + 'title': 'おまけ本作業(リョナ有)', + 'uploader': 'おぶい / Obui', + 'uploader_id': 'oving', + 'channel_id': '17606', + 'age_limit': 18, + 'timestamp': 1640330263, + }, + 'skip': True, + }] + + def _real_extract(self, url): + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') + data = self._call_api(video_id, f'lives/{video_id}.json', url) + + if not traverse_obj(data, 'is_broadcasting'): + raise ExtractorError(f'This live is offline. Use https://sketch.pixiv.net/@{uploader_id} for ongoing live.', expected=True) + + m3u8_url = traverse_obj(data, ('owner', 'hls_movie', 'url')) + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': data.get('name'), + 'formats': formats, + 'uploader': traverse_obj(data, ('user', 'name'), ('owner', 'user', 'name')), + 'uploader_id': traverse_obj(data, ('user', 'unique_name'), ('owner', 'user', 'unique_name')), + 'channel_id': str(traverse_obj(data, ('user', 'pixiv_user_id'), ('owner', 'user', 'pixiv_user_id'))), + 'age_limit': 18 if data.get('is_r18') else 15 if data.get('is_r15') else 0, + 'timestamp': unified_timestamp(data.get('created_at')), + 'is_live': True + } + + +class PixivSketchUserIE(PixivSketchBaseIE): + IE_NAME = 'pixiv:sketch:user' + _VALID_URL = r'https?://sketch\.pixiv\.net/@(?P<id>[a-zA-Z0-9_-]+)/?' + _TESTS = [{ + 'url': 'https://sketch.pixiv.net/@nuhutya', + 'only_matching': True, + }, { + 'url': 'https://sketch.pixiv.net/@namahyou', + 'only_matching': True, + }, { + 'url': 'https://sketch.pixiv.net/@8aki', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return super(PixivSketchUserIE, cls).suitable(url) and not PixivSketchIE.suitable(url) + + def _real_extract(self, url): + user_id = self._match_id(url) + data = self._call_api(user_id, f'lives/users/@{user_id}.json', url) + + if not traverse_obj(data, 'is_broadcasting'): + try: + self._call_api(user_id, 'users/current.json', url, 'Investigating reason for request failure') + except ExtractorError as ex: + if ex.cause and ex.cause.code == 401: + self.raise_login_required(f'Please log in, or use direct link like https://sketch.pixiv.net/@{user_id}/1234567890', method='cookies') + raise ExtractorError('This user is offline', expected=True) + + return self.url_result(f'https://sketch.pixiv.net/@{user_id}/lives/{data["id"]}') diff --git a/hypervideo_dl/extractor/pladform.py b/hypervideo_dl/extractor/pladform.py index dc20300..99ade85 100644 --- a/hypervideo_dl/extractor/pladform.py +++ b/hypervideo_dl/extractor/pladform.py @@ -28,6 +28,24 @@ class PladformIE(InfoExtractor): (?P<id>\d+) ''' _TESTS = [{ + 'url': 'http://out.pladform.ru/player?pl=18079&type=html5&videoid=100231282', + 'info_dict': { + 'id': '6216d548e755edae6e8280667d774791', + 'ext': 'mp4', + 'timestamp': 1406117012, + 'title': 'Гарик Мартиросян и Гарик Харламов - Кастинг на концерт ко Дню милиции', + 'age_limit': 0, + 'upload_date': '20140723', + 'thumbnail': str, + 'view_count': int, + 'description': str, + 'category': list, + 'uploader_id': '12082', + 'uploader': 'Comedy Club', + 'duration': 367, + }, + 'expected_warnings': ['HTTP Error 404: Not Found'] + }, { 'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0', 'md5': '53362fac3a27352da20fa2803cc5cd6f', 'info_dict': { @@ -63,13 +81,19 @@ class PladformIE(InfoExtractor): 'http://out.pladform.ru/getVideo', video_id, query={ 'pl': pl, 'videoid': video_id, - }) + }, fatal=False) def fail(text): raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, text), expected=True) + if not video: + targetUrl = self._request_webpage(url, video_id, note='Resolving final URL').geturl() + if targetUrl == url: + raise ExtractorError('Can\'t parse page') + return self.url_result(targetUrl) + if video.tag == 'error': fail(video.text) diff --git a/hypervideo_dl/extractor/planetmarathi.py b/hypervideo_dl/extractor/planetmarathi.py new file mode 100644 index 0000000..07ac15b --- /dev/null +++ b/hypervideo_dl/extractor/planetmarathi.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + try_get, + unified_strdate, +) + + +class PlanetMarathiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?planetmarathi\.com/titles/(?P<id>[^/#&?$]+)' + _TESTS = [{ + 'url': 'https://www.planetmarathi.com/titles/ek-unad-divas', + 'playlist_mincount': 2, + 'info_dict': { + 'id': 'ek-unad-divas', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'ASSETS-MOVIE-ASSET-01_ek-unad-divas', + 'ext': 'mp4', + 'title': 'ek unad divas', + 'alt_title': 'चित्रपट', + 'description': 'md5:41c7ed6b041c2fea9820a3f3125bd881', + 'season_number': None, + 'episode_number': 1, + 'duration': 5539, + 'upload_date': '20210829', + }, + }] # Trailer skipped + }, { + 'url': 'https://www.planetmarathi.com/titles/baap-beep-baap-season-1', + 'playlist_mincount': 10, + 'info_dict': { + 'id': 'baap-beep-baap-season-1', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'ASSETS-CHARACTER-PROFILE-SEASON-01-ASSET-01_baap-beep-baap-season-1', + 'ext': 'mp4', + 'title': 'Manohar Kanhere', + 'alt_title': 'मनोहर कान्हेरे', + 'description': 'md5:285ed45d5c0ab5522cac9a043354ebc6', + 'season_number': 1, + 'episode_number': 1, + 'duration': 29, + 'upload_date': '20210829', + }, + }] # Trailers, Episodes, other Character profiles skipped + }] + + def _real_extract(self, url): + id = self._match_id(url) + entries = [] + json_data = self._download_json(f'https://www.planetmarathi.com/api/v1/titles/{id}/assets', id)['assets'] + for asset in json_data: + asset_title = asset['mediaAssetName']['en'] + if asset_title == 'Movie': + asset_title = id.replace('-', ' ') + asset_id = f'{asset["sk"]}_{id}'.replace('#', '-') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(asset['mediaAssetURL'], asset_id) + self._sort_formats(formats) + entries.append({ + 'id': asset_id, + 'title': asset_title, + 'alt_title': try_get(asset, lambda x: x['mediaAssetName']['mr']), + 'description': try_get(asset, lambda x: x['mediaAssetDescription']['en']), + 'season_number': asset.get('mediaAssetSeason'), + 'episode_number': asset.get('mediaAssetIndexForAssetType'), + 'duration': asset.get('mediaAssetDurationInSeconds'), + 'upload_date': unified_strdate(asset.get('created')), + 'formats': formats, + 'subtitles': subtitles, + }) + return self.playlist_result(entries, playlist_id=id) diff --git a/hypervideo_dl/extractor/platzi.py b/hypervideo_dl/extractor/platzi.py index 23c8256..17f52e7 100644 --- a/hypervideo_dl/extractor/platzi.py +++ b/hypervideo_dl/extractor/platzi.py @@ -22,14 +22,7 @@ class PlatziBaseIE(InfoExtractor): _LOGIN_URL = 'https://platzi.com/login/' _NETRC_MACHINE = 'platzi' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') diff --git a/hypervideo_dl/extractor/playplustv.py b/hypervideo_dl/extractor/playplustv.py index fd72a37..cad2c3a 100644 --- a/hypervideo_dl/extractor/playplustv.py +++ b/hypervideo_dl/extractor/playplustv.py @@ -38,14 +38,10 @@ class PlayPlusTVIE(InfoExtractor): 'Authorization': 'Bearer ' + self._token, }, query=query) - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - self.raise_login_required() - + def _perform_login(self, username, password): req = PUTRequest( 'https://api.playplus.tv/api/web/login', json.dumps({ - 'email': email, + 'email': username, 'password': password, }).encode(), { 'Content-Type': 'application/json; charset=utf-8', @@ -61,6 +57,10 @@ class PlayPlusTVIE(InfoExtractor): self._profile = self._call_api('Profiles')['list'][0]['_id'] + def _real_initialize(self): + if not self._token: + self.raise_login_required(method='password') + def _real_extract(self, url): project_id, media_id = self._match_valid_url(url).groups() media = self._call_api( diff --git a/hypervideo_dl/extractor/playtvak.py b/hypervideo_dl/extractor/playtvak.py index 84e92dd..30c8a59 100644 --- a/hypervideo_dl/extractor/playtvak.py +++ b/hypervideo_dl/extractor/playtvak.py @@ -167,8 +167,6 @@ class PlaytvakIE(InfoExtractor): title = item['title'] is_live = item['type'] == 'stream' - if is_live: - title = self._live_title(title) description = self._og_search_description(webpage, default=None) or self._html_search_meta( 'description', webpage, 'description', default=None) timestamp = None diff --git a/hypervideo_dl/extractor/playvid.py b/hypervideo_dl/extractor/playvid.py index 4aef186..e1c406b 100644 --- a/hypervideo_dl/extractor/playvid.py +++ b/hypervideo_dl/extractor/playvid.py @@ -85,8 +85,7 @@ class PlayvidIE(InfoExtractor): # Extract title - should be in the flashvars; if not, look elsewhere if video_title is None: - video_title = self._html_search_regex( - r'<title>(.*?)</title', webpage, 'title') + video_title = self._html_extract_title(webpage) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/pluralsight.py b/hypervideo_dl/extractor/pluralsight.py index 801057e..2a5e0e4 100644 --- a/hypervideo_dl/extractor/pluralsight.py +++ b/hypervideo_dl/extractor/pluralsight.py @@ -162,14 +162,7 @@ query viewClip { } }''' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') diff --git a/hypervideo_dl/extractor/plutotv.py b/hypervideo_dl/extractor/plutotv.py index 0cf8246..26aff1a 100644 --- a/hypervideo_dl/extractor/plutotv.py +++ b/hypervideo_dl/extractor/plutotv.py @@ -20,11 +20,11 @@ from ..utils import ( class PlutoTVIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?:www\.)?pluto\.tv(?:/en)?/on-demand + https?://(?:www\.)?pluto\.tv(?:/[^/]+)?/on-demand /(?P<video_type>movies|series) /(?P<series_or_movie_slug>[^/]+) (?: - /seasons?/(?P<season_no>\d+) + (?:/seasons?/(?P<season_no>\d+))? (?:/episode/(?P<episode_slug>[^/]+))? )? /?(?:$|[#?])''' @@ -84,6 +84,9 @@ class PlutoTVIE(InfoExtractor): }, { 'url': 'https://pluto.tv/en/on-demand/series/manhunters-fugitive-task-force/seasons/1/episode/third-times-the-charm-1-1', 'only_matching': True, + }, { + 'url': 'https://pluto.tv/it/on-demand/series/csi-vegas/episode/legacy-2021-1-1', + 'only_matching': True, } ] diff --git a/hypervideo_dl/extractor/pokemon.py b/hypervideo_dl/extractor/pokemon.py index 402b574..b411390 100644 --- a/hypervideo_dl/extractor/pokemon.py +++ b/hypervideo_dl/extractor/pokemon.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import ( @@ -138,3 +139,42 @@ class PokemonWatchIE(InfoExtractor): 'episode': video_data.get('title'), 'episode_number': int_or_none(video_data.get('episode')), }) + + +class PokemonSoundLibraryIE(InfoExtractor): + _VALID_URL = r'https?://soundlibrary\.pokemon\.co\.jp' + + _TESTS = [{ + 'url': 'https://soundlibrary.pokemon.co.jp/', + 'info_dict': { + 'title': 'Pokémon Diamond and Pearl Sound Tracks', + }, + 'playlist_mincount': 149, + }] + + def _real_extract(self, url): + musicbox_webpage = self._download_webpage( + 'https://soundlibrary.pokemon.co.jp/musicbox', None, + 'Downloading list of songs') + song_titles = [x.group(1) for x in re.finditer(r'<span>([^>]+?)</span><br/>をてもち曲に加えます。', musicbox_webpage)] + song_titles = song_titles[4::2] + + # each songs don't have permalink; instead we return all songs at once + song_entries = [{ + 'id': f'pokemon-soundlibrary-{song_id}', + 'url': f'https://soundlibrary.pokemon.co.jp/api/assets/signing/sounds/wav/{song_id}.wav', + # note: the server always serves MP3 files, despite its extension of the URL above + 'ext': 'mp3', + 'acodec': 'mp3', + 'vcodec': 'none', + 'title': song_title, + 'track': song_title, + 'artist': 'Nintendo / Creatures Inc. / GAME FREAK inc.', + 'uploader': 'Pokémon', + 'release_year': 2006, + 'release_date': '20060928', + 'track_number': song_id, + 'album': 'Pokémon Diamond and Pearl', + } for song_id, song_title in enumerate(song_titles, 1)] + + return self.playlist_result(song_entries, playlist_title='Pokémon Diamond and Pearl Sound Tracks') diff --git a/hypervideo_dl/extractor/pokergo.py b/hypervideo_dl/extractor/pokergo.py new file mode 100644 index 0000000..c9e2fed --- /dev/null +++ b/hypervideo_dl/extractor/pokergo.py @@ -0,0 +1,109 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + try_get, +) + + +class PokerGoBaseIE(InfoExtractor): + _NETRC_MACHINE = 'pokergo' + _AUTH_TOKEN = None + _PROPERTY_ID = '1dfb3940-7d53-4980-b0b0-f28b369a000d' + + def _perform_login(self, username, password): + if self._AUTH_TOKEN: + return + self.report_login() + PokerGoBaseIE._AUTH_TOKEN = self._download_json( + f'https://subscription.pokergo.com/properties/{self._PROPERTY_ID}/sign-in', None, + headers={'authorization': f'Basic {base64.b64encode(f"{username}:{password}".encode()).decode()}'}, + data=b'')['meta']['token'] + if not self._AUTH_TOKEN: + raise ExtractorError('Unable to get Auth Token.', expected=True) + + def _real_initialize(self): + if not self._AUTH_TOKEN: + self.raise_login_required(method='password') + + +class PokerGoIE(PokerGoBaseIE): + _VALID_URL = r'https?://(?:www\.)?pokergo\.com/videos/(?P<id>[^&$#/?]+)' + + _TESTS = [{ + 'url': 'https://www.pokergo.com/videos/2a70ec4e-4a80-414b-97ec-725d9b72a7dc', + 'info_dict': { + 'id': 'aVLOxDzY', + 'ext': 'mp4', + 'title': 'Poker After Dark | Season 12 (2020) | Cry Me a River | Episode 2', + 'description': 'md5:c7a8c29556cbfb6eb3c0d5d622251b71', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/aVLOxDzY/poster.jpg?width=720', + 'timestamp': 1608085715, + 'duration': 2700.12, + 'season_number': 12, + 'episode_number': 2, + 'series': 'poker after dark', + 'upload_date': '20201216', + 'season': 'Season 12', + 'episode': 'Episode 2', + 'display_id': '2a70ec4e-4a80-414b-97ec-725d9b72a7dc', + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://api.pokergo.com/v2/properties/{self._PROPERTY_ID}/videos/{id}', id, + headers={'authorization': f'Bearer {self._AUTH_TOKEN}'})['data'] + v_id = data_json['source'] + + thumbnails = [{ + 'url': image['url'], + 'id': image.get('label'), + 'width': image.get('width'), + 'height': image.get('height') + } for image in data_json.get('images') or [] if image.get('url')] + series_json = next(dct for dct in data_json.get('show_tags') or [] if dct.get('video_id') == id) or {} + + return { + '_type': 'url_transparent', + 'display_id': id, + 'title': data_json.get('title'), + 'description': data_json.get('description'), + 'duration': data_json.get('duration'), + 'thumbnails': thumbnails, + 'season_number': series_json.get('season'), + 'episode_number': series_json.get('episode_number'), + 'series': try_get(series_json, lambda x: x['tag']['name']), + 'url': f'https://cdn.jwplayer.com/v2/media/{v_id}' + } + + +class PokerGoCollectionIE(PokerGoBaseIE): + _VALID_URL = r'https?://(?:www\.)?pokergo\.com/collections/(?P<id>[^&$#/?]+)' + + _TESTS = [{ + 'url': 'https://www.pokergo.com/collections/19ffe481-5dae-481a-8869-75cc0e3c4700', + 'playlist_mincount': 13, + 'info_dict': { + 'id': '19ffe481-5dae-481a-8869-75cc0e3c4700', + }, + }] + + def _entries(self, id): + data_json = self._download_json(f'https://api.pokergo.com/v2/properties/{self._PROPERTY_ID}/collections/{id}?include=entities', + id, headers={'authorization': f'Bearer {self._AUTH_TOKEN}'})['data'] + for video in data_json.get('collection_video') or []: + video_id = video.get('id') + if video_id: + yield self.url_result( + f'https://www.pokergo.com/videos/{video_id}', + ie=PokerGoIE.ie_key(), video_id=video_id) + + def _real_extract(self, url): + id = self._match_id(url) + return self.playlist_result(self._entries(id), playlist_id=id) diff --git a/hypervideo_dl/extractor/polsatgo.py b/hypervideo_dl/extractor/polsatgo.py new file mode 100644 index 0000000..1e3f46c --- /dev/null +++ b/hypervideo_dl/extractor/polsatgo.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from uuid import uuid4 +import json + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + url_or_none, + ExtractorError, +) + + +class PolsatGoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polsat(?:box)?go\.pl/.+/(?P<id>[0-9a-fA-F]+)(?:[/#?]|$)' + _TESTS = [{ + 'url': 'https://polsatgo.pl/wideo/seriale/swiat-wedlug-kiepskich/5024045/sezon-1/5028300/swiat-wedlug-kiepskich-odcinek-88/4121', + 'info_dict': { + 'id': '4121', + 'ext': 'mp4', + 'title': 'Świat według Kiepskich - Odcinek 88', + 'age_limit': 12, + }, + }] + + def _extract_formats(self, sources, video_id): + for source in sources or []: + if not source.get('id'): + continue + url = url_or_none(self._call_api( + 'drm', video_id, 'getPseudoLicense', + {'mediaId': video_id, 'sourceId': source['id']}).get('url')) + if not url: + continue + yield { + 'url': url, + 'height': int_or_none(try_get(source, lambda x: x['quality'][:-1])) + } + + def _real_extract(self, url): + video_id = self._match_id(url) + media = self._call_api('navigation', video_id, 'prePlayData', {'mediaId': video_id})['mediaItem'] + + formats = list(self._extract_formats( + try_get(media, lambda x: x['playback']['mediaSources']), video_id)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': media['displayInfo']['title'], + 'formats': formats, + 'age_limit': int_or_none(media['displayInfo']['ageGroup']) + } + + def _call_api(self, endpoint, media_id, method, params): + rand_uuid = str(uuid4()) + res = self._download_json( + f'https://b2c-mobile.redefine.pl/rpc/{endpoint}/', media_id, + note=f'Downloading {method} JSON metadata', + data=json.dumps({ + 'method': method, + 'id': '2137', + 'jsonrpc': '2.0', + 'params': { + **params, + 'userAgentData': { + 'deviceType': 'mobile', + 'application': 'native', + 'os': 'android', + 'build': 10003, + 'widevine': False, + 'portal': 'pg', + 'player': 'cpplayer', + }, + 'deviceId': { + 'type': 'other', + 'value': rand_uuid, + }, + 'clientId': rand_uuid, + 'cpid': 1, + }, + }).encode('utf-8'), + headers={'Content-type': 'application/json'}) + if not res.get('result'): + if res['error']['code'] == 13404: + raise ExtractorError('This video is either unavailable in your region or is DRM protected', expected=True) + raise ExtractorError(f'Solorz said: {res["error"]["message"]} - {res["error"]["data"]["userMessage"]}') + return res['result'] diff --git a/hypervideo_dl/extractor/polskieradio.py b/hypervideo_dl/extractor/polskieradio.py index 53fe034..b2b3eb2 100644 --- a/hypervideo_dl/extractor/polskieradio.py +++ b/hypervideo_dl/extractor/polskieradio.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals import itertools +import json +import math import re from .common import InfoExtractor @@ -12,15 +14,45 @@ from ..compat import ( ) from ..utils import ( extract_attributes, + ExtractorError, + InAdvancePagedList, int_or_none, + js_to_json, + parse_iso8601, strip_or_none, unified_timestamp, unescapeHTML, + url_or_none, ) -class PolskieRadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' +class PolskieRadioBaseExtractor(InfoExtractor): + def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): + media_urls = set() + + for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', webpage): + media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False) + if not media.get('file') or not media.get('desc'): + continue + media_url = self._proto_relative_url(media['file']) + if media_url in media_urls: + continue + media_urls.add(media_url) + entry = base_data.copy() + entry.update({ + 'id': compat_str(media['id']), + 'url': media_url, + 'duration': int_or_none(media.get('length')), + 'vcodec': 'none' if media.get('provider') == 'audio' else None, + }) + entry_title = compat_urllib_parse_unquote(media['desc']) + if entry_title: + entry['title'] = entry_title + yield entry + + +class PolskieRadioIE(PolskieRadioBaseExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' _TESTS = [{ # Old-style single broadcast. 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', 'info_dict': { @@ -59,22 +91,14 @@ class PolskieRadioIE(InfoExtractor): 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' }, }], - }, { # Old-style multiple broadcast playlist. - 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2487823,Marek-Kondrat-czyta-Mistrza-i-Malgorzate', - 'info_dict': { - 'id': '2487823', - 'title': 'Marek Kondrat czyta "Mistrza i Małgorzatę"', - 'description': 'md5:8422a95cc83834f2aaeff9d82e9c8f39', - }, - 'playlist_mincount': 50, - }, { # New-style multiple broadcast playlist. - 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2541317,Czytamy-Kalendarz-i-klepsydre-Tadeusza-Konwickiego', + }, { + # PR4 audition - other frontend + 'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301', 'info_dict': { - 'id': '2541317', - 'title': 'Czytamy "Kalendarz i klepsydrę" Tadeusza Konwickiego', - 'description': 'md5:0baeaa46d877f1351fb2eeed3e871f9f', + 'id': '2610977', + 'ext': 'mp3', + 'title': 'Pogłos 29 października godz. 23:01', }, - 'playlist_mincount': 15, }, { 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', 'only_matching': True, @@ -85,6 +109,9 @@ class PolskieRadioIE(InfoExtractor): # with mp4 video 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', 'only_matching': True, + }, { + 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci', + 'only_matching': True, }] def _real_extract(self, url): @@ -94,39 +121,37 @@ class PolskieRadioIE(InfoExtractor): content = self._search_regex( r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', - webpage, 'content') + webpage, 'content', default=None) timestamp = unified_timestamp(self._html_search_regex( r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', - webpage, 'timestamp', fatal=False)) + webpage, 'timestamp', default=None)) - thumbnail_url = self._og_search_thumbnail(webpage) + thumbnail_url = self._og_search_thumbnail(webpage, default=None) - entries = [] + title = self._og_search_title(webpage).strip() - media_urls = set() + description = strip_or_none(self._og_search_description(webpage, default=None)) + description = description.replace('\xa0', ' ') if description is not None else None - for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', content): - media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False) - if not media.get('file') or not media.get('desc'): - continue - media_url = self._proto_relative_url(media['file'], 'http:') - if media_url in media_urls: - continue - media_urls.add(media_url) - entries.append({ - 'id': compat_str(media['id']), - 'url': media_url, - 'title': compat_urllib_parse_unquote(media['desc']), - 'duration': int_or_none(media.get('length')), - 'vcodec': 'none' if media.get('provider') == 'audio' else None, + if not content: + return { + 'id': playlist_id, + 'url': self._proto_relative_url( + self._search_regex( + r"source:\s*'(//static\.prsa\.pl/[^']+)'", + webpage, 'audition record url')), + 'title': title, + 'description': description, 'timestamp': timestamp, - 'thumbnail': thumbnail_url - }) + 'thumbnail': thumbnail_url, + } - title = self._og_search_title(webpage).strip() - description = strip_or_none(self._og_search_description(webpage)) - description = description.replace('\xa0', ' ') if description is not None else None + entries = self._extract_webpage_player_entries(content, playlist_id, { + 'title': title, + 'timestamp': timestamp, + 'thumbnail': thumbnail_url, + }) return self.playlist_result(entries, playlist_id, title, description) @@ -207,3 +232,201 @@ class PolskieRadioCategoryIE(InfoExtractor): return self.playlist_result( self._entries(url, webpage, category_id), category_id, title) + + +class PolskieRadioPlayerIE(InfoExtractor): + IE_NAME = 'polskieradio:player' + _VALID_URL = r'https?://player\.polskieradio\.pl/anteny/(?P<id>[^/]+)' + + _BASE_URL = 'https://player.polskieradio.pl' + _PLAYER_URL = 'https://player.polskieradio.pl/main.bundle.js' + _STATIONS_API_URL = 'https://apipr.polskieradio.pl/api/stacje' + + _TESTS = [{ + 'url': 'https://player.polskieradio.pl/anteny/trojka', + 'info_dict': { + 'id': '3', + 'ext': 'm4a', + 'title': 'Trójka', + }, + 'params': { + 'format': 'bestaudio', + 'skip_download': 'endless stream', + }, + }] + + def _get_channel_list(self, channel_url='no_channel'): + player_code = self._download_webpage( + self._PLAYER_URL, channel_url, + note='Downloading js player') + channel_list = js_to_json(self._search_regex( + r';var r="anteny",a=(\[.+?\])},', player_code, 'channel list')) + return self._parse_json(channel_list, channel_url) + + def _real_extract(self, url): + channel_url = self._match_id(url) + channel_list = self._get_channel_list(channel_url) + + channel = next((c for c in channel_list if c.get('url') == channel_url), None) + + if not channel: + raise ExtractorError('Channel not found') + + station_list = self._download_json(self._STATIONS_API_URL, channel_url, + note='Downloading stream url list', + headers={ + 'Accept': 'application/json', + 'Referer': url, + 'Origin': self._BASE_URL, + }) + station = next((s for s in station_list + if s.get('Name') == (channel.get('streamName') or channel.get('name'))), None) + if not station: + raise ExtractorError('Station not found even though we extracted channel') + + formats = [] + for stream_url in station['Streams']: + stream_url = self._proto_relative_url(stream_url) + if stream_url.endswith('/playlist.m3u8'): + formats.extend(self._extract_m3u8_formats(stream_url, channel_url, live=True)) + elif stream_url.endswith('/manifest.f4m'): + formats.extend(self._extract_mpd_formats(stream_url, channel_url)) + elif stream_url.endswith('/Manifest'): + formats.extend(self._extract_ism_formats(stream_url, channel_url)) + else: + formats.append({ + 'url': stream_url, + }) + + self._sort_formats(formats) + + return { + 'id': compat_str(channel['id']), + 'formats': formats, + 'title': channel.get('name') or channel.get('streamName'), + 'display_id': channel_url, + 'thumbnail': f'{self._BASE_URL}/images/{channel_url}-color-logo.png', + 'is_live': True, + } + + +class PolskieRadioPodcastBaseExtractor(InfoExtractor): + _API_BASE = 'https://apipodcasts.polskieradio.pl/api' + + def _parse_episode(self, data): + return { + 'id': data['guid'], + 'formats': [{ + 'url': data['url'], + 'filesize': int_or_none(data.get('fileSize')), + }], + 'title': data['title'], + 'description': data.get('description'), + 'duration': int_or_none(data.get('length')), + 'timestamp': parse_iso8601(data.get('publishDate')), + 'thumbnail': url_or_none(data.get('image')), + 'series': data.get('podcastTitle'), + 'episode': data['title'], + } + + +class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor): + IE_NAME = 'polskieradio:podcast:list' + _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://podcasty.polskieradio.pl/podcast/8/', + 'info_dict': { + 'id': '8', + 'title': 'Śniadanie w Trójce', + 'description': 'md5:57abcc27bc4c6a6b25baa3061975b9ef', + 'uploader': 'Beata Michniewicz', + }, + 'playlist_mincount': 714, + }] + _PAGE_SIZE = 10 + + def _call_api(self, podcast_id, page): + return self._download_json( + f'{self._API_BASE}/Podcasts/{podcast_id}/?pageSize={self._PAGE_SIZE}&page={page}', + podcast_id, f'Downloading page {page}') + + def _real_extract(self, url): + podcast_id = self._match_id(url) + data = self._call_api(podcast_id, 1) + + def get_page(page_num): + page_data = self._call_api(podcast_id, page_num + 1) if page_num else data + yield from (self._parse_episode(ep) for ep in page_data['items']) + + return { + '_type': 'playlist', + 'entries': InAdvancePagedList( + get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE), + 'id': str(data['id']), + 'title': data['title'], + 'description': data.get('description'), + 'uploader': data.get('announcer'), + } + + +class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor): + IE_NAME = 'polskieradio:podcast' + _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P<id>[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})' + _TESTS = [{ + 'url': 'https://podcasty.polskieradio.pl/track/6eafe403-cb8f-4756-b896-4455c3713c32', + 'info_dict': { + 'id': '6eafe403-cb8f-4756-b896-4455c3713c32', + 'ext': 'mp3', + 'title': 'Theresa May rezygnuje. Co dalej z brexitem?', + 'description': 'md5:e41c409a29d022b70ef0faa61dbded60', + }, + }] + + def _real_extract(self, url): + podcast_id = self._match_id(url) + data = self._download_json( + f'{self._API_BASE}/audio', + podcast_id, 'Downloading podcast metadata', + data=json.dumps({ + 'guids': [podcast_id], + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) + return self._parse_episode(data[0]) + + +class PolskieRadioRadioKierowcowIE(PolskieRadioBaseExtractor): + _VALID_URL = r'https?://(?:www\.)?radiokierowcow\.pl/artykul/(?P<id>[0-9]+)' + IE_NAME = 'polskieradio:kierowcow' + + _TESTS = [{ + 'url': 'https://radiokierowcow.pl/artykul/2694529', + 'info_dict': { + 'id': '2694529', + 'title': 'Zielona fala reliktem przeszłości?', + 'description': 'md5:343950a8717c9818fdfd4bd2b8ca9ff2', + }, + 'playlist_count': 3, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) + nextjs_build = self._search_nextjs_data(webpage, media_id)['buildId'] + article = self._download_json( + f'https://radiokierowcow.pl/_next/data/{nextjs_build}/artykul/{media_id}.json?articleId={media_id}', + media_id) + data = article['pageProps']['data'] + title = data['title'] + entries = self._extract_webpage_player_entries(data['content'], media_id, { + 'title': title, + }) + + return { + '_type': 'playlist', + 'id': media_id, + 'entries': entries, + 'title': title, + 'description': data.get('lead'), + } diff --git a/hypervideo_dl/extractor/pornez.py b/hypervideo_dl/extractor/pornez.py new file mode 100644 index 0000000..713dc00 --- /dev/null +++ b/hypervideo_dl/extractor/pornez.py @@ -0,0 +1,43 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor +from ..utils import int_or_none + + +class PornezIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornez\.net/video(?P<id>[0-9]+)/' + _TEST = { + 'url': 'https://pornez.net/video344819/mistresst-funny_penis_names-wmv/', + 'md5': '2e19a0a1cff3a5dbea0ef1b9e80bcbbc', + 'info_dict': { + 'id': '344819', + 'ext': 'mp4', + 'title': r'mistresst funny_penis_names wmv', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + iframe_src = self._html_search_regex( + r'<iframe[^>]+src="(https?://pornez\.net/player/\?[^"]+)"', webpage, 'iframe', fatal=True) + title = self._html_search_meta(['name', 'twitter:title', 'og:title'], webpage, 'title', default=None) + if title is None: + title = self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title', fatal=True) + thumbnail = self._html_search_meta(['thumbnailUrl'], webpage, 'title', default=None) + webpage = self._download_webpage(iframe_src, video_id) + entries = self._parse_html5_media_entries(iframe_src, webpage, video_id)[0] + for format in entries['formats']: + height = self._search_regex(r'_(\d+)\.m3u8', format['url'], 'height') + format['format_id'] = '%sp' % height + format['height'] = int_or_none(height) + + entries.update({ + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'age_limit': 18 + }) + return entries diff --git a/hypervideo_dl/extractor/pornflip.py b/hypervideo_dl/extractor/pornflip.py index d0aefa2..accf452 100644 --- a/hypervideo_dl/extractor/pornflip.py +++ b/hypervideo_dl/extractor/pornflip.py @@ -29,7 +29,6 @@ class PornFlipIE(InfoExtractor): 'age_limit': 18, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, diff --git a/hypervideo_dl/extractor/pornhub.py b/hypervideo_dl/extractor/pornhub.py index 6d894af..17c8c91 100644 --- a/hypervideo_dl/extractor/pornhub.py +++ b/hypervideo_dl/extractor/pornhub.py @@ -18,6 +18,7 @@ from ..utils import ( clean_html, determine_ext, ExtractorError, + format_field, int_or_none, merge_dicts, NO_DEFAULT, @@ -32,7 +33,7 @@ from ..utils import ( class PornHubBaseIE(InfoExtractor): _NETRC_MACHINE = 'pornhub' - _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubthbh7ap3u\.onion)' + _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd\.onion)' def _download_webpage_handle(self, *args, **kwargs): def dl(*args, **kwargs): @@ -247,7 +248,7 @@ class PornHubIE(PornHubBaseIE): 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156', 'only_matching': True, }, { - 'url': 'http://pornhubthbh7ap3u.onion/view_video.php?viewkey=ph5a9813bfa7156', + 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/view_video.php?viewkey=ph5a9813bfa7156', 'only_matching': True, }] @@ -258,8 +259,7 @@ class PornHubIE(PornHubBaseIE): webpage) def _extract_count(self, pattern, webpage, name): - return str_to_int(self._search_regex( - pattern, webpage, '%s count' % name, fatal=False)) + return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None)) def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -432,7 +432,7 @@ class PornHubIE(PornHubBaseIE): default=None)) formats.append({ 'url': format_url, - 'format_id': '%dp' % height if height else None, + 'format_id': format_field(height, template='%dp'), 'height': height, }) @@ -562,7 +562,7 @@ class PornHubUserIE(PornHubPlaylistBaseIE): 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', 'only_matching': True, }, { - 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph', + 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph', 'only_matching': True, }] @@ -733,7 +733,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): 'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn', 'only_matching': True, }, { - 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph/videos', + 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph/videos', 'only_matching': True, }] @@ -756,7 +756,7 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload', 'only_matching': True, }, { - 'url': 'http://pornhubthbh7ap3u.onion/pornstar/jenny-blighe/videos/upload', + 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/pornstar/jenny-blighe/videos/upload', 'only_matching': True, }] diff --git a/hypervideo_dl/extractor/projectveritas.py b/hypervideo_dl/extractor/projectveritas.py index 1d832a6..9e9867b 100644 --- a/hypervideo_dl/extractor/projectveritas.py +++ b/hypervideo_dl/extractor/projectveritas.py @@ -10,7 +10,7 @@ from ..utils import ( class ProjectVeritasIE(InfoExtractor): - _VALID_URL = r'(?:https?://)(?:www\.)?projectveritas\.com/(?P<type>news|video)/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?projectveritas\.com/(?P<type>news|video)/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.projectveritas.com/news/exclusive-inside-the-new-york-and-new-jersey-hospitals-battling-coronavirus/', 'info_dict': { diff --git a/hypervideo_dl/extractor/prx.py b/hypervideo_dl/extractor/prx.py new file mode 100644 index 0000000..80561b8 --- /dev/null +++ b/hypervideo_dl/extractor/prx.py @@ -0,0 +1,431 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +from .common import InfoExtractor, SearchInfoExtractor +from ..utils import ( + urljoin, + traverse_obj, + int_or_none, + mimetype2ext, + clean_html, + url_or_none, + unified_timestamp, + str_or_none, +) + + +class PRXBaseIE(InfoExtractor): + PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s' + + def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'): + return self._download_json( + urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note) + + @staticmethod + def _get_prx_embed_response(response, section): + return traverse_obj(response, ('_embedded', f'prx:{section}')) + + @staticmethod + def _extract_file_link(response): + return url_or_none(traverse_obj( + response, ('_links', 'enclosure', 'href'), expected_type=str)) + + @classmethod + def _extract_image(cls, image_response): + if not isinstance(image_response, dict): + return + return { + 'id': str_or_none(image_response.get('id')), + 'filesize': image_response.get('size'), + 'width': image_response.get('width'), + 'height': image_response.get('height'), + 'url': cls._extract_file_link(image_response) + } + + @classmethod + def _extract_base_info(cls, response): + if not isinstance(response, dict): + return + item_id = str_or_none(response.get('id')) + if not item_id: + return + thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image')) + description = ( + clean_html(response.get('description')) + or response.get('shortDescription')) + return { + 'id': item_id, + 'title': response.get('title') or item_id, + 'thumbnails': [thumbnail_dict] if thumbnail_dict else None, + 'description': description, + 'release_timestamp': unified_timestamp(response.get('releasedAt')), + 'timestamp': unified_timestamp(response.get('createdAt')), + 'modified_timestamp': unified_timestamp(response.get('updatedAt')), + 'duration': int_or_none(response.get('duration')), + 'tags': response.get('tags'), + 'episode_number': int_or_none(response.get('episodeIdentifier')), + 'season_number': int_or_none(response.get('seasonIdentifier')) + } + + @classmethod + def _extract_series_info(cls, series_response): + base_info = cls._extract_base_info(series_response) + if not base_info: + return + account_info = cls._extract_account_info( + cls._get_prx_embed_response(series_response, 'account')) or {} + return { + **base_info, + 'channel_id': account_info.get('channel_id'), + 'channel_url': account_info.get('channel_url'), + 'channel': account_info.get('channel'), + 'series': base_info.get('title'), + 'series_id': base_info.get('id'), + } + + @classmethod + def _extract_account_info(cls, account_response): + base_info = cls._extract_base_info(account_response) + if not base_info: + return + name = account_response.get('name') + return { + **base_info, + 'title': name, + 'channel_id': base_info.get('id'), + 'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'), + 'channel': name, + } + + @classmethod + def _extract_story_info(cls, story_response): + base_info = cls._extract_base_info(story_response) + if not base_info: + return + series = cls._extract_series_info( + cls._get_prx_embed_response(story_response, 'series')) or {} + account = cls._extract_account_info( + cls._get_prx_embed_response(story_response, 'account')) or {} + return { + **base_info, + 'series': series.get('series'), + 'series_id': series.get('series_id'), + 'channel_id': account.get('channel_id'), + 'channel_url': account.get('channel_url'), + 'channel': account.get('channel') + } + + def _entries(self, item_id, endpoint, entry_func, query=None): + """ + Extract entries from paginated list API + @param entry_func: Function to generate entry from response item + """ + total = 0 + for page in itertools.count(1): + response = self._call_api(f'{item_id}: page {page}', endpoint, query={ + **(query or {}), + 'page': page, + 'per': 100 + }) + items = self._get_prx_embed_response(response, 'items') + if not response or not items: + break + + yield from filter(None, map(entry_func, items)) + + total += response['count'] + if total >= response['total']: + break + + def _story_playlist_entry(self, response): + story = self._extract_story_info(response) + if not story: + return + story.update({ + '_type': 'url', + 'url': 'https://beta.prx.org/stories/%s' % story['id'], + 'ie_key': PRXStoryIE.ie_key() + }) + return story + + def _series_playlist_entry(self, response): + series = self._extract_series_info(response) + if not series: + return + series.update({ + '_type': 'url', + 'url': 'https://beta.prx.org/series/%s' % series['id'], + 'ie_key': PRXSeriesIE.ie_key() + }) + return series + + +class PRXStoryIE(PRXBaseIE): + _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)' + + _TESTS = [ + { + # Story with season and episode details + 'url': 'https://beta.prx.org/stories/399200', + 'info_dict': { + 'id': '399200', + 'title': 'Fly Me To The Moon', + 'description': 'md5:43230168390b95d3322048d8a56bf2bb', + 'release_timestamp': 1640250000, + 'timestamp': 1640208972, + 'modified_timestamp': 1641318202, + 'duration': 1004, + 'tags': 'count:7', + 'episode_number': 8, + 'season_number': 5, + 'series': 'AirSpace', + 'series_id': '38057', + 'channel_id': '220986', + 'channel_url': 'https://beta.prx.org/accounts/220986', + 'channel': 'Air and Space Museum', + }, + 'playlist': [{ + 'info_dict': { + 'id': '399200_part1', + 'title': 'Fly Me To The Moon', + 'description': 'md5:43230168390b95d3322048d8a56bf2bb', + 'release_timestamp': 1640250000, + 'timestamp': 1640208972, + 'modified_timestamp': 1641318202, + 'duration': 530, + 'tags': 'count:7', + 'episode_number': 8, + 'season_number': 5, + 'series': 'AirSpace', + 'series_id': '38057', + 'channel_id': '220986', + 'channel_url': 'https://beta.prx.org/accounts/220986', + 'channel': 'Air and Space Museum', + 'ext': 'mp3', + 'upload_date': '20211222', + 'episode': 'Episode 8', + 'release_date': '20211223', + 'season': 'Season 5', + 'modified_date': '20220104' + } + }, { + 'info_dict': { + 'id': '399200_part2', + 'title': 'Fly Me To The Moon', + 'description': 'md5:43230168390b95d3322048d8a56bf2bb', + 'release_timestamp': 1640250000, + 'timestamp': 1640208972, + 'modified_timestamp': 1641318202, + 'duration': 474, + 'tags': 'count:7', + 'episode_number': 8, + 'season_number': 5, + 'series': 'AirSpace', + 'series_id': '38057', + 'channel_id': '220986', + 'channel_url': 'https://beta.prx.org/accounts/220986', + 'channel': 'Air and Space Museum', + 'ext': 'mp3', + 'upload_date': '20211222', + 'episode': 'Episode 8', + 'release_date': '20211223', + 'season': 'Season 5', + 'modified_date': '20220104' + } + } + + ] + }, { + # Story with only split audio + 'url': 'https://beta.prx.org/stories/326414', + 'info_dict': { + 'id': '326414', + 'title': 'Massachusetts v EPA', + 'description': 'md5:744fffba08f19f4deab69fa8d49d5816', + 'timestamp': 1592509124, + 'modified_timestamp': 1592510457, + 'duration': 3088, + 'tags': 'count:0', + 'series': 'Outside/In', + 'series_id': '36252', + 'channel_id': '206', + 'channel_url': 'https://beta.prx.org/accounts/206', + 'channel': 'New Hampshire Public Radio', + }, + 'playlist_count': 4 + }, { + # Story with single combined audio + 'url': 'https://beta.prx.org/stories/400404', + 'info_dict': { + 'id': '400404', + 'title': 'Cafe Chill (Episode 2022-01)', + 'thumbnails': 'count:1', + 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539', + 'timestamp': 1641233952, + 'modified_timestamp': 1641234248, + 'duration': 3540, + 'series': 'Café Chill', + 'series_id': '37762', + 'channel_id': '5767', + 'channel_url': 'https://beta.prx.org/accounts/5767', + 'channel': 'C89.5 - KNHC Seattle', + 'ext': 'mp3', + 'tags': 'count:0', + 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg', + 'upload_date': '20220103', + 'modified_date': '20220103' + } + }, { + 'url': 'https://listen.prx.org/stories/399200', + 'only_matching': True + } + ] + + def _extract_audio_pieces(self, audio_response): + return [{ + 'format_id': str_or_none(piece_response.get('id')), + 'format_note': str_or_none(piece_response.get('label')), + 'filesize': int_or_none(piece_response.get('size')), + 'duration': int_or_none(piece_response.get('duration')), + 'ext': mimetype2ext(piece_response.get('contentType')), + 'asr': int_or_none(piece_response.get('frequency'), scale=1000), + 'abr': int_or_none(piece_response.get('bitRate')), + 'url': self._extract_file_link(piece_response), + 'vcodec': 'none' + } for piece_response in sorted( + self._get_prx_embed_response(audio_response, 'items') or [], + key=lambda p: int_or_none(p.get('position')))] + + def _extract_story(self, story_response): + info = self._extract_story_info(story_response) + if not info: + return + audio_pieces = self._extract_audio_pieces( + self._get_prx_embed_response(story_response, 'audio')) + if len(audio_pieces) == 1: + return { + 'formats': audio_pieces, + **info + } + + entries = [{ + **info, + 'id': '%s_part%d' % (info['id'], (idx + 1)), + 'formats': [fmt], + } for idx, fmt in enumerate(audio_pieces)] + return { + '_type': 'multi_video', + 'entries': entries, + **info + } + + def _real_extract(self, url): + story_id = self._match_id(url) + response = self._call_api(story_id, f'stories/{story_id}') + return self._extract_story(response) + + +class PRXSeriesIE(PRXBaseIE): + _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://beta.prx.org/series/36252', + 'info_dict': { + 'id': '36252', + 'title': 'Outside/In', + 'thumbnails': 'count:1', + 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114', + 'timestamp': 1470684964, + 'modified_timestamp': 1582308830, + 'channel_id': '206', + 'channel_url': 'https://beta.prx.org/accounts/206', + 'channel': 'New Hampshire Public Radio', + 'series': 'Outside/In', + 'series_id': '36252' + }, + 'playlist_mincount': 39 + }, { + # Blank series + 'url': 'https://beta.prx.org/series/25038', + 'info_dict': { + 'id': '25038', + 'title': '25038', + 'timestamp': 1207612800, + 'modified_timestamp': 1207612800, + 'channel_id': '206', + 'channel_url': 'https://beta.prx.org/accounts/206', + 'channel': 'New Hampshire Public Radio', + 'series': '25038', + 'series_id': '25038' + }, + 'playlist_count': 0 + } + ] + + def _extract_series(self, series_response): + info = self._extract_series_info(series_response) + return { + '_type': 'playlist', + 'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry), + **info + } + + def _real_extract(self, url): + series_id = self._match_id(url) + response = self._call_api(series_id, f'series/{series_id}') + return self._extract_series(response) + + +class PRXAccountIE(PRXBaseIE): + _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://beta.prx.org/accounts/206', + 'info_dict': { + 'id': '206', + 'title': 'New Hampshire Public Radio', + 'description': 'md5:277f2395301d0aca563c80c70a18ee0a', + 'channel_id': '206', + 'channel_url': 'https://beta.prx.org/accounts/206', + 'channel': 'New Hampshire Public Radio', + 'thumbnails': 'count:1' + }, + 'playlist_mincount': 380 + }] + + def _extract_account(self, account_response): + info = self._extract_account_info(account_response) + series = self._entries( + info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry) + stories = self._entries( + info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry) + return { + '_type': 'playlist', + 'entries': itertools.chain(series, stories), + **info + } + + def _real_extract(self, url): + account_id = self._match_id(url) + response = self._call_api(account_id, f'accounts/{account_id}') + return self._extract_account(response) + + +class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor): + IE_DESC = 'PRX Stories Search' + IE_NAME = 'prxstories:search' + _SEARCH_KEY = 'prxstories' + + def _search_results(self, query): + yield from self._entries( + f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query}) + + +class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor): + IE_DESC = 'PRX Series Search' + IE_NAME = 'prxseries:search' + _SEARCH_KEY = 'prxseries' + + def _search_results(self, query): + yield from self._entries( + f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query}) diff --git a/hypervideo_dl/extractor/radiode.py b/hypervideo_dl/extractor/radiode.py index 2c06c8b..0382873 100644 --- a/hypervideo_dl/extractor/radiode.py +++ b/hypervideo_dl/extractor/radiode.py @@ -29,7 +29,7 @@ class RadioDeIE(InfoExtractor): webpage, 'broadcast') broadcast = self._parse_json(jscode, radio_id) - title = self._live_title(broadcast['name']) + title = broadcast['name'] description = broadcast.get('description') or broadcast.get('shortDescription') thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl') or broadcast.get('logo100x100') diff --git a/hypervideo_dl/extractor/radiokapital.py b/hypervideo_dl/extractor/radiokapital.py new file mode 100644 index 0000000..2e93e03 --- /dev/null +++ b/hypervideo_dl/extractor/radiokapital.py @@ -0,0 +1,99 @@ +# coding: utf-8 + +from .common import InfoExtractor +from ..utils import ( + clean_html, + traverse_obj, + unescapeHTML, +) + +import itertools +from urllib.parse import urlencode + + +class RadioKapitalBaseIE(InfoExtractor): + def _call_api(self, resource, video_id, note='Downloading JSON metadata', qs={}): + return self._download_json( + f'https://www.radiokapital.pl/wp-json/kapital/v1/{resource}?{urlencode(qs)}', + video_id, note=note) + + def _parse_episode(self, data): + release = '%s%s%s' % (data['published'][6:11], data['published'][3:6], data['published'][:3]) + return { + '_type': 'url_transparent', + 'url': data['mixcloud_url'], + 'ie_key': 'Mixcloud', + 'title': unescapeHTML(data['title']), + 'description': clean_html(data.get('content')), + 'tags': traverse_obj(data, ('tags', ..., 'name')), + 'release_date': release, + 'series': traverse_obj(data, ('show', 'title')), + } + + +class RadioKapitalIE(RadioKapitalBaseIE): + IE_NAME = 'radiokapital' + _VALID_URL = r'https?://(?:www\.)?radiokapital\.pl/shows/[a-z\d-]+/(?P<id>[a-z\d-]+)' + + _TESTS = [{ + 'url': 'https://radiokapital.pl/shows/tutaj-sa-smoki/5-its-okay-to-be-immaterial', + 'info_dict': { + 'id': 'radiokapital_radio-kapitał-tutaj-są-smoki-5-its-okay-to-be-immaterial-2021-05-20', + 'ext': 'm4a', + 'title': '#5: It’s okay to\xa0be\xa0immaterial', + 'description': 'md5:2499da5fbfb0e88333b7d37ec8e9e4c4', + 'uploader': 'Radio Kapitał', + 'uploader_id': 'radiokapital', + 'timestamp': 1621640164, + 'upload_date': '20210521', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + episode = self._call_api('episodes/%s' % video_id, video_id) + return self._parse_episode(episode) + + +class RadioKapitalShowIE(RadioKapitalBaseIE): + IE_NAME = 'radiokapital:show' + _VALID_URL = r'https?://(?:www\.)?radiokapital\.pl/shows/(?P<id>[a-z\d-]+)/?(?:$|[?#])' + + _TESTS = [{ + 'url': 'https://radiokapital.pl/shows/wesz', + 'info_dict': { + 'id': '100', + 'title': 'WĘSZ', + 'description': 'md5:3a557a1e0f31af612b0dcc85b1e0ca5c', + }, + 'playlist_mincount': 17, + }] + + def _get_episode_list(self, series_id, page_no): + return self._call_api( + 'episodes', series_id, + f'Downloading episode list page #{page_no}', qs={ + 'show': series_id, + 'page': page_no, + }) + + def _entries(self, series_id): + for page_no in itertools.count(1): + episode_list = self._get_episode_list(series_id, page_no) + yield from (self._parse_episode(ep) for ep in episode_list['items']) + if episode_list['next'] is None: + break + + def _real_extract(self, url): + series_id = self._match_id(url) + + show = self._call_api(f'shows/{series_id}', series_id, 'Downloading show metadata') + entries = self._entries(series_id) + return { + '_type': 'playlist', + 'entries': entries, + 'id': str(show['id']), + 'title': show.get('title'), + 'description': clean_html(show.get('content')), + } diff --git a/hypervideo_dl/extractor/radiozet.py b/hypervideo_dl/extractor/radiozet.py new file mode 100644 index 0000000..2e1ff36 --- /dev/null +++ b/hypervideo_dl/extractor/radiozet.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import ( + traverse_obj, + strip_or_none, +) + + +class RadioZetPodcastIE(InfoExtractor): + _VALID_URL = r'https?://player\.radiozet\.pl\/Podcasty/.*?/(?P<id>.+)' + _TEST = { + 'url': 'https://player.radiozet.pl/Podcasty/Nie-Ma-Za-Co/O-przedmiotach-szkolnych-ktore-przydaja-sie-w-zyciu', + 'md5': 'e03665c316b4fbc5f6a8f232948bbba3', + 'info_dict': { + 'id': '42154', + 'display_id': 'O-przedmiotach-szkolnych-ktore-przydaja-sie-w-zyciu', + 'title': 'O przedmiotach szkolnych, które przydają się w życiu', + 'description': 'md5:fa72bed49da334b09e5b2f79851f185c', + 'release_timestamp': 1592985480, + 'ext': 'mp3', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 83, + 'series': 'Nie Ma Za Co', + 'creator': 'Katarzyna Pakosińska', + } + } + + def _call_api(self, podcast_id, display_id): + return self._download_json( + f'https://player.radiozet.pl/api/podcasts/getPodcast/(node)/{podcast_id}/(station)/radiozet', + display_id) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + podcast_id = self._html_search_regex(r'<div.*?\sid="player".*?\sdata-id=[\'"]([^\'"]+)[\'"]', + webpage, 'podcast id') + data = self._call_api(podcast_id, display_id)['data'][0] + + return { + 'id': podcast_id, + 'display_id': display_id, + 'title': strip_or_none(data.get('title')), + 'description': strip_or_none(traverse_obj(data, ('program', 'desc'))), + 'release_timestamp': data.get('published_date'), + 'url': traverse_obj(data, ('player', 'stream')), + 'thumbnail': traverse_obj(data, ('program', 'image', 'original')), + 'duration': traverse_obj(data, ('player', 'duration')), + 'series': strip_or_none(traverse_obj(data, ('program', 'title'))), + 'creator': strip_or_none(traverse_obj(data, ('presenter', 0, 'title'))), + } diff --git a/hypervideo_dl/extractor/radlive.py b/hypervideo_dl/extractor/radlive.py index 2de7ab0..dc98973 100644 --- a/hypervideo_dl/extractor/radlive.py +++ b/hypervideo_dl/extractor/radlive.py @@ -1,6 +1,12 @@ import json -from ..utils import ExtractorError, traverse_obj, try_get, unified_timestamp +from ..utils import ( + ExtractorError, + format_field, + traverse_obj, + try_get, + unified_timestamp +) from .common import InfoExtractor @@ -74,7 +80,7 @@ class RadLiveIE(InfoExtractor): 'release_timestamp': release_date, 'channel': channel.get('name'), 'channel_id': channel_id, - 'channel_url': f'https://rad.live/content/channel/{channel_id}' if channel_id else None, + 'channel_url': format_field(channel_id, template='https://rad.live/content/channel/%s'), } if content_type == 'episode': diff --git a/hypervideo_dl/extractor/rai.py b/hypervideo_dl/extractor/rai.py index 27cd018..6864129 100644 --- a/hypervideo_dl/extractor/rai.py +++ b/hypervideo_dl/extractor/rai.py @@ -11,14 +11,17 @@ from ..compat import ( from ..utils import ( determine_ext, ExtractorError, + filter_dict, find_xpath_attr, fix_xml_ampersands, GeoRestrictedError, HEADRequest, int_or_none, + join_nonempty, parse_duration, remove_start, strip_or_none, + traverse_obj, try_get, unified_strdate, unified_timestamp, @@ -33,7 +36,7 @@ class RaiBaseIE(InfoExtractor): _GEO_COUNTRIES = ['IT'] _GEO_BYPASS = False - def _extract_relinker_info(self, relinker_url, video_id): + def _extract_relinker_info(self, relinker_url, video_id, audio_only=False): if not re.match(r'https?://', relinker_url): return {'formats': [{'url': relinker_url}]} @@ -76,7 +79,15 @@ class RaiBaseIE(InfoExtractor): if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): continue - if ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon': + if ext == 'mp3': + formats.append({ + 'url': media_url, + 'vcodec': 'none', + 'acodec': 'mp3', + 'format_id': 'http-mp3', + }) + break + elif ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon': formats.extend(self._extract_m3u8_formats( media_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) @@ -97,16 +108,17 @@ class RaiBaseIE(InfoExtractor): if not formats and geoprotection is True: self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) - formats.extend(self._create_http_urls(relinker_url, formats)) + if not audio_only: + formats.extend(self._create_http_urls(relinker_url, formats)) - return dict((k, v) for k, v in { + return filter_dict({ 'is_live': is_live, 'duration': duration, 'formats': formats, - }.items() if v is not None) + }) def _create_http_urls(self, relinker_url, fmts): - _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\d+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' + _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' _QUALITY = { # tbr: w, h @@ -135,6 +147,9 @@ class RaiBaseIE(InfoExtractor): return False if resp.url == url else resp.url return None + # filter out audio-only formats + fmts = [f for f in fmts if not f.get('vcodec') == 'none'] + def get_format_info(tbr): import math br = int_or_none(tbr) @@ -226,7 +241,7 @@ class RaiPlayIE(RaiBaseIE): 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', 'title': 'Report del 07/04/2014', - 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014', + 'alt_title': 'St 2013/14 - Report - Espresso nel caffè - 07/04/2014', 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Rai Gulp', @@ -234,7 +249,7 @@ class RaiPlayIE(RaiBaseIE): 'series': 'Report', 'season': '2013/14', 'subtitles': { - 'it': 'count:2', + 'it': 'count:4', }, }, 'params': { @@ -242,18 +257,18 @@ class RaiPlayIE(RaiBaseIE): }, }, { # 1080p direct mp4 url - 'url': 'https://www.raiplay.it/video/2021/03/Leonardo-S1E1-b5703b02-82ee-475a-85b6-c9e4a8adf642.html', - 'md5': '2e501e8651d72f05ffe8f5d286ad560b', + 'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html', + 'md5': 'aeda7243115380b2dd5e881fd42d949a', 'info_dict': { - 'id': 'b5703b02-82ee-475a-85b6-c9e4a8adf642', + 'id': 'b1255a4a-8e72-4a2f-b9f3-fc1308e00736', 'ext': 'mp4', - 'title': 'Leonardo - S1E1', - 'alt_title': 'St 1 Ep 1 - Episodio 1', - 'description': 'md5:f5360cd267d2de146e4e3879a5a47d31', + 'title': 'Blanca - S1E1 - Senza occhi', + 'alt_title': 'St 1 Ep 1 - Blanca - Senza occhi', + 'description': 'md5:75f95d5c030ec8bac263b1212322e28c', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Rai 1', - 'duration': 3229, - 'series': 'Leonardo', + 'duration': 6493, + 'series': 'Blanca', 'season': 'Season 1', }, }, { @@ -306,12 +321,13 @@ class RaiPlayIE(RaiBaseIE): program_info = media.get('program_info') or {} season = media.get('season') + alt_title = join_nonempty(media.get('subtitle'), media.get('toptitle'), delim=' - ') + info = { 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, 'display_id': video_id, - 'title': self._live_title(title) if relinker_info.get( - 'is_live') else title, - 'alt_title': strip_or_none(media.get('subtitle')), + 'title': title, + 'alt_title': strip_or_none(alt_title), 'description': media.get('description'), 'uploader': strip_or_none(media.get('channel')), 'creator': strip_or_none(media.get('editor') or None), @@ -351,26 +367,44 @@ class RaiPlayLiveIE(RaiPlayIE): class RaiPlayPlaylistIE(InfoExtractor): - _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))' + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?' _TESTS = [{ - 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', + 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo', 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, 'playlist_mincount': 12, + }, { + 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/episodi/stagione-2/', + 'info_dict': { + 'id': 'nondirloalmiocapo', + 'title': 'Non dirlo al mio capo - Stagione 2', + 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', + }, + 'playlist_mincount': 12, }] def _real_extract(self, url): - base, playlist_id = self._match_valid_url(url).groups() + base, playlist_id, extra_id = self._match_valid_url(url).groups() program = self._download_json( base + '.json', playlist_id, 'Downloading program JSON') + if extra_id: + extra_id = extra_id.upper().rstrip('/') + + playlist_title = program.get('name') entries = [] for b in (program.get('blocks') or []): for s in (b.get('sets') or []): + if extra_id: + if extra_id != join_nonempty( + b.get('name'), s.get('name'), delim='/').replace(' ', '-').upper(): + continue + playlist_title = join_nonempty(playlist_title, s.get('name'), delim=' - ') + s_id = s.get('id') if not s_id: continue @@ -389,10 +423,128 @@ class RaiPlayPlaylistIE(InfoExtractor): video_id=RaiPlayIE._match_id(video_url))) return self.playlist_result( - entries, playlist_id, program.get('name'), + entries, playlist_id, playlist_title, try_get(program, lambda x: x['program_info']['description'])) +class RaiPlaySoundIE(RaiBaseIE): + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE + _TESTS = [{ + 'url': 'https://www.raiplaysound.it/audio/2021/12/IL-RUGGITO-DEL-CONIGLIO-1ebae2a7-7cdb-42bb-842e-fe0d193e9707.html', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', + 'info_dict': { + 'id': '1ebae2a7-7cdb-42bb-842e-fe0d193e9707', + 'ext': 'mp3', + 'title': 'Il Ruggito del Coniglio del 10/12/2021', + 'description': 'md5:2a17d2107e59a4a8faa0e18334139ee2', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'rai radio 2', + 'duration': 5685, + 'series': 'Il Ruggito del Coniglio', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + base, audio_id = self._match_valid_url(url).group('base', 'id') + media = self._download_json(f'{base}.json', audio_id, 'Downloading audio JSON') + uid = try_get(media, lambda x: remove_start(remove_start(x['uniquename'], 'ContentItem-'), 'Page-')) + + info = {} + formats = [] + relinkers = set(traverse_obj(media, (('downloadable_audio', 'audio', ('live', 'cards', 0, 'audio')), 'url'))) + for r in relinkers: + info = self._extract_relinker_info(r, audio_id, True) + formats.extend(info.get('formats')) + + date_published = try_get(media, (lambda x: f'{x["create_date"]} {x.get("create_time") or ""}', + lambda x: x['live']['create_date'])) + + podcast_info = traverse_obj(media, 'podcast_info', ('live', 'cards', 0)) or {} + thumbnails = [{ + 'url': urljoin(url, thumb_url), + } for thumb_url in (podcast_info.get('images') or {}).values() if thumb_url] + + return { + **info, + 'id': uid or audio_id, + 'display_id': audio_id, + 'title': traverse_obj(media, 'title', 'episode_title'), + 'alt_title': traverse_obj(media, ('track_info', 'media_name')), + 'description': media.get('description'), + 'uploader': traverse_obj(media, ('track_info', 'channel'), expected_type=strip_or_none), + 'creator': traverse_obj(media, ('track_info', 'editor'), expected_type=strip_or_none), + 'timestamp': unified_timestamp(date_published), + 'thumbnails': thumbnails, + 'series': podcast_info.get('title'), + 'season_number': int_or_none(media.get('season')), + 'episode': media.get('episode_title'), + 'episode_number': int_or_none(media.get('episode')), + 'formats': formats, + } + + +class RaiPlaySoundLiveIE(RaiPlaySoundIE): + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?P<id>[^/?#&]+)$)' + _TESTS = [{ + 'url': 'https://www.raiplaysound.it/radio2', + 'info_dict': { + 'id': 'b00a50e6-f404-4af6-8f8c-ff3b9af73a44', + 'display_id': 'radio2', + 'ext': 'mp4', + 'title': 'Rai Radio 2', + 'uploader': 'rai radio 2', + 'creator': 'raiplaysound', + 'is_live': True, + }, + 'params': { + 'skip_download': 'live', + }, + }] + + +class RaiPlaySoundPlaylistIE(InfoExtractor): + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?:programmi|playlist|audiolibri)/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?' + _TESTS = [{ + 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio', + 'info_dict': { + 'id': 'ilruggitodelconiglio', + 'title': 'Il Ruggito del Coniglio', + 'description': 'md5:1bbaf631245a7ab1ec4d9fbb3c7aa8f3', + }, + 'playlist_mincount': 65, + }, { + 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio/puntate/prima-stagione-1995', + 'info_dict': { + 'id': 'ilruggitodelconiglio_puntate_prima-stagione-1995', + 'title': 'Prima Stagione 1995', + }, + 'playlist_count': 1, + }] + + def _real_extract(self, url): + base, playlist_id, extra_id = self._match_valid_url(url).group('base', 'id', 'extra_id') + url = f'{base}.json' + program = self._download_json(url, playlist_id, 'Downloading program JSON') + + if extra_id: + extra_id = extra_id.rstrip('/') + playlist_id += '_' + extra_id.replace('/', '_') + path = next(c['path_id'] for c in program.get('filters') or [] if extra_id in c.get('weblink')) + program = self._download_json( + urljoin('https://www.raiplaysound.it', path), playlist_id, 'Downloading program secondary JSON') + + entries = [ + self.url_result(urljoin(base, c['path_id']), ie=RaiPlaySoundIE.ie_key()) + for c in traverse_obj(program, 'cards', ('block', 'cards')) or [] + if c.get('path_id')] + + return self.playlist_result(entries, playlist_id, program.get('title'), + traverse_obj(program, ('podcast_info', 'description'))) + + class RaiIE(RaiBaseIE): _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE _TESTS = [{ diff --git a/hypervideo_dl/extractor/rcti.py b/hypervideo_dl/extractor/rcti.py index 31d9779..ac42e58 100644 --- a/hypervideo_dl/extractor/rcti.py +++ b/hypervideo_dl/extractor/rcti.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import itertools import json import random import time @@ -12,6 +11,7 @@ from ..utils import ( dict_get, ExtractorError, strip_or_none, + traverse_obj, try_get ) @@ -26,7 +26,7 @@ class RCTIPlusBaseIE(InfoExtractor): json = self._download_json( url, video_id, note=note, headers={'Authorization': self._AUTH_KEY}) if json.get('status', {}).get('code', 0) != 0: - raise ExtractorError('%s said: %s' % (self.IE_NAME, json["status"]["message_client"]), cause=json) + raise ExtractorError(f'{self.IE_NAME} said: {json["status"]["message_client"]}', cause=json) return json.get('data'), json.get('meta') @@ -85,9 +85,6 @@ class RCTIPlusIE(RCTIPlusBaseIE): 'series': 'iNews Malam', 'channel': 'INews', }, - 'params': { - 'format': 'bestvideo', - }, }, { # Missed event/replay 'url': 'https://www.rctiplus.com/missed-event/2507/mou-signing-ceremony-27-juli-2021-1400-wib', 'md5': '649c5f27250faed1452ca8b91e06922d', @@ -132,7 +129,6 @@ class RCTIPlusIE(RCTIPlusBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', }, }] _CONVIVA_JSON_TEMPLATE = { @@ -227,18 +223,30 @@ class RCTIPlusIE(RCTIPlusBaseIE): class RCTIPlusSeriesIE(RCTIPlusBaseIE): - _VALID_URL = r'https://www\.rctiplus\.com/programs/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' + _VALID_URL = r'https://www\.rctiplus\.com/programs/(?P<id>\d+)/(?P<display_id>[^/?#&]+)(?:/(?P<type>episodes|extras|clips))?' _TESTS = [{ - 'url': 'https://www.rctiplus.com/programs/540/upin-ipin', - 'playlist_mincount': 417, + 'url': 'https://www.rctiplus.com/programs/829/putri-untuk-pangeran', + 'playlist_mincount': 1019, 'info_dict': { - 'id': '540', - 'title': 'Upin & Ipin', - 'description': 'md5:22cc912381f389664416844e1ec4f86b', + 'id': '829', + 'title': 'Putri Untuk Pangeran', + 'description': 'md5:aca7b54d05bd95a67d4f4613cc1d622d', + 'age_limit': 2, + 'cast': ['Verrel Bramasta', 'Ranty Maria', 'Riza Syah', 'Ivan Fadilla', 'Nicole Parham', 'Dll', 'Aviv Elham'], + 'display_id': 'putri-untuk-pangeran', + 'tag': 'count:18', }, - }, { - 'url': 'https://www.rctiplus.com/programs/540/upin-ipin/episodes?utm_source=Rplusdweb&utm_medium=share_copy&utm_campaign=programsupin-ipin', - 'only_matching': True, + }, { # No episodes + 'url': 'https://www.rctiplus.com/programs/615/inews-pagi', + 'playlist_mincount': 388, + 'info_dict': { + 'id': '615', + 'title': 'iNews Pagi', + 'description': 'md5:f18ee3d4643cfb41c358e5a9b693ee04', + 'age_limit': 2, + 'tag': 'count:11', + 'display_id': 'inews-pagi', + } }] _AGE_RATINGS = { # Based off https://id.wikipedia.org/wiki/Sistem_rating_konten_televisi with additional ratings 'S-SU': 2, @@ -273,47 +281,63 @@ class RCTIPlusSeriesIE(RCTIPlusBaseIE): display_id, '%s page %s' % (note, page_num))[0] or [] for video_json in episode_list: - link = video_json['share_link'] - url_res = self.url_result(link, 'RCTIPlus', video_json.get('product_id'), video_json.get('title')) - url_res.update(metadata) - yield url_res + yield { + '_type': 'url', + 'url': video_json['share_link'], + 'ie_key': RCTIPlusIE.ie_key(), + 'id': video_json.get('product_id'), + 'title': video_json.get('title'), + 'display_id': video_json.get('title_code').replace('_', '-'), + 'description': video_json.get('summary'), + 'timestamp': video_json.get('release_date'), + 'duration': video_json.get('duration'), + 'season_number': video_json.get('season'), + 'episode_number': video_json.get('episode'), + **metadata + } + + def _series_entries(self, series_id, display_id=None, video_type=None, metadata={}): + if not video_type or video_type in 'episodes': + try: + seasons_list = self._call_api( + f'https://api.rctiplus.com/api/v1/program/{series_id}/season', + display_id, 'Downloading seasons list JSON')[0] + except ExtractorError as e: + if 'not found' not in str(e): + raise + seasons_list = [] + for season in seasons_list: + yield from self._entries( + f'https://api.rctiplus.com/api/v2/program/{series_id}/episode?season={season["season"]}', + display_id, f'Downloading season {season["season"]} episode entries', metadata) + if not video_type or video_type in 'extras': + yield from self._entries( + f'https://api.rctiplus.com/api/v2/program/{series_id}/extra?content_id=0', + display_id, 'Downloading extra entries', metadata) + if not video_type or video_type in 'clips': + yield from self._entries( + f'https://api.rctiplus.com/api/v2/program/{series_id}/clip?content_id=0', + display_id, 'Downloading clip entries', metadata) def _real_extract(self, url): - series_id, display_id = self._match_valid_url(url).groups() + series_id, display_id, video_type = self._match_valid_url(url).group('id', 'display_id', 'type') + if video_type: + self.report_warning( + f'Only {video_type} will be downloaded. ' + f'To download everything from the series, remove "/{video_type}" from the URL') series_meta, meta_paths = self._call_api( - 'https://api.rctiplus.com/api/v1/program/%s/detail' % series_id, display_id, 'Downloading series metadata') + f'https://api.rctiplus.com/api/v1/program/{series_id}/detail', display_id, 'Downloading series metadata') metadata = { - 'age_limit': try_get(series_meta, lambda x: self._AGE_RATINGS[x['age_restriction'][0]['code']]) + 'age_limit': try_get(series_meta, lambda x: self._AGE_RATINGS[x['age_restriction'][0]['code']]), + 'cast': traverse_obj(series_meta, (('starring', 'creator', 'writer'), ..., 'name'), + expected_type=lambda x: strip_or_none(x) or None), + 'tag': traverse_obj(series_meta, ('tag', ..., 'name'), + expected_type=lambda x: strip_or_none(x) or None), } - - cast = [] - for star in series_meta.get('starring', []): - cast.append(strip_or_none(star.get('name'))) - for star in series_meta.get('creator', []): - cast.append(strip_or_none(star.get('name'))) - for star in series_meta.get('writer', []): - cast.append(strip_or_none(star.get('name'))) - metadata['cast'] = cast - - tags = [] - for tag in series_meta.get('tag', []): - tags.append(strip_or_none(tag.get('name'))) - metadata['tag'] = tags - - entries = [] - seasons_list = self._call_api( - 'https://api.rctiplus.com/api/v1/program/%s/season' % series_id, display_id, 'Downloading seasons list JSON')[0] - for season in seasons_list: - entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/episode?season=%s' % (series_id, season['season']), - display_id, 'Downloading season %s episode entries' % season['season'], metadata)) - - entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/clip?content_id=0' % series_id, - display_id, 'Downloading clip entries', metadata)) - entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/extra?content_id=0' % series_id, - display_id, 'Downloading extra entries', metadata)) - - return self.playlist_result(itertools.chain(*entries), series_id, series_meta.get('title'), series_meta.get('summary'), **metadata) + return self.playlist_result( + self._series_entries(series_id, display_id, video_type, metadata), series_id, + series_meta.get('title'), series_meta.get('summary'), display_id=display_id, **metadata) class RCTIPlusTVIE(RCTIPlusBaseIE): @@ -329,7 +353,6 @@ class RCTIPlusTVIE(RCTIPlusBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', } }, { # Returned video will always change @@ -350,5 +373,6 @@ class RCTIPlusTVIE(RCTIPlusBaseIE): tv_id = match.get('tvname') or match.get('eventname') webpage = self._download_webpage(url, tv_id) video_type, video_id = self._search_regex( - r'url\s*:\s*["\']https://api\.rctiplus\.com/api/v./(?P<type>[^/]+)/(?P<id>\d+)/url', webpage, 'video link', group=('type', 'id')) + r'url\s*:\s*["\']https://api\.rctiplus\.com/api/v./(?P<type>[^/]+)/(?P<id>\d+)/url', + webpage, 'video link', group=('type', 'id')) return self.url_result(f'https://www.rctiplus.com/{video_type}/{video_id}/{tv_id}', 'RCTIPlus') diff --git a/hypervideo_dl/extractor/redbulltv.py b/hypervideo_dl/extractor/redbulltv.py index e7fdcce..756a366 100644 --- a/hypervideo_dl/extractor/redbulltv.py +++ b/hypervideo_dl/extractor/redbulltv.py @@ -81,12 +81,11 @@ class RedBullTVIE(InfoExtractor): title = video['title'].strip() - formats = self._extract_m3u8_formats( + formats, subtitles = self._extract_m3u8_formats_and_subtitles( 'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') self._sort_formats(formats) - subtitles = {} for resource in video.get('resources', []): if resource.startswith('closed_caption_'): splitted_resource = resource.split('_') diff --git a/hypervideo_dl/extractor/reddit.py b/hypervideo_dl/extractor/reddit.py index c75d95a..a042a59 100644 --- a/hypervideo_dl/extractor/reddit.py +++ b/hypervideo_dl/extractor/reddit.py @@ -8,46 +8,11 @@ from ..utils import ( try_get, unescapeHTML, url_or_none, + traverse_obj ) class RedditIE(InfoExtractor): - _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)' - _TEST = { - # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ - 'url': 'https://v.redd.it/zv89llsvexdz', - 'md5': '0a070c53eba7ec4534d95a5a1259e253', - 'info_dict': { - 'id': 'zv89llsvexdz', - 'ext': 'mp4', - 'title': 'zv89llsvexdz', - }, - 'params': { - 'format': 'bestvideo', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - formats = self._extract_m3u8_formats( - 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id, - 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) - - formats.extend(self._extract_mpd_formats( - 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id, - mpd_id='dash', fatal=False)) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': video_id, - 'formats': formats, - } - - -class RedditRIE(InfoExtractor): _VALID_URL = r'https?://(?P<subdomain>[^/]+\.)?reddit(?:media)?\.com/r/(?P<slug>[^/]+/comments/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', @@ -67,7 +32,6 @@ class RedditRIE(InfoExtractor): 'age_limit': 0, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { @@ -151,19 +115,53 @@ class RedditRIE(InfoExtractor): for resolution in resolutions: add_thumbnail(resolution) - return { - '_type': 'url_transparent', - 'url': video_url, + info = { 'title': data.get('title'), 'thumbnails': thumbnails, 'timestamp': float_or_none(data.get('created_utc')), 'uploader': data.get('author'), - 'duration': int_or_none(try_get( - data, - (lambda x: x['media']['reddit_video']['duration'], - lambda x: x['secure_media']['reddit_video']['duration']))), 'like_count': int_or_none(data.get('ups')), 'dislike_count': int_or_none(data.get('downs')), 'comment_count': int_or_none(data.get('num_comments')), 'age_limit': age_limit, } + + # Check if media is hosted on reddit: + reddit_video = traverse_obj(data, (('media', 'secure_media'), 'reddit_video'), get_all=False) + if reddit_video: + playlist_urls = [ + try_get(reddit_video, lambda x: unescapeHTML(x[y])) + for y in ('dash_url', 'hls_url') + ] + + # Update video_id + display_id = video_id + video_id = self._search_regex( + r'https?://v\.redd\.it/(?P<id>[^/?#&]+)', reddit_video['fallback_url'], + 'video_id', default=display_id) + + dash_playlist_url = playlist_urls[0] or f'https://v.redd.it/{video_id}/DASHPlaylist.mpd' + hls_playlist_url = playlist_urls[1] or f'https://v.redd.it/{video_id}/HLSPlaylist.m3u8' + + formats = self._extract_m3u8_formats( + hls_playlist_url, display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + dash_playlist_url, display_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + return { + **info, + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'duration': int_or_none(reddit_video.get('duration')), + } + + # Not hosted on reddit, must continue extraction + return { + **info, + 'display_id': video_id, + '_type': 'url_transparent', + 'url': video_url, + } diff --git a/hypervideo_dl/extractor/redgifs.py b/hypervideo_dl/extractor/redgifs.py new file mode 100644 index 0000000..55196b7 --- /dev/null +++ b/hypervideo_dl/extractor/redgifs.py @@ -0,0 +1,232 @@ +# coding: utf-8 +import functools + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ( + ExtractorError, + int_or_none, + qualities, + try_get, + OnDemandPagedList, +) + + +class RedGifsBaseInfoExtractor(InfoExtractor): + _FORMATS = { + 'gif': 250, + 'sd': 480, + 'hd': None, + } + + def _parse_gif_data(self, gif_data): + video_id = gif_data.get('id') + quality = qualities(tuple(self._FORMATS.keys())) + + orig_height = int_or_none(gif_data.get('height')) + aspect_ratio = try_get(gif_data, lambda x: orig_height / x['width']) + + formats = [] + for format_id, height in self._FORMATS.items(): + video_url = gif_data['urls'].get(format_id) + if not video_url: + continue + height = min(orig_height, height or orig_height) + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'width': height * aspect_ratio if aspect_ratio else None, + 'height': height, + 'quality': quality(format_id), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'webpage_url': f'https://redgifs.com/watch/{video_id}', + 'ie_key': RedGifsIE.ie_key(), + 'extractor': 'RedGifs', + 'title': ' '.join(gif_data.get('tags') or []) or 'RedGifs', + 'timestamp': int_or_none(gif_data.get('createDate')), + 'uploader': gif_data.get('userName'), + 'duration': int_or_none(gif_data.get('duration')), + 'view_count': int_or_none(gif_data.get('views')), + 'like_count': int_or_none(gif_data.get('likes')), + 'categories': gif_data.get('tags') or [], + 'tags': gif_data.get('tags'), + 'age_limit': 18, + 'formats': formats, + } + + def _call_api(self, ep, video_id, *args, **kwargs): + data = self._download_json( + f'https://api.redgifs.com/v2/{ep}', video_id, *args, **kwargs) + if 'error' in data: + raise ExtractorError(f'RedGifs said: {data["error"]}', expected=True, video_id=video_id) + return data + + def _fetch_page(self, ep, video_id, query, page): + query['page'] = page + 1 + data = self._call_api( + ep, video_id, query=query, note=f'Downloading JSON metadata page {page + 1}') + + for entry in data['gifs']: + yield self._parse_gif_data(entry) + + def _prepare_api_query(self, query, fields): + api_query = [ + (field_name, query.get(field_name, (default,))[0]) + for field_name, default in fields.items()] + + return {key: val for key, val in api_query if val is not None} + + def _paged_entries(self, ep, item_id, query, fields): + page = int_or_none(query.get('page', (None,))[0]) + page_fetcher = functools.partial( + self._fetch_page, ep, item_id, self._prepare_api_query(query, fields)) + return page_fetcher(page) if page else OnDemandPagedList(page_fetcher, self._PAGE_SIZE) + + +class RedGifsIE(RedGifsBaseInfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?redgifs\.com/watch/|thumbs2\.redgifs\.com/)(?P<id>[^-/?#\.]+)' + _TESTS = [{ + 'url': 'https://www.redgifs.com/watch/squeakyhelplesswisent', + 'info_dict': { + 'id': 'squeakyhelplesswisent', + 'ext': 'mp4', + 'title': 'Hotwife Legs Thick', + 'timestamp': 1636287915, + 'upload_date': '20211107', + 'uploader': 'ignored52', + 'duration': 16, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + } + }, { + 'url': 'https://thumbs2.redgifs.com/SqueakyHelplessWisent-mobile.mp4#t=0', + 'info_dict': { + 'id': 'squeakyhelplesswisent', + 'ext': 'mp4', + 'title': 'Hotwife Legs Thick', + 'timestamp': 1636287915, + 'upload_date': '20211107', + 'uploader': 'ignored52', + 'duration': 16, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url).lower() + video_info = self._call_api( + f'gifs/{video_id}', video_id, note='Downloading video info') + return self._parse_gif_data(video_info['gif']) + + +class RedGifsSearchIE(RedGifsBaseInfoExtractor): + IE_DESC = 'Redgifs search' + _VALID_URL = r'https?://(?:www\.)?redgifs\.com/browse\?(?P<query>[^#]+)' + _PAGE_SIZE = 80 + _TESTS = [ + { + 'url': 'https://www.redgifs.com/browse?tags=Lesbian', + 'info_dict': { + 'id': 'tags=Lesbian', + 'title': 'Lesbian', + 'description': 'RedGifs search for Lesbian, ordered by trending' + }, + 'playlist_mincount': 100, + }, + { + 'url': 'https://www.redgifs.com/browse?type=g&order=latest&tags=Lesbian', + 'info_dict': { + 'id': 'type=g&order=latest&tags=Lesbian', + 'title': 'Lesbian', + 'description': 'RedGifs search for Lesbian, ordered by latest' + }, + 'playlist_mincount': 100, + }, + { + 'url': 'https://www.redgifs.com/browse?type=g&order=latest&tags=Lesbian&page=2', + 'info_dict': { + 'id': 'type=g&order=latest&tags=Lesbian&page=2', + 'title': 'Lesbian', + 'description': 'RedGifs search for Lesbian, ordered by latest' + }, + 'playlist_count': 80, + } + ] + + def _real_extract(self, url): + query_str = self._match_valid_url(url).group('query') + query = compat_parse_qs(query_str) + if not query.get('tags'): + raise ExtractorError('Invalid query tags', expected=True) + + tags = query.get('tags')[0] + order = query.get('order', ('trending',))[0] + + query['search_text'] = [tags] + entries = self._paged_entries('gifs/search', query_str, query, { + 'search_text': None, + 'order': 'trending', + 'type': None, + }) + + return self.playlist_result( + entries, query_str, tags, f'RedGifs search for {tags}, ordered by {order}') + + +class RedGifsUserIE(RedGifsBaseInfoExtractor): + IE_DESC = 'Redgifs user' + _VALID_URL = r'https?://(?:www\.)?redgifs\.com/users/(?P<username>[^/?#]+)(?:\?(?P<query>[^#]+))?' + _PAGE_SIZE = 30 + _TESTS = [ + { + 'url': 'https://www.redgifs.com/users/lamsinka89', + 'info_dict': { + 'id': 'lamsinka89', + 'title': 'lamsinka89', + 'description': 'RedGifs user lamsinka89, ordered by recent' + }, + 'playlist_mincount': 100, + }, + { + 'url': 'https://www.redgifs.com/users/lamsinka89?page=3', + 'info_dict': { + 'id': 'lamsinka89?page=3', + 'title': 'lamsinka89', + 'description': 'RedGifs user lamsinka89, ordered by recent' + }, + 'playlist_count': 30, + }, + { + 'url': 'https://www.redgifs.com/users/lamsinka89?order=best&type=g', + 'info_dict': { + 'id': 'lamsinka89?order=best&type=g', + 'title': 'lamsinka89', + 'description': 'RedGifs user lamsinka89, ordered by best' + }, + 'playlist_mincount': 100, + } + ] + + def _real_extract(self, url): + username, query_str = self._match_valid_url(url).group('username', 'query') + playlist_id = f'{username}?{query_str}' if query_str else username + + query = compat_parse_qs(query_str) + order = query.get('order', ('recent',))[0] + + entries = self._paged_entries(f'users/{username}/search', playlist_id, query, { + 'order': 'recent', + 'type': None, + }) + + return self.playlist_result( + entries, playlist_id, username, f'RedGifs user {username}, ordered by {order}') diff --git a/hypervideo_dl/extractor/redtube.py b/hypervideo_dl/extractor/redtube.py index 747ce51..7fee54f 100644 --- a/hypervideo_dl/extractor/redtube.py +++ b/hypervideo_dl/extractor/redtube.py @@ -17,17 +17,20 @@ from ..utils import ( class RedTubeIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'http://www.redtube.com/66418', - 'md5': 'fc08071233725f26b8f014dba9590005', + 'url': 'https://www.redtube.com/38864951', + 'md5': '4fba70cbca3aefd25767ab4b523c9878', 'info_dict': { - 'id': '66418', + 'id': '38864951', 'ext': 'mp4', - 'title': 'Sucked on a toilet', - 'upload_date': '20110811', - 'duration': 596, + 'title': 'Public Sex on the Balcony in Freezing Paris! Amateur Couple LeoLulu', + 'description': 'Watch video Public Sex on the Balcony in Freezing Paris! Amateur Couple LeoLulu on Redtube, home of free Blowjob porn videos and Blonde sex movies online. Video length: (10:46) - Uploaded by leolulu - Verified User - Starring Pornstar: Leolulu', + 'upload_date': '20210111', + 'timestamp': 1610343109, + 'duration': 646, 'view_count': int, 'age_limit': 18, - } + 'thumbnail': r're:https://\wi-ph\.rdtcdn\.com/videos/.+/.+\.jpg', + }, }, { 'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286', 'only_matching': True, @@ -84,15 +87,25 @@ class RedTubeIE(InfoExtractor): r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage, 'media definitions', default='{}'), video_id, fatal=False) - if medias and isinstance(medias, list): - for media in medias: + for media in medias if isinstance(medias, list) else []: + format_url = url_or_none(media.get('videoUrl')) + if not format_url: + continue + format_id = media.get('format') + quality = media.get('quality') + if format_id == 'hls' or (format_id == 'mp4' and not quality): + more_media = self._download_json(format_url, video_id, fatal=False) + else: + more_media = [media] + for media in more_media if isinstance(more_media, list) else []: format_url = url_or_none(media.get('videoUrl')) if not format_url: continue - if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8': + format_id = media.get('format') + if format_id == 'hls' or determine_ext(format_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', + entry_protocol='m3u8_native', m3u8_id=format_id or 'hls', fatal=False)) continue format_id = media.get('quality') diff --git a/hypervideo_dl/extractor/rmcdecouverte.py b/hypervideo_dl/extractor/rmcdecouverte.py index 422d47a..8bfce34 100644 --- a/hypervideo_dl/extractor/rmcdecouverte.py +++ b/hypervideo_dl/extractor/rmcdecouverte.py @@ -26,7 +26,6 @@ class RMCDecouverteIE(InfoExtractor): 'upload_date': '20210428', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { diff --git a/hypervideo_dl/extractor/rokfin.py b/hypervideo_dl/extractor/rokfin.py new file mode 100644 index 0000000..0fd65db --- /dev/null +++ b/hypervideo_dl/extractor/rokfin.py @@ -0,0 +1,256 @@ +# coding: utf-8 +import itertools +from datetime import datetime + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + format_field, + int_or_none, + str_or_none, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +_API_BASE_URL = 'https://prod-api-v2.production.rokfin.com/api/v2/public/' + + +class RokfinIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rokfin\.com/(?P<id>(?P<type>post|stream)/\d+)' + _TESTS = [{ + 'url': 'https://www.rokfin.com/post/57548/Mitt-Romneys-Crazy-Solution-To-Climate-Change', + 'info_dict': { + 'id': 'post/57548', + 'ext': 'mp4', + 'title': 'Mitt Romney\'s Crazy Solution To Climate Change', + 'thumbnail': r're:https://img\.production\.rokfin\.com/.+', + 'upload_date': '20211023', + 'timestamp': 1634998029, + 'channel': 'Jimmy Dore', + 'channel_id': 65429, + 'channel_url': 'https://rokfin.com/TheJimmyDoreShow', + 'duration': 213.0, + 'availability': 'public', + 'live_status': 'not_live', + 'dislike_count': int, + 'like_count': int, + } + }, { + 'url': 'https://rokfin.com/post/223/Julian-Assange-Arrested-Streaming-In-Real-Time', + 'info_dict': { + 'id': 'post/223', + 'ext': 'mp4', + 'title': 'Julian Assange Arrested: Streaming In Real Time', + 'thumbnail': r're:https://img\.production\.rokfin\.com/.+', + 'upload_date': '20190412', + 'timestamp': 1555052644, + 'channel': 'Ron Placone', + 'channel_id': 10, + 'channel_url': 'https://rokfin.com/RonPlacone', + 'availability': 'public', + 'live_status': 'not_live', + 'dislike_count': int, + 'like_count': int, + 'tags': ['FreeThinkingMedia^', 'RealProgressives^'], + } + }, { + 'url': 'https://www.rokfin.com/stream/10543/Its-A-Crazy-Mess-Regional-Director-Blows-Whistle-On-Pfizers-Vaccine-Trial-Data', + 'info_dict': { + 'id': 'stream/10543', + 'ext': 'mp4', + 'title': '"It\'s A Crazy Mess" Regional Director Blows Whistle On Pfizer\'s Vaccine Trial Data', + 'thumbnail': r're:https://img\.production\.rokfin\.com/.+', + 'description': 'md5:324ce2d3e3b62e659506409e458b9d8e', + 'channel': 'Ryan Cristián', + 'channel_id': 53856, + 'channel_url': 'https://rokfin.com/TLAVagabond', + 'availability': 'public', + 'is_live': False, + 'was_live': True, + 'live_status': 'was_live', + 'timestamp': 1635874720, + 'release_timestamp': 1635874720, + 'release_date': '20211102', + 'upload_date': '20211102', + 'dislike_count': int, + 'like_count': int, + 'tags': ['FreeThinkingMedia^'], + } + }] + + def _real_extract(self, url): + video_id, video_type = self._match_valid_url(url).group('id', 'type') + + metadata = self._download_json(f'{_API_BASE_URL}{video_id}', video_id) + + scheduled = unified_timestamp(metadata.get('scheduledAt')) + live_status = ('was_live' if metadata.get('stoppedAt') + else 'is_upcoming' if scheduled + else 'is_live' if video_type == 'stream' + else 'not_live') + + video_url = traverse_obj(metadata, 'url', ('content', 'contentUrl'), expected_type=url_or_none) + formats, subtitles = [{'url': video_url}] if video_url else [], {} + if determine_ext(video_url) == 'm3u8': + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, fatal=False, live=live_status == 'is_live') + + if not formats: + if traverse_obj(metadata, 'premiumPlan', 'premium'): + self.raise_login_required('This video is only available to premium users', True, method='cookies') + elif scheduled: + self.raise_no_formats( + f'Stream is offline; sheduled for {datetime.fromtimestamp(scheduled).strftime("%Y-%m-%d %H:%M:%S")}', + video_id=video_id, expected=True) + self._sort_formats(formats) + + uploader = traverse_obj(metadata, ('createdBy', 'username'), ('creator', 'username')) + timestamp = (scheduled or float_or_none(metadata.get('postedAtMilli'), 1000) + or unified_timestamp(metadata.get('creationDateTime'))) + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': str_or_none(traverse_obj(metadata, 'title', ('content', 'contentTitle'))), + 'duration': float_or_none(traverse_obj(metadata, ('content', 'duration'))), + 'thumbnail': url_or_none(traverse_obj(metadata, 'thumbnail', ('content', 'thumbnailUrl1'))), + 'description': str_or_none(traverse_obj(metadata, 'description', ('content', 'contentDescription'))), + 'like_count': int_or_none(metadata.get('likeCount')), + 'dislike_count': int_or_none(metadata.get('dislikeCount')), + 'channel': str_or_none(traverse_obj(metadata, ('createdBy', 'name'), ('creator', 'name'))), + 'channel_id': traverse_obj(metadata, ('createdBy', 'id'), ('creator', 'id')), + 'channel_url': url_or_none(f'https://rokfin.com/{uploader}') if uploader else None, + 'timestamp': timestamp, + 'release_timestamp': timestamp if live_status != 'not_live' else None, + 'tags': traverse_obj(metadata, ('tags', ..., 'title'), expected_type=str_or_none), + 'live_status': live_status, + 'availability': self._availability( + needs_premium=bool(traverse_obj(metadata, 'premiumPlan', 'premium')), + is_private=False, needs_subscription=False, needs_auth=False, is_unlisted=False), + # 'comment_count': metadata.get('numComments'), # Data provided by website is wrong + '__post_extractor': self.extract_comments(video_id) if video_type == 'post' else None, + } + + def _get_comments(self, video_id): + pages_total = None + for page_n in itertools.count(): + raw_comments = self._download_json( + f'{_API_BASE_URL}comment?postId={video_id[5:]}&page={page_n}&size=50', + video_id, note=f'Downloading viewer comments page {page_n + 1}{format_field(pages_total, template=" of %s")}', + fatal=False) or {} + + for comment in raw_comments.get('content') or []: + yield { + 'text': str_or_none(comment.get('comment')), + 'author': str_or_none(comment.get('name')), + 'id': comment.get('commentId'), + 'author_id': comment.get('userId'), + 'parent': 'root', + 'like_count': int_or_none(comment.get('numLikes')), + 'dislike_count': int_or_none(comment.get('numDislikes')), + 'timestamp': unified_timestamp(comment.get('postedAt')) + } + + pages_total = int_or_none(raw_comments.get('totalPages')) or None + is_last = raw_comments.get('last') + if not raw_comments.get('content') or is_last or (page_n > pages_total if pages_total else is_last is not False): + return + + +class RokfinPlaylistBaseIE(InfoExtractor): + _TYPES = { + 'video': 'post', + 'audio': 'post', + 'stream': 'stream', + 'dead_stream': 'stream', + 'stack': 'stack', + } + + def _get_video_data(self, metadata): + for content in metadata.get('content') or []: + media_type = self._TYPES.get(content.get('mediaType')) + video_id = content.get('id') if media_type == 'post' else content.get('mediaId') + if not media_type or not video_id: + continue + + yield self.url_result(f'https://rokfin.com/{media_type}/{video_id}', video_id=f'{media_type}/{video_id}', + video_title=str_or_none(traverse_obj(content, ('content', 'contentTitle')))) + + +class RokfinStackIE(RokfinPlaylistBaseIE): + IE_NAME = 'rokfin:stack' + _VALID_URL = r'https?://(?:www\.)?rokfin\.com/stack/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.rokfin.com/stack/271/Tulsi-Gabbard-Portsmouth-Townhall-FULL--Feb-9-2020', + 'playlist_count': 8, + 'info_dict': { + 'id': '271', + }, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + return self.playlist_result(self._get_video_data( + self._download_json(f'{_API_BASE_URL}stack/{list_id}', list_id)), list_id) + + +class RokfinChannelIE(RokfinPlaylistBaseIE): + IE_NAME = 'rokfin:channel' + _VALID_URL = r'https?://(?:www\.)?rokfin\.com/(?!((feed/?)|(discover/?)|(channels/?))$)(?P<id>[^/]+)/?$' + _TESTS = [{ + 'url': 'https://rokfin.com/TheConvoCouch', + 'playlist_mincount': 100, + 'info_dict': { + 'id': '12071-new', + 'title': 'TheConvoCouch - New', + 'description': 'md5:bb622b1bca100209b91cd685f7847f06', + }, + }] + + _TABS = { + 'new': 'posts', + 'top': 'top', + 'videos': 'video', + 'podcasts': 'audio', + 'streams': 'stream', + 'stacks': 'stack', + } + + def _real_initialize(self): + self._validate_extractor_args() + + def _validate_extractor_args(self): + requested_tabs = self._configuration_arg('tab', None) + if requested_tabs is not None and (len(requested_tabs) > 1 or requested_tabs[0] not in self._TABS): + raise ExtractorError(f'Invalid extractor-arg "tab". Must be one of {", ".join(self._TABS)}', expected=True) + + def _entries(self, channel_id, channel_name, tab): + pages_total = None + for page_n in itertools.count(0): + if tab in ('posts', 'top'): + data_url = f'{_API_BASE_URL}user/{channel_name}/{tab}?page={page_n}&size=50' + else: + data_url = f'{_API_BASE_URL}post/search/{tab}?page={page_n}&size=50&creator={channel_id}' + metadata = self._download_json( + data_url, channel_name, + note=f'Downloading video metadata page {page_n + 1}{format_field(pages_total, template=" of %s")}') + + yield from self._get_video_data(metadata) + pages_total = int_or_none(metadata.get('totalPages')) or None + is_last = metadata.get('last') + if is_last or (page_n > pages_total if pages_total else is_last is not False): + return + + def _real_extract(self, url): + channel_name = self._match_id(url) + channel_info = self._download_json(f'{_API_BASE_URL}user/{channel_name}', channel_name) + channel_id = channel_info['id'] + tab = self._configuration_arg('tab', default=['new'])[0] + + return self.playlist_result( + self._entries(channel_id, channel_name, self._TABS[tab]), + f'{channel_id}-{tab}', f'{channel_name} - {tab.title()}', str_or_none(channel_info.get('description'))) diff --git a/hypervideo_dl/extractor/roosterteeth.py b/hypervideo_dl/extractor/roosterteeth.py index 2c815bd..a55dd4f 100644 --- a/hypervideo_dl/extractor/roosterteeth.py +++ b/hypervideo_dl/extractor/roosterteeth.py @@ -1,25 +1,88 @@ # coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, + join_nonempty, + LazyList, + parse_qs, str_or_none, + traverse_obj, + url_or_none, urlencode_postdata, + urljoin, + update_url_query, ) -class RoosterTeethIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)' +class RoosterTeethBaseIE(InfoExtractor): _NETRC_MACHINE = 'roosterteeth' + _API_BASE = 'https://svod-be.roosterteeth.com' + _API_BASE_URL = f'{_API_BASE}/api/v1' + + def _perform_login(self, username, password): + if self._get_cookies(self._API_BASE_URL).get('rt_access_token'): + return + + try: + self._download_json( + 'https://auth.roosterteeth.com/oauth/token', + None, 'Logging in', data=urlencode_postdata({ + 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5', + 'grant_type': 'password', + 'username': username, + 'password': password, + })) + except ExtractorError as e: + msg = 'Unable to login' + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json(e.cause.read().decode(), None, fatal=False) + if resp: + error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') + if error: + msg += ': ' + error + self.report_warning(msg) + + def _extract_video_info(self, data): + thumbnails = [] + for image in traverse_obj(data, ('included', 'images')): + if image.get('type') not in ('episode_image', 'bonus_feature_image'): + continue + thumbnails.extend([{ + 'id': name, + 'url': url, + } for name, url in (image.get('attributes') or {}).items() if url_or_none(url)]) + + attributes = data.get('attributes') or {} + title = traverse_obj(attributes, 'title', 'display_title') + sub_only = attributes.get('is_sponsors_only') + + return { + 'id': str(data.get('id')), + 'display_id': attributes.get('slug'), + 'title': title, + 'description': traverse_obj(attributes, 'description', 'caption'), + 'series': attributes.get('show_title'), + 'season_number': int_or_none(attributes.get('season_number')), + 'season_id': attributes.get('season_id'), + 'episode': title, + 'episode_number': int_or_none(attributes.get('number')), + 'episode_id': str_or_none(data.get('uuid')), + 'channel_id': attributes.get('channel_id'), + 'duration': int_or_none(attributes.get('length')), + 'thumbnails': thumbnails, + 'availability': self._availability( + needs_premium=sub_only, needs_subscription=sub_only, needs_auth=sub_only, + is_private=False, is_unlisted=False), + 'tags': attributes.get('genres') + } + + +class RoosterTeethIE(RoosterTeethBaseIE): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', - 'md5': 'e2bd7764732d785ef797700a2489f212', 'info_dict': { 'id': '9156', 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', @@ -30,19 +93,20 @@ class RoosterTeethIE(InfoExtractor): 'series': 'Million Dollars, But...', 'episode': 'Million Dollars, But... The Game Announcement', }, + 'params': {'skip_download': True}, }, { 'url': 'https://roosterteeth.com/watch/rwby-bonus-25', - 'md5': 'fe8d9d976b272c18a24fe7f1f5830084', 'info_dict': { - 'id': '31', + 'id': '40432', 'display_id': 'rwby-bonus-25', - 'title': 'Volume 2, World of Remnant 3', - 'description': 'md5:8d58d3270292ea11da00ea712bbfb009', - 'episode': 'Volume 2, World of Remnant 3', - 'channel_id': 'fab60c1c-29cb-43bc-9383-5c3538d9e246', + 'title': 'Grimm', + 'description': 'md5:f30ff570741213418a8d2c19868b93ab', + 'episode': 'Grimm', + 'channel_id': '92f780eb-ebfe-4bf5-a3b5-c6ad5460a5f1', 'thumbnail': r're:^https?://.*\.(png|jpe?g)$', 'ext': 'mp4', }, + 'params': {'skip_download': True}, }, { 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', 'only_matching': True, @@ -63,40 +127,10 @@ class RoosterTeethIE(InfoExtractor): 'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement', 'only_matching': True, }] - _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/watch/' - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - try: - self._download_json( - 'https://auth.roosterteeth.com/oauth/token', - None, 'Logging in', data=urlencode_postdata({ - 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5', - 'grant_type': 'password', - 'username': username, - 'password': password, - })) - except ExtractorError as e: - msg = 'Unable to login' - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - resp = self._parse_json(e.cause.read().decode(), None, fatal=False) - if resp: - error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') - if error: - msg += ': ' + error - self.report_warning(msg) - - def _real_initialize(self): - if self._get_cookies(self._EPISODE_BASE_URL).get('rt_access_token'): - return - self._login() def _real_extract(self, url): display_id = self._match_id(url) - api_episode_url = self._EPISODE_BASE_URL + display_id + api_episode_url = f'{self._API_BASE_URL}/watch/{display_id}' try: video_data = self._download_json( @@ -118,36 +152,62 @@ class RoosterTeethIE(InfoExtractor): episode = self._download_json( api_episode_url, display_id, 'Downloading episode JSON metadata')['data'][0] - attributes = episode['attributes'] - title = attributes.get('title') or attributes['display_title'] - video_id = compat_str(episode['id']) - - thumbnails = [] - for image in episode.get('included', {}).get('images', []): - if image.get('type') in ('episode_image', 'bonus_feature_image'): - img_attributes = image.get('attributes') or {} - for k in ('thumb', 'small', 'medium', 'large'): - img_url = img_attributes.get(k) - if img_url: - thumbnails.append({ - 'id': k, - 'url': img_url, - }) return { - 'id': video_id, 'display_id': display_id, - 'title': title, - 'description': attributes.get('description') or attributes.get('caption'), - 'thumbnails': thumbnails, - 'series': attributes.get('show_title'), - 'season_number': int_or_none(attributes.get('season_number')), - 'season_id': attributes.get('season_id'), - 'episode': title, - 'episode_number': int_or_none(attributes.get('number')), - 'episode_id': str_or_none(episode.get('uuid')), 'formats': formats, - 'channel_id': attributes.get('channel_id'), - 'duration': int_or_none(attributes.get('length')), - 'subtitles': subtitles + 'subtitles': subtitles, + **self._extract_video_info(episode) + } + + +class RoosterTeethSeriesIE(RoosterTeethBaseIE): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/series/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://roosterteeth.com/series/rwby?season=7', + 'playlist_count': 13, + 'info_dict': { + 'id': 'rwby-7', + 'title': 'RWBY - Season 7', } + }, { + 'url': 'https://roosterteeth.com/series/role-initiative', + 'playlist_mincount': 16, + 'info_dict': { + 'id': 'role-initiative', + 'title': 'Role Initiative', + } + }, { + 'url': 'https://roosterteeth.com/series/let-s-play-minecraft?season=9', + 'playlist_mincount': 50, + 'info_dict': { + 'id': 'let-s-play-minecraft-9', + 'title': 'Let\'s Play Minecraft - Season 9', + } + }] + + def _entries(self, series_id, season_number): + display_id = join_nonempty(series_id, season_number) + # TODO: extract bonus material + for data in self._download_json( + f'{self._API_BASE_URL}/shows/{series_id}/seasons?order=asc&order_by', display_id)['data']: + idx = traverse_obj(data, ('attributes', 'number')) + if season_number and idx != season_number: + continue + season_url = update_url_query(urljoin(self._API_BASE, data['links']['episodes']), {'per_page': 1000}) + season = self._download_json(season_url, display_id, f'Downloading season {idx} JSON metadata')['data'] + for episode in season: + yield self.url_result( + f'https://www.roosterteeth.com{episode["canonical_links"]["self"]}', + RoosterTeethIE.ie_key(), + **self._extract_video_info(episode)) + + def _real_extract(self, url): + series_id = self._match_id(url) + season_number = traverse_obj(parse_qs(url), ('season', 0), expected_type=int_or_none) + + entries = LazyList(self._entries(series_id, season_number)) + return self.playlist_result( + entries, + join_nonempty(series_id, season_number), + join_nonempty(entries[0].get('series'), season_number, delim=' - Season ')) diff --git a/hypervideo_dl/extractor/rtbf.py b/hypervideo_dl/extractor/rtbf.py index f9979d0..4b61fdb 100644 --- a/hypervideo_dl/extractor/rtbf.py +++ b/hypervideo_dl/extractor/rtbf.py @@ -85,8 +85,6 @@ class RTBFIE(InfoExtractor): title = data['title'] is_live = data.get('isLive') - if is_live: - title = self._live_title(title) height_re = r'-(\d+)p\.' formats = [] diff --git a/hypervideo_dl/extractor/rtl2.py b/hypervideo_dl/extractor/rtl2.py index 4e3aa03..e291714 100644 --- a/hypervideo_dl/extractor/rtl2.py +++ b/hypervideo_dl/extractor/rtl2.py @@ -4,16 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..aes import aes_cbc_decrypt +from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 from ..compat import ( compat_b64decode, - compat_ord, compat_str, ) from ..utils import ( - bytes_to_intlist, ExtractorError, - intlist_to_bytes, int_or_none, strip_or_none, ) @@ -142,17 +139,12 @@ class RTL2YouIE(RTL2YouBaseIE): self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id) data, iv = compat_b64decode(stream_data['streamUrl']).decode().split(':') - stream_url = intlist_to_bytes(aes_cbc_decrypt( - bytes_to_intlist(compat_b64decode(data)), - bytes_to_intlist(self._AES_KEY), - bytes_to_intlist(compat_b64decode(iv)) - )) + stream_url = unpad_pkcs7(aes_cbc_decrypt_bytes( + compat_b64decode(data), self._AES_KEY, compat_b64decode(iv))) if b'rtl2_you_video_not_found' in stream_url: raise ExtractorError('video not found', expected=True) - formats = self._extract_m3u8_formats( - stream_url[:-compat_ord(stream_url[-1])].decode(), - video_id, 'mp4', 'm3u8_native') + formats = self._extract_m3u8_formats(stream_url.decode(), video_id, 'mp4', 'm3u8_native') self._sort_formats(formats) video_data = self._download_json( diff --git a/hypervideo_dl/extractor/rtnews.py b/hypervideo_dl/extractor/rtnews.py new file mode 100644 index 0000000..68b6044 --- /dev/null +++ b/hypervideo_dl/extractor/rtnews.py @@ -0,0 +1,199 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import js_to_json + + +class RTNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rt\.com/[^/]+/(?:[^/]+/)?(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://www.rt.com/sport/546301-djokovic-arrives-belgrade-crowds/', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '546301', + 'title': 'Crowds gather to greet deported Djokovic as he returns to Serbia (VIDEO)', + 'description': 'md5:1d5bfe1a988d81fd74227cfdf93d314d', + 'thumbnail': 'https://cdni.rt.com/files/2022.01/article/61e587a085f540102c3386c1.png' + }, + }, { + 'url': 'https://www.rt.com/shows/in-question/535980-plot-to-assassinate-julian-assange/', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '535980', + 'title': 'The plot to assassinate Julian Assange', + 'description': 'md5:55279ce5e4441dc1d16e2e4a730152cd', + 'thumbnail': 'https://cdni.rt.com/files/2021.09/article/615226f42030274e8879b53d.png' + }, + 'playlist': [{ + 'info_dict': { + 'id': '6152271d85f5400464496162', + 'ext': 'mp4', + 'title': '6152271d85f5400464496162', + }, + }] + }] + + def _entries(self, webpage): + video_urls = set(re.findall(r'https://cdnv\.rt\.com/.*[a-f0-9]+\.mp4', webpage)) + for v_url in video_urls: + v_id = re.search(r'([a-f0-9]+)\.mp4', v_url).group(1) + if v_id: + yield { + 'id': v_id, + 'title': v_id, + 'url': v_url, + } + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + + return { + '_type': 'playlist', + 'id': id, + 'entries': self._entries(webpage), + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + } + + +class RTDocumentryIE(InfoExtractor): + _VALID_URL = r'https?://rtd\.rt\.com/(?:(?:series|shows)/[^/]+|films)/(?P<id>[^/?$&#]+)' + + _TESTS = [{ + 'url': 'https://rtd.rt.com/films/escobars-hitman/', + 'info_dict': { + 'id': 'escobars-hitman', + 'ext': 'mp4', + 'title': "Escobar's Hitman. Former drug-gang killer, now loved and loathed in Colombia", + 'description': 'md5:647c76984b7cb9a8b52a567e87448d88', + 'thumbnail': 'https://cdni.rt.com/rtd-files/films/escobars-hitman/escobars-hitman_11.jpg', + 'average_rating': 8.53, + 'duration': 3134.0 + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://rtd.rt.com/shows/the-kalashnikova-show-military-secrets-anna-knishenko/iskander-tactical-system-natos-headache/', + 'info_dict': { + 'id': 'iskander-tactical-system-natos-headache', + 'ext': 'mp4', + 'title': "Iskander tactical system. NATO's headache | The Kalashnikova Show. Episode 10", + 'description': 'md5:da7c24a0aa67bc2bb88c86658508ca87', + 'thumbnail': 'md5:89de8ce38c710b7c501ff02d47e2aa89', + 'average_rating': 9.27, + 'duration': 274.0, + 'timestamp': 1605726000, + 'view_count': int, + 'upload_date': '20201118' + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://rtd.rt.com/series/i-am-hacked-trailer/introduction-to-safe-digital-life-ep2/', + 'info_dict': { + 'id': 'introduction-to-safe-digital-life-ep2', + 'ext': 'mp4', + 'title': 'How to Keep your Money away from Hackers | I am Hacked. Episode 2', + 'description': 'md5:c46fa9a5af86c0008c45a3940a8cce87', + 'thumbnail': 'md5:a5e81b9bf5aed8f5e23d9c053601b825', + 'average_rating': 10.0, + 'duration': 1524.0, + 'timestamp': 1636977600, + 'view_count': int, + 'upload_date': '20211115' + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + ld_json = self._search_json_ld(webpage, None, fatal=False) + if not ld_json: + self.raise_no_formats('No video/audio found at the provided url.', expected=True) + media_json = self._parse_json( + self._search_regex(r'(?s)\'Med\'\s*:\s*\[\s*({.+})\s*\]\s*};', webpage, 'media info'), + id, transform_source=js_to_json) + if 'title' not in ld_json and 'title' in media_json: + ld_json['title'] = media_json['title'] + formats = [{'url': src['file']} for src in media_json.get('sources') or [] if src.get('file')] + + return { + 'id': id, + 'thumbnail': media_json.get('image'), + 'formats': formats, + **ld_json + } + + +class RTDocumentryPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://rtd\.rt\.com/(?:series|shows)/(?P<id>[^/]+)/$' + + _TESTS = [{ + 'url': 'https://rtd.rt.com/series/i-am-hacked-trailer/', + 'playlist_mincount': 6, + 'info_dict': { + 'id': 'i-am-hacked-trailer', + }, + }, { + 'url': 'https://rtd.rt.com/shows/the-kalashnikova-show-military-secrets-anna-knishenko/', + 'playlist_mincount': 34, + 'info_dict': { + 'id': 'the-kalashnikova-show-military-secrets-anna-knishenko', + }, + }] + + def _entries(self, webpage, id): + video_urls = set(re.findall(r'list-2__link\s*"\s*href="([^"]+)"', webpage)) + for v_url in video_urls: + if id not in v_url: + continue + yield self.url_result( + 'https://rtd.rt.com%s' % v_url, + ie=RTDocumentryIE.ie_key()) + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + + return { + '_type': 'playlist', + 'id': id, + 'entries': self._entries(webpage, id), + } + + +class RuptlyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ruptly\.tv/[a-z]{2}/videos/(?P<id>\d+-\d+)' + + _TESTS = [{ + 'url': 'https://www.ruptly.tv/en/videos/20220112-020-Japan-Double-trouble-Tokyo-zoo-presents-adorable-panda-twins', + 'info_dict': { + 'id': '20220112-020', + 'ext': 'mp4', + 'title': 'Japan: Double trouble! Tokyo zoo presents adorable panda twins | Video Ruptly', + 'description': 'md5:85a8da5fdb31486f0562daf4360ce75a', + 'thumbnail': 'https://storage.ruptly.tv/thumbnails/20220112-020/i6JQKnTNpYuqaXsR/i6JQKnTNpYuqaXsR.jpg' + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + m3u8_url = self._search_regex(r'preview_url"\s?:\s?"(https?://storage\.ruptly\.tv/video_projects/.+\.m3u8)"', webpage, 'm3u8 url', fatal=False) + if not m3u8_url: + self.raise_no_formats('No video/audio found at the provided url.', expected=True) + formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, id, ext='mp4') + return { + 'id': id, + 'formats': formats, + 'subtitles': subs, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + } diff --git a/hypervideo_dl/extractor/rtrfm.py b/hypervideo_dl/extractor/rtrfm.py new file mode 100644 index 0000000..93d51e8 --- /dev/null +++ b/hypervideo_dl/extractor/rtrfm.py @@ -0,0 +1,67 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RTRFMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtrfm\.com\.au/(?:shows|show-episode)/(?P<id>[^/?\#&]+)' + _TESTS = [ + { + 'url': 'https://rtrfm.com.au/shows/breakfast/', + 'md5': '46168394d3a5ce237cf47e85d0745413', + 'info_dict': { + 'id': 'breakfast-2021-11-16', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': r're:^Breakfast with Taylah \d{4}-\d{2}-\d{2}$', + 'description': 'md5:0979c3ab1febfbec3f1ccb743633c611', + }, + 'skip': 'ID and md5 changes daily', + }, + { + 'url': 'https://rtrfm.com.au/show-episode/breakfast-2021-11-11/', + 'md5': '396bedf1e40f96c62b30d4999202a790', + 'info_dict': { + 'id': 'breakfast-2021-11-11', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': 'Breakfast with Taylah 2021-11-11', + 'description': 'md5:0979c3ab1febfbec3f1ccb743633c611', + }, + }, + { + 'url': 'https://rtrfm.com.au/show-episode/breakfast-2020-06-01/', + 'md5': '594027f513ec36a24b15d65007a24dff', + 'info_dict': { + 'id': 'breakfast-2020-06-01', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': 'Breakfast with Taylah 2020-06-01', + 'description': r're:^Breakfast with Taylah ', + }, + 'skip': 'This audio has expired', + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show, date, title = self._search_regex( + r'''\.playShow(?:From)?\(['"](?P<show>[^'"]+)['"],\s*['"](?P<date>[0-9]{4}-[0-9]{2}-[0-9]{2})['"],\s*['"](?P<title>[^'"]+)['"]''', + webpage, 'details', group=('show', 'date', 'title')) + url = self._download_json( + 'https://restreams.rtrfm.com.au/rzz', + show, 'Downloading MP3 URL', query={'n': show, 'd': date})['u'] + # This is the only indicator of an error until trying to download the URL and + # downloads of mp4 URLs always fail (403 for current episodes, 404 for missing). + if '.mp4' in url: + url = None + self.raise_no_formats('Expired or no episode on this date', expected=True) + return { + 'id': '%s-%s' % (show, date), + 'title': '%s %s' % (title, date), + 'series': title, + 'url': url, + 'release_date': date, + 'description': self._og_search_description(webpage), + } diff --git a/hypervideo_dl/extractor/rtve.py b/hypervideo_dl/extractor/rtve.py index 59832ee..7a1dc6f 100644 --- a/hypervideo_dl/extractor/rtve.py +++ b/hypervideo_dl/extractor/rtve.py @@ -17,7 +17,7 @@ from ..utils import ( qualities, remove_end, remove_start, - std_headers, + try_get, ) _bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x)) @@ -70,7 +70,7 @@ class RTVEALaCartaIE(InfoExtractor): }] def _real_initialize(self): - user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') + user_agent_b64 = base64.b64encode(self.get_param('http_headers')['User-Agent'].encode('utf-8')).decode('utf-8') self._manager = self._download_json( 'http://www.rtve.es/odin/loki/' + user_agent_b64, None, 'Fetching manager info')['manager'] @@ -160,7 +160,7 @@ class RTVEALaCartaIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'formats': formats, 'thumbnail': info.get('image'), 'subtitles': subtitles, @@ -178,6 +178,93 @@ class RTVEALaCartaIE(InfoExtractor): for s in subs) +class RTVEAudioIE(RTVEALaCartaIE): + IE_NAME = 'rtve.es:audio' + IE_DESC = 'RTVE audio' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/[^/]+/[^/]+/(?P<id>[0-9]+)' + + _TESTS = [{ + 'url': 'https://www.rtve.es/alacarta/audios/a-hombros-de-gigantes/palabra-ingeniero-codigos-informaticos-27-04-21/5889192/', + 'md5': 'ae06d27bff945c4e87a50f89f6ce48ce', + 'info_dict': { + 'id': '5889192', + 'ext': 'mp3', + 'title': 'Códigos informáticos', + 'thumbnail': r're:https?://.+/1598856591583.jpg', + 'duration': 349.440, + 'series': 'A hombros de gigantes', + }, + }, { + 'url': 'https://www.rtve.es/play/audios/en-radio-3/ignatius-farray/5791165/', + 'md5': '072855ab89a9450e0ba314c717fa5ebc', + 'info_dict': { + 'id': '5791165', + 'ext': 'mp3', + 'title': 'Ignatius Farray', + 'thumbnail': r're:https?://.+/1613243011863.jpg', + 'duration': 3559.559, + 'series': 'En Radio 3' + }, + }, { + 'url': 'https://www.rtve.es/play/audios/frankenstein-o-el-moderno-prometeo/capitulo-26-ultimo-muerte-victor-juan-jose-plans-mary-shelley/6082623/', + 'md5': '0eadab248cc8dd193fa5765712e84d5c', + 'info_dict': { + 'id': '6082623', + 'ext': 'mp3', + 'title': 'Capítulo 26 y último: La muerte de Victor', + 'thumbnail': r're:https?://.+/1632147445707.jpg', + 'duration': 3174.086, + 'series': 'Frankenstein o el moderno Prometeo' + }, + }] + + def _extract_png_formats(self, audio_id): + """ + This function retrieves media related png thumbnail which obfuscate + valuable information about the media. This information is decrypted + via base class _decrypt_url function providing media quality and + media url + """ + png = self._download_webpage( + 'http://www.rtve.es/ztnr/movil/thumbnail/%s/audios/%s.png' % + (self._manager, audio_id), + audio_id, 'Downloading url information', query={'q': 'v2'}) + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + formats = [] + for quality, audio_url in self._decrypt_url(png): + ext = determine_ext(audio_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + audio_url, audio_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + audio_url, audio_id, 'dash', fatal=False)) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': audio_url, + }) + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + audio_id = self._match_id(url) + info = self._download_json( + 'https://www.rtve.es/api/audios/%s.json' % audio_id, + audio_id)['page']['items'][0] + + return { + 'id': audio_id, + 'title': info['title'].strip(), + 'thumbnail': info.get('thumbnail'), + 'duration': float_or_none(info.get('duration'), 1000), + 'series': try_get(info, lambda x: x['programInfo']['title']), + 'formats': self._extract_png_formats(audio_id), + } + + class RTVEInfantilIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:infantil' IE_DESC = 'RTVE infantil' @@ -230,7 +317,7 @@ class RTVELiveIE(RTVEALaCartaIE): return { 'id': video_id, - 'title': self._live_title(title), + 'title': title, 'formats': self._extract_png_formats(vidplayer_id), 'is_live': True, } diff --git a/hypervideo_dl/extractor/rtvs.py b/hypervideo_dl/extractor/rtvs.py index 6573b26..3ea0f18 100644 --- a/hypervideo_dl/extractor/rtvs.py +++ b/hypervideo_dl/extractor/rtvs.py @@ -1,11 +1,19 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import ( + parse_duration, + traverse_obj, + unified_timestamp, +) + class RTVSIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv/\d+/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv(?:/\d+)?/(?P<id>\d+)/?(?:[#?]|$)' _TESTS = [{ # radio archive 'url': 'http://www.rtvs.sk/radio/archiv/11224/414872', @@ -13,23 +21,37 @@ class RTVSIE(InfoExtractor): 'info_dict': { 'id': '414872', 'ext': 'mp3', - 'title': 'Ostrov pokladov 1 časť.mp3' - }, - 'params': { - 'skip_download': True, + 'title': 'Ostrov pokladov 1 časť.mp3', + 'duration': 2854, + 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0000/b1R8.rtvs.jpg', + 'display_id': '135331', } }, { # tv archive 'url': 'http://www.rtvs.sk/televizia/archiv/8249/63118', - 'md5': '85e2c55cf988403b70cac24f5c086dc6', 'info_dict': { 'id': '63118', 'ext': 'mp4', 'title': 'Amaro Džives - Náš deň', - 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.' - }, - 'params': { - 'skip_download': True, + 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.', + 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0031/L7Qm.amaro_dzives_png.jpg', + 'timestamp': 1428555900, + 'upload_date': '20150409', + 'duration': 4986, + } + }, { + # tv archive + 'url': 'https://www.rtvs.sk/televizia/archiv/18083?utm_source=web&utm_medium=rozcestnik&utm_campaign=Robin', + 'info_dict': { + 'id': '18083', + 'ext': 'mp4', + 'title': 'Robin', + 'description': 'md5:2f70505a7b8364491003d65ff7a0940a', + 'timestamp': 1636652760, + 'display_id': '307655', + 'duration': 831, + 'upload_date': '20211111', + 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0916/robin.jpg', } }] @@ -37,11 +59,31 @@ class RTVSIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + iframe_id = self._search_regex( + r'<iframe[^>]+id\s*=\s*"player_[^_]+_([0-9]+)"', webpage, 'Iframe ID') + iframe_url = self._search_regex( + fr'<iframe[^>]+id\s*=\s*"player_[^_]+_{re.escape(iframe_id)}"[^>]+src\s*=\s*"([^"]+)"', webpage, 'Iframe URL') + + webpage = self._download_webpage(iframe_url, video_id, 'Downloading iframe') + json_url = self._search_regex(r'var\s+url\s*=\s*"([^"]+)"\s*\+\s*ruurl', webpage, 'json URL') + data = self._download_json(f'https:{json_url}b=mozilla&p=win&v=97&f=0&d=1', video_id) - playlist_url = self._search_regex( - r'playlist["\']?\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'playlist url', group='url') + if data.get('clip'): + data['playlist'] = [data['clip']] - data = self._download_json( - playlist_url, video_id, 'Downloading playlist')[0] - return self._parse_jwplayer_data(data, video_id=video_id) + if traverse_obj(data, ('playlist', 0, 'sources', 0, 'type')) == 'audio/mp3': + formats = [{'url': traverse_obj(data, ('playlist', 0, 'sources', 0, 'src'))}] + else: + formats = self._extract_m3u8_formats(traverse_obj(data, ('playlist', 0, 'sources', 0, 'src')), video_id) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': iframe_id, + 'title': traverse_obj(data, ('playlist', 0, 'title')), + 'description': traverse_obj(data, ('playlist', 0, 'description')), + 'duration': parse_duration(traverse_obj(data, ('playlist', 0, 'length'))), + 'thumbnail': traverse_obj(data, ('playlist', 0, 'image')), + 'timestamp': unified_timestamp(traverse_obj(data, ('playlist', 0, 'datetime_create'))), + 'formats': formats + } diff --git a/hypervideo_dl/extractor/rule34video.py b/hypervideo_dl/extractor/rule34video.py new file mode 100644 index 0000000..a602a9f --- /dev/null +++ b/hypervideo_dl/extractor/rule34video.py @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from ..utils import parse_duration +from .common import InfoExtractor + + +class Rule34VideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rule34video\.com/videos/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://rule34video.com/videos/3065157/shot-it-mmd-hmv/', + 'md5': 'ffccac2c23799dabbd192621ae4d04f3', + 'info_dict': { + 'id': '3065157', + 'ext': 'mp4', + 'title': 'Shot It-(mmd hmv)', + 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065157/preview.jpg', + 'duration': 347.0, + 'age_limit': 18 + } + }, + { + 'url': 'https://rule34video.com/videos/3065296/lara-in-trouble-ep-7-wildeerstudio/', + 'md5': '6bb5169f9f6b38cd70882bf2e64f6b86', + 'info_dict': { + 'id': '3065296', + 'ext': 'mp4', + 'title': 'Lara in Trouble Ep. 7 [WildeerStudio]', + 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065296/preview.jpg', + 'duration': 938.0, + 'age_limit': 18 + } + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + formats = [] + + for mobj in re.finditer(r'<a[^>]+href="(?P<video_url>[^"]+download=true[^"]+)".*>(?P<ext>[^\s]+) (?P<quality>[^<]+)p</a>', webpage): + url, ext, quality = mobj.groups() + formats.append({ + 'url': url, + 'ext': ext.lower(), + 'quality': quality, + }) + + title = self._html_extract_title(webpage) + thumbnail = self._html_search_regex(r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None) + duration = self._html_search_regex(r'"icon-clock"></i>\s+<span>((?:\d+:?)+)', webpage, 'duration', default=None) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'thumbnail': thumbnail, + 'duration': parse_duration(duration), + 'age_limit': 18 + } diff --git a/hypervideo_dl/extractor/rumble.py b/hypervideo_dl/extractor/rumble.py index 49c1f44..a0d5f88 100644 --- a/hypervideo_dl/extractor/rumble.py +++ b/hypervideo_dl/extractor/rumble.py @@ -11,6 +11,7 @@ from ..utils import ( int_or_none, parse_iso8601, try_get, + unescapeHTML, ExtractorError, ) @@ -28,6 +29,20 @@ class RumbleEmbedIE(InfoExtractor): 'upload_date': '20191020', } }, { + 'url': 'https://rumble.com/embed/vslb7v', + 'md5': '7418035de1a30a178b8af34dc2b6a52b', + 'info_dict': { + 'id': 'vslb7v', + 'ext': 'mp4', + 'title': 'Defense Sec. says US Commitment to NATO Defense \'Ironclad\'', + 'timestamp': 1645142135, + 'upload_date': '20220217', + 'channel_url': 'https://rumble.com/c/CyberTechNews', + 'channel': 'CTNews', + 'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg', + 'duration': 901, + } + }, { 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', 'only_matching': True, }] @@ -45,7 +60,7 @@ class RumbleEmbedIE(InfoExtractor): video = self._download_json( 'https://rumble.com/embedJS/', video_id, query={'request': 'video', 'v': video_id}) - title = video['title'] + title = unescapeHTML(video['title']) formats = [] for height, ua in (video.get('ua') or {}).items(): diff --git a/hypervideo_dl/extractor/rutube.py b/hypervideo_dl/extractor/rutube.py index d027412..2f753b4 100644 --- a/hypervideo_dl/extractor/rutube.py +++ b/hypervideo_dl/extractor/rutube.py @@ -230,9 +230,9 @@ class RutubePlaylistBaseIE(RutubeBaseIE): return self._extract_playlist(self._match_id(url)) -class RutubeChannelIE(RutubePlaylistBaseIE): - IE_NAME = 'rutube:channel' - IE_DESC = 'Rutube channels' +class RutubeTagsIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:tags' + IE_DESC = 'Rutube tags' _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)' _TESTS = [{ 'url': 'http://rutube.ru/tags/video/1800/', @@ -312,3 +312,18 @@ class RutubePlaylistIE(RutubePlaylistBaseIE): playlist_kind = qs['pl_type'][0] playlist_id = qs['pl_id'][0] return self._extract_playlist(playlist_id, item_kind=playlist_kind) + + +class RutubeChannelIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:channel' + IE_DESC = 'Rutube channel' + _VALID_URL = r'https?://rutube\.ru/channel/(?P<id>\d+)/videos' + _TESTS = [{ + 'url': 'https://rutube.ru/channel/639184/videos/', + 'info_dict': { + 'id': '639184', + }, + 'playlist_mincount': 133, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' diff --git a/hypervideo_dl/extractor/rutv.py b/hypervideo_dl/extractor/rutv.py index 7e0de99..0ea8253 100644 --- a/hypervideo_dl/extractor/rutv.py +++ b/hypervideo_dl/extractor/rutv.py @@ -6,7 +6,8 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, - int_or_none + int_or_none, + str_to_int ) @@ -179,8 +180,7 @@ class RUTVIE(InfoExtractor): 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', 'rtmp_live': True, 'ext': 'flv', - 'vbr': int(quality), - 'quality': preference, + 'vbr': str_to_int(quality), } elif transport == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -191,9 +191,10 @@ class RUTVIE(InfoExtractor): 'url': url } fmt.update({ - 'width': width, - 'height': height, + 'width': int_or_none(quality, default=height, invscale=width, scale=height), + 'height': int_or_none(quality, default=height), 'format_id': '%s-%s' % (transport, quality), + 'source_preference': preference, }) formats.append(fmt) @@ -201,7 +202,7 @@ class RUTVIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': description, 'thumbnail': thumbnail, 'view_count': view_count, diff --git a/hypervideo_dl/extractor/ruutu.py b/hypervideo_dl/extractor/ruutu.py index d9cf39d..5a30e33 100644 --- a/hypervideo_dl/extractor/ruutu.py +++ b/hypervideo_dl/extractor/ruutu.py @@ -1,6 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse from ..utils import ( @@ -8,6 +11,8 @@ from ..utils import ( ExtractorError, find_xpath_attr, int_or_none, + traverse_obj, + try_call, unified_strdate, url_or_none, xpath_attr, @@ -123,6 +128,16 @@ class RuutuIE(InfoExtractor): ] _API_BASE = 'https://gatling.nelonenmedia.fi' + @classmethod + def _extract_url(cls, webpage): + settings = try_call( + lambda: json.loads(re.search( + r'jQuery\.extend\(Drupal\.settings, ({.+?})\);', webpage).group(1), strict=False)) + video_id = traverse_obj(settings, ( + 'mediaCrossbowSettings', 'file', 'field_crossbow_video_id', 'und', 0, 'value')) + if video_id: + return f'http://www.ruutu.fi/video/{video_id}' + def _real_extract(self, url): video_id = self._match_id(url) diff --git a/hypervideo_dl/extractor/ruv.py b/hypervideo_dl/extractor/ruv.py index 8f3cc40..d806ed0 100644 --- a/hypervideo_dl/extractor/ruv.py +++ b/hypervideo_dl/extractor/ruv.py @@ -4,6 +4,8 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( determine_ext, + parse_duration, + traverse_obj, unified_timestamp, ) @@ -99,3 +101,89 @@ class RuvIE(InfoExtractor): 'timestamp': timestamp, 'formats': formats, } + + +class RuvSpilaIE(InfoExtractor): + IE_NAME = 'ruv.is:spila' + _VALID_URL = r'https?://(?:www\.)?ruv\.is/(?:(?:sjon|ut)varp|(?:krakka|ung)ruv)/spila/.+/(?P<series_id>[0-9]+)/(?P<id>[a-z0-9]+)' + _TESTS = [{ + 'url': 'https://www.ruv.is/sjonvarp/spila/ithrottir/30657/9jcnd4', + 'info_dict': { + 'id': '9jcnd4', + 'ext': 'mp4', + 'title': '01.02.2022', + 'chapters': 'count:4', + 'timestamp': 1643743500, + 'upload_date': '20220201', + 'thumbnail': 'https://d38kdhuogyllre.cloudfront.net/fit-in/1960x/filters:quality(65)/hd_posters/94boog-iti3jg.jpg', + 'description': 'Íþróttafréttir.', + 'age_limit': 0, + }, + }, { + 'url': 'https://www.ruv.is/utvarp/spila/i-ljosi-sogunnar/23795/7hqkre', + 'info_dict': { + 'id': '7hqkre', + 'ext': 'mp3', + 'thumbnail': 'https://d38kdhuogyllre.cloudfront.net/fit-in/1960x/filters:quality(65)/hd_posters/7hqkre-7uepao.jpg', + 'description': 'md5:8d7046549daff35e9a3190dc9901a120', + 'chapters': [], + 'upload_date': '20220204', + 'timestamp': 1643965500, + 'title': 'Nellie Bly II', + 'age_limit': 0, + }, + }, { + 'url': 'https://www.ruv.is/ungruv/spila/ungruv/28046/8beuph', + 'only_matching': True + }, { + 'url': 'https://www.ruv.is/krakkaruv/spila/krakkafrettir/30712/9jbgb0', + 'only_matching': True + }] + + def _real_extract(self, url): + display_id, series_id = self._match_valid_url(url).group('id', 'series_id') + program = self._download_json( + 'https://www.ruv.is/gql/', display_id, query={'query': '''{ + Program(id: %s){ + title image description short_description + episodes(id: {value: "%s"}) { + rating title duration file image firstrun description + clips { + time text + } + subtitles { + name value + } + } + } + }''' % (series_id, display_id)})['data']['Program'] + episode = program['episodes'][0] + + subs = {} + for trk in episode.get('subtitles'): + if trk.get('name') and trk.get('value'): + subs.setdefault(trk['name'], []).append({'url': trk['value'], 'ext': 'vtt'}) + + media_url = episode['file'] + if determine_ext(media_url) == 'm3u8': + formats = self._extract_m3u8_formats(media_url, display_id) + else: + formats = [{'url': media_url}] + + clips = [ + {'start_time': parse_duration(c.get('time')), 'title': c.get('text')} + for c in episode.get('clips') or []] + + return { + 'id': display_id, + 'title': traverse_obj(program, ('episodes', 0, 'title'), 'title'), + 'description': traverse_obj( + program, ('episodes', 0, 'description'), 'description', 'short_description', + expected_type=lambda x: x or None), + 'subtitles': subs, + 'thumbnail': episode.get('image', '').replace('$$IMAGESIZE$$', '1960') or None, + 'timestamp': unified_timestamp(episode.get('firstrun')), + 'formats': formats, + 'age_limit': episode.get('rating'), + 'chapters': clips + } diff --git a/hypervideo_dl/extractor/safari.py b/hypervideo_dl/extractor/safari.py index cca4464..7b4571d 100644 --- a/hypervideo_dl/extractor/safari.py +++ b/hypervideo_dl/extractor/safari.py @@ -25,14 +25,7 @@ class SafariBaseIE(InfoExtractor): LOGGED_IN = False - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): _, urlh = self._download_webpage_handle( 'https://learning.oreilly.com/accounts/login-check/', None, 'Downloading login page') diff --git a/hypervideo_dl/extractor/sbs.py b/hypervideo_dl/extractor/sbs.py index 0a806ee..4090f63 100644 --- a/hypervideo_dl/extractor/sbs.py +++ b/hypervideo_dl/extractor/sbs.py @@ -10,7 +10,14 @@ from ..utils import ( class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=|/watch/)|news/(?:embeds/)?video/)(?P<id>[0-9]+)' + _VALID_URL = r'''(?x) + https?://(?:www\.)?sbs\.com\.au/(?: + ondemand(?: + /video/(?:single/)?| + /movie/[^/]+/| + .*?\bplay=|/watch/ + )|news/(?:embeds/)?video/ + )(?P<id>[0-9]+)''' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -46,6 +53,13 @@ class SBSIE(InfoExtractor): }, { 'url': 'https://www.sbs.com.au/ondemand/watch/1698704451971', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/movie/coherence/1469404227931', + 'only_matching': True, + }, { + 'note': 'Live stream', + 'url': 'https://www.sbs.com.au/ondemand/video/1726824003663/sbs-24x7-live-stream-nsw', + 'only_matching': True, }] def _real_extract(self, url): @@ -75,4 +89,5 @@ class SBSIE(InfoExtractor): 'ie_key': 'ThePlatform', 'id': video_id, 'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}), + 'is_live': player_params.get('streamType') == 'live', } diff --git a/hypervideo_dl/extractor/scte.py b/hypervideo_dl/extractor/scte.py index ca1de63..7215cf5 100644 --- a/hypervideo_dl/extractor/scte.py +++ b/hypervideo_dl/extractor/scte.py @@ -14,14 +14,7 @@ class SCTEBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx' _NETRC_MACHINE = 'scte' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_popup = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') diff --git a/hypervideo_dl/extractor/senategov.py b/hypervideo_dl/extractor/senategov.py new file mode 100644 index 0000000..b295184 --- /dev/null +++ b/hypervideo_dl/extractor/senategov.py @@ -0,0 +1,213 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + parse_qs, + unsmuggle_url, +) + +_COMMITTEES = { + 'ag': ('76440', 'http://ag-f.akamaihd.net'), + 'aging': ('76442', 'http://aging-f.akamaihd.net'), + 'approps': ('76441', 'http://approps-f.akamaihd.net'), + 'arch': ('', 'http://ussenate-f.akamaihd.net'), + 'armed': ('76445', 'http://armed-f.akamaihd.net'), + 'banking': ('76446', 'http://banking-f.akamaihd.net'), + 'budget': ('76447', 'http://budget-f.akamaihd.net'), + 'cecc': ('76486', 'http://srs-f.akamaihd.net'), + 'commerce': ('80177', 'http://commerce1-f.akamaihd.net'), + 'csce': ('75229', 'http://srs-f.akamaihd.net'), + 'dpc': ('76590', 'http://dpc-f.akamaihd.net'), + 'energy': ('76448', 'http://energy-f.akamaihd.net'), + 'epw': ('76478', 'http://epw-f.akamaihd.net'), + 'ethics': ('76449', 'http://ethics-f.akamaihd.net'), + 'finance': ('76450', 'http://finance-f.akamaihd.net'), + 'foreign': ('76451', 'http://foreign-f.akamaihd.net'), + 'govtaff': ('76453', 'http://govtaff-f.akamaihd.net'), + 'help': ('76452', 'http://help-f.akamaihd.net'), + 'indian': ('76455', 'http://indian-f.akamaihd.net'), + 'intel': ('76456', 'http://intel-f.akamaihd.net'), + 'intlnarc': ('76457', 'http://intlnarc-f.akamaihd.net'), + 'jccic': ('85180', 'http://jccic-f.akamaihd.net'), + 'jec': ('76458', 'http://jec-f.akamaihd.net'), + 'judiciary': ('76459', 'http://judiciary-f.akamaihd.net'), + 'rpc': ('76591', 'http://rpc-f.akamaihd.net'), + 'rules': ('76460', 'http://rules-f.akamaihd.net'), + 'saa': ('76489', 'http://srs-f.akamaihd.net'), + 'smbiz': ('76461', 'http://smbiz-f.akamaihd.net'), + 'srs': ('75229', 'http://srs-f.akamaihd.net'), + 'uscc': ('76487', 'http://srs-f.akamaihd.net'), + 'vetaff': ('76462', 'http://vetaff-f.akamaihd.net'), +} + + +class SenateISVPIE(InfoExtractor): + _IE_NAME = 'senate.gov:isvp' + _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' + + _TESTS = [{ + 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', + 'info_dict': { + 'id': 'judiciary031715', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', + 'info_dict': { + 'id': 'commerce011514', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', + # checksum differs each time + 'info_dict': { + 'id': 'intel090613', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + } + }, { + # From http://www.c-span.org/video/?96791-1 + 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', + 'only_matching': True, + }] + + @staticmethod + def _search_iframe_url(webpage): + mobj = re.search( + r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + qs = compat_parse_qs(self._match_valid_url(url).group('qs')) + if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = re.sub(r'.mp4$', '', qs['filename'][0]) + + webpage = self._download_webpage(url, video_id) + + if smuggled_data.get('force_title'): + title = smuggled_data['force_title'] + else: + title = self._html_extract_title(webpage) + poster = qs.get('poster') + thumbnail = poster[0] if poster else None + + video_type = qs['type'][0] + committee = video_type if video_type == 'arch' else qs['comm'][0] + + stream_num, domain = _COMMITTEES[committee] + + formats = [] + if video_type == 'arch': + filename = video_id if '.' in video_id else video_id + '.mp4' + m3u8_url = compat_urlparse.urljoin(domain, 'i/' + filename + '/master.m3u8') + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8') + else: + hdcore_sign = 'hdcore=3.1.0' + url_params = (domain, video_id, stream_num) + f4m_url = f'%s/z/%s_1@%s/manifest.f4m?{hdcore_sign}' % url_params + m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params + for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): + # URLs without the extra param induce an 404 error + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.append(entry) + for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): + mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url']) + if mobj: + entry['format_id'] += mobj.group('tag') + formats.append(entry) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } + + +class SenateGovIE(InfoExtractor): + _IE_NAME = 'senate.gov' + _VALID_URL = r'https?:\/\/(?:www\.)?(help|appropriations|judiciary|banking|armed-services|finance)\.senate\.gov' + _TESTS = [{ + 'url': 'https://www.help.senate.gov/hearings/vaccines-saving-lives-ensuring-confidence-and-protecting-public-health', + 'info_dict': { + 'id': 'help090920', + 'display_id': 'vaccines-saving-lives-ensuring-confidence-and-protecting-public-health', + 'title': 'Vaccines: Saving Lives, Ensuring Confidence, and Protecting Public Health', + 'description': 'The U.S. Senate Committee on Health, Education, Labor & Pensions', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.appropriations.senate.gov/hearings/watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD', + 'info_dict': { + 'id': 'appropsA051518', + 'display_id': 'watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD', + 'title': 'Review of the FY2019 Budget Request for the U.S. Army', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.banking.senate.gov/hearings/21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization', + 'info_dict': { + 'id': 'banking041521', + 'display_id': '21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization', + 'title': '21st Century Communities: Public Transportation Infrastructure Investment and FAST Act Reauthorization', + 'description': 'The Official website of The United States Committee on Banking, Housing, and Urban Affairs', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._generic_id(url) + webpage = self._download_webpage(url, display_id) + parse_info = parse_qs(self._search_regex( + r'<iframe class="[^>"]*streaminghearing[^>"]*"\s[^>]*\bsrc="([^">]*)', webpage, 'hearing URL')) + + stream_num, stream_domain = _COMMITTEES[parse_info['comm'][-1]] + filename = parse_info['filename'][-1] + + formats = self._extract_m3u8_formats( + f'{stream_domain}/i/{filename}_1@{stream_num}/master.m3u8', + display_id, ext='mp4') + self._sort_formats(formats) + + title = self._html_search_regex( + (*self._og_regexes('title'), r'(?s)<title>([^<]*?)</title>'), webpage, 'video title') + + return { + 'id': re.sub(r'.mp4$', '', filename), + 'display_id': display_id, + 'title': re.sub(r'\s+', ' ', title.split('|')[0]).strip(), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'age_limit': self._rta_search(webpage), + 'formats': formats + } diff --git a/hypervideo_dl/extractor/sendtonews.py b/hypervideo_dl/extractor/sendtonews.py index bc38a0f..858547b 100644 --- a/hypervideo_dl/extractor/sendtonews.py +++ b/hypervideo_dl/extractor/sendtonews.py @@ -80,7 +80,7 @@ class SendtoNewsIE(InfoExtractor): 'format_id': '%s-%d' % (determine_protocol(f), tbr), 'tbr': tbr, }) - # 'tbr' was explicitly set to be prefered over 'height' originally, + # 'tbr' was explicitly set to be preferred over 'height' originally, # So this is being kept unless someone can confirm this is unnecessary self._sort_formats(info_dict['formats'], ('tbr', 'res')) diff --git a/hypervideo_dl/extractor/sevenplus.py b/hypervideo_dl/extractor/sevenplus.py index 210c44a..9867961 100644 --- a/hypervideo_dl/extractor/sevenplus.py +++ b/hypervideo_dl/extractor/sevenplus.py @@ -35,7 +35,6 @@ class SevenPlusIE(BrightcoveNewIE): 'episode': 'Wind Surf', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, } }, { diff --git a/hypervideo_dl/extractor/shahid.py b/hypervideo_dl/extractor/shahid.py index 42de41a..ab45d9c 100644 --- a/hypervideo_dl/extractor/shahid.py +++ b/hypervideo_dl/extractor/shahid.py @@ -79,16 +79,12 @@ class ShahidIE(ShahidBaseIE): 'only_matching': True }] - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return - + def _perform_login(self, username, password): try: user_data = self._download_json( 'https://shahid.mbc.net/wd/service/users/login', None, 'Logging in', data=json.dumps({ - 'email': email, + 'email': username, 'password': password, 'basic': 'false', }).encode('utf-8'), headers={ diff --git a/hypervideo_dl/extractor/shemaroome.py b/hypervideo_dl/extractor/shemaroome.py index 142d5dc..45c1291 100644 --- a/hypervideo_dl/extractor/shemaroome.py +++ b/hypervideo_dl/extractor/shemaroome.py @@ -2,10 +2,9 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..aes import aes_cbc_decrypt +from ..aes import aes_cbc_decrypt, unpad_pkcs7 from ..compat import ( compat_b64decode, - compat_ord, ) from ..utils import ( bytes_to_intlist, @@ -16,7 +15,7 @@ from ..utils import ( class ShemarooMeIE(InfoExtractor): - _VALID_URL = r'(?:https?://)(?:www\.)?shemaroome\.com/(?:movies|shows)/(?P<id>[^?#]+)' + _VALID_URL = r'https?://(?:www\.)?shemaroome\.com/(?:movies|shows)/(?P<id>[^?#]+)' _TESTS = [{ 'url': 'https://www.shemaroome.com/movies/dil-hai-tumhaara', 'info_dict': { @@ -76,9 +75,8 @@ class ShemarooMeIE(InfoExtractor): url_data = bytes_to_intlist(compat_b64decode(data_json['new_play_url'])) key = bytes_to_intlist(compat_b64decode(data_json['key'])) iv = [0] * 16 - m3u8_url = intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv)) - m3u8_url = m3u8_url[:-compat_ord((m3u8_url[-1]))].decode('ascii') - formats = self._extract_m3u8_formats(m3u8_url, video_id, fatal=False, headers={'stream_key': data_json['stream_key']}) + m3u8_url = unpad_pkcs7(intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))).decode('ascii') + formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers={'stream_key': data_json['stream_key']}) self._sort_formats(formats) release_date = self._html_search_regex( @@ -91,6 +89,7 @@ class ShemarooMeIE(InfoExtractor): subtitles.setdefault('EN', []).append({ 'url': self._proto_relative_url(sub_url), }) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) description = self._html_search_regex(r'(?s)>Synopsis(</.+?)</', webpage, 'description', fatal=False) return { diff --git a/hypervideo_dl/extractor/showroomlive.py b/hypervideo_dl/extractor/showroomlive.py index efd9d56..1aada69 100644 --- a/hypervideo_dl/extractor/showroomlive.py +++ b/hypervideo_dl/extractor/showroomlive.py @@ -73,7 +73,7 @@ class ShowRoomLiveIE(InfoExtractor): return { 'id': compat_str(room.get('live_id') or broadcaster_id), - 'title': self._live_title(title), + 'title': title, 'description': room.get('description'), 'timestamp': int_or_none(room.get('current_live_started_at')), 'uploader': uploader, diff --git a/hypervideo_dl/extractor/skeb.py b/hypervideo_dl/extractor/skeb.py new file mode 100644 index 0000000..81aecb3 --- /dev/null +++ b/hypervideo_dl/extractor/skeb.py @@ -0,0 +1,143 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, determine_ext, parse_qs, traverse_obj + + +class SkebIE(InfoExtractor): + _VALID_URL = r'https?://skeb\.jp/@[^/]+/works/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://skeb.jp/@riiru_wm/works/10', + 'info_dict': { + 'id': '466853', + 'title': '内容はおまかせします! by 姫ノ森りぃる@一周年', + 'descripion': 'md5:1ec50901efc3437cfbfe3790468d532d', + 'uploader': '姫ノ森りぃる@一周年', + 'uploader_id': 'riiru_wm', + 'age_limit': 0, + 'tags': [], + 'url': r're:https://skeb.+', + 'thumbnail': r're:https://skeb.+', + 'subtitles': { + 'jpn': [{ + 'url': r're:https://skeb.+', + 'ext': 'vtt' + }] + }, + 'width': 720, + 'height': 405, + 'duration': 313, + 'fps': 30, + 'ext': 'mp4', + }, + }, { + 'url': 'https://skeb.jp/@furukawa_nob/works/3', + 'info_dict': { + 'id': '489408', + 'title': 'いつもお世話になってお... by 古川ノブ@音楽とVlo...', + 'descripion': 'md5:5adc2e41d06d33b558bf7b1faeb7b9c2', + 'uploader': '古川ノブ@音楽とVlogのVtuber', + 'uploader_id': 'furukawa_nob', + 'age_limit': 0, + 'tags': [ + 'よろしく', '大丈夫', 'お願い', 'でした', + '是非', 'O', 'バー', '遊び', 'おはよう', + 'オーバ', 'ボイス', + ], + 'url': r're:https://skeb.+', + 'thumbnail': r're:https://skeb.+', + 'subtitles': { + 'jpn': [{ + 'url': r're:https://skeb.+', + 'ext': 'vtt' + }] + }, + 'duration': 98, + 'ext': 'mp3', + 'vcodec': 'none', + 'abr': 128, + }, + }, { + 'url': 'https://skeb.jp/@mollowmollow/works/6', + 'info_dict': { + 'id': '6', + 'title': 'ヒロ。\n\n私のキャラク... by 諸々', + 'descripion': 'md5:aa6cbf2ba320b50bce219632de195f07', + '_type': 'playlist', + 'entries': [{ + 'id': '486430', + 'title': 'ヒロ。\n\n私のキャラク... by 諸々', + 'descripion': 'md5:aa6cbf2ba320b50bce219632de195f07', + }, { + 'id': '486431', + 'title': 'ヒロ。\n\n私のキャラク... by 諸々', + }] + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + nuxt_data = self._search_nuxt_data(self._download_webpage(url, video_id), video_id) + + parent = { + 'id': video_id, + 'title': nuxt_data.get('title'), + 'descripion': nuxt_data.get('description'), + 'uploader': traverse_obj(nuxt_data, ('creator', 'name')), + 'uploader_id': traverse_obj(nuxt_data, ('creator', 'screen_name')), + 'age_limit': 18 if nuxt_data.get('nsfw') else 0, + 'tags': nuxt_data.get('tag_list'), + } + + entries = [] + for item in nuxt_data.get('previews') or []: + vid_url = item.get('url') + given_ext = traverse_obj(item, ('information', 'extension')) + preview_ext = determine_ext(vid_url, default_ext=None) + if not preview_ext: + content_disposition = parse_qs(vid_url)['response-content-disposition'][0] + preview_ext = self._search_regex( + r'filename="[^"]+\.([^\.]+?)"', content_disposition, + 'preview file extension', fatal=False, group=1) + if preview_ext not in ('mp4', 'mp3'): + continue + if not vid_url or not item.get('id'): + continue + width, height = traverse_obj(item, ('information', 'width')), traverse_obj(item, ('information', 'height')) + if width is not None and height is not None: + # the longest side is at most 720px for non-client viewers + max_size = max(width, height) + width, height = list(x * 720 // max_size for x in (width, height)) + entries.append({ + **parent, + 'id': str(item['id']), + 'url': vid_url, + 'thumbnail': item.get('poster_url'), + 'subtitles': { + 'jpn': [{ + 'url': item.get('vtt_url'), + 'ext': 'vtt', + }] + } if item.get('vtt_url') else None, + 'width': width, + 'height': height, + 'duration': traverse_obj(item, ('information', 'duration')), + 'fps': traverse_obj(item, ('information', 'frame_rate')), + 'ext': preview_ext or given_ext, + 'vcodec': 'none' if preview_ext == 'mp3' else None, + # you'll always get 128kbps MP3 for non-client viewers + 'abr': 128 if preview_ext == 'mp3' else None, + }) + + if not entries: + raise ExtractorError('No video/audio attachment found in this commission.', expected=True) + elif len(entries) == 1: + return entries[0] + else: + parent.update({ + '_type': 'playlist', + 'entries': entries, + }) + return parent diff --git a/hypervideo_dl/extractor/sky.py b/hypervideo_dl/extractor/sky.py index ff2c977..ad1e62d 100644 --- a/hypervideo_dl/extractor/sky.py +++ b/hypervideo_dl/extractor/sky.py @@ -105,6 +105,34 @@ class SkyNewsIE(SkyBaseIE): } +class SkyNewsStoryIE(SkyBaseIE): + IE_NAME = 'sky:news:story' + _VALID_URL = r'https?://news\.sky\.com/story/[0-9a-z-]+-(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://news.sky.com/story/budget-2021-chancellor-rishi-sunak-vows-address-will-deliver-strong-economy-fit-for-a-new-age-of-optimism-12445425', + 'info_dict': { + 'id': 'ref:0714acb9-123d-42c8-91b8-5c1bc6c73f20', + 'title': 'md5:e408dd7aad63f31a1817bbe40c7d276f', + 'description': 'md5:a881e12f49212f92be2befe4a09d288a', + 'ext': 'mp4', + 'upload_date': '20211027', + 'timestamp': 1635317494, + 'uploader_id': '6058004172001', + } + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + + entries = [self._process_ooyala_element(webpage, sdc_el, url) + for sdc_el in re.findall(self._SDC_EL_REGEX, webpage)] + + return self.playlist_result( + entries, article_id, self._og_search_title(webpage), + self._html_search_meta(['og:description', 'description'], webpage)) + + class SkySportsNewsIE(SkyBaseIE): IE_NAME = 'sky:sports:news' _VALID_URL = r'https?://(?:www\.)?skysports\.com/([^/]+/)*news/\d+/(?P<id>\d+)' diff --git a/hypervideo_dl/extractor/skyit.py b/hypervideo_dl/extractor/skyit.py index 14a4d8d..ddb43c0 100644 --- a/hypervideo_dl/extractor/skyit.py +++ b/hypervideo_dl/extractor/skyit.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( - compat_str, compat_parse_qs, compat_urllib_parse_urlparse, ) @@ -55,7 +54,7 @@ class SkyItPlayerIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'formats': formats, 'thumbnail': dict_get(video, ('video_still', 'video_still_medium', 'thumb')), 'description': video.get('short_desc') or None, @@ -125,9 +124,7 @@ class SkyItVideoLiveIE(SkyItPlayerIE): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - asset_id = compat_str(self._parse_json(self._search_regex( - r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', - webpage, 'next data'), display_id)['props']['initialState']['livePage']['content']['asset_id']) + asset_id = str(self._search_nextjs_data(webpage, display_id)['props']['initialState']['livePage']['content']['asset_id']) livestream = self._download_json( 'https://apid.sky.it/vdp/v1/getLivestream', asset_id, query={'id': asset_id}) diff --git a/hypervideo_dl/extractor/skylinewebcams.py b/hypervideo_dl/extractor/skylinewebcams.py index b7f8ac7..47bbb76 100644 --- a/hypervideo_dl/extractor/skylinewebcams.py +++ b/hypervideo_dl/extractor/skylinewebcams.py @@ -36,7 +36,7 @@ class SkylineWebcamsIE(InfoExtractor): 'id': video_id, 'url': stream_url, 'ext': 'mp4', - 'title': self._live_title(title), + 'title': title, 'description': description, 'is_live': True, } diff --git a/hypervideo_dl/extractor/skynewsau.py b/hypervideo_dl/extractor/skynewsau.py index b1d7795..8e079ee 100644 --- a/hypervideo_dl/extractor/skynewsau.py +++ b/hypervideo_dl/extractor/skynewsau.py @@ -9,7 +9,7 @@ from ..utils import ( class SkyNewsAUIE(InfoExtractor): - _VALID_URL = r'(?:https?://)(?:www\.)?skynews\.com\.au/[^/]+/[^/]+/[^/]+/video/(?P<id>[a-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?skynews\.com\.au/[^/]+/[^/]+/[^/]+/video/(?P<id>[a-z0-9]+)' _TESTS = [{ 'url': 'https://www.skynews.com.au/world-news/united-states/incredible-vision-shows-lava-overflowing-from-spains-la-palma-volcano/video/0f4c6243d6903502c01251f228b91a71', diff --git a/hypervideo_dl/extractor/slideslive.py b/hypervideo_dl/extractor/slideslive.py index 9409a01..df60846 100644 --- a/hypervideo_dl/extractor/slideslive.py +++ b/hypervideo_dl/extractor/slideslive.py @@ -35,9 +35,6 @@ class SlidesLiveIE(InfoExtractor): 'ext': 'mp4', 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges', }, - 'params': { - 'format': 'bestvideo', - }, }, { # video_service_name = youtube 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', diff --git a/hypervideo_dl/extractor/sonyliv.py b/hypervideo_dl/extractor/sonyliv.py index c3ed442..5b6849f 100644 --- a/hypervideo_dl/extractor/sonyliv.py +++ b/hypervideo_dl/extractor/sonyliv.py @@ -1,6 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import datetime +import math +import random import time import uuid @@ -56,17 +59,60 @@ class SonyLIVIE(InfoExtractor): 'only_matching': True, }] _GEO_COUNTRIES = ['IN'] - _TOKEN = None + _HEADERS = {} + _LOGIN_HINT = 'Use "--username <mobile_number>" to login using OTP or "--username token --password <auth_token>" to login using auth token.' + _NETRC_MACHINE = 'sonyliv' + + def _get_device_id(self): + e = int(time.time() * 1000) + t = list('xxxxxxxxxxxx4xxxyxxxxxxxxxxxxxxx') + for i, c in enumerate(t): + n = int((e + 16 * random.random()) % 16) | 0 + e = math.floor(e / 16) + if c == 'x': + t[i] = str(n) + elif c == 'y': + t[i] = '{:x}'.format(3 & n | 8) + return ''.join(t) + '-' + str(int(time.time() * 1000)) + + def _perform_login(self, username, password): + self._HEADERS['device_id'] = self._get_device_id() + self._HEADERS['content-type'] = 'application/json' + + if username.lower() == 'token' and len(password) > 1198: + self._HEADERS['authorization'] = password + elif len(username) != 10 or not username.isdigit(): + raise ExtractorError(f'Invalid username/password; {self._LOGIN_HINT}') + + self.report_login() + data = '''{"mobileNumber":"%s","channelPartnerID":"MSMIND","country":"IN","timestamp":"%s", + "otpSize":6,"loginType":"REGISTERORSIGNIN","isMobileMandatory":true} + ''' % (username, datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%MZ")) + otp_request_json = self._download_json( + 'https://apiv2.sonyliv.com/AGL/1.6/A/ENG/WEB/IN/HR/CREATEOTP-V2', + None, note='Sending OTP', data=data.encode(), headers=self._HEADERS) + if otp_request_json['resultCode'] == 'KO': + raise ExtractorError(otp_request_json['message'], expected=True) + otp_code = self._get_tfa_info('OTP') + data = '''{"channelPartnerID":"MSMIND","mobileNumber":"%s","country":"IN","otp":"%s", + "dmaId":"IN","ageConfirmation":true,"timestamp":"%s","isMobileMandatory":true} + ''' % (username, otp_code, datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%MZ")) + otp_verify_json = self._download_json( + 'https://apiv2.sonyliv.com/AGL/2.0/A/ENG/WEB/IN/HR/CONFIRMOTP-V2', + None, note='Verifying OTP', data=data.encode(), headers=self._HEADERS) + if otp_verify_json['resultCode'] == 'KO': + raise ExtractorError(otp_request_json['message'], expected=True) + self._HEADERS['authorization'] = otp_verify_json['resultObj']['accessToken'] def _call_api(self, version, path, video_id): - headers = {} - if self._TOKEN: - headers['security_token'] = self._TOKEN try: return self._download_json( 'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path), - video_id, headers=headers)['resultObj'] + video_id, headers=self._HEADERS)['resultObj'] except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406 and self._parse_json( + e.cause.read().decode(), video_id)['message'] == 'Please subscribe to watch this content': + self.raise_login_required(self._LOGIN_HINT, method=None) if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: message = self._parse_json( e.cause.read().decode(), video_id)['message'] @@ -75,8 +121,8 @@ class SonyLIVIE(InfoExtractor): raise ExtractorError(message) raise - def _real_initialize(self): - self._TOKEN = self._call_api('1.4', 'ALL/GETTOKEN', None) + def _initialize_pre_login(self): + self._HEADERS['security_token'] = self._call_api('1.4', 'ALL/GETTOKEN', None) def _real_extract(self, url): video_id = self._match_id(url) diff --git a/hypervideo_dl/extractor/soundcloud.py b/hypervideo_dl/extractor/soundcloud.py index 78fecd1..92535f7 100644 --- a/hypervideo_dl/extractor/soundcloud.py +++ b/hypervideo_dl/extractor/soundcloud.py @@ -58,7 +58,143 @@ class SoundcloudEmbedIE(InfoExtractor): return self.url_result(api_url) -class SoundcloudIE(InfoExtractor): +class SoundcloudBaseIE(InfoExtractor): + _NETRC_MACHINE = 'soundcloud' + + _API_V2_BASE = 'https://api-v2.soundcloud.com/' + _BASE_URL = 'https://soundcloud.com/' + _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' + _API_AUTH_QUERY_TEMPLATE = '?client_id=%s' + _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s' + _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s' + _access_token = None + _HEADERS = {} + + def _store_client_id(self, client_id): + self._downloader.cache.store('soundcloud', 'client_id', client_id) + + def _update_client_id(self): + webpage = self._download_webpage('https://soundcloud.com/', None) + for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)): + script = self._download_webpage(src, None, fatal=False) + if script: + client_id = self._search_regex( + r'client_id\s*:\s*"([0-9a-zA-Z]{32})"', + script, 'client id', default=None) + if client_id: + self._CLIENT_ID = client_id + self._store_client_id(client_id) + return + raise ExtractorError('Unable to extract client id') + + def _download_json(self, *args, **kwargs): + non_fatal = kwargs.get('fatal') is False + if non_fatal: + del kwargs['fatal'] + query = kwargs.get('query', {}).copy() + for _ in range(2): + query['client_id'] = self._CLIENT_ID + kwargs['query'] = query + try: + return super()._download_json(*args, **compat_kwargs(kwargs)) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): + self._store_client_id(None) + self._update_client_id() + continue + elif non_fatal: + self.report_warning(error_to_compat_str(e)) + return False + raise + + def _initialize_pre_login(self): + self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf' + + def _perform_login(self, username, password): + if username != 'oauth': + self.report_warning( + 'Login using username and password is not currently supported. ' + 'Use "--username oauth --password <oauth_token>" to login using an oauth token') + self._access_token = password + query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID + payload = {'session': {'access_token': self._access_token}} + token_verification = sanitized_Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8')) + response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False) + if response is not False: + self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} + self.report_login() + else: + self.report_warning('Provided authorization token seems to be invalid. Continue as guest') + + r''' + def genDevId(): + def genNumBlock(): + return ''.join([str(random.randrange(10)) for i in range(6)]) + return '-'.join([genNumBlock() for i in range(4)]) + + payload = { + 'client_id': self._CLIENT_ID, + 'recaptcha_pubkey': 'null', + 'recaptcha_response': 'null', + 'credentials': { + 'identifier': username, + 'password': password + }, + 'signature': self.sign(username, password, self._CLIENT_ID), + 'device_id': genDevId(), + 'user_agent': self._USER_AGENT + } + + query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID + login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8')) + response = self._download_json(login, None) + self._access_token = response.get('session').get('access_token') + if not self._access_token: + self.report_warning('Unable to get access token, login may has failed') + else: + self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} + ''' + + # signature generation + def sign(self, user, pw, clid): + a = 33 + i = 1 + s = 440123 + w = 117 + u = 1800000 + l = 1042 + b = 37 + k = 37 + c = 5 + n = '0763ed7314c69015fd4a0dc16bbf4b90' # _KEY + y = '8' # _REV + r = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' # _USER_AGENT + e = user # _USERNAME + t = clid # _CLIENT_ID + + d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]]) + p = n + y + d + r + e + t + d + n + h = p + + m = 8011470 + f = 0 + + for f in range(f, len(h)): + m = (m >> 1) + ((1 & m) << 23) + m += ord(h[f]) + m &= 16777215 + + # c is not even needed + out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c) + + return out + + @classmethod + def _resolv_url(cls, url): + return cls._API_V2_BASE + 'resolve?url=' + url + + +class SoundcloudIE(SoundcloudBaseIE): """Information extractor for soundcloud.com To access the media, the uid of the song and a stream token must be extracted from the page source and the script must make @@ -72,8 +208,9 @@ class SoundcloudIE(InfoExtractor): (?!stations/track) (?P<uploader>[\w\d-]+)/ (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) - (?P<title>[\w\d-]+)/? - (?P<token>[^?]+?)?(?:[?].*)?$) + (?P<title>[\w\d-]+) + (?:/(?P<token>(?!(?:albums|sets|recommended))[^?]+?))? + (?:[?].*)?$) |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+) (?:/?\?secret_token=(?P<secret_token>[^&]+))?) ) @@ -250,8 +387,6 @@ class SoundcloudIE(InfoExtractor): }, ] - _API_V2_BASE = 'https://api-v2.soundcloud.com/' - _BASE_URL = 'https://soundcloud.com/' _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' _ARTWORK_MAP = { @@ -267,143 +402,6 @@ class SoundcloudIE(InfoExtractor): 'original': 0, } - def _store_client_id(self, client_id): - self._downloader.cache.store('soundcloud', 'client_id', client_id) - - def _update_client_id(self): - webpage = self._download_webpage('https://soundcloud.com/', None) - for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)): - script = self._download_webpage(src, None, fatal=False) - if script: - client_id = self._search_regex( - r'client_id\s*:\s*"([0-9a-zA-Z]{32})"', - script, 'client id', default=None) - if client_id: - self._CLIENT_ID = client_id - self._store_client_id(client_id) - return - raise ExtractorError('Unable to extract client id') - - def _download_json(self, *args, **kwargs): - non_fatal = kwargs.get('fatal') is False - if non_fatal: - del kwargs['fatal'] - query = kwargs.get('query', {}).copy() - for _ in range(2): - query['client_id'] = self._CLIENT_ID - kwargs['query'] = query - try: - return super(SoundcloudIE, self)._download_json(*args, **compat_kwargs(kwargs)) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): - self._store_client_id(None) - self._update_client_id() - continue - elif non_fatal: - self.report_warning(error_to_compat_str(e)) - return False - raise - - def _real_initialize(self): - self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf' - self._login() - - _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' - _API_AUTH_QUERY_TEMPLATE = '?client_id=%s' - _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s' - _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s' - _access_token = None - _HEADERS = {} - _NETRC_MACHINE = 'soundcloud' - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - if username == 'oauth' and password is not None: - self._access_token = password - query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID - payload = {'session': {'access_token': self._access_token}} - token_verification = sanitized_Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8')) - response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False) - if response is not False: - self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} - self.report_login() - else: - self.report_warning('Provided authorization token seems to be invalid. Continue as guest') - elif username is not None: - self.report_warning( - 'Login using username and password is not currently supported. ' - 'Use "--user oauth --password <oauth_token>" to login using an oauth token') - - r''' - def genDevId(): - def genNumBlock(): - return ''.join([str(random.randrange(10)) for i in range(6)]) - return '-'.join([genNumBlock() for i in range(4)]) - - payload = { - 'client_id': self._CLIENT_ID, - 'recaptcha_pubkey': 'null', - 'recaptcha_response': 'null', - 'credentials': { - 'identifier': username, - 'password': password - }, - 'signature': self.sign(username, password, self._CLIENT_ID), - 'device_id': genDevId(), - 'user_agent': self._USER_AGENT - } - - query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID - login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8')) - response = self._download_json(login, None) - self._access_token = response.get('session').get('access_token') - if not self._access_token: - self.report_warning('Unable to get access token, login may has failed') - else: - self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} - ''' - - # signature generation - def sign(self, user, pw, clid): - a = 33 - i = 1 - s = 440123 - w = 117 - u = 1800000 - l = 1042 - b = 37 - k = 37 - c = 5 - n = '0763ed7314c69015fd4a0dc16bbf4b90' # _KEY - y = '8' # _REV - r = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' # _USER_AGENT - e = user # _USERNAME - t = clid # _CLIENT_ID - - d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]]) - p = n + y + d + r + e + t + d + n - h = p - - m = 8011470 - f = 0 - - for f in range(f, len(h)): - m = (m >> 1) + ((1 & m) << 23) - m += ord(h[f]) - m &= 16777215 - - # c is not even needed - out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c) - - return out - - @classmethod - def _resolv_url(cls, url): - return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url - def _extract_info_dict(self, info, full_title=None, secret_token=None): track_id = compat_str(info['id']) title = info['title'] @@ -581,7 +579,7 @@ class SoundcloudIE(InfoExtractor): return self._extract_info_dict(info, full_title, token) -class SoundcloudPlaylistBaseIE(SoundcloudIE): +class SoundcloudPlaylistBaseIE(SoundcloudBaseIE): def _extract_set(self, playlist, token=None): playlist_id = compat_str(playlist['id']) tracks = playlist.get('tracks') or [] @@ -654,7 +652,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): return self._extract_set(info, token) -class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): +class SoundcloudPagedPlaylistBaseIE(SoundcloudBaseIE): def _extract_playlist(self, base_url, playlist_id, playlist_title): return { '_type': 'playlist', @@ -824,6 +822,54 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): track_id, 'Track station: %s' % track['title']) +class SoundcloudRelatedIE(SoundcloudPagedPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<slug>[\w\d-]+/[\w\d-]+)/(?P<relation>albums|sets|recommended)' + IE_NAME = 'soundcloud:related' + _TESTS = [{ + 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/recommended', + 'info_dict': { + 'id': '1084577272', + 'title': 'Sexapil - Pingers 5 (Recommended)', + }, + 'playlist_mincount': 50, + }, { + 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/albums', + 'info_dict': { + 'id': '1084577272', + 'title': 'Sexapil - Pingers 5 (Albums)', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/sets', + 'info_dict': { + 'id': '1084577272', + 'title': 'Sexapil - Pingers 5 (Sets)', + }, + 'playlist_mincount': 4, + }] + + _BASE_URL_MAP = { + 'albums': 'tracks/%s/albums', + 'sets': 'tracks/%s/playlists_without_albums', + 'recommended': 'tracks/%s/related', + } + + def _real_extract(self, url): + slug, relation = self._match_valid_url(url).group('slug', 'relation') + + track = self._download_json( + self._resolv_url(self._BASE_URL + slug), + slug, 'Downloading track info', headers=self._HEADERS) + + if track.get('errors'): + raise ExtractorError(f'{self.IE_NAME} said: %s' % ','.join( + str(err['error_message']) for err in track['errors']), expected=True) + + return self._extract_playlist( + self._API_V2_BASE + self._BASE_URL_MAP[relation] % track['id'], str(track['id']), + '%s (%s)' % (track.get('title') or slug, relation.capitalize())) + + class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' IE_NAME = 'soundcloud:playlist' @@ -853,10 +899,10 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): return self._extract_set(data, token) -class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): +class SoundcloudSearchIE(SoundcloudBaseIE, SearchInfoExtractor): IE_NAME = 'soundcloud:search' IE_DESC = 'Soundcloud search' - _MAX_RESULTS = float('inf') + _SEARCH_KEY = 'scsearch' _TESTS = [{ 'url': 'scsearch15:post-avant jazzcore', 'info_dict': { @@ -865,7 +911,6 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): 'playlist_count': 15, }] - _SEARCH_KEY = 'scsearch' _MAX_RESULTS_PER_PAGE = 200 _DEFAULT_RESULTS_PER_PAGE = 50 @@ -894,5 +939,6 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): break def _get_n_results(self, query, n): - tracks = self._get_collection('search/tracks', query, limit=n, q=query) - return self.playlist_result(tracks, query, query) + return self.playlist_result(itertools.islice( + self._get_collection('search/tracks', query, limit=n, q=query), + 0, None if n == float('inf') else n), query, query) diff --git a/hypervideo_dl/extractor/southpark.py b/hypervideo_dl/extractor/southpark.py index d497494..942a52d 100644 --- a/hypervideo_dl/extractor/southpark.py +++ b/hypervideo_dl/extractor/southpark.py @@ -6,19 +6,18 @@ from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark(?:\.cc|studios)\.com/((?:video-)?clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _TESTS = [{ - 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', + 'url': 'https://southpark.cc.com/video-clips/d7wr06/south-park-you-all-agreed-to-counseling', 'info_dict': { - 'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30', 'ext': 'mp4', - 'title': 'South Park|Bat Daded', - 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', - 'timestamp': 1112760000, - 'upload_date': '20050406', + 'title': 'You All Agreed to Counseling', + 'description': 'Kenny, Cartman, Stan, and Kyle visit Mr. Mackey and ask for his help getting Mrs. Nelson to come back. Mr. Mackey reveals the only way to get things back to normal is to get the teachers vaccinated.', + 'timestamp': 1615352400, + 'upload_date': '20210310', }, }, { 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', @@ -40,11 +39,11 @@ class SouthParkIE(MTVServicesInfoExtractor): class SouthParkEsIE(SouthParkIE): IE_NAME = 'southpark.cc.com:español' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/es/episodios/(?P<id>.+?)(\?|#|$))' _LANG = 'es' _TESTS = [{ - 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', + 'url': 'http://southpark.cc.com/es/episodios/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', 'info_dict': { 'title': 'Cartman Consigue Una Sonda Anal', 'description': 'Cartman Consigue Una Sonda Anal', diff --git a/hypervideo_dl/extractor/sovietscloset.py b/hypervideo_dl/extractor/sovietscloset.py index 7df2375..4bc2263 100644 --- a/hypervideo_dl/extractor/sovietscloset.py +++ b/hypervideo_dl/extractor/sovietscloset.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - js_to_json, try_get, unified_timestamp ) @@ -14,17 +13,7 @@ class SovietsClosetBaseIE(InfoExtractor): def parse_nuxt_jsonp(self, nuxt_jsonp_url, video_id, name): nuxt_jsonp = self._download_webpage(nuxt_jsonp_url, video_id, note=f'Downloading {name} __NUXT_JSONP__') - js, arg_keys, arg_vals = self._search_regex( - r'__NUXT_JSONP__\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)', - nuxt_jsonp, '__NUXT_JSONP__', group=['js', 'arg_keys', 'arg_vals']) - - args = dict(zip(arg_keys.split(','), arg_vals.split(','))) - - for key, val in args.items(): - if val in ('undefined', 'void 0'): - args[key] = 'null' - - return self._parse_json(js_to_json(js, args), video_id)['data'][0] + return self._search_nuxt_data(nuxt_jsonp, video_id, '__NUXT_JSONP__') def video_meta(self, video_id, game_name, category_name, episode_number, stream_date): title = game_name @@ -78,6 +67,7 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'series': 'The Witcher', 'season': 'Misc', 'episode_number': 13, + 'episode': 'Episode 13', }, }, { @@ -103,6 +93,7 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'series': 'Arma 3', 'season': 'Zeus Games', 'episode_number': 3, + 'episode': 'Episode 3', }, }, ] diff --git a/hypervideo_dl/extractor/spiegel.py b/hypervideo_dl/extractor/spiegel.py index 2da32b9..58f2ed3 100644 --- a/hypervideo_dl/extractor/spiegel.py +++ b/hypervideo_dl/extractor/spiegel.py @@ -7,7 +7,7 @@ from .jwplatform import JWPlatformIE class SpiegelIE(InfoExtractor): _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' - _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE + _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:$|[#?])' % _UUID_RE _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', 'md5': '50c7948883ec85a3e431a0a44b7ad1d6', diff --git a/hypervideo_dl/extractor/sportdeutschland.py b/hypervideo_dl/extractor/sportdeutschland.py index 94bcaba..15b488a 100644 --- a/hypervideo_dl/extractor/sportdeutschland.py +++ b/hypervideo_dl/extractor/sportdeutschland.py @@ -59,12 +59,8 @@ class SportDeutschlandIE(InfoExtractor): videos = asset.get('videos') or [] if len(videos) > 1: playlist_id = parse_qs(url).get('playlistId', [None])[0] - if playlist_id: - if self.get_param('noplaylist'): - videos = [videos[int(playlist_id)]] - self.to_screen('Downloading just a single video because of --no-playlist') - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % asset_id) + if not self._yes_playlist(playlist_id, asset_id): + videos = [videos[int(playlist_id)]] def entries(): for i, video in enumerate(videos, 1): diff --git a/hypervideo_dl/extractor/srgssr.py b/hypervideo_dl/extractor/srgssr.py index cbc1c47..f991981 100644 --- a/hypervideo_dl/extractor/srgssr.py +++ b/hypervideo_dl/extractor/srgssr.py @@ -7,6 +7,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + join_nonempty, parse_iso8601, qualities, try_get, @@ -94,11 +95,7 @@ class SRGSSRIE(InfoExtractor): continue protocol = source.get('protocol') quality = source.get('quality') - format_id = [] - for e in (protocol, source.get('encoding'), quality): - if e: - format_id.append(e) - format_id = '-'.join(format_id) + format_id = join_nonempty(protocol, source.get('encoding'), quality) if protocol in ('HDS', 'HLS'): if source.get('tokenType') == 'AKAMAI': diff --git a/hypervideo_dl/extractor/steam.py b/hypervideo_dl/extractor/steam.py index 7f777c4..4ed0fb5 100644 --- a/hypervideo_dl/extractor/steam.py +++ b/hypervideo_dl/extractor/steam.py @@ -7,14 +7,13 @@ from ..utils import ( extract_attributes, ExtractorError, get_element_by_class, - js_to_json, ) class SteamIE(InfoExtractor): _VALID_URL = r"""(?x) - https?://store\.steampowered\.com/ - (agecheck/)? + https?://(?:store\.steampowered|steamcommunity)\.com/ + (?:agecheck/)? (?P<urltype>video|app)/ #If the page is only for videos or for a game (?P<gameID>\d+)/? (?P<videoID>\d*)(?P<extra>\??) # For urltype == video we sometimes get the videoID @@ -27,21 +26,24 @@ class SteamIE(InfoExtractor): 'url': 'http://store.steampowered.com/video/105600/', 'playlist': [ { - 'md5': '6a294ee0c4b1f47f5bb76a65e31e3592', + 'md5': '695242613303ffa2a4c44c9374ddc067', 'info_dict': { - 'id': '2040428', + 'id': '256785003', 'ext': 'mp4', - 'title': 'Terraria 1.3 Trailer', - 'playlist_index': 1, + 'title': 'Terraria video 256785003', + 'thumbnail': r're:^https://cdn\.[^\.]+\.steamstatic\.com', + 'n_entries': 2, } }, { - 'md5': '911672b20064ca3263fa89650ba5a7aa', + 'md5': '6a294ee0c4b1f47f5bb76a65e31e3592', 'info_dict': { - 'id': '2029566', + 'id': '2040428', 'ext': 'mp4', - 'title': 'Terraria 1.2 Trailer', + 'title': 'Terraria video 2040428', 'playlist_index': 2, + 'thumbnail': r're:^https://cdn\.[^\.]+\.steamstatic\.com', + 'n_entries': 2, } } ], @@ -53,96 +55,76 @@ class SteamIE(InfoExtractor): 'playlistend': 2, } }, { - 'url': 'http://steamcommunity.com/sharedfiles/filedetails/?id=242472205', + 'url': 'https://store.steampowered.com/app/271590/Grand_Theft_Auto_V/', 'info_dict': { - 'id': 'X8kpJBlzD2E', + 'id': '256757115', + 'title': 'Grand Theft Auto V video 256757115', 'ext': 'mp4', - 'upload_date': '20140617', - 'title': 'FRONTIERS - Trapping', - 'description': 'md5:bf6f7f773def614054089e5769c12a6e', - 'uploader': 'AAD Productions', - 'uploader_id': 'AtomicAgeDogGames', - } + 'thumbnail': r're:^https://cdn\.[^\.]+\.steamstatic\.com', + 'n_entries': 20, + }, }] def _real_extract(self, url): m = self._match_valid_url(url) fileID = m.group('fileID') if fileID: - videourl = url + video_url = url playlist_id = fileID else: gameID = m.group('gameID') playlist_id = gameID - videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id + video_url = self._VIDEO_PAGE_TEMPLATE % playlist_id - self._set_cookie('steampowered.com', 'mature_content', '1') + self._set_cookie('steampowered.com', 'wants_mature_content', '1') + self._set_cookie('steampowered.com', 'birthtime', '944006401') + self._set_cookie('steampowered.com', 'lastagecheckage', '1-0-2000') - webpage = self._download_webpage(videourl, playlist_id) + webpage = self._download_webpage(video_url, playlist_id) - if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None: - videourl = self._AGECHECK_TEMPLATE % playlist_id + if re.search('<div[^>]+>Please enter your birth date to continue:</div>', webpage) is not None: + video_url = self._AGECHECK_TEMPLATE % playlist_id self.report_age_confirmation() - webpage = self._download_webpage(videourl, playlist_id) - - flash_vars = self._parse_json(self._search_regex( - r'(?s)rgMovieFlashvars\s*=\s*({.+?});', webpage, - 'flash vars'), playlist_id, js_to_json) + webpage = self._download_webpage(video_url, playlist_id) - playlist_title = None + videos = re.findall(r'(<div[^>]+id=[\'"]highlight_movie_(\d+)[\'"][^>]+>)', webpage) entries = [] - if fileID: - playlist_title = get_element_by_class('workshopItemTitle', webpage) - for movie in flash_vars.values(): - if not movie: - continue - youtube_id = movie.get('YOUTUBE_VIDEO_ID') - if not youtube_id: - continue + playlist_title = get_element_by_class('apphub_AppName', webpage) + for movie, movie_id in videos: + if not movie: + continue + movie = extract_attributes(movie) + if not movie_id: + continue + entry = { + 'id': movie_id, + 'title': f'{playlist_title} video {movie_id}', + } + formats = [] + if movie: + entry['thumbnail'] = movie.get('data-poster') + for quality in ('', '-hd'): + for ext in ('webm', 'mp4'): + video_url = movie.get('data-%s%s-source' % (ext, quality)) + if video_url: + formats.append({ + 'format_id': ext + quality, + 'url': video_url, + }) + self._sort_formats(formats) + entry['formats'] = formats + entries.append(entry) + embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage) + for evideos in embedded_videos: + evideos = extract_attributes(evideos).get('src') + video_id = self._search_regex(r'youtube\.com/embed/([0-9A-Za-z_-]{11})', evideos, 'youtube_video_id', default=None) + if video_id: entries.append({ - '_type': 'url', - 'url': youtube_id, + '_type': 'url_transparent', + 'id': video_id, + 'url': video_id, 'ie_key': 'Youtube', }) - else: - playlist_title = get_element_by_class('apphub_AppName', webpage) - for movie_id, movie in flash_vars.items(): - if not movie: - continue - video_id = self._search_regex(r'movie_(\d+)', movie_id, 'video id', fatal=False) - title = movie.get('MOVIE_NAME') - if not title or not video_id: - continue - entry = { - 'id': video_id, - 'title': title.replace('+', ' '), - } - formats = [] - flv_url = movie.get('FILENAME') - if flv_url: - formats.append({ - 'format_id': 'flv', - 'url': flv_url, - }) - highlight_element = self._search_regex( - r'(<div[^>]+id="highlight_movie_%s"[^>]+>)' % video_id, - webpage, 'highlight element', fatal=False) - if highlight_element: - highlight_attribs = extract_attributes(highlight_element) - if highlight_attribs: - entry['thumbnail'] = highlight_attribs.get('data-poster') - for quality in ('', '-hd'): - for ext in ('webm', 'mp4'): - video_url = highlight_attribs.get('data-%s%s-source' % (ext, quality)) - if video_url: - formats.append({ - 'format_id': ext + quality, - 'url': video_url, - }) - if not formats and not self.get_param('ignore_no_formats'): - continue - entry['formats'] = formats - entries.append(entry) if not entries: raise ExtractorError('Could not find any videos') diff --git a/hypervideo_dl/extractor/storyfire.py b/hypervideo_dl/extractor/storyfire.py index 9c69862..e18a59a 100644 --- a/hypervideo_dl/extractor/storyfire.py +++ b/hypervideo_dl/extractor/storyfire.py @@ -5,7 +5,7 @@ import functools from .common import InfoExtractor from ..utils import ( - # HEADRequest, + format_field, int_or_none, OnDemandPagedList, smuggle_url, @@ -26,18 +26,6 @@ class StoryFireBaseIE(InfoExtractor): r'https?://player\.vimeo\.com/external/(\d+)', video['vimeoVideoURL'], 'vimeo id') - # video_url = self._request_webpage( - # HEADRequest(video['vimeoVideoURL']), video_id).geturl() - # formats = [] - # for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]: - # formats.extend(self._extract_m3u8_formats( - # v_url, video_id, 'mp4', 'm3u8_native', - # m3u8_id='hls' + suffix, fatal=False)) - # formats.extend(self._extract_mpd_formats( - # v_url.replace('.m3u8', '.mpd'), video_id, - # mpd_id='dash' + suffix, fatal=False)) - # self._sort_formats(formats) - uploader_id = video.get('hostID') return { @@ -51,7 +39,6 @@ class StoryFireBaseIE(InfoExtractor): 'Referer': 'https://storyfire.com/', } }), - # 'formats': formats, 'thumbnail': video.get('storyImage'), 'view_count': int_or_none(video.get('views')), 'like_count': int_or_none(video.get('likesCount')), @@ -60,7 +47,7 @@ class StoryFireBaseIE(InfoExtractor): 'timestamp': int_or_none(video.get('publishDate')), 'uploader': video.get('username'), 'uploader_id': uploader_id, - 'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None, + 'uploader_url': format_field(uploader_id, template='https://storyfire.com/user/%s/video'), 'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')), } diff --git a/hypervideo_dl/extractor/streamcz.py b/hypervideo_dl/extractor/streamcz.py index 58e0b4c..4cb9923 100644 --- a/hypervideo_dl/extractor/streamcz.py +++ b/hypervideo_dl/extractor/streamcz.py @@ -1,105 +1,124 @@ # coding: utf-8 -from __future__ import unicode_literals - -import hashlib -import time +import json from .common import InfoExtractor from ..utils import ( + float_or_none, int_or_none, - sanitized_Request, + parse_codecs, + traverse_obj, + urljoin, ) -def _get_api_key(api_path): - if api_path.endswith('?'): - api_path = api_path[:-1] - - api_key = 'fb5f58a820353bd7095de526253c14fd' - a = '{0:}{1:}{2:}'.format(api_key, api_path, int(round(time.time() / 24 / 3600))) - return hashlib.md5(a.encode('ascii')).hexdigest() - - class StreamCZIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<id>[0-9]+)' - _API_URL = 'http://www.stream.cz/API' - + _VALID_URL = r'https?://(?:www\.)?(?:stream|televizeseznam)\.cz/[^?#]+/(?P<display_id>[^?#]+)-(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti', - 'md5': '934bb6a6d220d99c010783c9719960d5', + 'url': 'https://www.televizeseznam.cz/video/lajna/buh-57953890', + 'md5': '40c41ade1464a390a0b447e333df4239', 'info_dict': { - 'id': '765767', + 'id': '57953890', 'ext': 'mp4', - 'title': 'Peklo na talíři: Éčka pro děti', - 'description': 'Taška s grónskou pomazánkou a další pekelnosti ZDE', - 'thumbnail': 're:^http://im.stream.cz/episode/52961d7e19d423f8f06f0100', - 'duration': 256, - }, + 'title': 'Bůh', + 'display_id': 'buh', + 'description': 'md5:8f5f09b9b7bc67df910486cdd88f7165', + 'duration': 1369.6, + 'view_count': int, + } }, { - 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka', - 'md5': '849a88c1e1ca47d41403c2ba5e59e261', + 'url': 'https://www.stream.cz/kdo-to-mluvi/kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna-64087937', + 'md5': '41fd358000086a1ccdb068c77809b158', 'info_dict': { - 'id': '10002447', + 'id': '64087937', 'ext': 'mp4', - 'title': 'Kancelář Blaník: Tři roky pro Mazánka', - 'description': 'md5:3862a00ba7bf0b3e44806b544032c859', - 'thumbnail': 're:^http://im.stream.cz/episode/537f838c50c11f8d21320000', - 'duration': 368, - }, + 'title': 'Kdo to mluví? Velké odhalení přináší nový pořad už od 25. srpna', + 'display_id': 'kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna', + 'description': 'md5:97a811000a6460266029d6c1c2ebcd59', + 'duration': 50.2, + 'view_count': int, + } + }, { + 'url': 'https://www.stream.cz/tajemno/znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili-64147267', + 'md5': '3ee4d0be040e8f4a543e67e509d55e3f', + 'info_dict': { + 'id': '64147267', + 'ext': 'mp4', + 'title': 'Zničehonic jim skrz střechu prolítnul záhadný předmět. Badatelé vše objasnili', + 'display_id': 'znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili', + 'description': 'md5:4b8ada6718d34bb011c4e04ca4bc19bf', + 'duration': 442.84, + 'view_count': int, + } }] - def _real_extract(self, url): - video_id = self._match_id(url) - api_path = '/episode/%s' % video_id - - req = sanitized_Request(self._API_URL + api_path) - req.add_header('Api-Password', _get_api_key(api_path)) - data = self._download_json(req, video_id) + def _extract_formats(self, spl_url, video): + for ext, pref, streams in ( + ('ts', -1, traverse_obj(video, ('http_stream', 'qualities'))), + ('mp4', 1, video.get('mp4'))): + for format_id, stream in streams.items(): + if not stream.get('url'): + continue + yield { + 'format_id': f'{format_id}-{ext}', + 'ext': ext, + 'source_preference': pref, + 'url': urljoin(spl_url, stream['url']), + 'tbr': float_or_none(stream.get('bandwidth'), scale=1000), + 'duration': float_or_none(stream.get('duration'), scale=1000), + 'width': traverse_obj(stream, ('resolution', 0)), + 'height': traverse_obj(stream, ('resolution', 1)) or int_or_none(format_id.replace('p', '')), + **parse_codecs(stream.get('codec')), + } - formats = [] - for quality, video in enumerate(data['video_qualities']): - for f in video['formats']: - typ = f['type'].partition('/')[2] - qlabel = video.get('quality_label') - formats.append({ - 'format_note': '%s-%s' % (qlabel, typ) if qlabel else typ, - 'format_id': '%s-%s' % (typ, f['quality']), - 'url': f['source'], - 'height': int_or_none(f['quality'].rstrip('p')), - 'quality': quality, - }) - self._sort_formats(formats) + def _real_extract(self, url): + display_id, video_id = self._match_valid_url(url).groups() - image = data.get('image') - if image: - thumbnail = self._proto_relative_url( - image.replace('{width}', '1240').replace('{height}', '697'), - scheme='http:', - ) - else: - thumbnail = None + data = self._download_json( + 'https://www.televizeseznam.cz/api/graphql', video_id, 'Downloading GraphQL result', + data=json.dumps({ + 'variables': {'urlName': video_id}, + 'query': ''' + query LoadEpisode($urlName : String){ episode(urlName: $urlName){ ...VideoDetailFragmentOnEpisode } } + fragment VideoDetailFragmentOnEpisode on Episode { + id + spl + urlName + name + perex + duration + views + }''' + }).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=UTF-8'} + )['data']['episode'] - stream = data.get('_embedded', {}).get('stream:show', {}).get('name') - if stream: - title = '%s: %s' % (stream, data['name']) - else: - title = data['name'] + spl_url = data['spl'] + 'spl2,3' + metadata = self._download_json(spl_url, video_id, 'Downloading playlist') + if 'Location' in metadata and 'data' not in metadata: + spl_url = metadata['Location'] + metadata = self._download_json(spl_url, video_id, 'Downloading redirected playlist') + video = metadata['data'] subtitles = {} - srt_url = data.get('subtitles_srt') - if srt_url: - subtitles['cs'] = [{ - 'ext': 'srt', - 'url': srt_url, - }] + for subs in video.get('subtitles', {}).values(): + if not subs.get('language'): + continue + for ext, sub_url in subs.get('urls').items(): + subtitles.setdefault(subs['language'], []).append({ + 'ext': ext, + 'url': urljoin(spl_url, sub_url) + }) + + formats = list(self._extract_formats(spl_url, video)) + self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - 'description': data.get('web_site_text'), - 'duration': int_or_none(data.get('duration')), + 'display_id': display_id, + 'title': data.get('name'), + 'description': data.get('perex'), + 'duration': float_or_none(data.get('duration')), 'view_count': int_or_none(data.get('views')), + 'formats': formats, 'subtitles': subtitles, } diff --git a/hypervideo_dl/extractor/streamff.py b/hypervideo_dl/extractor/streamff.py new file mode 100644 index 0000000..6b190bb --- /dev/null +++ b/hypervideo_dl/extractor/streamff.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import int_or_none, parse_iso8601 + + +class StreamFFIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?streamff\.com/v/(?P<id>[a-zA-Z0-9]+)' + + _TESTS = [{ + 'url': 'https://streamff.com/v/55cc94', + 'md5': '8745a67bb5e5c570738efe7983826370', + 'info_dict': { + 'id': '55cc94', + 'ext': 'mp4', + 'title': '55cc94', + 'timestamp': 1634764643, + 'upload_date': '20211020', + 'view_count': int, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json(f'https://streamff.com/api/videos/{video_id}', video_id) + return { + 'id': video_id, + 'title': json_data.get('name') or video_id, + 'url': 'https://streamff.com/%s' % json_data['videoLink'], + 'view_count': int_or_none(json_data.get('views')), + 'timestamp': parse_iso8601(json_data.get('date')), + } diff --git a/hypervideo_dl/extractor/stripchat.py b/hypervideo_dl/extractor/stripchat.py new file mode 100644 index 0000000..0d4a0ce --- /dev/null +++ b/hypervideo_dl/extractor/stripchat.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + ExtractorError, + lowercase_escape, + try_get, +) + + +class StripchatIE(InfoExtractor): + _VALID_URL = r'https?://stripchat\.com/(?P<id>[0-9A-Za-z-_]+)' + _TESTS = [{ + 'url': 'https://stripchat.com/feel_me', + 'info_dict': { + 'id': 'feel_me', + 'ext': 'mp4', + 'title': 're:^feel_me [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': str, + 'is_live': True, + 'age_limit': 18, + }, + 'skip': 'Room is offline', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'https://stripchat.com/%s/' % video_id, video_id, + headers=self.geo_verification_headers()) + + data = self._parse_json( + self._search_regex( + r'<script\b[^>]*>\s*window\.__PRELOADED_STATE__\s*=(?P<value>.*?)<\/script>', + webpage, 'data', default='{}', group='value'), + video_id, transform_source=lowercase_escape, fatal=False) + if not data: + raise ExtractorError('Unable to find configuration for stream.') + + if try_get(data, lambda x: x['viewCam']['show'], dict): + raise ExtractorError('Model is in private show', expected=True) + elif not try_get(data, lambda x: x['viewCam']['model']['isLive'], bool): + raise ExtractorError('Model is offline', expected=True) + + server = try_get(data, lambda x: x['viewCam']['viewServers']['flashphoner-hls'], compat_str) + host = try_get(data, lambda x: x['config']['data']['hlsStreamHost'], compat_str) + model_id = try_get(data, lambda x: x['viewCam']['model']['id'], int) + + formats = self._extract_m3u8_formats( + 'https://b-%s.%s/hls/%d/%d.m3u8' % (server, host, model_id, model_id), + video_id, ext='mp4', m3u8_id='hls', fatal=False, live=True) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'description': self._og_search_description(webpage), + 'is_live': True, + 'formats': formats, + # Stripchat declares the RTA meta-tag, but in an non-standard format so _rta_search() can't be used + 'age_limit': 18, + } diff --git a/hypervideo_dl/extractor/stv.py b/hypervideo_dl/extractor/stv.py index d36a4b6..ba5661d 100644 --- a/hypervideo_dl/extractor/stv.py +++ b/hypervideo_dl/extractor/stv.py @@ -45,10 +45,7 @@ class STVPlayerIE(InfoExtractor): ptype, video_id = self._match_valid_url(url).groups() webpage = self._download_webpage(url, video_id, fatal=False) or '' - props = (self._parse_json(self._search_regex( - r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', - webpage, 'next data', default='{}'), video_id, - fatal=False) or {}).get('props') or {} + props = self._search_nextjs_data(webpage, video_id, default='{}').get('props') or {} player_api_cache = try_get( props, lambda x: x['initialReduxState']['playerApiCache']) or {} diff --git a/hypervideo_dl/extractor/sunporno.py b/hypervideo_dl/extractor/sunporno.py index 6805116..59b77bf 100644 --- a/hypervideo_dl/extractor/sunporno.py +++ b/hypervideo_dl/extractor/sunporno.py @@ -36,8 +36,7 @@ class SunPornoIE(InfoExtractor): webpage = self._download_webpage( 'http://www.sunporno.com/videos/%s' % video_id, video_id) - title = self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title') + title = self._html_extract_title(webpage) description = self._html_search_meta( 'description', webpage, 'description') thumbnail = self._html_search_regex( diff --git a/hypervideo_dl/extractor/svt.py b/hypervideo_dl/extractor/svt.py index 38e0086..8ca62e3 100644 --- a/hypervideo_dl/extractor/svt.py +++ b/hypervideo_dl/extractor/svt.py @@ -23,23 +23,27 @@ class SVTBaseIE(InfoExtractor): is_live = dict_get(video_info, ('live', 'simulcast'), default=False) m3u8_protocol = 'm3u8' if is_live else 'm3u8_native' formats = [] + subtitles = {} for vr in video_info['videoReferences']: player_type = vr.get('playerType') or vr.get('format') vurl = vr['url'] ext = determine_ext(vurl) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( vurl, video_id, ext='mp4', entry_protocol=m3u8_protocol, - m3u8_id=player_type, fatal=False)) + m3u8_id=player_type, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( vurl + '?hdcore=3.3.0', video_id, f4m_id=player_type, fatal=False)) elif ext == 'mpd': - if player_type == 'dashhbbtv': - formats.extend(self._extract_mpd_formats( - vurl, video_id, mpd_id=player_type, fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles( + vurl, video_id, mpd_id=player_type, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: formats.append({ 'format_id': player_type, @@ -52,18 +56,19 @@ class SVTBaseIE(InfoExtractor): countries=self._GEO_COUNTRIES, metadata_available=True) self._sort_formats(formats) - subtitles = {} subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences')) if isinstance(subtitle_references, list): for sr in subtitle_references: subtitle_url = sr.get('url') subtitle_lang = sr.get('language', 'sv') if subtitle_url: + sub = { + 'url': subtitle_url, + } if determine_ext(subtitle_url) == 'm3u8': - # TODO(yan12125): handle WebVTT in m3u8 manifests - continue - - subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url}) + # XXX: no way of testing, is it ever hit? + sub['ext'] = 'vtt' + subtitles.setdefault(subtitle_lang, []).append(sub) title = video_info.get('title') @@ -168,7 +173,6 @@ class SVTPlayIE(SVTPlayBaseIE): }, }, 'params': { - 'format': 'bestvideo', # skip for now due to download test asserts that segment is > 10000 bytes and svt uses # init segments that are smaller # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B @@ -204,10 +208,6 @@ class SVTPlayIE(SVTPlayBaseIE): 'only_matching': True, }] - def _adjust_title(self, info): - if info['is_live']: - info['title'] = self._live_title(info['title']) - def _extract_by_video_id(self, video_id, webpage=None): data = self._download_json( 'https://api.svt.se/videoplayer-api/video/%s' % video_id, @@ -221,7 +221,6 @@ class SVTPlayIE(SVTPlayBaseIE): if not title: title = video_id info_dict['title'] = title - self._adjust_title(info_dict) return info_dict def _real_extract(self, url): @@ -252,7 +251,6 @@ class SVTPlayIE(SVTPlayBaseIE): 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], 'thumbnail': thumbnail, }) - self._adjust_title(info_dict) return info_dict svt_id = try_get( diff --git a/hypervideo_dl/extractor/tagesschau.py b/hypervideo_dl/extractor/tagesschau.py index 25c2004..6e03d0a 100644 --- a/hypervideo_dl/extractor/tagesschau.py +++ b/hypervideo_dl/extractor/tagesschau.py @@ -5,177 +5,63 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, js_to_json, - parse_iso8601, - parse_filesize, + extract_attributes, + try_get, + int_or_none, ) -class TagesschauPlayerIE(InfoExtractor): - IE_NAME = 'tagesschau:player' - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html' - - _TESTS = [{ - 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html', - 'md5': '8d09548d5c15debad38bee3a4d15ca21', - 'info_dict': { - 'id': '179517', - 'ext': 'mp4', - 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD', - 'thumbnail': r're:^https?:.*\.jpg$', - 'formats': 'mincount:6', - }, - }, { - 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', - 'md5': '76e6eec6ebd40740671cf0a2c88617e5', - 'info_dict': { - 'id': '29417', - 'ext': 'mp3', - 'title': 'Trabi - Bye, bye Rennpappe', - 'thumbnail': r're:^https?:.*\.jpg$', - 'formats': 'mincount:2', - }, - }, { - 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html', - 'only_matching': True, - }] - - _FORMATS = { - 'xs': {'quality': 0}, - 's': {'width': 320, 'height': 180, 'quality': 1}, - 'm': {'width': 512, 'height': 288, 'quality': 2}, - 'l': {'width': 960, 'height': 540, 'quality': 3}, - 'xl': {'width': 1280, 'height': 720, 'quality': 4}, - 'xxl': {'quality': 5}, - } - - def _extract_via_api(self, kind, video_id): - info = self._download_json( - 'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id), - video_id) - title = info['headline'] - formats = [] - for media in info['mediadata']: - for format_id, format_url in media.items(): - if determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls')) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'vcodec': 'none' if kind == 'audio' else None, - }) - self._sort_formats(formats) - timestamp = parse_iso8601(info.get('date')) - return { - 'id': video_id, - 'title': title, - 'timestamp': timestamp, - 'formats': formats, - } - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - # kind = mobj.group('kind').lower() - # if kind == 'video': - # return self._extract_via_api(kind, video_id) - - # JSON api does not provide some audio formats (e.g. ogg) thus - # extracting audio via webpage - - webpage = self._download_webpage(url, video_id) - - title = self._og_search_title(webpage).strip() - formats = [] - - for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage): - media = self._parse_json(js_to_json(media_json), video_id, fatal=False) - if not media: - continue - src = media.get('src') - if not src: - return - quality = media.get('quality') - kind = media.get('type', '').split('/')[0] - ext = determine_ext(src) - f = { - 'url': src, - 'format_id': '%s_%s' % (quality, ext) if quality else ext, - 'ext': ext, - 'vcodec': 'none' if kind == 'audio' else None, - } - f.update(self._FORMATS.get(quality, {})) - formats.append(f) - - self._sort_formats(formats) - - thumbnail = self._og_search_thumbnail(webpage) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } - - class TagesschauIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', - 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6', + 'md5': '7a7287612fa881a1ae1d087df45c2fd6', 'info_dict': { - 'id': 'video-102143', + 'id': 'video-102143-1', 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', - 'description': '18.07.2015 20:10 Uhr', - 'thumbnail': r're:^https?:.*\.jpg$', }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', 'md5': '3c54c1f6243d279b706bde660ceec633', 'info_dict': { - 'id': 'ts-5727', + 'id': 'ts-5727-1', 'ext': 'mp4', - 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', - 'description': 'md5:695c01bfd98b7e313c501386327aea59', - 'thumbnail': r're:^https?:.*\.jpg$', + 'title': 'Ganze Sendung', }, }, { # exclusive audio 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', - 'md5': '76e6eec6ebd40740671cf0a2c88617e5', + 'md5': '4cf22023c285f35e99c24d290ba58cc9', 'info_dict': { - 'id': 'audio-29417', + 'id': 'audio-29417-1', 'ext': 'mp3', - 'title': 'Trabi - Bye, bye Rennpappe', - 'description': 'md5:8687dda862cbbe2cfb2df09b56341317', - 'thumbnail': r're:^https?:.*\.jpg$', + 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt', }, }, { - # audio in article 'url': 'http://www.tagesschau.de/inland/bnd-303.html', - 'md5': 'e0916c623e85fc1d2b26b78f299d3958', + 'md5': '12cfb212d9325b5ba0d52b625f1aa61c', 'info_dict': { - 'id': 'bnd-303', - 'ext': 'mp3', - 'title': 'Viele Baustellen für neuen BND-Chef', - 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4', - 'thumbnail': r're:^https?:.*\.jpg$', + 'id': 'bnd-303-1', + 'ext': 'mp4', + 'title': 'SPD-Gruppenbild mit Bärbel Bas nach der Fraktionssitzung | dpa', }, }, { 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', 'info_dict': { 'id': 'afd-parteitag-135', - 'title': 'Möchtegern-Underdog mit Machtanspruch', + 'title': 'AfD', + }, + 'playlist_count': 20, + }, { + 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', + 'info_dict': { + 'id': 'audio-29417-1', + 'ext': 'mp3', + 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt', }, - 'playlist_count': 2, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', 'only_matching': True, @@ -206,62 +92,6 @@ class TagesschauIE(InfoExtractor): 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url) - - def _extract_formats(self, download_text, media_kind): - links = re.finditer( - r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>', - download_text) - formats = [] - for l in links: - link_url = l.group('url') - if not link_url: - continue - format_id = self._search_regex( - r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID', - default=determine_ext(link_url)) - format = { - 'format_id': format_id, - 'url': l.group('url'), - 'format_name': l.group('name'), - } - title = l.group('title') - if title: - if media_kind.lower() == 'video': - m = re.match( - r'''(?x) - Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10; - (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10; - (?P<vbr>[0-9]+)kbps&\#10; - Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10; - Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''', - title) - if m: - format.update({ - 'format_note': m.group('audio_desc'), - 'vcodec': m.group('vcodec'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - 'abr': int(m.group('abr')), - 'vbr': int(m.group('vbr')), - 'filesize_approx': parse_filesize(m.group('filesize_approx')), - }) - else: - m = re.match( - r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)', - title) - if m: - format.update({ - 'format_note': '%s, %s' % (m.group('format'), m.group('note')), - 'vcodec': 'none', - 'abr': int(m.group('abr')), - }) - formats.append(format) - self._sort_formats(formats) - return formats - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') or mobj.group('path') @@ -271,34 +101,46 @@ class TagesschauIE(InfoExtractor): title = self._html_search_regex( r'<span[^>]*class="headline"[^>]*>(.+?)</span>', - webpage, 'title', default=None) or self._og_search_title(webpage) - - DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>' - - webpage_type = self._og_search_property('type', webpage, default=None) - if webpage_type == 'website': # Article - entries = [] - for num, (entry_title, media_kind, download_text) in enumerate(re.findall( - r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX, - webpage), 1): + webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False) + + entries = [] + videos = re.findall(r'<div[^>]+>', webpage) + num = 0 + for video in videos: + video = extract_attributes(video).get('data-config') + if not video: + continue + video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False) + video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray']) + if not video_formats: + continue + num += 1 + for video_format in video_formats: + media_url = video_format.get('_stream') or '' + formats = [] + if media_url.endswith('master.m3u8'): + formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls') + elif media_url.endswith('.hi.mp3') and media_url.startswith('https://download'): + formats = [{ + 'url': media_url, + 'vcodec': 'none', + }] + if not formats: + continue entries.append({ 'id': '%s-%d' % (display_id, num), - 'title': '%s' % entry_title, - 'formats': self._extract_formats(download_text, media_kind), + 'title': try_get(video, lambda x: x['mc']['_title']), + 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])), + 'formats': formats }) - if len(entries) > 1: - return self.playlist_result(entries, display_id, title) - formats = entries[0]['formats'] - else: # Assume single video - download_text = self._search_regex( - DOWNLOAD_REGEX, webpage, 'download links', group='links') - media_kind = self._search_regex( - DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind') - formats = self._extract_formats(download_text, media_kind) - thumbnail = self._og_search_thumbnail(webpage) - description = self._html_search_regex( - r'(?s)<p class="teasertext">(.*?)</p>', - webpage, 'description', default=None) + if len(entries) > 1: + return self.playlist_result(entries, display_id, title) + formats = entries[0]['formats'] + video_info = self._search_json_ld(webpage, video_id) + description = video_info.get('description') + thumbnail = self._og_search_thumbnail(webpage) or video_info.get('thumbnail') + timestamp = video_info.get('timestamp') + title = title or video_info.get('description') self._sort_formats(formats) @@ -307,5 +149,6 @@ class TagesschauIE(InfoExtractor): 'title': title, 'thumbnail': thumbnail, 'formats': formats, + 'timestamp': timestamp, 'description': description, } diff --git a/hypervideo_dl/extractor/teachable.py b/hypervideo_dl/extractor/teachable.py index 37eae82..232eaa5 100644 --- a/hypervideo_dl/extractor/teachable.py +++ b/hypervideo_dl/extractor/teachable.py @@ -40,8 +40,7 @@ class TeachableBaseIE(InfoExtractor): if self._logged_in: return - username, password = self._get_login_info( - netrc_machine=self._SITES.get(site, site)) + username, password = self._get_login_info(netrc_machine=self._SITES.get(site, site)) if username is None: return diff --git a/hypervideo_dl/extractor/teamtreehouse.py b/hypervideo_dl/extractor/teamtreehouse.py index d347e97..64522ec 100644 --- a/hypervideo_dl/extractor/teamtreehouse.py +++ b/hypervideo_dl/extractor/teamtreehouse.py @@ -51,17 +51,14 @@ class TeamTreeHouseIE(InfoExtractor): }] _NETRC_MACHINE = 'teamtreehouse' - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return + def _perform_login(self, username, password): signin_page = self._download_webpage( 'https://teamtreehouse.com/signin', None, 'Downloading signin page') data = self._form_hidden_inputs('new_user_session', signin_page) data.update({ - 'user_session[email]': email, + 'user_session[email]': username, 'user_session[password]': password, }) error_message = get_element_by_class('error-message', self._download_webpage( diff --git a/hypervideo_dl/extractor/ted.py b/hypervideo_dl/extractor/ted.py index f09f1a3..b5c7e35 100644 --- a/hypervideo_dl/extractor/ted.py +++ b/hypervideo_dl/extractor/ted.py @@ -1,274 +1,105 @@ -from __future__ import unicode_literals - -import json +import itertools import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse -) from ..utils import ( - extract_attributes, - float_or_none, int_or_none, + str_to_int, try_get, url_or_none, + unified_strdate, + parse_duration, ) -class TEDIE(InfoExtractor): - IE_NAME = 'ted' - _VALID_URL = r'''(?x) - (?P<proto>https?://) - (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ - ( - (?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist - | - ((?P<type_talk>talks)) # We have a simple talk - | - (?P<type_watch>watch)/[^/]+/[^/]+ - ) - (/lang/(.*?))? # The url may contain the language - /(?P<name>[\w-]+) # Here goes the name and then ".html" - .*)$ - ''' +class TedBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://www\.ted\.com/(?:{type})(?:/lang/[^/#?]+)?/(?P<id>[\w-]+)' + + def _parse_playlist(self, playlist): + for entry in try_get(playlist, lambda x: x['videos']['nodes'], list): + if entry.get('__typename') == 'Video' and entry.get('canonicalUrl'): + yield self.url_result(entry['canonicalUrl'], TedTalkIE.ie_key()) + + +class TedTalkIE(TedBaseIE): + _VALID_URL = TedBaseIE._VALID_URL_BASE.format(type='talks') _TESTS = [{ - 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', - 'md5': 'b0ce2b05ca215042124fbc9e3886493a', - 'info_dict': { - 'id': '102', - 'ext': 'mp4', - 'title': 'The illusion of consciousness', - 'description': ('Philosopher Dan Dennett makes a compelling ' - 'argument that not only don\'t we understand our own ' - 'consciousness, but that half the time our brains are ' - 'actively fooling us.'), - 'uploader': 'Dan Dennett', - 'width': 853, - 'duration': 1308, - 'view_count': int, - 'comment_count': int, - 'tags': list, - }, - 'params': { - 'skip_download': True, - }, - }, { - # missing HTTP bitrates - 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms', - 'info_dict': { - 'id': '6069', - 'ext': 'mp4', - 'title': 'The beauty and power of algorithms', - 'thumbnail': r're:^https?://.+\.jpg', - 'description': 'md5:734e352710fb00d840ab87ae31aaf688', - 'uploader': 'Vishal Sikka', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', - 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0', - 'info_dict': { - 'id': '1972', - 'ext': 'mp4', - 'title': 'Be passionate. Be courageous. Be your best.', - 'uploader': 'Gabby Giffords and Mark Kelly', - 'description': 'md5:5174aed4d0f16021b704120360f72b92', - 'duration': 1128, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.ted.com/playlists/who_are_the_hackers', - 'info_dict': { - 'id': '10', - 'title': 'Who are the hackers?', - 'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a' - }, - 'playlist_mincount': 6, - }, { - # contains a youtube video - 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', - 'add_ie': ['Youtube'], - 'info_dict': { - 'id': '_ZG8HBuDjgc', - 'ext': 'webm', - 'title': 'Douglas Adams: Parrots the Universe and Everything', - 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', - 'uploader': 'University of California Television (UCTV)', - 'uploader_id': 'UCtelevision', - 'upload_date': '20080522', - }, - 'params': { - 'skip_download': True, - }, - }, { - # no nativeDownloads - 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', + 'url': 'https://www.ted.com/talks/candace_parker_how_to_break_down_barriers_and_not_accept_limits', + 'md5': '47e82c666d9c3261d4fe74748a90aada', 'info_dict': { - 'id': '1792', + 'id': '86532', 'ext': 'mp4', - 'title': 'The orchestra in my mouth', - 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', - 'uploader': 'Tom Thum', + 'title': 'How to break down barriers and not accept limits', + 'description': 'md5:000707cece219d1e165b11550d612331', 'view_count': int, - 'comment_count': int, - 'tags': list, + 'tags': ['personal growth', 'equality', 'activism', 'motivation', 'social change', 'sports'], + 'uploader': 'Candace Parker', + 'duration': 676.0, + 'upload_date': '20220114', + 'release_date': '20211201', + 'thumbnail': r're:http.*\.jpg', }, - 'params': { - 'skip_download': True, - }, - }, { - # with own formats and private Youtube external - 'url': 'https://www.ted.com/talks/spencer_wells_a_family_tree_for_humanity', - 'only_matching': True, }] - _NATIVE_FORMATS = { - 'low': {'width': 320, 'height': 180}, - 'medium': {'width': 512, 'height': 288}, - 'high': {'width': 854, 'height': 480}, - } - - def _extract_info(self, webpage): - info_json = self._search_regex( - r'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*</script>', - webpage, 'info json') - return json.loads(info_json) - def _real_extract(self, url): - m = re.match(self._VALID_URL, url, re.VERBOSE) - if m.group('type').startswith('embed'): - desktop_url = m.group('proto') + 'www' + m.group('urlmain') - return self.url_result(desktop_url, 'TED') - name = m.group('name') - if m.group('type_talk'): - return self._talk_info(url, name) - elif m.group('type_watch'): - return self._watch_info(url, name) - else: - return self._playlist_videos_info(url, name) - - def _playlist_videos_info(self, url, name): - '''Returns the videos of the playlist''' - - webpage = self._download_webpage(url, name, - 'Downloading playlist webpage') - - playlist_entries = [] - for entry in re.findall(r'(?s)<[^>]+data-ga-context=["\']playlist["\'][^>]*>', webpage): - attrs = extract_attributes(entry) - entry_url = compat_urlparse.urljoin(url, attrs['href']) - playlist_entries.append(self.url_result(entry_url, self.ie_key())) - - final_url = self._og_search_url(webpage, fatal=False) - playlist_id = ( - re.match(self._VALID_URL, final_url).group('playlist_id') - if final_url else None) - - return self.playlist_result( - playlist_entries, playlist_id=playlist_id, - playlist_title=self._og_search_title(webpage, fatal=False), - playlist_description=self._og_search_description(webpage)) - - def _talk_info(self, url, video_name): - webpage = self._download_webpage(url, video_name) - - info = self._extract_info(webpage) - - data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info - talk_info = data['talks'][0] - - title = talk_info['title'].strip() - - downloads = talk_info.get('downloads') or {} - native_downloads = downloads.get('nativeDownloads') or talk_info.get('nativeDownloads') or {} - - formats = [{ - 'url': format_url, - 'format_id': format_id, - } for (format_id, format_url) in native_downloads.items() if format_url is not None] - - subtitled_downloads = downloads.get('subtitledDownloads') or {} - for lang, subtitled_download in subtitled_downloads.items(): - for q in self._NATIVE_FORMATS: - q_url = subtitled_download.get(q) - if not q_url: - continue - formats.append({ - 'url': q_url, - 'format_id': '%s-%s' % (q, lang), - 'language': lang, - }) - - if formats: - for f in formats: - finfo = self._NATIVE_FORMATS.get(f['format_id'].split('-')[0]) - if finfo: - f.update(finfo) - - player_talk = talk_info['player_talks'][0] - - resources_ = player_talk.get('resources') or talk_info.get('resources') + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + talk_info = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['videoData'] + video_id = talk_info['id'] + playerData = self._parse_json(talk_info.get('playerData'), video_id) http_url = None - for format_id, resources in resources_.items(): + formats, subtitles = [], {} + for format_id, resources in (playerData.get('resources') or {}).items(): if format_id == 'hls': - if not isinstance(resources, dict): - continue - stream_url = url_or_none(resources.get('stream')) + stream_url = url_or_none(try_get(resources, lambda x: x['stream'])) if not stream_url: continue - formats.extend(self._extract_m3u8_formats( - stream_url, video_name, 'mp4', m3u8_id=format_id, - fatal=False)) - else: - if not isinstance(resources, list): - continue - if format_id == 'h264': - for resource in resources: - h264_url = resource.get('file') - if not h264_url: - continue - bitrate = int_or_none(resource.get('bitrate')) - formats.append({ - 'url': h264_url, - 'format_id': '%s-%sk' % (format_id, bitrate), - 'tbr': bitrate, - }) - if re.search(r'\d+k', h264_url): - http_url = h264_url - elif format_id == 'rtmp': - streamer = talk_info.get('streamer') - if not streamer: + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + stream_url, video_id, 'mp4', m3u8_id=format_id, fatal=False) + formats.extend(m3u8_formats) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + continue + + if not isinstance(resources, list): + continue + if format_id == 'h264': + for resource in resources: + h264_url = resource.get('file') + if not h264_url: continue - for resource in resources: - formats.append({ - 'format_id': '%s-%s' % (format_id, resource.get('name')), - 'url': streamer, - 'play_path': resource['file'], - 'ext': 'flv', - 'width': int_or_none(resource.get('width')), - 'height': int_or_none(resource.get('height')), - 'tbr': int_or_none(resource.get('bitrate')), - }) + bitrate = int_or_none(resource.get('bitrate')) + formats.append({ + 'url': h264_url, + 'format_id': '%s-%sk' % (format_id, bitrate), + 'tbr': bitrate, + }) + if re.search(r'\d+k', h264_url): + http_url = h264_url + elif format_id == 'rtmp': + streamer = talk_info.get('streamer') + if not streamer: + continue + formats.extend({ + 'format_id': '%s-%s' % (format_id, resource.get('name')), + 'url': streamer, + 'play_path': resource['file'], + 'ext': 'flv', + 'width': int_or_none(resource.get('width')), + 'height': int_or_none(resource.get('height')), + 'tbr': int_or_none(resource.get('bitrate')), + } for resource in resources if resource.get('file')) - m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', - formats)) if http_url: + m3u8_formats = [f for f in formats if f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none'] for m3u8_format in m3u8_formats: bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) if not bitrate: continue bitrate_url = re.sub(r'\d+k', bitrate, http_url) if not self._is_valid_url( - bitrate_url, video_name, '%s bitrate' % bitrate): + bitrate_url, video_id, '%s bitrate' % bitrate): continue f = m3u8_format.copy() f.update({ @@ -289,79 +120,123 @@ class TEDIE(InfoExtractor): }) if not formats: - external = player_talk.get('external') - if isinstance(external, dict): - service = external.get('service') - if isinstance(service, compat_str): - ext_url = None - if service.lower() == 'youtube': - ext_url = external.get('code') - return self.url_result(ext_url or external['uri']) + external = playerData.get('external') or {} + service = external.get('service') or '' + ext_url = external.get('code') if service.lower() == 'youtube' else None + return self.url_result(ext_url or external['uri']) self._sort_formats(formats) - video_id = compat_str(talk_info['id']) + thumbnail = playerData.get('thumb') or self._og_search_property('image', webpage) + if thumbnail: + # trim thumbnail resize parameters + thumbnail = thumbnail.split('?')[0] return { 'id': video_id, - 'title': title, - 'uploader': player_talk.get('speaker') or talk_info.get('speaker'), - 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'), - 'description': self._og_search_description(webpage), - 'subtitles': self._get_subtitles(video_id, talk_info), + 'title': talk_info.get('title') or self._og_search_title(webpage), + 'uploader': talk_info.get('presenterDisplayName'), + 'thumbnail': thumbnail, + 'description': talk_info.get('description') or self._og_search_description(webpage), + 'subtitles': subtitles, 'formats': formats, - 'duration': float_or_none(talk_info.get('duration')), - 'view_count': int_or_none(data.get('viewed_count')), - 'comment_count': int_or_none( - try_get(data, lambda x: x['comments']['count'])), - 'tags': try_get(talk_info, lambda x: x['tags'], list), + 'duration': talk_info.get('duration') or parse_duration(self._og_search_property('video:duration', webpage)), + 'view_count': str_to_int(talk_info.get('viewedCount')), + 'upload_date': unified_strdate(talk_info.get('publishedAt')), + 'release_date': unified_strdate(talk_info.get('recordedOn')), + 'tags': try_get(playerData, lambda x: x['targeting']['tag'].split(',')), } - def _get_subtitles(self, video_id, talk_info): - sub_lang_list = {} - for language in try_get( - talk_info, - (lambda x: x['downloads']['languages'], - lambda x: x['languages']), list): - lang_code = language.get('languageCode') or language.get('ianaCode') - if not lang_code: - continue - sub_lang_list[lang_code] = [ - { - 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext), - 'ext': ext, - } - for ext in ['ted', 'srt'] - ] - return sub_lang_list - def _watch_info(self, url, name): - webpage = self._download_webpage(url, name) +class TedSeriesIE(TedBaseIE): + _VALID_URL = fr'{TedBaseIE._VALID_URL_BASE.format(type=r"series")}(?:#season_(?P<season>\d+))?' + _TESTS = [{ + 'url': 'https://www.ted.com/series/small_thing_big_idea', + 'info_dict': { + 'id': '3', + 'title': 'Small Thing Big Idea', + 'series': 'Small Thing Big Idea', + 'description': 'md5:6869ca52cec661aef72b3e9f7441c55c' + }, + 'playlist_mincount': 16, + }, { + 'url': 'https://www.ted.com/series/the_way_we_work#season_2', + 'info_dict': { + 'id': '8_2', + 'title': 'The Way We Work Season 2', + 'series': 'The Way We Work', + 'description': 'md5:59469256e533e1a48c4aa926a382234c', + 'season_number': 2 + }, + 'playlist_mincount': 8, + }] - config_json = self._html_search_regex( - r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>', - webpage, 'config', default=None) - if not config_json: - embed_url = self._search_regex( - r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url') - return self.url_result(self._proto_relative_url(embed_url)) - config = json.loads(config_json)['config'] - video_url = config['video']['url'] - thumbnail = config.get('image', {}).get('url') + def _real_extract(self, url): + display_id, season = self._match_valid_url(url).group('id', 'season') + webpage = self._download_webpage(url, display_id, 'Downloading series webpage') + info = self._search_nextjs_data(webpage, display_id)['props']['pageProps'] - title = self._html_search_regex( - r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title') - description = self._html_search_regex( - [ - r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>', - r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>', - ], - webpage, 'description', fatal=False) + entries = itertools.chain.from_iterable( + self._parse_playlist(s) for s in info['seasons'] if season in [None, s.get('seasonNumber')]) - return { - 'id': name, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'description': description, - } + series_id = try_get(info, lambda x: x['series']['id']) + series_name = try_get(info, lambda x: x['series']['name']) or self._og_search_title(webpage, fatal=False) + + return self.playlist_result( + entries, + f'{series_id}_{season}' if season and series_id else series_id, + f'{series_name} Season {season}' if season else series_name, + self._og_search_description(webpage), + series=series_name, season_number=int_or_none(season)) + + +class TedPlaylistIE(TedBaseIE): + _VALID_URL = TedBaseIE._VALID_URL_BASE.format(type=r'playlists(?:/\d+)?') + _TESTS = [{ + 'url': 'https://www.ted.com/playlists/171/the_most_popular_talks_of_all', + 'info_dict': { + 'id': '171', + 'title': 'The most popular talks of all time', + 'description': 'md5:d2f22831dc86c7040e733a3cb3993d78' + }, + 'playlist_mincount': 25, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + playlist = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['playlist'] + + return self.playlist_result( + self._parse_playlist(playlist), playlist.get('id'), + playlist.get('title') or self._og_search_title(webpage, default='').replace(' | TED Talks', '') or None, + self._og_search_description(webpage)) + + +class TedEmbedIE(InfoExtractor): + _VALID_URL = r'https?://embed(?:-ssl)?\.ted\.com/' + + _TESTS = [{ + 'url': 'https://embed.ted.com/talks/janet_stovall_how_to_get_serious_about_diversity_and_inclusion_in_the_workplace', + 'info_dict': { + 'id': '21802', + 'ext': 'mp4', + 'title': 'How to get serious about diversity and inclusion in the workplace', + 'description': 'md5:0978aafe396e05341f8ecc795d22189d', + 'view_count': int, + 'tags': list, + 'uploader': 'Janet Stovall', + 'duration': 664.0, + 'upload_date': '20180822', + 'release_date': '20180719', + 'thumbnail': r're:http.*\.jpg', + }, + }] + + @classmethod + def _extract_urls(cls, webpage): + return [mobj.group('url') for mobj in re.finditer( + fr'<iframe[^>]+?src=(["\'])(?P<url>{cls._VALID_URL}.+?)\1', webpage)] + + def _real_extract(self, url): + return self.url_result(re.sub(r'://embed(-ssl)?', '://www', url), TedTalkIE.ie_key()) diff --git a/hypervideo_dl/extractor/tele5.py b/hypervideo_dl/extractor/tele5.py index 0d9cf75..c7beee1 100644 --- a/hypervideo_dl/extractor/tele5.py +++ b/hypervideo_dl/extractor/tele5.py @@ -1,19 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from .jwplatform import JWPlatformIE -from .nexx import NexxIE +from .dplay import DPlayIE +from ..compat import compat_urlparse from ..utils import ( - NO_DEFAULT, - parse_qs, - smuggle_url, + ExtractorError, + extract_attributes, ) -class Tele5IE(InfoExtractor): +class Tele5IE(DPlayIE): _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)' _GEO_COUNTRIES = ['DE'] _TESTS = [{ @@ -28,6 +24,7 @@ class Tele5IE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'No longer available: "404 Seite nicht gefunden"', }, { # jwplatform, nexx unavailable 'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/', @@ -42,7 +39,20 @@ class Tele5IE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [JWPlatformIE.ie_key()], + 'skip': 'No longer available, redirects to Filme page', + }, { + 'url': 'https://tele5.de/mediathek/angel-of-mine/', + 'info_dict': { + 'id': '1252360', + 'ext': 'mp4', + 'upload_date': '20220109', + 'timestamp': 1641762000, + 'title': 'Angel of Mine', + 'description': 'md5:a72546a175e1286eb3251843a52d1ad7', + }, + 'params': { + 'format': 'bestvideo', + }, }, { 'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191', 'only_matching': True, @@ -64,45 +74,18 @@ class Tele5IE(InfoExtractor): }] def _real_extract(self, url): - qs = parse_qs(url) - video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] - - NEXX_ID_RE = r'\d{6,}' - JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}' - - def nexx_result(nexx_id): - return self.url_result( - 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id, - ie=NexxIE.ie_key(), video_id=nexx_id) - - nexx_id = jwplatform_id = None - - if video_id: - if re.match(NEXX_ID_RE, video_id): - return nexx_result(video_id) - elif re.match(JWPLATFORM_ID_RE, video_id): - jwplatform_id = video_id - - if not nexx_id: - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def extract_id(pattern, name, default=NO_DEFAULT): - return self._html_search_regex( - (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, - r'\s+id\s*=\s*["\']player_(%s)' % pattern, - r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, - default=default) - - nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) - if nexx_id: - return nexx_result(nexx_id) - - if not jwplatform_id: - jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') - - return self.url_result( - smuggle_url( - 'jwplatform:%s' % jwplatform_id, - {'geo_countries': self._GEO_COUNTRIES}), - ie=JWPlatformIE.ie_key(), video_id=jwplatform_id) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player_element = self._search_regex(r'(<hyoga-player\b[^>]+?>)', webpage, 'video player') + player_info = extract_attributes(player_element) + asset_id, country, realm = (player_info[x] for x in ('assetid', 'locale', 'realm', )) + endpoint = compat_urlparse.urlparse(player_info['endpoint']).hostname + source_type = player_info.get('sourcetype') + if source_type: + endpoint = '%s-%s' % (source_type, endpoint) + try: + return self._get_disco_api_info(url, asset_id, endpoint, realm, country) + except ExtractorError as e: + if getattr(e, 'message', '') == 'Missing deviceId in context': + self.report_drm(video_id) + raise diff --git a/hypervideo_dl/extractor/telebruxelles.py b/hypervideo_dl/extractor/telebruxelles.py index a0353fe..9e8c89b 100644 --- a/hypervideo_dl/extractor/telebruxelles.py +++ b/hypervideo_dl/extractor/telebruxelles.py @@ -69,7 +69,7 @@ class TeleBruxellesIE(InfoExtractor): return { 'id': article_id or display_id, 'display_id': display_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': description, 'formats': formats, 'is_live': is_live, diff --git a/hypervideo_dl/extractor/telegram.py b/hypervideo_dl/extractor/telegram.py new file mode 100644 index 0000000..2dfa261 --- /dev/null +++ b/hypervideo_dl/extractor/telegram.py @@ -0,0 +1,37 @@ +from .common import InfoExtractor + + +class TelegramEmbedIE(InfoExtractor): + IE_NAME = 'telegram:embed' + _VALID_URL = r'https?://t\.me/(?P<channel_name>[^/]+)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://t.me/europa_press/613', + 'info_dict': { + 'id': '613', + 'ext': 'mp4', + 'title': 'Europa Press', + 'description': '6ce2d7e8d56eda16d80607b23db7b252', + 'thumbnail': r're:^https?:\/\/cdn.*?telesco\.pe\/file\/\w+', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + webpage_embed = self._download_webpage(f'{url}?embed=1', video_id) + + formats = [{ + 'url': self._proto_relative_url(self._search_regex( + '<video[^>]+src="([^"]+)"', webpage_embed, 'source')), + 'ext': 'mp4', + }] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage, fatal=True), + 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage, fatal=True), + 'thumbnail': self._search_regex(r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)', + webpage_embed, 'thumbnail'), + 'formats': formats, + } diff --git a/hypervideo_dl/extractor/telemundo.py b/hypervideo_dl/extractor/telemundo.py index 18552a0..ebcecf5 100644 --- a/hypervideo_dl/extractor/telemundo.py +++ b/hypervideo_dl/extractor/telemundo.py @@ -1,4 +1,4 @@ -# coding=utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor @@ -34,8 +34,7 @@ class TelemundoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - metadata = self._parse_json( - self._search_regex(r'<[^>]+id="__NEXT_DATA__"[^>]+>([^<]+)', webpage, 'JSON metadata'), video_id) + metadata = self._search_nextjs_data(webpage, video_id) redirect_url = try_get( metadata, lambda x: x['props']['initialState']['video']['associatedPlaylists'][0]['videos'][0]['videoAssets'][0]['publicUrl']) diff --git a/hypervideo_dl/extractor/telequebec.py b/hypervideo_dl/extractor/telequebec.py index 800d87b..4bef2fe 100644 --- a/hypervideo_dl/extractor/telequebec.py +++ b/hypervideo_dl/extractor/telequebec.py @@ -43,9 +43,6 @@ class TeleQuebecIE(TeleQuebecBaseIE): 'uploader_id': '6150020952001', 'upload_date': '20200512', }, - 'params': { - 'format': 'bestvideo', - }, 'add_ie': ['BrightcoveNew'], }, { 'url': 'https://zonevideo.telequebec.tv/media/55267/le-soleil/passe-partout', @@ -58,9 +55,6 @@ class TeleQuebecIE(TeleQuebecBaseIE): 'upload_date': '20200625', 'timestamp': 1593090307, }, - 'params': { - 'format': 'bestvideo', - }, 'add_ie': ['BrightcoveNew'], }, { # no description @@ -157,9 +151,6 @@ class TeleQuebecEmissionIE(InfoExtractor): 'timestamp': 1588713424, 'uploader_id': '6150020952001', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'http://bancpublic.telequebec.tv/emissions/emission-49/31986/jeunes-meres-sous-pression', 'only_matching': True, @@ -220,9 +211,6 @@ class TeleQuebecVideoIE(TeleQuebecBaseIE): 'timestamp': 1603115930, 'uploader_id': '6101674910001', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'https://video.telequebec.tv/player-live/28527', 'only_matching': True, diff --git a/hypervideo_dl/extractor/tennistv.py b/hypervideo_dl/extractor/tennistv.py index a39a2fc..58fdece 100644 --- a/hypervideo_dl/extractor/tennistv.py +++ b/hypervideo_dl/extractor/tennistv.py @@ -30,11 +30,9 @@ class TennisTVIE(InfoExtractor): 'skip': 'Requires email and password of a subscribed account', } _NETRC_MACHINE = 'tennistv' + _session_token = None - def _login(self): - username, password = self._get_login_info() - if not username or not password: - raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) + def _perform_login(self, username, password): login_form = { 'Email': username, @@ -63,7 +61,8 @@ class TennisTVIE(InfoExtractor): self._session_token = login_result['sessionToken'] def _real_initialize(self): - self._login() + if not self._session_token: + raise self.raise_login_required('Login info is needed for this website', method='password') def _real_extract(self, url): video_id = self._match_id(url) diff --git a/hypervideo_dl/extractor/tenplay.py b/hypervideo_dl/extractor/tenplay.py index c810cfd..5c7b545 100644 --- a/hypervideo_dl/extractor/tenplay.py +++ b/hypervideo_dl/extractor/tenplay.py @@ -7,6 +7,7 @@ import base64 from .common import InfoExtractor from ..utils import ( HEADRequest, + int_or_none, urlencode_postdata, ) @@ -15,6 +16,28 @@ class TenPlayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?P<id>tpv\d{6}[a-z]{5})' _NETRC_MACHINE = '10play' _TESTS = [{ + 'url': 'https://10play.com.au/neighbours/web-extras/season-39/nathan-borg-is-the-first-aussie-actor-with-a-cochlear-implant-to-join-neighbours/tpv210128qupwd', + 'info_dict': { + 'id': '6226844312001', + 'ext': 'mp4', + 'title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', + 'alt_title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', + 'description': 'md5:a02d0199c901c2dd4c796f1e7dd0de43', + 'duration': 186, + 'season': 39, + 'series': 'Neighbours', + 'thumbnail': r're:https://.*\.jpg', + 'uploader': 'Channel 10', + 'age_limit': 15, + 'timestamp': 1611810000, + 'upload_date': '20210128', + 'uploader_id': '2199827728001', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Only available in Australia', + }, { 'url': 'https://10play.com.au/todd-sampsons-body-hack/episodes/season-4/episode-7/tpv200921kvngh', 'info_dict': { 'id': '6192880312001', @@ -58,16 +81,21 @@ class TenPlayIE(InfoExtractor): 'email': username, 'password': password, })) - return "Bearer " + data['jwt']['accessToken'] + return 'Bearer ' + data['jwt']['accessToken'] def _real_extract(self, url): content_id = self._match_id(url) - _token = self._get_bearer_token(content_id) data = self._download_json( 'https://10play.com.au/api/v1/videos/' + content_id, content_id) + headers = {} + + if data.get('memberGated') is True: + _token = self._get_bearer_token(content_id) + headers = {'Authorization': _token} + _video_url = self._download_json( data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON', - headers={'Authorization': _token}).get('source') + headers=headers).get('source') m3u8_url = self._request_webpage(HEADRequest( _video_url), content_id).geturl() if '10play-not-in-oz' in m3u8_url: @@ -77,12 +105,16 @@ class TenPlayIE(InfoExtractor): return { 'formats': formats, + 'subtitles': {'en': [{'url': data.get('captionUrl')}]} if data.get('captionUrl') else None, 'id': data.get('altId') or content_id, - 'title': data.get('title'), + 'duration': data.get('duration'), + 'title': data.get('subtitle'), + 'alt_title': data.get('title'), 'description': data.get('description'), 'age_limit': self._AUS_AGES.get(data.get('classification')), - 'series': data.get('showName'), - 'season': data.get('showContentSeason'), + 'series': data.get('tvShow'), + 'season': int_or_none(data.get('season')), + 'episode_number': int_or_none(data.get('episode')), 'timestamp': data.get('published'), 'thumbnail': data.get('imageUrl'), 'uploader': 'Channel 10', diff --git a/hypervideo_dl/extractor/tf1.py b/hypervideo_dl/extractor/tf1.py index 669eb50..44785bc 100644 --- a/hypervideo_dl/extractor/tf1.py +++ b/hypervideo_dl/extractor/tf1.py @@ -29,7 +29,6 @@ class TF1IE(InfoExtractor): 'params': { # Sometimes wat serves the whole file with the --test option 'skip_download': True, - 'format': 'bestvideo', }, }, { 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', diff --git a/hypervideo_dl/extractor/theta.py b/hypervideo_dl/extractor/theta.py index 3b65436..8b6d70a 100644 --- a/hypervideo_dl/extractor/theta.py +++ b/hypervideo_dl/extractor/theta.py @@ -6,7 +6,7 @@ from ..utils import try_get class ThetaStreamIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?theta\.tv/(?!video/)(?P<id>[a-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?theta\.tv/(?!video/)(?P<id>[a-z0-9-]+)' _TESTS = [{ 'url': 'https://www.theta.tv/davirus', 'skip': 'The live may have ended', @@ -25,6 +25,14 @@ class ThetaStreamIE(InfoExtractor): 'title': 'Mystery Science Theatre 3000 24/7 Powered by the THETA Network.', 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+\.jpg', } + }, { + 'url': 'https://www.theta.tv/contv-anime', + 'info_dict': { + 'id': 'ConTVAnime', + 'ext': 'mp4', + 'title': 'CONTV ANIME 24/7. Powered by THETA Network.', + 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+\.jpg', + } }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/thisav.py b/hypervideo_dl/extractor/thisav.py index 4af286e..6bb00b3 100644 --- a/hypervideo_dl/extractor/thisav.py +++ b/hypervideo_dl/extractor/thisav.py @@ -37,9 +37,7 @@ class ThisAVIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = remove_end(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), - ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站') + title = remove_end(self._html_extract_title(webpage), ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站') video_url = self._html_search_regex( r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None) if video_url: diff --git a/hypervideo_dl/extractor/thisoldhouse.py b/hypervideo_dl/extractor/thisoldhouse.py index a3d9b40..8a1d173 100644 --- a/hypervideo_dl/extractor/thisoldhouse.py +++ b/hypervideo_dl/extractor/thisoldhouse.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import HEADRequest class ThisOldHouseIE(InfoExtractor): @@ -15,6 +16,11 @@ class ThisOldHouseIE(InfoExtractor): 'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.', 'timestamp': 1442548800, 'upload_date': '20150918', + 'duration': 674, + 'view_count': int, + 'average_rating': 0, + 'thumbnail': r're:^https?://.*\.jpg\?\d+$', + 'display_id': 'how-to-build-a-storage-bench', }, 'params': { 'skip_download': True, @@ -41,7 +47,12 @@ class ThisOldHouseIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'<iframe[^>]+src=[\'"](?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})', - webpage, 'video id') + if 'To Unlock This content' in webpage: + self.raise_login_required(method='cookies') + video_url = self._search_regex( + r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})[^\'"]*)[\'"]', + webpage, 'video url') + if 'subscription_required=true' in video_url or 'c-entry-group-labels__image' in webpage: + return self.url_result(self._request_webpage(HEADRequest(video_url), display_id).geturl(), 'Zype', display_id) + video_id = self._search_regex(r'(?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})', video_url, 'video id') return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id) diff --git a/hypervideo_dl/extractor/threeqsdn.py b/hypervideo_dl/extractor/threeqsdn.py index bb76103..00a51dc 100644 --- a/hypervideo_dl/extractor/threeqsdn.py +++ b/hypervideo_dl/extractor/threeqsdn.py @@ -9,6 +9,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + join_nonempty, parse_iso8601, ) @@ -110,8 +111,7 @@ class ThreeQSDNIE(InfoExtractor): subtitles = self._merge_subtitles(subtitles, subs) elif source_type == 'hls': fmts, subs = self._extract_m3u8_formats_and_subtitles( - source, video_id, 'mp4', 'm3u8' if live else 'm3u8_native', - m3u8_id='hls', fatal=False) + source, video_id, 'mp4', live=live, m3u8_id='hls', fatal=False) formats.extend(fmts) subtitles = self._merge_subtitles(subtitles, subs) elif source_type == 'progressive': @@ -119,24 +119,16 @@ class ThreeQSDNIE(InfoExtractor): src = s.get('src') if not (src and self._is_valid_url(src, video_id)): continue - width = None - format_id = ['http'] ext = determine_ext(src) - if ext: - format_id.append(ext) height = int_or_none(s.get('height')) - if height: - format_id.append('%dp' % height) - if aspect: - width = int(height * aspect) formats.append({ 'ext': ext, - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty('http', ext, height and '%dp' % height), 'height': height, 'source_preference': 0, 'url': src, 'vcodec': 'none' if height == 0 else None, - 'width': width, + 'width': int(height * aspect) if height and aspect else None, }) # It seems like this would be correctly handled by default # However, unless someone can confirm this, the old @@ -155,7 +147,7 @@ class ThreeQSDNIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if live else title, + 'title': title, 'thumbnail': config.get('poster') or None, 'description': config.get('description') or None, 'timestamp': parse_iso8601(config.get('upload_date')), diff --git a/hypervideo_dl/extractor/threespeak.py b/hypervideo_dl/extractor/threespeak.py new file mode 100644 index 0000000..fe6a955 --- /dev/null +++ b/hypervideo_dl/extractor/threespeak.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + try_get, + unified_strdate, +) + + +class ThreeSpeakIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?3speak\.tv/watch\?v\=[^/]+/(?P<id>[^/$&#?]+)' + + _TESTS = [{ + 'url': 'https://3speak.tv/watch?v=dannyshine/wjgoxyfy', + 'info_dict': { + 'id': 'wjgoxyfy', + 'ext': 'mp4', + 'title': 'Can People who took the Vax think Critically', + 'uploader': 'dannyshine', + 'description': 'md5:181aa7ccb304afafa089b5af3bca7a10', + 'tags': ['sex', 'covid', 'antinatalism', 'comedy', 'vaccines'], + 'thumbnail': 'https://img.3speakcontent.co/wjgoxyfy/thumbnails/default.png', + 'upload_date': '20211021', + 'duration': 2703.867833, + 'filesize': 1620054781, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + json_str = self._html_search_regex(r'JSON\.parse\(\'([^\']+)\'\)', webpage, 'json') + # The json string itself is escaped. Hence the double parsing + data_json = self._parse_json(self._parse_json(f'"{json_str}"', id), id) + video_json = self._parse_json(data_json['json_metadata'], id) + formats, subtitles = [], {} + og_m3u8 = self._html_search_regex(r'<meta\s?property=\"ogvideo\"\s?content=\"([^\"]+)\">', webpage, 'og m3u8', fatal=False) + if og_m3u8: + https_frmts, https_subs = self._extract_m3u8_formats_and_subtitles(og_m3u8, id, fatal=False, m3u8_id='https') + formats.extend(https_frmts) + subtitles = self._merge_subtitles(subtitles, https_subs) + ipfs_m3u8 = try_get(video_json, lambda x: x['video']['info']['ipfs']) + if ipfs_m3u8: + ipfs_frmts, ipfs_subs = self._extract_m3u8_formats_and_subtitles(f'https://ipfs.3speak.tv/ipfs/{ipfs_m3u8}', + id, fatal=False, m3u8_id='ipfs') + formats.extend(ipfs_frmts) + subtitles = self._merge_subtitles(subtitles, ipfs_subs) + mp4_file = try_get(video_json, lambda x: x['video']['info']['file']) + if mp4_file: + formats.append({ + 'url': f'https://threespeakvideo.b-cdn.net/{id}/{mp4_file}', + 'ext': 'mp4', + 'format_id': 'https-mp4', + 'duration': try_get(video_json, lambda x: x['video']['info']['duration']), + 'filesize': try_get(video_json, lambda x: x['video']['info']['filesize']), + 'quality': 11, + 'format_note': 'Original file', + }) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title') or data_json.get('root_title'), + 'uploader': data_json.get('author'), + 'description': try_get(video_json, lambda x: x['video']['content']['description']), + 'tags': try_get(video_json, lambda x: x['video']['content']['tags']), + 'thumbnail': try_get(video_json, lambda x: x['image'][0]), + 'upload_date': unified_strdate(data_json.get('created')), + 'formats': formats, + 'subtitles': subtitles, + } + + +class ThreeSpeakUserIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?3speak\.tv/user/(?P<id>[^/$&?#]+)' + + _TESTS = [{ + 'url': 'https://3speak.tv/user/theycallmedan', + 'info_dict': { + 'id': 'theycallmedan', + }, + 'playlist_mincount': 115, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + entries = [ + self.url_result( + 'https://3speak.tv/watch?v=%s' % video, + ie=ThreeSpeakIE.ie_key()) + for video in re.findall(r'data-payout\s?\=\s?\"([^\"]+)\"', webpage) if video + ] + return self.playlist_result(entries, id) diff --git a/hypervideo_dl/extractor/tiktok.py b/hypervideo_dl/extractor/tiktok.py index 1db6327..c1d6c54 100644 --- a/hypervideo_dl/extractor/tiktok.py +++ b/hypervideo_dl/extractor/tiktok.py @@ -8,10 +8,18 @@ import time import json from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse +) from ..utils import ( ExtractorError, + HEADRequest, + get_first, int_or_none, + join_nonempty, + LazyList, + srt_subtitles_timecode, str_or_none, traverse_obj, try_get, @@ -21,25 +29,38 @@ from ..utils import ( class TikTokBaseIE(InfoExtractor): - _APP_VERSION = '20.9.3' - _MANIFEST_APP_VERSION = '291' + _APP_VERSIONS = [('20.9.3', '293'), ('20.4.3', '243'), ('20.2.1', '221'), ('20.1.2', '212'), ('20.0.4', '204')] + _WORKING_APP_VERSION = None _APP_NAME = 'trill' _AID = 1180 - _API_HOSTNAME = 'api-t2.tiktokv.com' + _API_HOSTNAME = 'api-h2.tiktokv.com' _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s' - QUALITIES = ('360p', '540p', '720p') + _WEBPAGE_HOST = 'https://www.tiktok.com/' + QUALITIES = ('360p', '540p', '720p', '1080p') + + def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, + note='Downloading API JSON', errnote='Unable to download API page'): + self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160))) + webpage_cookies = self._get_cookies(self._WEBPAGE_HOST) + if webpage_cookies.get('sid_tt'): + self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value) + return self._download_json( + 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id, + fatal=fatal, note=note, errnote=errnote, headers={ + 'User-Agent': f'com.ss.android.ugc.trill/{manifest_app_version} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)', + 'Accept': 'application/json', + }, query=query) - def _call_api(self, ep, query, video_id, fatal=True, - note='Downloading API JSON', errnote='Unable to download API page'): - real_query = { + def _build_api_query(self, query, app_version, manifest_app_version): + return { **query, - 'version_name': self._APP_VERSION, - 'version_code': self._MANIFEST_APP_VERSION, - 'build_number': self._APP_VERSION, - 'manifest_version_code': self._MANIFEST_APP_VERSION, - 'update_version_code': self._MANIFEST_APP_VERSION, - 'openudid': ''.join(random.choice('0123456789abcdef') for i in range(16)), - 'uuid': ''.join([random.choice(string.digits) for num in range(16)]), + 'version_name': app_version, + 'version_code': manifest_app_version, + 'build_number': app_version, + 'manifest_version_code': manifest_app_version, + 'update_version_code': manifest_app_version, + 'openudid': ''.join(random.choice('0123456789abcdef') for _ in range(16)), + 'uuid': ''.join([random.choice(string.digits) for _ in range(16)]), '_rticket': int(time.time() * 1000), 'ts': int(time.time()), 'device_brand': 'Google', @@ -66,13 +87,61 @@ class TikTokBaseIE(InfoExtractor): 'as': 'a1qwert123', 'cp': 'cbfhckdckkde1', } - self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for i in range(160))) - return self._download_json( - 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id, - fatal=fatal, note=note, errnote=errnote, headers={ - 'User-Agent': f'com.ss.android.ugc.trill/{self._MANIFEST_APP_VERSION} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)', - 'Accept': 'application/json', - }, query=real_query) + + def _call_api(self, ep, query, video_id, fatal=True, + note='Downloading API JSON', errnote='Unable to download API page'): + if not self._WORKING_APP_VERSION: + app_version = self._configuration_arg('app_version', [''], ie_key=TikTokIE.ie_key())[0] + manifest_app_version = self._configuration_arg('manifest_app_version', [''], ie_key=TikTokIE.ie_key())[0] + if app_version and manifest_app_version: + self._WORKING_APP_VERSION = (app_version, manifest_app_version) + self.write_debug('Imported app version combo from extractor arguments') + elif app_version or manifest_app_version: + self.report_warning('Only one of the two required version params are passed as extractor arguments', only_once=True) + + if self._WORKING_APP_VERSION: + app_version, manifest_app_version = self._WORKING_APP_VERSION + real_query = self._build_api_query(query, app_version, manifest_app_version) + return self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote) + + for count, (app_version, manifest_app_version) in enumerate(self._APP_VERSIONS, start=1): + real_query = self._build_api_query(query, app_version, manifest_app_version) + try: + res = self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote) + self._WORKING_APP_VERSION = (app_version, manifest_app_version) + return res + except ExtractorError as e: + if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: + if count == len(self._APP_VERSIONS): + if fatal: + raise e + else: + self.report_warning(str(e.cause or e.msg)) + return + self.report_warning('%s. Retrying... (attempt %s of %s)' % (str(e.cause or e.msg), count, len(self._APP_VERSIONS))) + continue + raise e + + def _get_subtitles(self, aweme_detail, aweme_id): + # TODO: Extract text positioning info + subtitles = {} + captions_info = traverse_obj( + aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict, default=[]) + for caption in captions_info: + caption_url = traverse_obj(caption, ('url', 'url_list', ...), expected_type=url_or_none, get_all=False) + if not caption_url: + continue + caption_json = self._download_json( + caption_url, aweme_id, note='Downloading captions', errnote='Unable to download captions', fatal=False) + if not caption_json: + continue + subtitles.setdefault(caption.get('language', 'en'), []).append({ + 'ext': 'srt', + 'data': '\n\n'.join( + f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}' + for i, line in enumerate(caption_json['utterances']) if line.get('text')) + }) + return subtitles def _parse_aweme_video_app(self, aweme_detail): aweme_id = aweme_detail['aweme_id'] @@ -107,8 +176,8 @@ class TikTokBaseIE(InfoExtractor): 'acodec': 'aac', 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked **add_meta, **parsed_meta, - 'format_note': ' '.join(filter(None, ( - add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else ''))) + 'format_note': join_nonempty( + add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' ') } for url in addr.get('url_list') or []] # Hack: Add direct video links first to prioritize them when removing duplicate formats @@ -118,7 +187,7 @@ class TikTokBaseIE(InfoExtractor): 'format_id': 'play_addr', 'format_note': 'Direct video', 'vcodec': 'h265' if traverse_obj( - video_info, 'is_bytevc1', 'is_h265') else 'h264', # Always h264? + video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002 'width': video_info.get('width'), 'height': video_info.get('height'), })) @@ -156,6 +225,10 @@ class TikTokBaseIE(InfoExtractor): })) self._remove_duplicate_formats(formats) + auth_cookie = self._get_cookies(self._WEBPAGE_HOST).get('sid_tt') + if auth_cookie: + for f in formats: + self._set_cookie(compat_urllib_parse_urlparse(f['url']).hostname, 'sid_tt', auth_cookie.value) self._sort_formats(formats, ('quality', 'codec', 'size', 'br')) thumbnails = [] @@ -175,6 +248,7 @@ class TikTokBaseIE(InfoExtractor): user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info, 'sec_uid', 'id', 'uid', 'unique_id', expected_type=str_or_none, get_all=False)) + labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str, default=[]) contained_music_track = traverse_obj( music_info, ('matched_song', 'title'), ('matched_pgc_sound', 'title'), expected_type=str) @@ -189,8 +263,8 @@ class TikTokBaseIE(InfoExtractor): return { 'id': aweme_id, - 'title': aweme_detail['desc'], - 'description': aweme_detail['desc'], + 'title': aweme_detail.get('desc'), + 'description': aweme_detail.get('desc'), 'view_count': int_or_none(stats_info.get('play_count')), 'like_count': int_or_none(stats_info.get('digg_count')), 'repost_count': int_or_none(stats_info.get('share_count')), @@ -204,18 +278,24 @@ class TikTokBaseIE(InfoExtractor): 'artist': music_author, 'timestamp': int_or_none(aweme_detail.get('create_time')), 'formats': formats, + 'subtitles': self.extract_subtitles(aweme_detail, aweme_id), 'thumbnails': thumbnails, - 'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000) + 'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000), + 'availability': self._availability( + is_private='Private' in labels, + needs_subscription='Friends only' in labels, + is_unlisted='Followers only' in labels) } def _parse_aweme_video_web(self, aweme_detail, webpage_url): video_info = aweme_detail['video'] - author_info = traverse_obj(aweme_detail, 'author', 'authorInfo', default={}) + author_info = traverse_obj(aweme_detail, 'authorInfo', 'author', expected_type=dict, default={}) music_info = aweme_detail.get('music') or {} stats_info = aweme_detail.get('stats') or {} user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info, 'secUid', 'id', 'uid', 'uniqueId', - expected_type=str_or_none, get_all=False)) + expected_type=str_or_none, get_all=False) + or aweme_detail.get('authorSecId')) formats = [] play_url = video_info.get('playAddr') @@ -267,8 +347,8 @@ class TikTokBaseIE(InfoExtractor): 'comment_count': int_or_none(stats_info.get('commentCount')), 'timestamp': int_or_none(aweme_detail.get('createTime')), 'creator': str_or_none(author_info.get('nickname')), - 'uploader': str_or_none(author_info.get('uniqueId')), - 'uploader_id': str_or_none(author_info.get('id')), + 'uploader': str_or_none(author_info.get('uniqueId') or aweme_detail.get('author')), + 'uploader_id': str_or_none(author_info.get('id') or aweme_detail.get('authorId')), 'uploader_url': user_url, 'track': str_or_none(music_info.get('title')), 'album': str_or_none(music_info.get('album')) or None, @@ -307,6 +387,9 @@ class TikTokIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'artist': 'Ysrbeats', + 'album': 'Lehanga', + 'track': 'Lehanga', } }, { 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en', @@ -330,16 +413,98 @@ class TikTokIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson', + 'track': 'Big Fun', } }, { - # Promoted content/ad - 'url': 'https://www.tiktok.com/@MS4wLjABAAAAAR29F6J2Ktu0Daw03BJyXPNoRQ-W7U5a0Mn3lVCq2rQhjOd_WNLclHUoFgwX8Eno/video/6932675057474981122', - 'only_matching': True, + # Banned audio, only available on the app + 'url': 'https://www.tiktok.com/@barudakhb_/video/6984138651336838402', + 'info_dict': { + 'id': '6984138651336838402', + 'ext': 'mp4', + 'title': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥', + 'description': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥', + 'uploader': 'barudakhb_', + 'creator': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6', + 'uploader_id': '6974687867511718913', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d', + 'track': 'Boka Dance', + 'artist': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6', + 'timestamp': 1626121503, + 'duration': 18, + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'upload_date': '20210712', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + } + }, { + # Sponsored video, only available with feed workaround + 'url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_/video/7042692929109986561', + 'info_dict': { + 'id': '7042692929109986561', + 'ext': 'mp4', + 'title': 'Slap and Run!', + 'description': 'Slap and Run!', + 'uploader': 'user440922249', + 'creator': 'Slap And Run', + 'uploader_id': '7036055384943690754', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_', + 'track': 'Promoted Music', + 'timestamp': 1639754738, + 'duration': 30, + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'upload_date': '20211217', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + }, + 'expected_warnings': ['Video not available'] + }, { + # Video without title and description + 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694', + 'info_dict': { + 'id': '7059698374567611694', + 'ext': 'mp4', + 'title': 'tiktok video #7059698374567611694', + 'description': '', + 'uploader': 'pokemonlife22', + 'creator': 'Pokemon', + 'uploader_id': '6820838815978423302', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W', + 'track': 'original sound', + 'timestamp': 1643714123, + 'duration': 6, + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'upload_date': '20220201', + 'artist': 'Pokemon', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + }, + 'expected_warnings': ['Video not available', 'Creating a generic title'] + }, { + # Auto-captions available + 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', + 'only_matching': True }] def _extract_aweme_app(self, aweme_id): - aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id, - note='Downloading video details', errnote='Unable to download video details')['aweme_detail'] + try: + aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id, + note='Downloading video details', errnote='Unable to download video details').get('aweme_detail') + if not aweme_detail: + raise ExtractorError('Video not available', video_id=aweme_id) + except ExtractorError as e: + self.report_warning(f'{e}; Retrying with feed workaround') + feed_list = self._call_api('feed', {'aweme_id': aweme_id}, aweme_id, + note='Downloading video feed', errnote='Unable to download video feed').get('aweme_list') or [] + aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None) + if not aweme_detail: + raise ExtractorError('Unable to find video in feed', video_id=aweme_id) return self._parse_aweme_video_app(aweme_detail) def _real_extract(self, url): @@ -353,19 +518,23 @@ class TikTokIE(TikTokBaseIE): # If we only call once, we get a 403 when downlaoding the video. self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id, note='Downloading video webpage') - json_string = self._search_regex( - r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\"\s*[^>]+>\s*(?P<json_string_ld>[^<]+)', - webpage, 'json_string', group='json_string_ld') - json_data = self._parse_json(json_string, video_id) - props_data = try_get(json_data, lambda x: x['props'], expected_type=dict) - - # Chech statusCode for success - status = props_data.get('pageProps').get('statusCode') + next_data = self._search_nextjs_data(webpage, video_id, default='{}') + + if next_data: + status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0 + video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct'), expected_type=dict) + else: + sigi_json = self._search_regex( + r'>\s*window\[[\'"]SIGI_STATE[\'"]\]\s*=\s*(?P<sigi_state>{.+});', + webpage, 'sigi data', group='sigi_state') + sigi_data = self._parse_json(sigi_json, video_id) + status = traverse_obj(sigi_data, ('VideoPage', 'statusCode'), expected_type=int) or 0 + video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict) + if status == 0: - return self._parse_aweme_video_web(props_data['pageProps']['itemInfo']['itemStruct'], url) + return self._parse_aweme_video_web(video_data, url) elif status == 10216: raise ExtractorError('This video is private', expected=True) - raise ExtractorError('Video not available', video_id=video_id) @@ -378,6 +547,16 @@ class TikTokUserIE(TikTokBaseIE): 'info_dict': { 'id': '6935371178089399301', 'title': 'corgibobaa', + 'thumbnail': r're:https://.+_1080x1080\.webp' + }, + 'expected_warnings': ['Retrying'] + }, { + 'url': 'https://www.tiktok.com/@6820838815978423302', + 'playlist_mincount': 5, + 'info_dict': { + 'id': '6820838815978423302', + 'title': '6820838815978423302', + 'thumbnail': r're:https://.+_1080x1080\.webp' }, 'expected_warnings': ['Retrying'] }, { @@ -386,6 +565,7 @@ class TikTokUserIE(TikTokBaseIE): 'info_dict': { 'id': '79005827461758976', 'title': 'meme', + 'thumbnail': r're:https://.+_1080x1080\.webp' }, 'expected_warnings': ['Retrying'] }] @@ -409,14 +589,14 @@ class TikTokUserIE(TikTokBaseIE): cursor = data_json['cursor'] ''' - def _entries_api(self, webpage, user_id, username): + def _video_entries_api(self, webpage, user_id, username): query = { 'user_id': user_id, 'count': 21, 'max_cursor': 0, 'min_cursor': 0, 'retry_type': 'no_retry', - 'device_id': ''.join(random.choice(string.digits) for i in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. + 'device_id': ''.join(random.choice(string.digits) for _ in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. } max_retries = self.get_param('extractor_retries', 3) @@ -432,23 +612,139 @@ class TikTokUserIE(TikTokBaseIE): continue raise break + yield from post_list.get('aweme_list', []) + if not post_list.get('has_more'): + break + query['max_cursor'] = post_list['max_cursor'] + + def _entries_api(self, user_id, videos): + for video in videos: + yield { + **self._parse_aweme_video_app(video), + 'extractor_key': TikTokIE.ie_key(), + 'extractor': 'TikTok', + 'webpage_url': f'https://tiktok.com/@{user_id}/video/{video["aweme_id"]}', + } + + def _real_extract(self, url): + user_name = self._match_id(url) + webpage = self._download_webpage(url, user_name, headers={ + 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)' + }) + user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID', default=None) or user_name + + videos = LazyList(self._video_entries_api(webpage, user_id, user_name)) + thumbnail = traverse_obj(videos, (0, 'author', 'avatar_larger', 'url_list', 0)) + + return self.playlist_result(self._entries_api(user_id, videos), user_id, user_name, thumbnail=thumbnail) + + +class TikTokBaseListIE(TikTokBaseIE): + def _entries(self, list_id, display_id): + query = { + self._QUERY_NAME: list_id, + 'cursor': 0, + 'count': 20, + 'type': 5, + 'device_id': ''.join(random.choice(string.digits) for i in range(19)) + } + + max_retries = self.get_param('extractor_retries', 3) + for page in itertools.count(1): + for retries in itertools.count(): + try: + post_list = self._call_api(self._API_ENDPOINT, query, display_id, + note='Downloading video list page %d%s' % (page, f' (attempt {retries})' if retries != 0 else ''), + errnote='Unable to download video list') + except ExtractorError as e: + if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0 and retries != max_retries: + self.report_warning('%s. Retrying...' % str(e.cause or e.msg)) + continue + raise + break for video in post_list.get('aweme_list', []): yield { **self._parse_aweme_video_app(video), - 'ie_key': TikTokIE.ie_key(), + 'extractor_key': TikTokIE.ie_key(), 'extractor': 'TikTok', + 'webpage_url': f'https://tiktok.com/@_/video/{video["aweme_id"]}', } if not post_list.get('has_more'): break - query['max_cursor'] = post_list['max_cursor'] + query['cursor'] = post_list['cursor'] def _real_extract(self, url): - user_name = self._match_id(url) - webpage = self._download_webpage(url, user_name, headers={ + list_id = self._match_id(url) + return self.playlist_result(self._entries(list_id, list_id), list_id) + + +class TikTokSoundIE(TikTokBaseListIE): + IE_NAME = 'tiktok:sound' + _VALID_URL = r'https?://(?:www\.)?tiktok\.com/music/[\w\.-]+-(?P<id>[\d]+)[/?#&]?' + _QUERY_NAME = 'music_id' + _API_ENDPOINT = 'music/aweme' + _TESTS = [{ + 'url': 'https://www.tiktok.com/music/Build-a-Btch-6956990112127585029?lang=en', + 'playlist_mincount': 100, + 'info_dict': { + 'id': '6956990112127585029' + }, + 'expected_warnings': ['Retrying'] + }, { + # Actual entries are less than listed video count + 'url': 'https://www.tiktok.com/music/jiefei-soap-remix-7036843036118469381', + 'playlist_mincount': 2182, + 'info_dict': { + 'id': '7036843036118469381' + }, + 'expected_warnings': ['Retrying'] + }] + + +class TikTokEffectIE(TikTokBaseListIE): + IE_NAME = 'tiktok:effect' + _VALID_URL = r'https?://(?:www\.)?tiktok\.com/sticker/[\w\.-]+-(?P<id>[\d]+)[/?#&]?' + _QUERY_NAME = 'sticker_id' + _API_ENDPOINT = 'sticker/aweme' + _TESTS = [{ + 'url': 'https://www.tiktok.com/sticker/MATERIAL-GWOOORL-1258156', + 'playlist_mincount': 100, + 'info_dict': { + 'id': '1258156', + }, + 'expected_warnings': ['Retrying'] + }, { + # Different entries between mobile and web, depending on region + 'url': 'https://www.tiktok.com/sticker/Elf-Friend-479565', + 'only_matching': True + }] + + +class TikTokTagIE(TikTokBaseListIE): + IE_NAME = 'tiktok:tag' + _VALID_URL = r'https?://(?:www\.)?tiktok\.com/tag/(?P<id>[^/?#&]+)' + _QUERY_NAME = 'ch_id' + _API_ENDPOINT = 'challenge/aweme' + _TESTS = [{ + 'url': 'https://tiktok.com/tag/hello2018', + 'playlist_mincount': 39, + 'info_dict': { + 'id': '46294678', + 'title': 'hello2018', + }, + 'expected_warnings': ['Retrying'] + }, { + 'url': 'https://tiktok.com/tag/fypシ?is_copy_url=0&is_from_webapp=v1', + 'only_matching': True + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id, headers={ 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)' }) - user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID') - return self.playlist_result(self._entries_api(webpage, user_id, user_name), user_id, user_name) + tag_id = self._html_search_regex(r'snssdk\d*://challenge/detail/(\d+)', webpage, 'tag ID') + return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id) class DouyinIE(TikTokIE): @@ -534,12 +830,12 @@ class DouyinIE(TikTokIE): 'comment_count': int, } }] - _APP_VERSION = '9.6.0' - _MANIFEST_APP_VERSION = '960' + _APP_VERSIONS = [('9.6.0', '960')] _APP_NAME = 'aweme' _AID = 1128 _API_HOSTNAME = 'aweme.snssdk.com' _UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s' + _WEBPAGE_HOST = 'https://www.douyin.com/' def _real_extract(self, url): video_id = self._match_id(url) @@ -559,5 +855,40 @@ class DouyinIE(TikTokIE): render_data = self._parse_json( render_data_json, video_id, transform_source=compat_urllib_parse_unquote) - return self._parse_aweme_video_web( - traverse_obj(render_data, (..., 'aweme', 'detail'), get_all=False), url) + return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url) + + +class TikTokVMIE(InfoExtractor): + _VALID_URL = r'https?://(?:vm|vt)\.tiktok\.com/(?P<id>\w+)' + IE_NAME = 'vm.tiktok' + + _TESTS = [{ + 'url': 'https://vm.tiktok.com/ZSe4FqkKd', + 'info_dict': { + 'id': '7023491746608712966', + 'ext': 'mp4', + 'title': 'md5:5607564db90271abbbf8294cca77eddd', + 'description': 'md5:5607564db90271abbbf8294cca77eddd', + 'duration': 11, + 'upload_date': '20211026', + 'uploader_id': '7007385080558846981', + 'creator': 'Memes', + 'artist': 'Memes', + 'track': 'original sound', + 'uploader': 'susmandem', + 'timestamp': 1635284105, + 'thumbnail': r're:https://.+\.webp.*', + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'repost_count': int, + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAXcNoOEOxVyBzuII_E--T0MeCrLP0ay1Sm6x_n3dluiWEoWZD0VlQOytwad4W0i0n', + } + }, { + 'url': 'https://vt.tiktok.com/ZSe4FqkKd', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result(self._request_webpage( + HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).geturl(), TikTokIE) diff --git a/hypervideo_dl/extractor/toggo.py b/hypervideo_dl/extractor/toggo.py new file mode 100644 index 0000000..da5f0c4 --- /dev/null +++ b/hypervideo_dl/extractor/toggo.py @@ -0,0 +1,73 @@ +from .common import InfoExtractor +from ..utils import int_or_none, parse_qs + + +class ToggoIE(InfoExtractor): + IE_NAME = 'toggo' + _VALID_URL = r'https?://(?:www\.)?toggo\.de/[\w-]+/folge/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.toggo.de/weihnachtsmann--co-kg/folge/ein-geschenk-fuer-zwei', + 'info_dict': { + 'id': 'VEP2977', + 'ext': 'mp4', + 'title': 'Ein Geschenk für zwei', + 'display_id': 'ein-geschenk-fuer-zwei', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'description': 'md5:b7715915bfa47824b4e4ad33fb5962f8', + 'release_timestamp': 1637259179, + 'series': 'Weihnachtsmann & Co. KG', + 'season': 'Weihnachtsmann & Co. KG', + 'season_number': 1, + 'season_id': 'VST118', + 'episode': 'Ein Geschenk für zwei', + 'episode_number': 7, + 'episode_id': 'VEP2977', + 'timestamp': 1581935960, + 'uploader_id': '6057955896001', + 'upload_date': '20200217', + }, + 'params': {'skip_download': True}, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + data = self._download_json( + f'https://production-n.toggo.de/api/assetstore/vod/asset/{display_id}', display_id)['data'] + + brightcove_id = next( + x['value'] for x in data['custom_fields'] if x.get('key') == 'video-cloud-id') + info = self._downloader.get_info_extractor('BrightcoveNew').extract( + f'http://players.brightcove.net/6057955896001/default_default/index.html?videoId={brightcove_id}') + + for f in info['formats']: + if '/dash/live/cenc/' in f.get('fragment_base_url', ''): + # Get hidden non-DRM format + f['fragment_base_url'] = f['fragment_base_url'].replace('/cenc/', '/clear/') + f['has_drm'] = False + + if '/fairplay/' in f.get('manifest_url', ''): + f['has_drm'] = True + + thumbnails = [{ + 'id': name, + 'url': url, + 'width': int_or_none(next(iter(parse_qs(url).get('width', [])), None)), + } for name, url in (data.get('images') or {}).items()] + + return { + **info, + 'id': data.get('id'), + 'display_id': display_id, + 'title': data.get('title'), + 'language': data.get('language'), + 'thumbnails': thumbnails, + 'description': data.get('description'), + 'release_timestamp': data.get('earliest_start_date'), + 'series': data.get('series_title'), + 'season': data.get('season_title'), + 'season_number': data.get('season_no'), + 'season_id': data.get('season_id'), + 'episode': data.get('title'), + 'episode_number': data.get('episode_no'), + 'episode_id': data.get('id'), + } diff --git a/hypervideo_dl/extractor/tokentube.py b/hypervideo_dl/extractor/tokentube.py index d636211..579623f 100644 --- a/hypervideo_dl/extractor/tokentube.py +++ b/hypervideo_dl/extractor/tokentube.py @@ -6,7 +6,10 @@ import re from .common import InfoExtractor from ..utils import ( + clean_html, + get_element_by_class, parse_count, + remove_end, unified_strdate, js_to_json, OnDemandPagedList, @@ -35,7 +38,7 @@ class TokentubeIE(InfoExtractor): 'id': '3950239124', 'ext': 'mp4', 'title': 'Linux Ubuntu Studio perus käyttö', - 'description': 'md5:854ff1dc732ff708976de2880ea32050', + 'description': 'md5:46077d0daaba1974f2dc381257f9d64c', 'uploader': 'jyrilehtonen', 'upload_date': '20210825', }, @@ -45,7 +48,7 @@ class TokentubeIE(InfoExtractor): 'id': '3582463289', 'ext': 'mp4', 'title': 'Police for Freedom - toiminta aloitetaan Suomessa ❤️??', - 'description': 'md5:cd92e620d7f5fa162e8410d0fc9a08be', + 'description': 'md5:37ebf1cb44264e0bf23ed98b337ee63e', 'uploader': 'Voitontie', 'upload_date': '20210428', } @@ -90,7 +93,10 @@ class TokentubeIE(InfoExtractor): r'<a\s*class="place-left"[^>]+>(.+?)</a>', webpage, 'uploader', fatal=False) - description = self._html_search_meta('description', webpage) + description = (clean_html(get_element_by_class('p-d-txt', webpage)) + or self._html_search_meta(('og:description', 'description', 'twitter:description'), webpage)) + + description = remove_end(description, 'Category') self._sort_formats(formats) diff --git a/hypervideo_dl/extractor/tonline.py b/hypervideo_dl/extractor/tonline.py index cc11eae..9b6a40d 100644 --- a/hypervideo_dl/extractor/tonline.py +++ b/hypervideo_dl/extractor/tonline.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import int_or_none, join_nonempty class TOnlineIE(InfoExtractor): @@ -30,13 +30,8 @@ class TOnlineIE(InfoExtractor): asset_source = asset.get('source') or asset.get('source2') if not asset_source: continue - formats_id = [] - for field_key in ('type', 'profile'): - field_value = asset.get(field_key) - if field_value: - formats_id.append(field_value) formats.append({ - 'format_id': '-'.join(formats_id), + 'format_id': join_nonempty('type', 'profile', from_dict=asset), 'url': asset_source, }) diff --git a/hypervideo_dl/extractor/toutv.py b/hypervideo_dl/extractor/toutv.py index 6c84c21..1d5da10 100644 --- a/hypervideo_dl/extractor/toutv.py +++ b/hypervideo_dl/extractor/toutv.py @@ -40,17 +40,14 @@ class TouTvIE(RadioCanadaIE): }] _CLIENT_KEY = '90505c8d-9c34-4f34-8da1-3a85bdc6d4f4' - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return + def _perform_login(self, username, password): try: self._access_token = self._download_json( 'https://services.radio-canada.ca/toutv/profiling/accounts/login', None, 'Logging in', data=json.dumps({ 'ClientId': self._CLIENT_KEY, 'ClientSecret': '34026772-244b-49b6-8b06-317b30ac9a20', - 'Email': email, + 'Email': username, 'Password': password, 'Scope': 'id.write media-validation.read', }).encode(), headers={ diff --git a/hypervideo_dl/extractor/traileraddict.py b/hypervideo_dl/extractor/traileraddict.py index 10100fb..514f479 100644 --- a/hypervideo_dl/extractor/traileraddict.py +++ b/hypervideo_dl/extractor/traileraddict.py @@ -24,8 +24,7 @@ class TrailerAddictIE(InfoExtractor): name = mobj.group('movie') + '/' + mobj.group('trailer_name') webpage = self._download_webpage(url, name) - title = self._search_regex(r'<title>(.+?)</title>', - webpage, 'video title').replace(' - Trailer Addict', '') + title = self._html_extract_title(webpage, 'video title').replace(' - Trailer Addict', '') view_count_str = self._search_regex( r'<span class="views_n">([0-9,.]+)</span>', webpage, 'view count', fatal=False) diff --git a/hypervideo_dl/extractor/trovo.py b/hypervideo_dl/extractor/trovo.py index ec55f41..65ea13d 100644 --- a/hypervideo_dl/extractor/trovo.py +++ b/hypervideo_dl/extractor/trovo.py @@ -7,6 +7,7 @@ import json from .common import InfoExtractor from ..utils import ( ExtractorError, + format_field, int_or_none, str_or_none, try_get, @@ -17,13 +18,18 @@ class TrovoBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.)?trovo\.live/' _HEADERS = {'Origin': 'https://trovo.live'} + def _call_api(self, video_id, query=None, data=None): + return self._download_json( + 'https://gql.trovo.live/', video_id, query=query, data=data, + headers={'Accept': 'application/json'}) + def _extract_streamer_info(self, data): streamer_info = data.get('streamerInfo') or {} username = streamer_info.get('userName') return { 'uploader': streamer_info.get('nickName'), 'uploader_id': str_or_none(streamer_info.get('uid')), - 'uploader_url': 'https://trovo.live/' + username if username else None, + 'uploader_url': format_field(username, template='https://trovo.live/%s'), } @@ -32,9 +38,8 @@ class TrovoIE(TrovoBaseIE): def _real_extract(self, url): username = self._match_id(url) - live_info = self._download_json( - 'https://gql.trovo.live/', username, query={ - 'query': '''{ + live_info = self._call_api(username, query={ + 'query': '''{ getLiveInfo(params: {userName: "%s"}) { isLive programInfo { @@ -53,12 +58,12 @@ class TrovoIE(TrovoBaseIE): } } }''' % username, - })['data']['getLiveInfo'] + })['data']['getLiveInfo'] if live_info.get('isLive') == 0: raise ExtractorError('%s is offline' % username, expected=True) program_info = live_info['programInfo'] program_id = program_info['id'] - title = self._live_title(program_info['title']) + title = program_info['title'] formats = [] for stream_info in (program_info.get('streamInfo') or []): @@ -104,6 +109,7 @@ class TrovoVodIE(TrovoBaseIE): 'comments': 'mincount:8', 'categories': ['Grand Theft Auto V'], }, + 'skip': '404' }, { 'url': 'https://trovo.live/clip/lc-5285890810184026005', 'only_matching': True, @@ -111,15 +117,14 @@ class TrovoVodIE(TrovoBaseIE): def _real_extract(self, url): vid = self._match_id(url) - resp = self._download_json( - 'https://gql.trovo.live/', vid, data=json.dumps([{ - 'query': '''{ + resp = self._call_api(vid, data=json.dumps([{ + 'query': '''{ batchGetVodDetailInfo(params: {vids: ["%s"]}) { VodDetailInfos } }''' % vid, - }, { - 'query': '''{ + }, { + 'query': '''{ getCommentList(params: {appInfo: {postID: "%s"}, pageSize: 1000000000, preview: {}}) { commentList { author { @@ -133,9 +138,7 @@ class TrovoVodIE(TrovoBaseIE): } } }''' % vid, - }]).encode(), headers={ - 'Content-Type': 'application/json', - }) + }]).encode()) vod_detail_info = resp[0]['data']['batchGetVodDetailInfo']['VodDetailInfos'][vid] vod_info = vod_detail_info['vodInfo'] title = vod_info['title'] @@ -197,7 +200,7 @@ class TrovoVodIE(TrovoBaseIE): return info -class TrovoChannelBaseIE(InfoExtractor): +class TrovoChannelBaseIE(TrovoBaseIE): def _get_vod_json(self, page, uid): raise NotImplementedError('This method must be implemented by subclasses') @@ -215,7 +218,7 @@ class TrovoChannelBaseIE(InfoExtractor): def _real_extract(self, url): id = self._match_id(url) - uid = str(self._download_json('https://gql.trovo.live/', id, query={ + uid = str(self._call_api(id, query={ 'query': '{getLiveInfo(params:{userName:"%s"}){streamerInfo{uid}}}' % id })['data']['getLiveInfo']['streamerInfo']['uid']) return self.playlist_result(self._entries(uid), playlist_id=uid) @@ -223,7 +226,7 @@ class TrovoChannelBaseIE(InfoExtractor): class TrovoChannelVodIE(TrovoChannelBaseIE): _VALID_URL = r'trovovod:(?P<id>[^\s]+)' - IE_DESC = 'All VODs of a trovo.live channel, "trovovod" keyword' + IE_DESC = 'All VODs of a trovo.live channel; "trovovod:" prefix' _TESTS = [{ 'url': 'trovovod:OneTappedYou', @@ -237,14 +240,14 @@ class TrovoChannelVodIE(TrovoChannelBaseIE): _TYPE = 'video' def _get_vod_json(self, page, uid): - return self._download_json('https://gql.trovo.live/', uid, query={ + return self._call_api(uid, query={ 'query': self._QUERY % (page, uid) })['data']['getChannelLtvVideoInfos'] class TrovoChannelClipIE(TrovoChannelBaseIE): _VALID_URL = r'trovoclip:(?P<id>[^\s]+)' - IE_DESC = 'All Clips of a trovo.live channel, "trovoclip" keyword' + IE_DESC = 'All Clips of a trovo.live channel; "trovoclip:" prefix' _TESTS = [{ 'url': 'trovoclip:OneTappedYou', @@ -258,6 +261,6 @@ class TrovoChannelClipIE(TrovoChannelBaseIE): _TYPE = 'clip' def _get_vod_json(self, page, uid): - return self._download_json('https://gql.trovo.live/', uid, query={ + return self._call_api(uid, query={ 'query': self._QUERY % (page, uid) })['data']['getChannelClipVideoInfos'] diff --git a/hypervideo_dl/extractor/trueid.py b/hypervideo_dl/extractor/trueid.py new file mode 100644 index 0000000..fc98303 --- /dev/null +++ b/hypervideo_dl/extractor/trueid.py @@ -0,0 +1,139 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_age_limit, + traverse_obj, + unified_timestamp, + url_or_none +) + + +class TrueIDIE(InfoExtractor): + _VALID_URL = r'https?://(?P<domain>vn\.trueid\.net|trueid\.(?:id|ph))/(?:movie|series/[^/]+)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://trueid.id/movie/XYNlDOZZJzL6/pengabdi-setan/', + 'md5': '2552c7535125885901f1a2a4bcf32ca3', + 'info_dict': { + 'id': 'XYNlDOZZJzL6', + 'ext': 'mp4', + 'title': 'Pengabdi Setan', + 'display_id': 'pengabdi-setan', + 'description': 'md5:b0b41df08601e85e5291496c9bbe52cd', + 'timestamp': 1600243511, + 'categories': ['Film Indonesia', 'Horror', 'Mystery'], + 'release_timestamp': 1593536400, + 'release_year': 1982, + 'cast': list, + 'thumbnail': 'https://cms.dmpcdn.com/movie/2020/09/18/8b6e35c0-f97f-11ea-81fe-c52fc9dd314f_original.png', + 'upload_date': '20200916', + 'release_date': '20200630', + }, + 'expected_warnings': ['Video is geo restricted.'] + }, { + 'url': 'https://trueid.id/series/zZOBVPb62EwR/qXY73rwyl7oj/one-piece-ep-1/', + 'md5': '1c6d976049bc3c89a8a25aed2c3fb081', + 'info_dict': { + 'id': 'qXY73rwyl7oj', + 'ext': 'mp4', + 'title': 'One Piece Ep. 1', + 'display_id': 'one-piece-ep-1', + 'description': 'md5:13226d603bd03c4150a1cf5758e842ea', + 'timestamp': 1610421085, + 'categories': ['Animation & Cartoon', 'Kids & Family', 'Adventure'], + 'release_timestamp': 1612112400, + 'release_year': 1999, + 'age_limit': 7, + 'cast': ['Kounosuke Uda', 'Junji Shimizu'], + 'thumbnail': 'https://cms.dmpcdn.com/movie/2021/01/13/f84e9e70-5562-11eb-9fe2-dd6c2099a468_original.png', + 'upload_date': '20210112', + 'release_date': '20210131', + }, + 'expected_warnings': ['Video is geo restricted.'] + }, { + 'url': 'https://vn.trueid.net/series/7DNPM7Bpa9wv/pwLgEQ4Xbda2/haikyu-vua-bong-chuyen-phan-1/', + 'info_dict': { + 'id': 'pwLgEQ4Xbda2', + 'ext': 'mp4', + 'title': 'Haikyu!!: Vua Bóng Chuyền Phần 1 - Tập 1', + 'display_id': 'haikyu-vua-bong-chuyen-phan-1-tap-1', + 'description': 'md5:0374dd44d247799169449ee30cca963a', + 'timestamp': 1629270901, + 'categories': ['Anime', 'Phim Hài', 'Phim Học Đường', 'Phim Thể Thao', 'Shounen'], + 'release_timestamp': 1629270720, + 'release_year': 2014, + 'age_limit': 13, + 'thumbnail': 'https://cms.dmpcdn.com/movie/2021/09/28/b6e7ec00-2039-11ec-8436-974544e5841f_webp_original.jpg', + 'upload_date': '20210818', + 'release_date': '20210818', + }, + 'expected_warnings': ['Video is geo restricted.'] + }, { + 'url': 'https://trueid.ph/series/l8rvvAw7Jwv8/l8rvvAw7Jwv8/naruto-trailer/', + 'only_matching': True, + }] + _CUSTOM_RATINGS = { + 'PG': 7, + } + + def _real_extract(self, url): + domain, video_id = self._match_valid_url(url).group('domain', 'id') + webpage = self._download_webpage(url, video_id) + initial_data = traverse_obj( + self._search_nextjs_data(webpage, video_id, fatal=False), ('props', 'pageProps', 'initialContentData'), default={}) + + try: + stream_data = self._download_json( + f'https://{domain}/cmsPostProxy/contents/video/{video_id}/streamer?os=android', video_id, data=b'')['data'] + except ExtractorError as e: + if not isinstance(e.cause, compat_HTTPError): + raise e + errmsg = self._parse_json(e.cause.read().decode(), video_id)['meta']['message'] + if 'country' in errmsg: + self.raise_geo_restricted( + errmsg, [initial_data['display_country']] if initial_data.get('display_country') else None, True) + else: + self.raise_no_formats(errmsg, video_id=video_id) + + if stream_data: + stream_url = stream_data['stream']['stream_url'] + stream_ext = determine_ext(stream_url) + if stream_ext == 'm3u8': + formats, subs = self._extract_m3u8_formats_and_subtitles(stream_url, video_id, 'mp4') + elif stream_ext == 'mpd': + formats, subs = self._extract_mpd_formats_and_subtitles(stream_url, video_id) + else: + formats = [{'url': stream_url}] + + thumbnails = [ + {'id': thumb_key, 'url': thumb_url} + for thumb_key, thumb_url in (initial_data.get('thumb_list') or {}).items() + if url_or_none(thumb_url)] + + return { + 'id': video_id, + 'title': initial_data.get('title') or self._html_search_regex( + [r'Nonton (?P<name>.+) Gratis', + r'Xem (?P<name>.+) Miễn phí', + r'Watch (?P<name>.+) Free'], webpage, 'title', group='name'), + 'display_id': initial_data.get('slug_title'), + 'description': initial_data.get('synopsis'), + 'timestamp': unified_timestamp(initial_data.get('create_date')), + # 'duration': int_or_none(initial_data.get('duration'), invscale=60), # duration field must atleast be accurate to the second + 'categories': traverse_obj(initial_data, ('article_category_details', ..., 'name')), + 'release_timestamp': unified_timestamp(initial_data.get('publish_date')), + 'release_year': int_or_none(initial_data.get('release_year')), + 'formats': formats, + 'subtitles': subs, + 'thumbnails': thumbnails, + 'age_limit': self._CUSTOM_RATINGS.get(initial_data.get('rate')) or parse_age_limit(initial_data.get('rate')), + 'cast': traverse_obj(initial_data, (('actor', 'director'), ...)), + 'view_count': int_or_none(initial_data.get('count_views')), + 'like_count': int_or_none(initial_data.get('count_likes')), + 'average_rating': int_or_none(initial_data.get('count_ratings')), + } diff --git a/hypervideo_dl/extractor/tubitv.py b/hypervideo_dl/extractor/tubitv.py index 2e9b325..31feb9a 100644 --- a/hypervideo_dl/extractor/tubitv.py +++ b/hypervideo_dl/extractor/tubitv.py @@ -54,10 +54,7 @@ class TubiTvIE(InfoExtractor): }, }] - def _login(self): - username, password = self._get_login_info() - if username is None: - return + def _perform_login(self, username, password): self.report_login() form_data = { 'username': username, @@ -72,9 +69,6 @@ class TubiTvIE(InfoExtractor): raise ExtractorError( 'Login failed (invalid username/password)', expected=True) - def _real_initialize(self): - self._login() - def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( @@ -107,6 +101,9 @@ class TubiTvIE(InfoExtractor): 'url': self._proto_relative_url(sub_url), }) + season_number, episode_number, episode_title = self._search_regex( + r'^S(\d+):E(\d+) - (.+)', title, 'episode info', fatal=False, group=(1, 2, 3), default=(None, None, None)) + return { 'id': video_id, 'title': title, @@ -117,6 +114,9 @@ class TubiTvIE(InfoExtractor): 'duration': int_or_none(video_data.get('duration')), 'uploader_id': video_data.get('publisher_id'), 'release_year': int_or_none(video_data.get('year')), + 'season_number': int_or_none(season_number), + 'episode_number': int_or_none(episode_number), + 'episode_title': episode_title } @@ -132,9 +132,11 @@ class TubiTvShowIE(InfoExtractor): def _entries(self, show_url, show_name): show_webpage = self._download_webpage(show_url, show_name) + show_json = self._parse_json(self._search_regex( - r"window\.__data\s*=\s*({.+?});\s*</script>", - show_webpage, 'data',), show_name, transform_source=js_to_json)['video'] + r'window\.__data\s*=\s*({[^<]+});\s*</script>', + show_webpage, 'data'), show_name, transform_source=js_to_json)['video'] + for episode_id in show_json['fullContentById'].keys(): yield self.url_result( 'tubitv:%s' % episode_id, diff --git a/hypervideo_dl/extractor/tumblr.py b/hypervideo_dl/extractor/tumblr.py index adc3701..8086f61 100644 --- a/hypervideo_dl/extractor/tumblr.py +++ b/hypervideo_dl/extractor/tumblr.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + traverse_obj, urlencode_postdata ) @@ -14,39 +15,130 @@ class TumblrIE(InfoExtractor): _VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' _NETRC_MACHINE = 'tumblr' _LOGIN_URL = 'https://www.tumblr.com/login' + _OAUTH_URL = 'https://www.tumblr.com/api/v2/oauth2/token' _TESTS = [{ 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', 'md5': '479bb068e5b16462f5176a6828829767', 'info_dict': { 'id': '54196191430', 'ext': 'mp4', - 'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...', - 'description': 'md5:37db8211e40b50c7c44e95da14f630b7', - 'thumbnail': r're:http://.*\.jpg', + 'title': 'md5:dfac39636969fe6bf1caa2d50405f069', + 'description': 'md5:390ab77358960235b6937ab3b8528956', + 'uploader_id': 'tatianamaslanydaily', + 'uploader_url': 'https://tatianamaslanydaily.tumblr.com/', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 127, + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': ['Orphan Black', 'Tatiana Maslany', 'Interview', 'Video', 'OB S1 DVD Extras'], + } + }, { + 'note': 'multiple formats', + 'url': 'https://maskofthedragon.tumblr.com/post/626907179849564160/mona-talking-in-english', + 'md5': 'f43ff8a8861712b6cf0e0c2bd84cfc68', + 'info_dict': { + 'id': '626907179849564160', + 'ext': 'mp4', + 'title': 'Mona\xa0“talking” in\xa0“english”', + 'description': 'md5:082a3a621530cb786ad2b7592a6d9e2c', + 'uploader_id': 'maskofthedragon', + 'uploader_url': 'https://maskofthedragon.tumblr.com/', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 7, + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': 'count:19', + }, + 'params': { + 'format': 'hd', + }, + }, { + 'note': 'non-iframe video (with related posts)', + 'url': 'https://shieldfoss.tumblr.com/post/675519763813908480', + 'md5': '12bdb75661ef443bffe5a4dac1dbf118', + 'info_dict': { + 'id': '675519763813908480', + 'ext': 'mp4', + 'title': 'Shieldfoss', + 'uploader_id': 'nerviovago', + 'uploader_url': 'https://nerviovago.tumblr.com/', + 'thumbnail': r're:^https?://.*\.jpg', + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': [], } }, { - 'url': 'http://5sostrum.tumblr.com/post/90208453769/yall-forgetting-the-greatest-keek-of-them-all', - 'md5': 'bf348ef8c0ef84fbf1cbd6fa6e000359', + 'note': 'dashboard only (original post)', + 'url': 'https://jujanon.tumblr.com/post/159704441298/my-baby-eating', + 'md5': '029f7c91ab386701b211e3d494d2d95e', 'info_dict': { - 'id': '90208453769', + 'id': '159704441298', 'ext': 'mp4', - 'title': '5SOS STRUM ;]', - 'description': 'md5:dba62ac8639482759c8eb10ce474586a', - 'thumbnail': r're:http://.*\.jpg', + 'title': 'md5:ba79365861101f4911452728d2950561', + 'description': 'md5:773738196cea76b6996ec71e285bdabc', + 'uploader_id': 'jujanon', + 'uploader_url': 'https://jujanon.tumblr.com/', + 'thumbnail': r're:^https?://.*\.jpg', + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': ['crabs', 'my video', 'my pets'], } }, { - 'url': 'http://hdvideotest.tumblr.com/post/130323439814/test-description-for-my-hd-video', - 'md5': '7ae503065ad150122dc3089f8cf1546c', + 'note': 'dashboard only (reblog)', + 'url': 'https://bartlebyshop.tumblr.com/post/180294460076/duality-of-bird', + 'md5': '04334e7cadb1af680d162912559f51a5', 'info_dict': { - 'id': '130323439814', + 'id': '180294460076', 'ext': 'mp4', - 'title': 'HD Video Testing \u2014 Test description for my HD video', - 'description': 'md5:97cc3ab5fcd27ee4af6356701541319c', - 'thumbnail': r're:http://.*\.jpg', - }, - 'params': { - 'format': 'hd', + 'title': 'duality of bird', + 'description': 'duality of bird', + 'uploader_id': 'todaysbird', + 'uploader_url': 'https://todaysbird.tumblr.com/', + 'thumbnail': r're:^https?://.*\.jpg', + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': [], + } + }, { + 'note': 'dashboard only (external)', + 'url': 'https://afloweroutofstone.tumblr.com/post/675661759168823296/the-blues-remembers-everything-the-country-forgot', + 'info_dict': { + 'id': 'q67_fd7b8SU', + 'ext': 'mp4', + 'title': 'The Blues Remembers Everything the Country Forgot', + 'alt_title': 'The Blues Remembers Everything the Country Forgot', + 'description': 'md5:1a6b4097e451216835a24c1023707c79', + 'release_date': '20201224', + 'creator': 'md5:c2239ba15430e87c3b971ba450773272', + 'uploader': 'Moor Mother - Topic', + 'upload_date': '20201223', + 'uploader_id': 'UCxrMtFBRkFvQJ_vVM4il08w', + 'uploader_url': 'http://www.youtube.com/channel/UCxrMtFBRkFvQJ_vVM4il08w', + 'thumbnail': r're:^https?://i.ytimg.com/.*', + 'channel': 'Moor Mother - Topic', + 'channel_id': 'UCxrMtFBRkFvQJ_vVM4il08w', + 'channel_url': 'https://www.youtube.com/channel/UCxrMtFBRkFvQJ_vVM4il08w', + 'channel_follower_count': int, + 'duration': 181, + 'view_count': int, + 'like_count': int, + 'age_limit': 0, + 'categories': ['Music'], + 'tags': 'count:7', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'availability': 'public', + 'track': 'The Blues Remembers Everything the Country Forgot', + 'artist': 'md5:c2239ba15430e87c3b971ba450773272', + 'album': 'Brass', + 'release_year': 2020, }, + 'add_ie': ['Youtube'], }, { 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching', 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab', @@ -60,16 +152,51 @@ class TumblrIE(InfoExtractor): 'uploader_id': '1638622', 'uploader': 'naked-yogi', }, - 'add_ie': ['Vidme'], + # 'add_ie': ['Vidme'], + 'skip': 'dead embedded video host' }, { - 'url': 'http://camdamage.tumblr.com/post/98846056295/', - 'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6', + 'url': 'https://prozdvoices.tumblr.com/post/673201091169681408/what-recording-voice-acting-sounds-like', + 'md5': 'a0063fc8110e6c9afe44065b4ea68177', 'info_dict': { - 'id': '105463834', + 'id': 'eomhW5MLGWA', 'ext': 'mp4', - 'title': 'Cam Damage-HD 720p', - 'uploader': 'John Moyer', - 'uploader_id': 'user32021558', + 'title': 'what recording voice acting sounds like', + 'description': 'md5:1da3faa22d0e0b1d8b50216c284ee798', + 'uploader': 'ProZD', + 'upload_date': '20220112', + 'uploader_id': 'ProZD', + 'uploader_url': 'http://www.youtube.com/user/ProZD', + 'thumbnail': r're:^https?://i.ytimg.com/.*', + 'channel': 'ProZD', + 'channel_id': 'UC6MFZAOHXlKK1FI7V0XQVeA', + 'channel_url': 'https://www.youtube.com/channel/UC6MFZAOHXlKK1FI7V0XQVeA', + 'channel_follower_count': int, + 'duration': 20, + 'view_count': int, + 'like_count': int, + 'age_limit': 0, + 'categories': ['Film & Animation'], + 'tags': [], + 'live_status': 'not_live', + 'playable_in_embed': True, + 'availability': 'public', + }, + 'add_ie': ['Youtube'], + }, { + 'url': 'https://dominustempori.tumblr.com/post/673572712813297664/youtubes-all-right-for-some-pretty-cool', + 'md5': '203e9eb8077e3f45bfaeb4c86c1467b8', + 'info_dict': { + 'id': '87816359', + 'ext': 'mov', + 'title': 'Harold Ramis', + 'description': 'md5:be8e68cbf56ce0785c77f0c6c6dfaf2c', + 'uploader': 'Resolution Productions Group', + 'uploader_id': 'resolutionproductions', + 'uploader_url': 'https://vimeo.com/resolutionproductions', + 'upload_date': '20140227', + 'thumbnail': r're:^https?://i.vimeocdn.com/video/.*', + 'timestamp': 1393523719, + 'duration': 291, }, 'add_ie': ['Vimeo'], }, { @@ -86,127 +213,180 @@ class TumblrIE(InfoExtractor): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1455940159, + 'view_count': int, }, 'add_ie': ['Vine'], }, { - 'url': 'http://vitasidorkina.tumblr.com/post/134652425014/joskriver-victoriassecret-invisibility-or', - 'md5': '01c12ceb82cbf6b2fe0703aa56b3ad72', + 'url': 'https://silami.tumblr.com/post/84250043974/my-bad-river-flows-in-you-impression-on-maschine', + 'md5': '3c92d7c3d867f14ccbeefa2119022277', 'info_dict': { - 'id': '-7LnUPGlSo', + 'id': 'nYtvtTPuTl', 'ext': 'mp4', - 'title': 'Video by victoriassecret', - 'description': 'Invisibility or flight…which superpower would YOU choose? #VSFashionShow #ThisOrThat', - 'uploader_id': 'victoriassecret', - 'thumbnail': r're:^https?://.*\.jpg' + 'title': 'Video by silbulterman', + 'description': '#maschine', + 'uploader_id': '242859024', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1398801174, + 'like_count': int, + 'uploader': 'Sil', + 'channel': 'silbulterman', + 'comment_count': int, + 'upload_date': '20140429', }, 'add_ie': ['Instagram'], }] - def _real_initialize(self): - self._login() + _providers = { + 'instagram': 'Instagram', + 'vimeo': 'Vimeo', + 'vine': 'Vine', + 'youtube': 'Youtube', + } - def _login(self): - username, password = self._get_login_info() - if username is None: - return + _ACCESS_TOKEN = None + def _initialize_pre_login(self): login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - login_form = self._hidden_inputs(login_page) - login_form.update({ - 'user[email]': username, - 'user[password]': password - }) - - response, urlh = self._download_webpage_handle( - self._LOGIN_URL, None, 'Logging in', - data=urlencode_postdata(login_form), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': self._LOGIN_URL, - }) + self._LOGIN_URL, None, 'Downloading login page', fatal=False) + if login_page: + self._ACCESS_TOKEN = self._search_regex( + r'"API_TOKEN":\s*"(\w+)"', login_page, 'API access token', fatal=False) + if not self._ACCESS_TOKEN: + self.report_warning('Failed to get access token; metadata will be missing and some videos may not work') - # Successful login - if '/dashboard' in urlh.geturl(): + def _perform_login(self, username, password): + if not self._ACCESS_TOKEN: return - login_errors = self._parse_json( - self._search_regex( - r'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response, - 'login errors', default='[]'), - None, fatal=False) - if login_errors: - raise ExtractorError( - 'Unable to login: %s' % login_errors[0], expected=True) - - self.report_warning('Login has probably failed') + self._download_json( + self._OAUTH_URL, None, 'Logging in', + data=urlencode_postdata({ + 'password': password, + 'grant_type': 'password', + 'username': username, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Authorization': f'Bearer {self._ACCESS_TOKEN}', + }, + errnote='Login failed', fatal=False) def _real_extract(self, url): - m_url = self._match_valid_url(url) - video_id = m_url.group('id') - blog = m_url.group('blog_name') + blog, video_id = self._match_valid_url(url).groups() - url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) + url = f'http://{blog}.tumblr.com/post/{video_id}/' webpage, urlh = self._download_webpage_handle(url, video_id) redirect_url = urlh.geturl() - if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): - raise ExtractorError( - 'This Tumblr may contain sensitive media. ' - 'Disable safe mode in your account settings ' - 'at https://www.tumblr.com/settings/account#safe_mode', - expected=True) + api_only = bool(self._search_regex( + r'(tumblr.com|^)/(safe-mode|login_required|blog/view)', + redirect_url, 'redirect', default=None)) + + if api_only and not self._ACCESS_TOKEN: + raise ExtractorError('Cannot get data for dashboard-only post without access token') + + post_json = {} + if self._ACCESS_TOKEN: + post_json = traverse_obj( + self._download_json( + f'https://www.tumblr.com/api/v2/blog/{blog}/posts/{video_id}/permalink', + video_id, headers={'Authorization': f'Bearer {self._ACCESS_TOKEN}'}, fatal=False), + ('response', 'timeline', 'elements', 0)) or {} + content_json = traverse_obj(post_json, ('trail', 0, 'content'), ('content')) or [] + video_json = next( + (item for item in content_json if item.get('type') == 'video'), {}) + media_json = video_json.get('media') or {} + if api_only and not media_json.get('url') and not video_json.get('url'): + raise ExtractorError('Failed to find video data for dashboard-only post') + + if not media_json.get('url') and video_json.get('url'): + # external video host + return self.url_result( + video_json['url'], + self._providers.get(video_json.get('provider'), 'Generic')) + + video_url = self._og_search_video_url(webpage, default=None) + duration = None + formats = [] + + # iframes can supply duration and sometimes additional formats, so check for one iframe_url = self._search_regex( - r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', + fr'src=\'(https?://www\.tumblr\.com/video/{blog}/{video_id}/[^\']+)\'', webpage, 'iframe url', default=None) - if iframe_url is None: - return self.url_result(redirect_url, 'Generic') + if iframe_url: + iframe = self._download_webpage( + iframe_url, video_id, 'Downloading iframe page', + headers={'Referer': redirect_url}) - iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page') + options = self._parse_json( + self._search_regex( + r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe, + 'hd video url', default='', group='options'), + video_id, fatal=False) + if options: + duration = int_or_none(options.get('duration')) - duration = None - sources = [] - - sd_url = self._search_regex( - r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe, - 'sd video url', default=None, group='url') - if sd_url: - sources.append((sd_url, 'sd')) - - options = self._parse_json( - self._search_regex( - r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe, - 'hd video url', default='', group='options'), - video_id, fatal=False) - if options: - duration = int_or_none(options.get('duration')) - hd_url = options.get('hdUrl') - if hd_url: - sources.append((hd_url, 'hd')) - - formats = [{ - 'url': video_url, - 'ext': 'mp4', - 'format_id': format_id, - 'height': int_or_none(self._search_regex( - r'/(\d{3,4})$', video_url, 'height', default=None)), - 'quality': quality, - } for quality, (video_url, format_id) in enumerate(sources)] + hd_url = options.get('hdUrl') + if hd_url: + # there are multiple formats; extract them + # ignore other sources of width/height data as they may be wrong + sources = [] + sd_url = self._search_regex( + r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe, + 'sd video url', default=None, group='url') + if sd_url: + sources.append((sd_url, 'sd')) + sources.append((hd_url, 'hd')) + + formats = [{ + 'url': video_url, + 'format_id': format_id, + 'height': int_or_none(self._search_regex( + r'_(\d+)\.\w+$', video_url, 'height', default=None)), + 'quality': quality, + } for quality, (video_url, format_id) in enumerate(sources)] + + if not media_json.get('url') and not video_url and not iframe_url: + # external video host (but we weren't able to figure it out from the api) + iframe_url = self._search_regex( + r'src=["\'](https?://safe\.txmblr\.com/svc/embed/inline/[^"\']+)["\']', + webpage, 'embed iframe url', default=None) + return self.url_result(iframe_url or redirect_url, 'Generic') + formats = formats or [{ + 'url': media_json.get('url') or video_url, + 'width': int_or_none( + media_json.get('width') or self._og_search_property('video:width', webpage, default=None)), + 'height': int_or_none( + media_json.get('height') or self._og_search_property('video:height', webpage, default=None)), + }] self._sort_formats(formats) - # The only place where you can get a title, it's not complete, - # but searching in other places doesn't work for all videos - video_title = self._html_search_regex( - r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?</title>', - webpage, 'title') + # the url we're extracting from might be an original post or it might be a reblog. + # if it's a reblog, og:description will be the reblogger's comment, not the uploader's. + # content_json is always the op, so if it exists but has no text, there's no description + if content_json: + description = '\n\n'.join(( + item.get('text') for item in content_json if item.get('type') == 'text')) or None + else: + description = self._og_search_description(webpage, default=None) + uploader_id = traverse_obj(post_json, 'reblogged_root_name', 'blog_name') return { 'id': video_id, - 'title': video_title, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'title': post_json.get('summary') or (blog if api_only else self._html_search_regex( + r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?</title>', webpage, 'title')), + 'description': description, + 'thumbnail': (traverse_obj(video_json, ('poster', 0, 'url')) + or self._og_search_thumbnail(webpage, default=None)), + 'uploader_id': uploader_id, + 'uploader_url': f'https://{uploader_id}.tumblr.com/' if uploader_id else None, 'duration': duration, + 'like_count': post_json.get('like_count'), + 'repost_count': post_json.get('reblog_count'), + 'age_limit': {True: 18, False: 0}.get(post_json.get('is_nsfw')), + 'tags': post_json.get('tags'), 'formats': formats, } diff --git a/hypervideo_dl/extractor/tunein.py b/hypervideo_dl/extractor/tunein.py index c7a5f5a..7e51de8 100644 --- a/hypervideo_dl/extractor/tunein.py +++ b/hypervideo_dl/extractor/tunein.py @@ -62,7 +62,7 @@ class TuneInBaseIE(InfoExtractor): return { 'id': content_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'formats': formats, 'thumbnail': thumbnail, 'location': location, diff --git a/hypervideo_dl/extractor/turner.py b/hypervideo_dl/extractor/turner.py index 32125bc..519dc32 100644 --- a/hypervideo_dl/extractor/turner.py +++ b/hypervideo_dl/extractor/turner.py @@ -205,7 +205,7 @@ class TurnerBaseIE(AdobePassIE): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'formats': formats, 'subtitles': subtitles, 'thumbnails': thumbnails, diff --git a/hypervideo_dl/extractor/tv2.py b/hypervideo_dl/extractor/tv2.py index e085153..977da30 100644 --- a/hypervideo_dl/extractor/tv2.py +++ b/hypervideo_dl/extractor/tv2.py @@ -19,7 +19,7 @@ from ..utils import ( class TV2IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/v\d*/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { @@ -33,6 +33,9 @@ class TV2IE(InfoExtractor): 'view_count': int, 'categories': list, }, + }, { + 'url': 'http://www.tv2.no/v2/916509', + 'only_matching': True, }] _PROTOCOLS = ('HLS', 'DASH') _GEO_COUNTRIES = ['NO'] @@ -78,9 +81,7 @@ class TV2IE(InfoExtractor): elif ext == 'm3u8': if not data.get('drmProtected'): formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', - 'm3u8' if is_live else 'm3u8_native', - m3u8_id=format_id, fatal=False)) + video_url, video_id, 'mp4', live=is_live, m3u8_id=format_id, fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, format_id, fatal=False)) @@ -103,7 +104,7 @@ class TV2IE(InfoExtractor): return { 'id': video_id, 'url': video_url, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': strip_or_none(asset.get('description')), 'thumbnails': thumbnails, 'timestamp': parse_iso8601(asset.get('live_broadcast_time') or asset.get('update_time')), @@ -241,9 +242,7 @@ class KatsomoIE(InfoExtractor): elif ext == 'm3u8': if not data.get('drmProtected'): formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', - 'm3u8' if is_live else 'm3u8_native', - m3u8_id=format_id, fatal=False)) + video_url, video_id, 'mp4', live=is_live, m3u8_id=format_id, fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, format_id, fatal=False)) @@ -268,7 +267,7 @@ class KatsomoIE(InfoExtractor): return { 'id': video_id, 'url': video_url, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': strip_or_none(asset.get('description')), 'thumbnails': thumbnails, 'timestamp': parse_iso8601(asset.get('createTime')), diff --git a/hypervideo_dl/extractor/tv2dk.py b/hypervideo_dl/extractor/tv2dk.py index 8bd5fd6..ec5cbdf 100644 --- a/hypervideo_dl/extractor/tv2dk.py +++ b/hypervideo_dl/extractor/tv2dk.py @@ -41,8 +41,16 @@ class TV2DKIE(InfoExtractor): 'duration': 1347, 'view_count': int, }, - 'params': { - 'skip_download': True, + 'add_ie': ['Kaltura'], + }, { + 'url': 'https://www.tv2lorry.dk/gadekamp/gadekamp-6-hoejhuse-i-koebenhavn', + 'info_dict': { + 'id': '1_7iwll9n0', + 'ext': 'mp4', + 'upload_date': '20211027', + 'title': 'Gadekamp #6 - Højhuse i København', + 'uploader_id': 'tv2lorry', + 'timestamp': 1635345229, }, 'add_ie': ['Kaltura'], }, { @@ -91,11 +99,14 @@ class TV2DKIE(InfoExtractor): add_entry(partner_id, kaltura_id) if not entries: kaltura_id = self._search_regex( - r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id') + (r'entry_id\s*:\s*["\']([0-9a-z_]+)', + r'\\u002FentryId\\u002F(\w+)\\u002F'), webpage, 'kaltura id') partner_id = self._search_regex( (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, 'partner id') add_entry(partner_id, kaltura_id) + if len(entries) == 1: + return entries[0] return self.playlist_result(entries) diff --git a/hypervideo_dl/extractor/tver.py b/hypervideo_dl/extractor/tver.py index 943b3eb..9ff3136 100644 --- a/hypervideo_dl/extractor/tver.py +++ b/hypervideo_dl/extractor/tver.py @@ -5,15 +5,16 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + ExtractorError, int_or_none, remove_start, smuggle_url, - try_get, + traverse_obj, ) class TVerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>(?:corner|episode|feature)/(?P<id>f?\d+))' + _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>corner|episode|feature|lp|tokyo2020/video)/(?P<id>[fc]?\d+)' # videos are only available for 7 days _TESTS = [{ 'url': 'https://tver.jp/corner/f0062178', @@ -28,6 +29,15 @@ class TVerIE(InfoExtractor): # subtitle = ' ' 'url': 'https://tver.jp/corner/f0068870', 'only_matching': True, + }, { + 'url': 'https://tver.jp/lp/f0009694', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/lp/c0000239', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/tokyo2020/video/6264525510001', + 'only_matching': True, }] _TOKEN = None BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' @@ -38,13 +48,20 @@ class TVerIE(InfoExtractor): def _real_extract(self, url): path, video_id = self._match_valid_url(url).groups() - main = self._download_json( - 'https://api.tver.jp/v4/' + path, video_id, - query={'token': self._TOKEN})['main'] - p_id = main['publisher_id'] - service = remove_start(main['service'], 'ts_') + if path == 'lp': + webpage = self._download_webpage(url, video_id) + redirect_path = self._search_regex(r'to_href="([^"]+)', webpage, 'redirect path') + path, video_id = self._match_valid_url(f'https://tver.jp{redirect_path}').groups() + api_response = self._download_json(f'https://api.tver.jp/v4/{path}/{video_id}', video_id, query={'token': self._TOKEN}) + p_id = traverse_obj(api_response, ('main', 'publisher_id')) + if not p_id: + error_msg, expected = traverse_obj(api_response, ('episode', 0, 'textbar', 0, ('text', 'longer')), get_all=False), True + if not error_msg: + error_msg, expected = 'Failed to extract publisher ID', False + raise ExtractorError(error_msg, expected=expected) + service = remove_start(traverse_obj(api_response, ('main', 'service')), 'ts_') - r_id = main['reference_id'] + r_id = traverse_obj(api_response, ('main', 'reference_id')) if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): r_id = 'ref:' + r_id bc_url = smuggle_url( @@ -53,8 +70,8 @@ class TVerIE(InfoExtractor): return { '_type': 'url_transparent', - 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), - 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + 'description': traverse_obj(api_response, ('main', 'note', 0, 'text'), expected_type=compat_str), + 'episode_number': int_or_none(traverse_obj(api_response, ('main', 'ext', 'episode_number'), expected_type=compat_str)), 'url': bc_url, 'ie_key': 'BrightcoveNew', } diff --git a/hypervideo_dl/extractor/tvnet.py b/hypervideo_dl/extractor/tvnet.py index 4222ff9..aa1e9d9 100644 --- a/hypervideo_dl/extractor/tvnet.py +++ b/hypervideo_dl/extractor/tvnet.py @@ -111,9 +111,7 @@ class TVNetIE(InfoExtractor): continue stream_urls.add(stream_url) formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', - entry_protocol='m3u8' if is_live else 'm3u8_native', - m3u8_id='hls', fatal=False)) + stream_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False)) self._sort_formats(formats) # better support for radio streams @@ -130,9 +128,6 @@ class TVNetIE(InfoExtractor): r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, 'thumbnail', default=None, group='url')) - if is_live: - title = self._live_title(title) - view_count = int_or_none(self._search_regex( r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>', webpage, 'view count', default=None)) diff --git a/hypervideo_dl/extractor/tvopengr.py b/hypervideo_dl/extractor/tvopengr.py new file mode 100644 index 0000000..a11cdc6 --- /dev/null +++ b/hypervideo_dl/extractor/tvopengr.py @@ -0,0 +1,128 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + get_elements_text_and_html_by_attribute, + scale_thumbnails_to_max_format_width, + unescapeHTML, +) + + +class TVOpenGrBaseIE(InfoExtractor): + def _return_canonical_url(self, url, video_id): + webpage = self._download_webpage(url, video_id) + canonical_url = self._og_search_url(webpage) + title = self._og_search_title(webpage) + return self.url_result(canonical_url, ie=TVOpenGrWatchIE.ie_key(), video_id=video_id, video_title=title) + + +class TVOpenGrWatchIE(TVOpenGrBaseIE): + IE_NAME = 'tvopengr:watch' + IE_DESC = 'tvopen.gr (and ethnos.gr) videos' + _VALID_URL = r'https?://(?P<netloc>(?:www\.)?(?:tvopen|ethnos)\.gr)/watch/(?P<id>\d+)/(?P<slug>[^/]+)' + _API_ENDPOINT = 'https://www.tvopen.gr/templates/data/player' + + _TESTS = [{ + 'url': 'https://www.ethnos.gr/watch/101009/nikoskaprabelosdenexoymekanenanasthenhsemethmethmetallaxhomikron', + 'md5': '8728570e3a72e0f8d9475ba94859fdc1', + 'info_dict': { + 'id': '101009', + 'title': 'md5:51f68773dcb6c70498cd326f45fefdf0', + 'display_id': 'nikoskaprabelosdenexoymekanenanasthenhsemethmethmetallaxhomikron', + 'description': 'md5:78fff49f18fb3effe41b070e5c7685d6', + 'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/d573ba71-ec5f-43c6-b4cb-d181f327d3a8.jpg', + 'ext': 'mp4', + 'upload_date': '20220109', + 'timestamp': 1641686400, + }, + }, { + 'url': 'https://www.tvopen.gr/watch/100979/se28099agapaomenalla7cepeisodio267cmhthrargiapashskakias', + 'md5': '38f98a1be0c577db4ea2d1b1c0770c48', + 'info_dict': { + 'id': '100979', + 'title': 'md5:e021f3001e16088ee40fa79b20df305b', + 'display_id': 'se28099agapaomenalla7cepeisodio267cmhthrargiapashskakias', + 'description': 'md5:ba17db53954134eb8d625d199e2919fb', + 'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/9bb71cf1-21da-43a9-9d65-367950fde4e3.jpg', + 'ext': 'mp4', + 'upload_date': '20220108', + 'timestamp': 1641600000, + }, + }] + + def _extract_formats_and_subs(self, response, video_id): + formats, subs = [], {} + for format_id, format_url in response.items(): + if format_id not in ('stream', 'httpstream', 'mpegdash'): + continue + ext = determine_ext(format_url) + if ext == 'm3u8': + formats_, subs_ = self._extract_m3u8_formats_and_subtitles( + format_url, video_id, 'mp4', m3u8_id=format_id, + fatal=False) + elif ext == 'mpd': + formats_, subs_ = self._extract_mpd_formats_and_subtitles( + format_url, video_id, 'mp4', fatal=False) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + continue + formats.extend(formats_) + self._merge_subtitles(subs_, target=subs) + self._sort_formats(formats) + return formats, subs + + def _real_extract(self, url): + netloc, video_id, display_id = self._match_valid_url(url).group('netloc', 'id', 'slug') + if netloc.find('tvopen.gr') == -1: + return self._return_canonical_url(url, video_id) + webpage = self._download_webpage(url, video_id) + info = self._search_json_ld(webpage, video_id, expected_type='VideoObject') + info['formats'], info['subtitles'] = self._extract_formats_and_subs( + self._download_json(self._API_ENDPOINT, video_id, query={'cid': video_id}), + video_id) + info['thumbnails'] = scale_thumbnails_to_max_format_width( + info['formats'], info['thumbnails'], r'(?<=/imgHandler/)\d+') + description, _html = next(get_elements_text_and_html_by_attribute('class', 'description', webpage)) + if description and _html.startswith('<span '): + info['description'] = description + info['id'] = video_id + info['display_id'] = display_id + return info + + +class TVOpenGrEmbedIE(TVOpenGrBaseIE): + IE_NAME = 'tvopengr:embed' + IE_DESC = 'tvopen.gr embedded videos' + _VALID_URL = r'(?:https?:)?//(?:www\.|cdn\.|)(?:tvopen|ethnos).gr/embed/(?P<id>\d+)' + _EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''') + + _TESTS = [{ + 'url': 'https://cdn.ethnos.gr/embed/100963', + 'md5': '2da147881f45571d81662d94d086628b', + 'info_dict': { + 'id': '100963', + 'display_id': 'koronoiosapotoysdieythyntestonsxoleionselftestgiaosoysdenbrhkan', + 'title': 'md5:2c71876fadf0cda6043da0da5fca2936', + 'description': 'md5:17482b4432e5ed30eccd93b05d6ea509', + 'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/5804e07f-799a-4247-a696-33842c94ca37.jpg', + 'ext': 'mp4', + 'upload_date': '20220108', + 'timestamp': 1641600000, + }, + }] + + @classmethod + def _extract_urls(cls, webpage): + for mobj in cls._EMBED_RE.finditer(webpage): + yield unescapeHTML(mobj.group('url')) + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._return_canonical_url(url, video_id) diff --git a/hypervideo_dl/extractor/tvp.py b/hypervideo_dl/extractor/tvp.py index 1e42b33..48e2c6e 100644 --- a/hypervideo_dl/extractor/tvp.py +++ b/hypervideo_dl/extractor/tvp.py @@ -2,35 +2,40 @@ from __future__ import unicode_literals import itertools +import random import re from .common import InfoExtractor from ..utils import ( - clean_html, determine_ext, + dict_get, ExtractorError, - get_element_by_attribute, + int_or_none, + js_to_json, orderedSet, + str_or_none, + try_get, ) class TVPIE(InfoExtractor): IE_NAME = 'tvp' IE_DESC = 'Telewizja Polska' - _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|polandin\.com)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)' _TESTS = [{ + # TVPlayer 2 in js wrapper 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', - 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 'info_dict': { 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, odc. 13 – Władek', 'description': 'md5:437f48b93558370b031740546b696e24', + 'age_limit': 12, }, }, { + # TVPlayer legacy 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', - 'md5': 'b0005b542e5b4de643a9690326ab1257', 'info_dict': { 'id': '17916176', 'ext': 'mp4', @@ -38,16 +43,63 @@ class TVPIE(InfoExtractor): 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', }, }, { - # page id is not the same as video id(#7799) - 'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930', - 'md5': '84cd3c8aec4840046e5ab712416b73d0', + # TVPlayer 2 in iframe + 'url': 'https://wiadomosci.tvp.pl/50725617/dzieci-na-sprzedaz-dla-homoseksualistow', 'info_dict': { - 'id': '33908820', + 'id': '50725617', 'ext': 'mp4', - 'title': 'Wiadomości, 28.09.2017, 19:30', - 'description': 'Wydanie główne codziennego serwisu informacyjnego.' + 'title': 'Dzieci na sprzedaż dla homoseksualistów', + 'description': 'md5:7d318eef04e55ddd9f87a8488ac7d590', + 'age_limit': 12, }, - 'skip': 'HTTP Error 404: Not Found', + }, { + # TVPlayer 2 in client-side rendered website (regional; window.__newsData) + 'url': 'https://warszawa.tvp.pl/25804446/studio-yayo', + 'info_dict': { + 'id': '25804446', + 'ext': 'mp4', + 'title': 'Studio Yayo', + 'upload_date': '20160616', + 'timestamp': 1466075700, + } + }, { + # TVPlayer 2 in client-side rendered website (tvp.info; window.__videoData) + 'url': 'https://www.tvp.info/52880236/09042021-0800', + 'info_dict': { + 'id': '52880236', + 'ext': 'mp4', + 'title': '09.04.2021, 08:00', + }, + }, { + # client-side rendered (regional) program (playlist) page + 'url': 'https://opole.tvp.pl/9660819/rozmowa-dnia', + 'info_dict': { + 'id': '9660819', + 'description': 'Od poniedziałku do piątku o 18:55', + 'title': 'Rozmowa dnia', + }, + 'playlist_mincount': 1800, + 'params': { + 'skip_download': True, + } + }, { + # ABC-specific video embeding + # moved to https://bajkowakraina.tvp.pl/wideo/50981130,teleranek,51027049,zubr,51116450 + 'url': 'https://abc.tvp.pl/48636269/zubry-odc-124', + 'info_dict': { + 'id': '48320456', + 'ext': 'mp4', + 'title': 'Teleranek, Żubr', + }, + 'skip': 'unavailable', + }, { + # yet another vue page + 'url': 'https://jp2.tvp.pl/46925618/filmy', + 'info_dict': { + 'id': '46925618', + 'title': 'Filmy', + }, + 'playlist_mincount': 19, }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', 'only_matching': True, @@ -66,137 +118,344 @@ class TVPIE(InfoExtractor): }, { 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji', 'only_matching': True, + }, { + 'url': 'https://tvp.info/49193823/teczowe-flagi-na-pomnikach-prokuratura-wszczela-postepowanie-wieszwiecej', + 'only_matching': True, + }, { + 'url': 'https://www.tvpparlament.pl/retransmisje-vod/inne/wizyta-premiera-mateusza-morawieckiego-w-firmie-berotu-sp-z-oo/48857277', + 'only_matching': True, + }, { + 'url': 'https://polandin.com/47942651/pln-10-billion-in-subsidies-transferred-to-companies-pm', + 'only_matching': True, }] + def _parse_vue_website_data(self, webpage, page_id): + website_data = self._search_regex([ + # website - regiony, tvp.info + # directory - jp2.tvp.pl + r'window\.__(?:website|directory)Data\s*=\s*({(?:.|\s)+?});', + ], webpage, 'website data') + if not website_data: + return None + return self._parse_json(website_data, page_id, transform_source=js_to_json) + + def _extract_vue_video(self, video_data, page_id=None): + if isinstance(video_data, str): + video_data = self._parse_json(video_data, page_id, transform_source=js_to_json) + thumbnails = [] + image = video_data.get('image') + if image: + for thumb in (image if isinstance(image, list) else [image]): + thmb_url = str_or_none(thumb.get('url')) + if thmb_url: + thumbnails.append({ + 'url': thmb_url, + }) + is_website = video_data.get('type') == 'website' + if is_website: + url = video_data['url'] + fucked_up_url_parts = re.match(r'https?://vod\.tvp\.pl/(\d+)/([^/?#]+)', url) + if fucked_up_url_parts: + url = f'https://vod.tvp.pl/website/{fucked_up_url_parts.group(2)},{fucked_up_url_parts.group(1)}' + else: + url = 'tvp:' + str_or_none(video_data.get('_id') or page_id) + return { + '_type': 'url_transparent', + 'id': str_or_none(video_data.get('_id') or page_id), + 'url': url, + 'ie_key': 'TVPEmbed' if not is_website else 'TVPWebsite', + 'title': str_or_none(video_data.get('title')), + 'description': str_or_none(video_data.get('lead')), + 'timestamp': int_or_none(video_data.get('release_date_long')), + 'duration': int_or_none(video_data.get('duration')), + 'thumbnails': thumbnails, + } + + def _handle_vuejs_page(self, url, webpage, page_id): + # vue client-side rendered sites (all regional pages + tvp.info) + video_data = self._search_regex([ + r'window\.__(?:news|video)Data\s*=\s*({(?:.|\s)+?})\s*;', + ], webpage, 'video data', default=None) + if video_data: + return self._extract_vue_video(video_data, page_id=page_id) + # paged playlists + website_data = self._parse_vue_website_data(webpage, page_id) + if website_data: + entries = self._vuejs_entries(url, website_data, page_id) + + return { + '_type': 'playlist', + 'id': page_id, + 'title': str_or_none(website_data.get('title')), + 'description': str_or_none(website_data.get('lead')), + 'entries': entries, + } + raise ExtractorError('Could not extract video/website data') + + def _vuejs_entries(self, url, website_data, page_id): + + def extract_videos(wd): + if wd.get('latestVideo'): + yield self._extract_vue_video(wd['latestVideo']) + for video in wd.get('videos') or []: + yield self._extract_vue_video(video) + for video in wd.get('items') or []: + yield self._extract_vue_video(video) + + yield from extract_videos(website_data) + + if website_data.get('items_total_count') > website_data.get('items_per_page'): + for page in itertools.count(2): + page_website_data = self._parse_vue_website_data( + self._download_webpage(url, page_id, note='Downloading page #%d' % page, + query={'page': page}), + page_id) + if not page_website_data.get('videos') and not page_website_data.get('items'): + break + yield from extract_videos(page_website_data) + def _real_extract(self, url): page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) + webpage, urlh = self._download_webpage_handle(url, page_id) + + # The URL may redirect to a VOD + # example: https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii + if TVPWebsiteIE.suitable(urlh.url): + return self.url_result(urlh.url, ie=TVPWebsiteIE.ie_key(), video_id=page_id) + + if re.search( + r'window\.__(?:video|news|website|directory)Data\s*=', + webpage): + return self._handle_vuejs_page(url, webpage, page_id) + + # classic server-side rendered sites video_id = self._search_regex([ + r'<iframe[^>]+src="[^"]*?embed\.php\?(?:[^&]+&)*ID=(\d+)', r'<iframe[^>]+src="[^"]*?object_id=(\d+)', r"object_id\s*:\s*'(\d+)'", - r'data-video-id="(\d+)"'], webpage, 'video id', default=page_id) + r'data-video-id="(\d+)"', + + # abc.tvp.pl - somehow there are more than one video IDs that seem to be the same video? + # the first one is referenced to as "copyid", and seems to be unused by the website + r'<script>\s*tvpabc\.video\.init\(\s*\d+,\s*(\d+)\s*\)\s*</script>', + ], webpage, 'video id', default=page_id) return { '_type': 'url_transparent', 'url': 'tvp:' + video_id, 'description': self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'description', webpage, default=None), + webpage, default=None) or (self._html_search_meta( + 'description', webpage, default=None) + if '//s.tvp.pl/files/portal/v' in webpage else None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'ie_key': 'TVPEmbed', } +class TVPStreamIE(InfoExtractor): + IE_NAME = 'tvp:stream' + _VALID_URL = r'(?:tvpstream:|https?://tvpstream\.vod\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)' + _TESTS = [{ + # untestable as "video" id changes many times across a day + 'url': 'https://tvpstream.vod.tvp.pl/?channel_id=1455', + 'only_matching': True, + }, { + 'url': 'tvpstream:39821455', + 'only_matching': True, + }, { + # the default stream when you provide no channel_id, most probably TVP Info + 'url': 'tvpstream:', + 'only_matching': True, + }, { + 'url': 'https://tvpstream.vod.tvp.pl/', + 'only_matching': True, + }] + + _PLAYER_BOX_RE = r'<div\s[^>]*id\s*=\s*["\']?tvp_player_box["\']?[^>]+data-%s-id\s*=\s*["\']?(\d+)' + _BUTTON_RE = r'<div\s[^>]*data-channel-id=["\']?%s["\']?[^>]*\sdata-title=(?:"([^"]*)"|\'([^\']*)\')[^>]*\sdata-stationname=(?:"([^"]*)"|\'([^\']*)\')' + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel_url = self._proto_relative_url('//tvpstream.vod.tvp.pl/?channel_id=%s' % channel_id or 'default') + webpage = self._download_webpage(channel_url, channel_id, 'Downloading channel webpage') + if not channel_id: + channel_id = self._search_regex(self._PLAYER_BOX_RE % 'channel', + webpage, 'default channel id') + video_id = self._search_regex(self._PLAYER_BOX_RE % 'video', + webpage, 'video id') + audition_title, station_name = self._search_regex( + self._BUTTON_RE % (re.escape(channel_id)), webpage, + 'audition title and station name', + group=(1, 2)) + return { + '_type': 'url_transparent', + 'id': channel_id, + 'url': 'tvp:%s' % video_id, + 'title': audition_title, + 'alt_title': station_name, + 'is_live': True, + 'ie_key': 'TVPEmbed', + } + + class TVPEmbedIE(InfoExtractor): IE_NAME = 'tvp:embed' IE_DESC = 'Telewizja Polska' - _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)' + _VALID_URL = r'''(?x) + (?: + tvp: + |https?:// + (?:[^/]+\.)? + (?:tvp(?:parlament)?\.pl|tvp\.info|polandin\.com)/ + (?:sess/ + (?:tvplayer\.php\?.*?object_id + |TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd]) + |shared/details\.php\?.*?object_id) + =) + (?P<id>\d+) + ''' _TESTS = [{ 'url': 'tvp:194536', - 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 'info_dict': { 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, odc. 13 – Władek', + 'description': 'md5:76649d2014f65c99477be17f23a4dead', + 'age_limit': 12, }, }, { - # not available - 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', - 'md5': '8c9cd59d16edabf39331f93bf8a766c7', + 'url': 'https://www.tvp.pl/sess/tvplayer.php?object_id=51247504&autoplay=false', 'info_dict': { - 'id': '22670268', + 'id': '51247504', 'ext': 'mp4', - 'title': 'Panorama, 07.12.2015, 15:40', + 'title': 'Razmova 091220', }, - 'skip': 'Transmisja została zakończona lub materiał niedostępny', }, { - 'url': 'tvp:22670268', + # TVPlayer2 embed URL + 'url': 'https://tvp.info/sess/TVPlayer2/embed.php?ID=50595757', + 'only_matching': True, + }, { + 'url': 'https://wiadomosci.tvp.pl/sess/TVPlayer2/api.php?id=51233452', + 'only_matching': True, + }, { + # pulsembed on dziennik.pl + 'url': 'https://www.tvp.pl/shared/details.php?copy_id=52205981&object_id=52204505&autoplay=false&is_muted=false&allowfullscreen=true&template=external-embed/video/iframe-video.html', 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage, **kw): + return [m.group('embed') for m in re.finditer( + r'(?x)<iframe[^>]+?src=(["\'])(?P<embed>%s)' % TVPEmbedIE._VALID_URL[4:], + webpage)] + def _real_extract(self, url): video_id = self._match_id(url) + # it could be anything that is a valid JS function name + callback = random.choice(( + 'jebac_pis', + 'jebacpis', + 'ziobro', + 'sasin70', + 'sasin_przejebal_70_milionow_PLN', + 'tvp_is_a_state_propaganda_service', + )) + webpage = self._download_webpage( - 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) - - error = self._html_search_regex( - r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>', - webpage, 'error', default=None) or clean_html( - get_element_by_attribute('class', 'msg error', webpage)) - if error: - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, clean_html(error)), expected=True) - - title = self._search_regex( - r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', - webpage, 'title', group='title') - series_title = self._search_regex( - r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1', - webpage, 'series', group='series', default=None) - if series_title: - title = '%s, %s' % (series_title, title) - - thumbnail = self._search_regex( - r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) - - video_url = self._search_regex( - r'0:{src:([\'"])(?P<url>.*?)\1', webpage, - 'formats', group='url', default=None) - if not video_url or 'material_niedostepny.mp4' in video_url: - video_url = self._download_json( - 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, - video_id)['video_url'] + ('https://www.tvp.pl/sess/TVPlayer2/api.php?id=%s' + + '&@method=getTvpConfig&@callback=%s') % (video_id, callback), video_id) + + # stripping JSONP padding + datastr = webpage[15 + len(callback):-3] + if datastr.startswith('null,'): + error = self._parse_json(datastr[5:], video_id) + raise ExtractorError(error[0]['desc']) + + content = self._parse_json(datastr, video_id)['content'] + info = content['info'] + is_live = try_get(info, lambda x: x['isLive'], bool) formats = [] - video_url_base = self._search_regex( - r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', - video_url, 'video base url', default=None) - if video_url_base: - # TODO: <Group> found instead of <AdaptationSet> in MPD manifest. - # It's not mentioned in MPEG-DASH standard. Figure that out. - # formats.extend(self._extract_mpd_formats( - # video_url_base + '.ism/video.mpd', - # video_id, mpd_id='dash', fatal=False)) - formats.extend(self._extract_ism_formats( - video_url_base + '.ism/Manifest', - video_id, 'mss', fatal=False)) - formats.extend(self._extract_f4m_formats( - video_url_base + '.ism/video.f4m', - video_id, f4m_id='hds', fatal=False)) - m3u8_formats = self._extract_m3u8_formats( - video_url_base + '.ism/video.m3u8', video_id, - 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none', m3u8_formats)) - formats.extend(m3u8_formats) - for i, m3u8_format in enumerate(m3u8_formats, 2): - http_url = '%s-%d.mp4' % (video_url_base, i) - if self._is_valid_url(http_url, video_id): - f = m3u8_format.copy() - f.update({ - 'url': http_url, - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - formats = [{ - 'format_id': 'direct', - 'url': video_url, - 'ext': determine_ext(video_url, 'mp4'), - }] + for file in content['files']: + video_url = file.get('url') + if not video_url: + continue + if video_url.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False, live=is_live)) + elif video_url.endswith('.mpd'): + if is_live: + # doesn't work with either ffmpeg or native downloader + continue + formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False)) + elif video_url.endswith('.f4m'): + formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False)) + elif video_url.endswith('.ism/manifest'): + formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False)) + else: + # mp4, wmv or something + quality = file.get('quality', {}) + formats.append({ + 'format_id': 'direct', + 'url': video_url, + 'ext': determine_ext(video_url, file['type']), + 'fps': int_or_none(quality.get('fps')), + 'tbr': int_or_none(quality.get('bitrate')), + 'width': int_or_none(quality.get('width')), + 'height': int_or_none(quality.get('height')), + }) self._sort_formats(formats) - return { + title = dict_get(info, ('subtitle', 'title', 'seoTitle')) + description = dict_get(info, ('description', 'seoDescription')) + thumbnails = [] + for thumb in content.get('posters') or (): + thumb_url = thumb.get('src') + if not thumb_url or '{width}' in thumb_url or '{height}' in thumb_url: + continue + thumbnails.append({ + 'url': thumb.get('src'), + 'width': thumb.get('width'), + 'height': thumb.get('height'), + }) + age_limit = try_get(info, lambda x: x['ageGroup']['minAge'], int) + if age_limit == 1: + age_limit = 0 + duration = try_get(info, lambda x: x['duration'], int) if not is_live else None + + subtitles = {} + for sub in content.get('subtitles') or []: + if not sub.get('url'): + continue + subtitles.setdefault(sub['lang'], []).append({ + 'url': sub['url'], + 'ext': sub.get('type'), + }) + + info_dict = { 'id': video_id, 'title': title, - 'thumbnail': thumbnail, + 'description': description, + 'thumbnails': thumbnails, + 'age_limit': age_limit, + 'is_live': is_live, + 'duration': duration, 'formats': formats, + 'subtitles': subtitles, } + # vod.tvp.pl + if info.get('vortalName') == 'vod': + info_dict.update({ + 'title': '%s, %s' % (info.get('title'), info.get('subtitle')), + 'series': info.get('title'), + 'season': info.get('season'), + 'episode_number': info.get('episode'), + }) + + return info_dict + class TVPWebsiteIE(InfoExtractor): IE_NAME = 'tvp:series' @@ -204,18 +463,20 @@ class TVPWebsiteIE(InfoExtractor): _TESTS = [{ # series - 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video', + 'url': 'https://vod.tvp.pl/website/wspaniale-stulecie,17069012/video', 'info_dict': { - 'id': '38678312', + 'id': '17069012', }, - 'playlist_count': 115, + 'playlist_count': 312, }, { # film - 'url': 'https://vod.tvp.pl/website/gloria,35139666', + 'url': 'https://vod.tvp.pl/website/krzysztof-krawczyk-cale-moje-zycie,51374466', 'info_dict': { - 'id': '36637049', + 'id': '51374509', 'ext': 'mp4', - 'title': 'Gloria, Gloria', + 'title': 'Krzysztof Krawczyk – całe moje życie, Krzysztof Krawczyk – całe moje życie', + 'description': 'md5:2e80823f00f5fc263555482f76f8fa42', + 'age_limit': 12, }, 'params': { 'skip_download': True, diff --git a/hypervideo_dl/extractor/tvplay.py b/hypervideo_dl/extractor/tvplay.py index fbafb41..b5dbc55 100644 --- a/hypervideo_dl/extractor/tvplay.py +++ b/hypervideo_dl/extractor/tvplay.py @@ -12,9 +12,9 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, - parse_duration, parse_iso8601, qualities, + traverse_obj, try_get, update_url_query, url_or_none, @@ -369,7 +369,6 @@ class ViafreeIE(InfoExtractor): 'upload_date': '20201217' }, 'params': { - 'format': 'bestvideo', 'skip_download': True } }, { @@ -432,77 +431,96 @@ class ViafreeIE(InfoExtractor): class TVPlayHomeIE(InfoExtractor): - _VALID_URL = r'https?://(?:tv3?)?play\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/(?:[^/]+/)*[^/?#&]+-(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:tv3?)? + play\.(?:tv3|skaties)\.(?P<country>lv|lt|ee)/ + (?P<live>lives/)? + [^?#&]+(?:episode|programme|clip)-(?P<id>\d+) + ''' _TESTS = [{ - 'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/', + 'url': 'https://play.tv3.lt/series/gauju-karai-karveliai,serial-2343791/serija-8,episode-2343828', 'info_dict': { - 'id': '366367', + 'id': '2343828', 'ext': 'mp4', - 'title': 'Aferistai', - 'description': 'Aferistai. Kalėdinė pasaka.', - 'series': 'Aferistai [N-7]', - 'season': '1 sezonas', + 'title': 'Gaujų karai. Karveliai (2021) | S01E08: Serija 8', + 'description': 'md5:f6fcfbb236429f05531131640dfa7c81', + 'duration': 2710, + 'season': 'Gaujų karai. Karveliai', 'season_number': 1, - 'duration': 464, - 'timestamp': 1394209658, - 'upload_date': '20140307', - 'age_limit': 18, + 'release_year': 2021, + 'episode': 'Serija 8', + 'episode_number': 8, }, 'params': { - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { - 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/', - 'only_matching': True, + 'url': 'https://play.tv3.lt/series/moterys-meluoja-geriau-n-7,serial-2574652/serija-25,episode-3284937', + 'info_dict': { + 'id': '3284937', + 'ext': 'mp4', + 'season': 'Moterys meluoja geriau [N-7]', + 'season_number': 14, + 'release_year': 2021, + 'episode': 'Serija 25', + 'episode_number': 25, + 'title': 'Moterys meluoja geriau [N-7] (2021) | S14|E25: Serija 25', + 'description': 'md5:c6926e9710f1a126f028fbe121eddb79', + 'duration': 2440, + }, + 'skip': '404' }, { - 'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/', + 'url': 'https://play.tv3.lt/lives/tv6-lt,live-2838694/optibet-a-lygos-rungtynes-marijampoles-suduva--vilniaus-riteriai,programme-3422014', 'only_matching': True, }, { - 'url': 'https://play.tv3.lt/aferistai-10047125', + 'url': 'https://tv3play.skaties.lv/series/women-lie-better-lv,serial-1024464/women-lie-better-lv,episode-1038762', 'only_matching': True, }, { - 'url': 'https://tv3play.skaties.lv/vinas-melo-labak-10280317', + 'url': 'https://play.tv3.ee/series/_,serial-2654462/_,episode-2654474', 'only_matching': True, }, { - 'url': 'https://play.tv3.ee/cool-d-ga-mehhikosse-10044354', + 'url': 'https://tv3play.skaties.lv/clips/tv3-zinas-valsti-lidz-15novembrim-bus-majsede,clip-3464509', 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + country, is_live, video_id = self._match_valid_url(url).groups() - asset = self._download_json( - urljoin(url, '/sb/public/asset/' + video_id), video_id) + api_path = 'lives/programmes' if is_live else 'vods' + data = self._download_json( + urljoin(url, f'/api/products/{api_path}/{video_id}?platform=BROWSER&lang={country.upper()}'), + video_id) - m3u8_url = asset['movie']['contentUrl'] - video_id = asset['assetId'] - asset_title = asset['title'] - title = asset_title['title'] - - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') + video_type = 'CATCHUP' if is_live else 'MOVIE' + stream_id = data['programRecordingId'] if is_live else video_id + stream = self._download_json( + urljoin(url, f'/api/products/{stream_id}/videos/playlist?videoType={video_type}&platform=BROWSER'), video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + stream['sources']['HLS'][0]['src'], video_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) - thumbnails = None - image_url = asset.get('imageUrl') - if image_url: - thumbnails = [{ - 'url': urljoin(url, image_url), - 'ext': 'jpg', - }] - - metadata = asset.get('metadata') or {} + thumbnails = set(traverse_obj( + data, (('galary', 'images', 'artworks'), ..., ..., ('miniUrl', 'mainUrl')), expected_type=url_or_none)) return { 'id': video_id, - 'title': title, - 'description': asset_title.get('summaryLong') or asset_title.get('summaryShort'), - 'thumbnails': thumbnails, - 'duration': parse_duration(asset_title.get('runTime')), - 'series': asset.get('tvSeriesTitle'), - 'season': asset.get('tvSeasonTitle'), - 'season_number': int_or_none(metadata.get('seasonNumber')), - 'episode': asset_title.get('titleBrief'), - 'episode_number': int_or_none(metadata.get('episodeNumber')), + 'title': self._resolve_title(data), + 'description': traverse_obj(data, 'description', 'lead'), + 'duration': int_or_none(data.get('duration')), + 'season': traverse_obj(data, ('season', 'serial', 'title')), + 'season_number': int_or_none(traverse_obj(data, ('season', 'number'))), + 'episode': data.get('title'), + 'episode_number': int_or_none(data.get('episode')), + 'release_year': int_or_none(traverse_obj(data, ('season', 'serial', 'year'))), + 'thumbnails': [{'url': url, 'ext': 'jpg'} for url in thumbnails], 'formats': formats, + 'subtitles': subtitles, } + + @staticmethod + def _resolve_title(data): + return try_get(data, lambda x: ( + f'{data["season"]["serial"]["title"]} ({data["season"]["serial"]["year"]}) | ' + f'S{data["season"]["number"]:02d}E{data["episode"]:02d}: {data["title"]}' + )) or data.get('title') diff --git a/hypervideo_dl/extractor/tvplayer.py b/hypervideo_dl/extractor/tvplayer.py index 8f8686a..5970596 100644 --- a/hypervideo_dl/extractor/tvplayer.py +++ b/hypervideo_dl/extractor/tvplayer.py @@ -80,7 +80,7 @@ class TVPlayerIE(InfoExtractor): return { 'id': resource_id, 'display_id': display_id, - 'title': self._live_title(title), + 'title': title, 'formats': formats, 'is_live': True, } diff --git a/hypervideo_dl/extractor/twitcasting.py b/hypervideo_dl/extractor/twitcasting.py index 3acf1b1..5c4d26c 100644 --- a/hypervideo_dl/extractor/twitcasting.py +++ b/hypervideo_dl/extractor/twitcasting.py @@ -8,22 +8,27 @@ from .common import InfoExtractor from ..downloader.websocket import has_websockets from ..utils import ( clean_html, + ExtractorError, float_or_none, get_element_by_class, get_element_by_id, parse_duration, qualities, str_to_int, + traverse_obj, try_get, unified_timestamp, urlencode_postdata, urljoin, - ExtractorError, ) class TwitCastingIE(InfoExtractor): _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/(?:movie|twplayer)/(?P<id>\d+)' + _M3U8_HEADERS = { + 'Origin': 'https://twitcasting.tv', + 'Referer': 'https://twitcasting.tv/', + } _TESTS = [{ 'url': 'https://twitcasting.tv/ivetesangalo/movie/2357609', 'md5': '745243cad58c4681dc752490f7540d7f', @@ -60,6 +65,16 @@ class TwitCastingIE(InfoExtractor): 'skip_download': True, 'videopassword': 'abc', }, + }, { + 'note': 'archive is split in 2 parts', + 'url': 'https://twitcasting.tv/loft_heaven/movie/685979292', + 'info_dict': { + 'id': '685979292', + 'ext': 'mp4', + 'title': '南波一海のhear_here “ナタリー望月哲さんに聞く編集と「渋谷系狂騒曲」”', + 'duration': 6964.599334, + }, + 'playlist_mincount': 2, }] def _real_extract(self, url): @@ -70,66 +85,49 @@ class TwitCastingIE(InfoExtractor): if video_password: request_data = urlencode_postdata({ 'password': video_password, - }) - webpage = self._download_webpage( + }, encoding='utf-8') + webpage, urlh = self._download_webpage_handle( url, video_id, data=request_data, headers={'Origin': 'https://twitcasting.tv'}) + if urlh.geturl() != url and request_data: + webpage = self._download_webpage( + urlh.geturl(), video_id, data=request_data, + headers={'Origin': 'https://twitcasting.tv'}, + note='Retrying authentication') + # has to check here as the first request can contain password input form even if the password is correct + if re.search(r'<form\s+method="POST">\s*<input\s+[^>]+?name="password"', webpage): + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) title = (clean_html(get_element_by_id('movietitle', webpage)) or self._html_search_meta(['og:title', 'twitter:title'], webpage, fatal=True)) - video_js_data = {} - m3u8_url = self._search_regex( - r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'm3u8 url', group='url', default=None) - if not m3u8_url: - video_js_data = self._parse_json(self._search_regex( - r'data-movie-playlist=(["\'])(?P<url>(?:(?!\1).)+)', - webpage, 'movie playlist', group='url', default='[{}]'), video_id) - if isinstance(video_js_data, dict): - video_js_data = list(video_js_data.values())[0] - video_js_data = video_js_data[0] - m3u8_url = try_get(video_js_data, lambda x: x['source']['url']) - - stream_server_data = self._download_json( - 'https://twitcasting.tv/streamserver.php?target=%s&mode=client' % uploader_id, video_id, - 'Downloading live info', fatal=False) - - is_live = 'data-status="online"' in webpage - formats = [] - if is_live and not m3u8_url: - m3u8_url = 'https://twitcasting.tv/%s/metastream.m3u8' % uploader_id - if is_live and has_websockets and stream_server_data: - qq = qualities(['base', 'mobilesource', 'main']) - for mode, ws_url in stream_server_data['llfmp4']['streams'].items(): - formats.append({ - 'url': ws_url, - 'format_id': 'ws-%s' % mode, - 'ext': 'mp4', - 'quality': qq(mode), - 'protocol': 'websocket_frag', # TwitCasting simply sends moof atom directly over WS - }) + video_js_data = try_get( + webpage, + lambda x: self._parse_json(self._search_regex( + r'data-movie-playlist=\'([^\']+?)\'', + x, 'movie playlist', default=None), video_id)['2'], list) - thumbnail = video_js_data.get('thumbnailUrl') or self._og_search_thumbnail(webpage) + thumbnail = traverse_obj(video_js_data, (0, 'thumbnailUrl')) or self._og_search_thumbnail(webpage) description = clean_html(get_element_by_id( 'authorcomment', webpage)) or self._html_search_meta( ['description', 'og:description', 'twitter:description'], webpage) - duration = float_or_none(video_js_data.get( - 'duration'), 1000) or parse_duration(clean_html( - get_element_by_class('tw-player-duration-time', webpage))) + duration = (try_get(video_js_data, lambda x: sum(float_or_none(y.get('duration')) for y in x) / 1000) + or parse_duration(clean_html(get_element_by_class('tw-player-duration-time', webpage)))) view_count = str_to_int(self._search_regex( - r'Total\s*:\s*([\d,]+)\s*Views', webpage, 'views', None)) + (r'Total\s*:\s*([\d,]+)\s*Views', r'総視聴者\s*:\s*([\d,]+)\s*</'), webpage, 'views', None)) timestamp = unified_timestamp(self._search_regex( r'data-toggle="true"[^>]+datetime="([^"]+)"', webpage, 'datetime', None)) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live)) - self._sort_formats(formats) + stream_server_data = self._download_json( + 'https://twitcasting.tv/streamserver.php?target=%s&mode=client' % uploader_id, video_id, + 'Downloading live info', fatal=False) - return { - 'id': video_id, + is_live = 'data-status="online"' in webpage + if not traverse_obj(stream_server_data, 'llfmp4') and is_live: + self.raise_login_required(method='cookies') + + base_dict = { 'title': title, 'description': description, 'thumbnail': thumbnail, @@ -137,10 +135,75 @@ class TwitCastingIE(InfoExtractor): 'uploader_id': uploader_id, 'duration': duration, 'view_count': view_count, - 'formats': formats, 'is_live': is_live, } + def find_dmu(x): + data_movie_url = self._search_regex( + r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + x, 'm3u8 url', group='url', default=None) + if data_movie_url: + return [data_movie_url] + + m3u8_urls = (try_get(webpage, find_dmu, list) + or traverse_obj(video_js_data, (..., 'source', 'url')) + or ([f'https://twitcasting.tv/{uploader_id}/metastream.m3u8'] if is_live else None)) + if not m3u8_urls: + raise ExtractorError('Failed to get m3u8 playlist') + + if is_live: + m3u8_url = m3u8_urls[0] + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', m3u8_id='hls', + live=True, headers=self._M3U8_HEADERS) + + if traverse_obj(stream_server_data, ('hls', 'source')): + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', m3u8_id='source', + live=True, query={'mode': 'source'}, + note='Downloading source quality m3u8', + headers=self._M3U8_HEADERS, fatal=False)) + + if has_websockets: + qq = qualities(['base', 'mobilesource', 'main']) + streams = traverse_obj(stream_server_data, ('llfmp4', 'streams')) or {} + for mode, ws_url in streams.items(): + formats.append({ + 'url': ws_url, + 'format_id': 'ws-%s' % mode, + 'ext': 'mp4', + 'quality': qq(mode), + 'source_preference': -10, + # TwitCasting simply sends moof atom directly over WS + 'protocol': 'websocket_frag', + }) + + self._sort_formats(formats, ('source',)) + + infodict = { + 'formats': formats + } + else: + infodict = { + '_type': 'multi_video', + 'entries': [{ + 'id': f'{video_id}-{num}', + 'url': m3u8_url, + 'ext': 'mp4', + # Requesting the manifests here will cause download to fail. + # So use ffmpeg instead. See: https://github.com/hypervideo/hypervideo/issues/382 + 'protocol': 'm3u8', + 'http_headers': self._M3U8_HEADERS, + **base_dict, + } for (num, m3u8_url) in enumerate(m3u8_urls)], + } + + return { + 'id': video_id, + **base_dict, + **infodict, + } + class TwitCastingLiveIE(InfoExtractor): _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/?(?:[#?]|$)' @@ -161,6 +224,17 @@ class TwitCastingLiveIE(InfoExtractor): r'tw-sound-flag-open-link" data-id="(\d+)" style=',), webpage, 'current live ID', default=None) if not current_live: + # fetch unfiltered /show to find running livestreams; we can't get ID of the password-protected livestream above + webpage = self._download_webpage( + f'https://twitcasting.tv/{uploader_id}/show/', uploader_id, + note='Downloading live history') + is_live = self._search_regex(r'(?s)(<span\s*class="tw-movie-thumbnail-badge"\s*data-status="live">\s*LIVE)', webpage, 'is live?', default=None) + if is_live: + # get the first live; running live is always at the first + current_live = self._search_regex( + r'(?s)<a\s+class="tw-movie-thumbnail"\s*href="/[^/]+/movie/(?P<video_id>\d+)"\s*>.+?</a>', + webpage, 'current live ID 2', default=None, group='video_id') + if not current_live: raise ExtractorError('The user is not currently live') return self.url_result('https://twitcasting.tv/%s/movie/%s' % (uploader_id, current_live)) diff --git a/hypervideo_dl/extractor/twitch.py b/hypervideo_dl/extractor/twitch.py index be70bee..10de74c 100644 --- a/hypervideo_dl/extractor/twitch.py +++ b/hypervideo_dl/extractor/twitch.py @@ -24,6 +24,8 @@ from ..utils import ( parse_iso8601, parse_qs, qualities, + str_or_none, + traverse_obj, try_get, unified_timestamp, update_url_query, @@ -52,16 +54,10 @@ class TwitchBaseIE(InfoExtractor): 'VideoAccessToken_Clip': '36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11', 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', 'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687', + 'VideoPlayer_ChapterSelectButtonVideo': '8d2793384aac3773beab5e59bd5d6f585aedb923d292800119e03d40cd0f9b41', } - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): def fail(message): raise ExtractorError( 'Unable to login. Twitch said: %s' % message, expected=True) @@ -249,6 +245,38 @@ class TwitchVodIE(TwitchBaseIE): }, { 'url': 'https://player.twitch.tv/?video=480452374', 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/videos/635475444', + 'info_dict': { + 'id': 'v635475444', + 'ext': 'mp4', + 'title': 'Riot Games', + 'duration': 11643, + 'uploader': 'Riot Games', + 'uploader_id': 'riotgames', + 'timestamp': 1590770569, + 'upload_date': '20200529', + 'chapters': [ + { + 'start_time': 0, + 'end_time': 573, + 'title': 'League of Legends' + }, + { + 'start_time': 573, + 'end_time': 3922, + 'title': 'Legends of Runeterra' + }, + { + 'start_time': 3922, + 'end_time': 11643, + 'title': 'Art' + } + ], + }, + 'params': { + 'skip_download': True + } }] def _download_info(self, item_id): @@ -259,16 +287,24 @@ class TwitchVodIE(TwitchBaseIE): 'channelLogin': '', 'videoID': item_id, }, + }, { + 'operationName': 'VideoPlayer_ChapterSelectButtonVideo', + 'variables': { + 'includePrivate': False, + 'videoID': item_id, + }, }], - 'Downloading stream metadata GraphQL')[0]['data'] - video = data.get('video') + 'Downloading stream metadata GraphQL') + + video = traverse_obj(data, (0, 'data', 'video')) + video['moments'] = traverse_obj(data, (1, 'data', 'video', 'moments', 'edges', ..., 'node')) + if video is None: raise ExtractorError( 'Video %s does not exist' % item_id, expected=True) return self._extract_info_gql(video, item_id) - @staticmethod - def _extract_info(info): + def _extract_info(self, info): status = info.get('status') if status == 'recording': is_live = True @@ -302,18 +338,39 @@ class TwitchVodIE(TwitchBaseIE): 'timestamp': parse_iso8601(info.get('recorded_at')), 'view_count': int_or_none(info.get('views')), 'is_live': is_live, + 'was_live': True, } - @staticmethod - def _extract_info_gql(info, item_id): + def _extract_moments(self, info, item_id): + for moment in info.get('moments') or []: + start_time = int_or_none(moment.get('positionMilliseconds'), 1000) + duration = int_or_none(moment.get('durationMilliseconds'), 1000) + name = str_or_none(moment.get('description')) + + if start_time is None or duration is None: + self.report_warning(f'Important chapter information missing for chapter {name}', item_id) + continue + yield { + 'start_time': start_time, + 'end_time': start_time + duration, + 'title': name, + } + + def _extract_info_gql(self, info, item_id): vod_id = info.get('id') or item_id # id backward compatibility for download archives if vod_id[0] != 'v': vod_id = 'v%s' % vod_id thumbnail = url_or_none(info.get('previewThumbnailURL')) + is_live = None if thumbnail: - for p in ('width', 'height'): - thumbnail = thumbnail.replace('{%s}' % p, '0') + if thumbnail.endswith('/404_processing_{width}x{height}.png'): + is_live, thumbnail = True, None + else: + is_live = False + for p in ('width', 'height'): + thumbnail = thumbnail.replace('{%s}' % p, '0') + return { 'id': vod_id, 'title': info.get('title') or 'Untitled Broadcast', @@ -324,6 +381,9 @@ class TwitchVodIE(TwitchBaseIE): 'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str), 'timestamp': unified_timestamp(info.get('publishedAt')), 'view_count': int_or_none(info.get('viewCount')), + 'chapters': list(self._extract_moments(info, item_id)), + 'is_live': is_live, + 'was_live': True, } def _real_extract(self, url): @@ -836,7 +896,7 @@ class TwitchStreamIE(TwitchBaseIE): return { 'id': stream_id, 'display_id': channel_name, - 'title': self._live_title(title), + 'title': title, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, @@ -981,7 +1041,7 @@ class TwitchClipsIE(TwitchBaseIE): 'title': clip.get('title') or video_id, 'formats': formats, 'duration': int_or_none(clip.get('durationSeconds')), - 'views': int_or_none(clip.get('viewCount')), + 'view_count': int_or_none(clip.get('viewCount')), 'timestamp': unified_timestamp(clip.get('createdAt')), 'thumbnails': thumbnails, 'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], compat_str), diff --git a/hypervideo_dl/extractor/twitter.py b/hypervideo_dl/extractor/twitter.py index 485b781..8ccc38e 100644 --- a/hypervideo_dl/extractor/twitter.py +++ b/hypervideo_dl/extractor/twitter.py @@ -13,8 +13,10 @@ from ..compat import ( from ..utils import ( dict_get, ExtractorError, + format_field, float_or_none, int_or_none, + traverse_obj, try_get, strip_or_none, unified_timestamp, @@ -55,7 +57,7 @@ class TwitterBaseIE(InfoExtractor): def _extract_formats_from_vmap_url(self, vmap_url, video_id): vmap_url = url_or_none(vmap_url) if not vmap_url: - return [] + return [], {} vmap_data = self._download_xml(vmap_url, video_id) formats = [] subtitles = {} @@ -88,6 +90,9 @@ class TwitterBaseIE(InfoExtractor): headers = { 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', } + token = self._get_cookies(self._API_BASE).get('ct0') + if token: + headers['x-csrf-token'] = token.value if not self._GUEST_TOKEN: self._GUEST_TOKEN = self._download_json( self._API_BASE + 'guest/activate.json', video_id, @@ -468,7 +473,7 @@ class TwitterIE(TwitterBaseIE): 'uploader': uploader, 'timestamp': unified_timestamp(status.get('created_at')), 'uploader_id': uploader_id, - 'uploader_url': 'https://twitter.com/' + uploader_id if uploader_id else None, + 'uploader_url': format_field(uploader_id, template='https://twitter.com/%s'), 'like_count': int_or_none(status.get('favorite_count')), 'repost_count': int_or_none(status.get('retweet_count')), 'comment_count': int_or_none(status.get('reply_count')), @@ -485,7 +490,7 @@ class TwitterIE(TwitterBaseIE): fmts, subs = self._extract_variant_formats(variant, twid) subtitles = self._merge_subtitles(subtitles, subs) formats.extend(fmts) - self._sort_formats(formats) + self._sort_formats(formats, ('res', 'br', 'size', 'proto')) # The codec of http formats are unknown thumbnails = [] media_url = media.get('media_url_https') or media.get('media_url') @@ -508,7 +513,7 @@ class TwitterIE(TwitterBaseIE): 'duration': float_or_none(video_info.get('duration_millis'), 1000), }) - media = try_get(status, lambda x: x['extended_entities']['media'][0]) + media = traverse_obj(status, ((None, 'quoted_status'), 'extended_entities', 'media', 0), get_all=False) if media and media.get('type') != 'photo': extract_from_video_info(media) else: diff --git a/hypervideo_dl/extractor/udemy.py b/hypervideo_dl/extractor/udemy.py index 74f638e..88b2310 100644 --- a/hypervideo_dl/extractor/udemy.py +++ b/hypervideo_dl/extractor/udemy.py @@ -168,14 +168,7 @@ class UdemyIE(InfoExtractor): self._handle_error(response) return response - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_popup = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') diff --git a/hypervideo_dl/extractor/uol.py b/hypervideo_dl/extractor/uol.py index 4a2a97f..1baee0b 100644 --- a/hypervideo_dl/extractor/uol.py +++ b/hypervideo_dl/extractor/uol.py @@ -95,7 +95,6 @@ class UOLIE(InfoExtractor): if v: query[k] = v f_url = update_url_query(f_url, query) - format_id = format_id if format_id == 'HLS': m3u8_formats = self._extract_m3u8_formats( f_url, media_id, 'mp4', 'm3u8_native', diff --git a/hypervideo_dl/extractor/urplay.py b/hypervideo_dl/extractor/urplay.py index 753ffa4..eb2ab26 100644 --- a/hypervideo_dl/extractor/urplay.py +++ b/hypervideo_dl/extractor/urplay.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( dict_get, + ExtractorError, int_or_none, + ISO639Utils, + parse_age_limit, + try_get, unified_timestamp, ) @@ -23,9 +27,10 @@ class URPlayIE(InfoExtractor): 'upload_date': '20171214', 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik', 'duration': 2269, - 'categories': ['Kultur & historia'], + 'categories': ['Vetenskap & teknik'], 'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'], 'episode': 'Om vetenskap, kritiskt tänkande och motstånd', + 'age_limit': 15, }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -50,11 +55,16 @@ class URPlayIE(InfoExtractor): video_id = self._match_id(url) url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - vid = int(video_id) - accessible_episodes = self._parse_json(self._html_search_regex( - r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', - webpage, 'urplayer data'), video_id)['accessibleEpisodes'] - urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid) + urplayer_data = self._search_nextjs_data(webpage, video_id, fatal=False) or {} + if urplayer_data: + urplayer_data = try_get(urplayer_data, lambda x: x['props']['pageProps']['program'], dict) + if not urplayer_data: + raise ExtractorError('Unable to parse __NEXT_DATA__') + else: + accessible_episodes = self._parse_json(self._html_search_regex( + r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['accessibleEpisodes'] + urplayer_data = next(e for e in accessible_episodes if e.get('id') == int_or_none(video_id)) episode = urplayer_data['title'] host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] @@ -72,11 +82,28 @@ class URPlayIE(InfoExtractor): self._sort_formats(formats) subtitles = {} - subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location") - if subs: - subtitles.setdefault('Svenska', []).append({ - 'url': subs, - }) + + def parse_lang_code(code): + "3-character language code or None (utils candidate)" + if code is None: + return + lang = code.lower() + if not ISO639Utils.long2short(lang): + lang = ISO639Utils.short2long(lang) + return lang or None + + for k, v in (urplayer_data['streamingInfo'].get('sweComplete') or {}).items(): + if (k in ('sd', 'hd') or not isinstance(v, dict)): + continue + lang, sttl_url = (v.get(kk) for kk in ('language', 'location', )) + if not sttl_url: + continue + lang = parse_lang_code(lang) + if not lang: + continue + sttl = subtitles.get(lang) or [] + sttl.append({'ext': k, 'url': sttl_url, }) + subtitles[lang] = sttl image = urplayer_data.get('image') or {} thumbnails = [] @@ -98,7 +125,6 @@ class URPlayIE(InfoExtractor): return { 'id': video_id, - 'subtitles': subtitles, 'title': '%s : %s' % (series_title, episode) if series_title else episode, 'description': urplayer_data.get('description'), 'thumbnails': thumbnails, @@ -111,4 +137,7 @@ class URPlayIE(InfoExtractor): 'season': series.get('label'), 'episode': episode, 'episode_number': int_or_none(urplayer_data.get('episodeNumber')), + 'age_limit': parse_age_limit(min(try_get(a, lambda x: x['from'], int) or 0 + for a in urplayer_data.get('ageRanges', []))), + 'subtitles': subtitles, } diff --git a/hypervideo_dl/extractor/ustream.py b/hypervideo_dl/extractor/ustream.py index 8b75879..4a7a8f8 100644 --- a/hypervideo_dl/extractor/ustream.py +++ b/hypervideo_dl/extractor/ustream.py @@ -13,6 +13,7 @@ from ..utils import ( ExtractorError, int_or_none, float_or_none, + join_nonempty, mimetype2ext, str_or_none, ) @@ -139,8 +140,8 @@ class UstreamIE(InfoExtractor): content_type = stream['contentType'] kind = content_type.split('/')[0] f = { - 'format_id': '-'.join(filter(None, [ - 'dash', kind, str_or_none(stream.get('bitrate'))])), + 'format_id': join_nonempty( + 'dash', kind, str_or_none(stream.get('bitrate'))), 'protocol': 'http_dash_segments', # TODO: generate a MPD doc for external players? 'url': encode_data_uri(b'<MPD/>', 'text/xml'), diff --git a/hypervideo_dl/extractor/utreon.py b/hypervideo_dl/extractor/utreon.py index 4a25f0c..4986635 100644 --- a/hypervideo_dl/extractor/utreon.py +++ b/hypervideo_dl/extractor/utreon.py @@ -13,7 +13,7 @@ from ..utils import ( class UtreonIE(InfoExtractor): - _VALID_URL = r'(?:https?://)(?:www\.)?utreon.com/v/(?P<id>[a-zA-Z0-9_-]+)' + _VALID_URL = r'https?://(?:www\.)?utreon.com/v/(?P<id>[a-zA-Z0-9_-]+)' _TESTS = [{ 'url': 'https://utreon.com/v/z_I7ikQbuDw', 'info_dict': { diff --git a/hypervideo_dl/extractor/varzesh3.py b/hypervideo_dl/extractor/varzesh3.py index 81313dc..32655b9 100644 --- a/hypervideo_dl/extractor/varzesh3.py +++ b/hypervideo_dl/extractor/varzesh3.py @@ -42,8 +42,7 @@ class Varzesh3IE(InfoExtractor): video_url = self._search_regex( r'<source[^>]+src="([^"]+)"', webpage, 'video url') - title = remove_start(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), 'ویدیو ورزش 3 | ') + title = remove_start(self._html_extract_title(webpage), 'ویدیو ورزش 3 | ') description = self._html_search_regex( r'(?s)<div class="matn">(.+?)</div>', diff --git a/hypervideo_dl/extractor/veo.py b/hypervideo_dl/extractor/veo.py index 4e57a52..d87bb5b 100644 --- a/hypervideo_dl/extractor/veo.py +++ b/hypervideo_dl/extractor/veo.py @@ -6,13 +6,14 @@ from .common import InfoExtractor from ..utils import ( int_or_none, mimetype2ext, + str_or_none, unified_timestamp, url_or_none, ) class VeoIE(InfoExtractor): - _VALID_URL = r'https?://app\.veo\.co/matches/(?P<id>[0-9A-Za-z-]+)' + _VALID_URL = r'https?://app\.veo\.co/matches/(?P<id>[0-9A-Za-z-_]+)' _TESTS = [{ 'url': 'https://app.veo.co/matches/20201027-last-period/', @@ -24,7 +25,11 @@ class VeoIE(InfoExtractor): 'upload_date': '20201028', 'timestamp': 1603847208, 'duration': 1916, + 'view_count': int, } + }, { + 'url': 'https://app.veo.co/matches/20220313-2022-03-13_u15m-plsjq-vs-csl/', + 'only_matching': True, }] def _real_extract(self, url): @@ -36,39 +41,41 @@ class VeoIE(InfoExtractor): video_data = self._download_json( 'https://app.veo.co/api/app/matches/%s/videos' % video_id, video_id, 'Downloading video data') - title = metadata.get('title') - thumbnail = url_or_none(metadata.get('thumbnail')) - - timestamp = unified_timestamp(metadata.get('created')) - duration = int_or_none(metadata.get('duration')) - view_count = int_or_none(metadata.get('view_count')) - formats = [] for fmt in video_data: - mimetype = fmt.get('mime_type') + mimetype = str_or_none(fmt.get('mime_type')) + format_url = url_or_none(fmt.get('url')) # skip configuration file for panoramic video - if mimetype == 'video/mp2t': + if not format_url or mimetype == 'video/mp2t': continue + height = int_or_none(fmt.get('height')) - bitrate = int_or_none(fmt.get('bit_rate'), scale=1000) - render_type = fmt.get('render_type') + render_type = str_or_none(fmt.get('render_type')) + format_id = f'{render_type}-{height}p' if render_type and height else None + + # Veo returns panoramic video information even if panoramic video is not available. + # e.g. https://app.veo.co/matches/20201027-last-period/ + if render_type == 'panorama': + if not self._is_valid_url(format_url, video_id, format_id): + continue + formats.append({ - 'url': url_or_none(fmt.get('url')), - 'format_id': '%s-%sp' % (render_type, height), + 'url': format_url, + 'format_id': format_id, 'ext': mimetype2ext(mimetype), 'width': int_or_none(fmt.get('width')), 'height': height, - 'vbr': bitrate + 'vbr': int_or_none(fmt.get('bit_rate'), scale=1000), }) self._sort_formats(formats) return { 'id': video_id, - 'title': title, + 'title': str_or_none(metadata.get('title')), 'formats': formats, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'view_count': view_count, - 'duration': duration + 'thumbnail': url_or_none(metadata.get('thumbnail')), + 'timestamp': unified_timestamp(metadata.get('created')), + 'view_count': int_or_none(metadata.get('view_count')), + 'duration': int_or_none(metadata.get('duration')), } diff --git a/hypervideo_dl/extractor/veoh.py b/hypervideo_dl/extractor/veoh.py index 1c44c14..d9afb56 100644 --- a/hypervideo_dl/extractor/veoh.py +++ b/hypervideo_dl/extractor/veoh.py @@ -5,21 +5,30 @@ from ..utils import ( int_or_none, parse_duration, qualities, + try_get ) class VeohIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|embed|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|videos|embed|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)' _TESTS = [{ 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', - 'md5': '9e7ecc0fd8bbee7a69fe38953aeebd30', + 'md5': '620e68e6a3cff80086df3348426c9ca3', 'info_dict': { 'id': 'v56314296nk7Zdmz3', 'ext': 'mp4', 'title': 'Straight Backs Are Stronger', + 'description': 'md5:203f976279939a6dc664d4001e13f5f4', + 'thumbnail': 're:https://fcache\\.veoh\\.com/file/f/th56314296\\.jpg(\\?.*)?', 'uploader': 'LUMOback', - 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ', + 'duration': 46, + 'view_count': int, + 'average_rating': int, + 'comment_count': int, + 'age_limit': 0, + 'categories': ['technology_and_gaming'], + 'tags': ['posture', 'posture', 'sensor', 'back', 'pain', 'wearable', 'tech', 'lumo'], }, }, { 'url': 'http://www.veoh.com/embed/v56314296nk7Zdmz3', @@ -51,30 +60,36 @@ class VeohIE(InfoExtractor): }, { 'url': 'http://www.veoh.com/watch/e152215AJxZktGS', 'only_matching': True, - }] - - def _extract_video(self, source): - return { - 'id': source.get('videoId'), - 'title': source.get('title'), - 'description': source.get('description'), - 'thumbnail': source.get('highResImage') or source.get('medResImage'), - 'uploader': source.get('username'), - 'duration': int_or_none(source.get('length')), - 'view_count': int_or_none(source.get('views')), - 'age_limit': 18 if source.get('isMature') == 'true' or source.get('isSexy') == 'true' else 0, - 'formats': self._extract_formats(source), + }, { + 'url': 'https://www.veoh.com/videos/v16374379WA437rMH', + 'md5': 'cceb73f3909063d64f4b93d4defca1b3', + 'info_dict': { + 'id': 'v16374379WA437rMH', + 'ext': 'mp4', + 'title': 'Phantasmagoria 2, pt. 1-3', + 'description': 'Phantasmagoria: a Puzzle of Flesh', + 'thumbnail': 're:https://fcache\\.veoh\\.com/file/f/th16374379\\.jpg(\\?.*)?', + 'uploader': 'davidspackage', + 'duration': 968, + 'view_count': int, + 'average_rating': int, + 'comment_count': int, + 'age_limit': 18, + 'categories': ['technology_and_gaming', 'gaming'], + 'tags': ['puzzle', 'of', 'flesh'], } + }] def _real_extract(self, url): video_id = self._match_id(url) - video = self._download_json( + metadata = self._download_json( 'https://www.veoh.com/watch/getVideo/' + video_id, - video_id)['video'] + video_id) + video = metadata['video'] title = video['title'] thumbnail_url = None - q = qualities(['HQ', 'Regular']) + q = qualities(['Regular', 'HQ']) formats = [] for f_id, f_url in video.get('src', {}).items(): if not f_url: @@ -89,6 +104,12 @@ class VeohIE(InfoExtractor): }) self._sort_formats(formats) + categories = metadata.get('categoryPath') + if not categories: + category = try_get(video, lambda x: x['category'].strip().removeprefix('category_')) + categories = [category] if category else None + tags = video.get('tags') + return { 'id': video_id, 'title': title, @@ -100,4 +121,7 @@ class VeohIE(InfoExtractor): 'formats': formats, 'average_rating': int_or_none(video.get('rating')), 'comment_count': int_or_none(video.get('numOfComments')), + 'age_limit': 18 if video.get('contentRatingId') == 2 else 0, + 'categories': categories, + 'tags': tags.split(', ') if tags else None, } diff --git a/hypervideo_dl/extractor/vgtv.py b/hypervideo_dl/extractor/vgtv.py index b6131ff..9d6090b 100644 --- a/hypervideo_dl/extractor/vgtv.py +++ b/hypervideo_dl/extractor/vgtv.py @@ -195,9 +195,7 @@ class VGTVIE(XstreamIE): hls_url = streams.get('hls') if hls_url: formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', - entry_protocol='m3u8' if is_live else 'm3u8_native', - m3u8_id='hls', fatal=False)) + hls_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False)) hds_url = streams.get('hds') if hds_url: @@ -242,7 +240,7 @@ class VGTVIE(XstreamIE): info.update({ 'id': video_id, - 'title': self._live_title(data['title']) if is_live else data['title'], + 'title': data['title'], 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], diff --git a/hypervideo_dl/extractor/vice.py b/hypervideo_dl/extractor/vice.py index ca4d3ed..c8c3055 100644 --- a/hypervideo_dl/extractor/vice.py +++ b/hypervideo_dl/extractor/vice.py @@ -290,7 +290,6 @@ class ViceArticleIE(ViceBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', }, 'add_ie': [ViceIE.ie_key()], }, { diff --git a/hypervideo_dl/extractor/videa.py b/hypervideo_dl/extractor/videa.py index 512ade7..90d7050 100644 --- a/hypervideo_dl/extractor/videa.py +++ b/hypervideo_dl/extractor/videa.py @@ -111,7 +111,6 @@ class VideaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_page = self._download_webpage(url, video_id) if 'videa.hu/player' in url: @@ -146,7 +145,7 @@ class VideaIE(InfoExtractor): compat_b64decode(b64_info), key), video_id) video = xpath_element(info, './video', 'video') - if not video: + if video is None: raise ExtractorError(xpath_element( info, './error', fatal=True), expected=True) sources = xpath_element( @@ -163,9 +162,9 @@ class VideaIE(InfoExtractor): source_exp = source.get('exp') if not (source_url and source_name): continue - hash_value = None - if hash_values: - hash_value = xpath_text(hash_values, 'hash_value_' + source_name) + hash_value = ( + xpath_text(hash_values, 'hash_value_' + source_name) + if hash_values is not None else None) if hash_value and source_exp: source_url = update_url_query(source_url, { 'md5': hash_value, diff --git a/hypervideo_dl/extractor/videocampus_sachsen.py b/hypervideo_dl/extractor/videocampus_sachsen.py new file mode 100644 index 0000000..96e9857 --- /dev/null +++ b/hypervideo_dl/extractor/videocampus_sachsen.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from .common import InfoExtractor + + +class VideocampusSachsenIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://videocampus\.sachsen\.de/(?: + m/(?P<tmp_id>[0-9a-f]+)| + (?:category/)?video/(?P<display_id>[\w-]+)/(?P<id>[0-9a-f]{32}) + )''' + + _TESTS = [ + { + 'url': 'https://videocampus.sachsen.de/m/e0d6c8ce6e394c188f1342f1ab7c50ed6fc4490b808699801def5cb2e46d76ca7367f622a9f516c542ffb805b24d6b643bd7c81f385acaac4c59081b87a2767b', + 'info_dict': { + 'id': 'e6b9349905c1628631f175712250f2a1', + 'title': 'Konstruktiver Entwicklungsprozess Vorlesung 7', + 'ext': 'mp4', + }, + }, + { + 'url': 'https://videocampus.sachsen.de/video/Was-ist-selbstgesteuertes-Lernen/fc99c527e4205b121cb7c74433469262', + 'info_dict': { + 'id': 'fc99c527e4205b121cb7c74433469262', + 'title': 'Was ist selbstgesteuertes Lernen?', + 'display_id': 'Was-ist-selbstgesteuertes-Lernen', + 'ext': 'mp4', + }, + }, + { + 'url': 'https://videocampus.sachsen.de/category/video/Tutorial-zur-Nutzung-von-Adobe-Connect-aus-Veranstalter-Sicht/09d4ed029002eb1bdda610f1103dd54c/100', + 'info_dict': { + 'id': '09d4ed029002eb1bdda610f1103dd54c', + 'title': 'Tutorial zur Nutzung von Adobe Connect aus Veranstalter-Sicht', + 'display_id': 'Tutorial-zur-Nutzung-von-Adobe-Connect-aus-Veranstalter-Sicht', + 'ext': 'mp4', + }, + }, + ] + + def _real_extract(self, url): + video_id, tmp_id, display_id = self._match_valid_url(url).group('id', 'tmp_id', 'display_id') + webpage = self._download_webpage(url, video_id or tmp_id, fatal=False) or '' + + if not tmp_id: + video_id = self._html_search_regex( + r'src="https?://videocampus\.sachsen\.de/media/embed\?key=([0-9a-f]+)&', + webpage, 'video_id') + + title = self._html_search_regex( + (r'<h1>(?P<content>[^<]+)</h1>', *self._meta_regex('title')), + webpage, 'title', group='content', fatal=False) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://videocampus.sachsen.de/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8', + video_id, 'mp4', 'm3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles + } + + +class VideocampusSachsenEmbedIE(InfoExtractor): + _VALID_URL = r'https?://videocampus.sachsen.de/media/embed\?key=(?P<id>[0-9a-f]+)' + + _TESTS = [ + { + 'url': 'https://videocampus.sachsen.de/media/embed?key=fc99c527e4205b121cb7c74433469262', + 'info_dict': { + 'id': 'fc99c527e4205b121cb7c74433469262', + 'title': 'Was ist selbstgesteuertes Lernen?', + 'ext': 'mp4', + }, + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'<img[^>]*title="([^"<]+)"', webpage, 'title', fatal=False) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://videocampus.sachsen.de/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8', + video_id, 'mp4', 'm3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/vidio.py b/hypervideo_dl/extractor/vidio.py index 571448b..6bfb8d4 100644 --- a/hypervideo_dl/extractor/vidio.py +++ b/hypervideo_dl/extractor/vidio.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( clean_html, ExtractorError, + format_field, get_element_by_class, int_or_none, parse_iso8601, @@ -22,11 +23,7 @@ class VidioBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.vidio.com/users/login' _NETRC_MACHINE = 'vidio' - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): def is_logged_in(): res = self._download_json( 'https://www.vidio.com/interactions.json', None, 'Checking if logged in', fatal=False) or {} @@ -62,10 +59,9 @@ class VidioBaseIE(InfoExtractor): 'Unable to log in: %s. %s' % (reason, clean_html(subreason)), expected=True) raise ExtractorError('Unable to log in') - def _real_initialize(self): + def _initialize_pre_login(self): self._api_key = self._download_json( 'https://www.vidio.com/auth', None, data=b'')['api_key'] - self._login() def _call_api(self, url, video_id, note=None): return self._download_json(url, video_id, note=note, headers={ @@ -160,7 +156,7 @@ class VidioIE(VidioBaseIE): 'uploader': user.get('name'), 'timestamp': parse_iso8601(video.get('created_at')), 'uploader_id': username, - 'uploader_url': 'https://www.vidio.com/@' + username if username else None, + 'uploader_url': format_field(username, template='https://www.vidio.com/@%s'), 'channel': channel.get('name'), 'channel_id': str_or_none(channel.get('id')), 'view_count': get_count('view_count'), @@ -291,5 +287,5 @@ class VidioLiveIE(VidioBaseIE): 'uploader': user.get('name'), 'timestamp': parse_iso8601(stream_meta.get('start_time')), 'uploader_id': username, - 'uploader_url': 'https://www.vidio.com/@' + username if username else None, + 'uploader_url': format_field(username, template='https://www.vidio.com/@%s'), } diff --git a/hypervideo_dl/extractor/vidlii.py b/hypervideo_dl/extractor/vidlii.py index f477425..a63919f 100644 --- a/hypervideo_dl/extractor/vidlii.py +++ b/hypervideo_dl/extractor/vidlii.py @@ -5,9 +5,12 @@ import re from .common import InfoExtractor from ..utils import ( + HEADRequest, + format_field, float_or_none, get_element_by_id, int_or_none, + str_to_int, strip_or_none, unified_strdate, urljoin, @@ -36,6 +39,25 @@ class VidLiiIE(InfoExtractor): 'tags': ['Vidlii', 'Jan', 'Videogames'], } }, { + 'url': 'https://www.vidlii.com/watch?v=zTAtaAgOLKt', + 'md5': '5778f7366aa4c569b77002f8bf6b614f', + 'info_dict': { + 'id': 'zTAtaAgOLKt', + 'ext': 'mp4', + 'title': 'FULPTUBE SUCKS.', + 'description': 'md5:087b2ca355d4c8f8f77e97c43e72d711', + 'thumbnail': 'https://www.vidlii.com/usfi/thmp/zTAtaAgOLKt.jpg', + 'uploader': 'Homicide', + 'uploader_url': 'https://www.vidlii.com/user/Homicide', + 'upload_date': '20210612', + 'duration': 89, + 'view_count': int, + 'comment_count': int, + 'average_rating': float, + 'categories': ['News & Politics'], + 'tags': ['fulp', 'tube', 'sucks', 'bad', 'fulptube'], + }, + }, { 'url': 'https://www.vidlii.com/embed?v=tJluaH4BJ3v&a=0', 'only_matching': True, }] @@ -45,10 +67,20 @@ class VidLiiIE(InfoExtractor): webpage = self._download_webpage( 'https://www.vidlii.com/watch?v=%s' % video_id, video_id) - - video_url = self._search_regex( - r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', webpage, - 'video url', group='url') + formats = [] + + sources = [source[1] for source in re.findall( + r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', + webpage) or []] + for source in sources: + height = int(self._search_regex(r'(\d+).mp4', source, 'height', default=360)) + if self._request_webpage(HEADRequest(source), video_id, f'Checking {height}p url', errnote=False): + formats.append({ + 'url': source, + 'format_id': f'{height}p', + 'height': height, + }) + self._sort_formats(formats) title = self._search_regex( (r'<h1>([^<]+)</h1>', r'<title>([^<]+) - VidLii<'), webpage, @@ -71,7 +103,7 @@ class VidLiiIE(InfoExtractor): uploader = self._search_regex( r'<div[^>]+class=["\']wt_person[^>]+>\s*<a[^>]+\bhref=["\']/user/[^>]+>([^<]+)', webpage, 'uploader', fatal=False) - uploader_url = 'https://www.vidlii.com/user/%s' % uploader if uploader else None + uploader_url = format_field(uploader, template='https://www.vidlii.com/user/%s') upload_date = unified_strdate(self._html_search_meta( 'datePublished', webpage, default=None) or self._search_regex( @@ -82,9 +114,9 @@ class VidLiiIE(InfoExtractor): default=None) or self._search_regex( r'duration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) - view_count = int_or_none(self._search_regex( - (r'<strong>(\d+)</strong> views', - r'Views\s*:\s*<strong>(\d+)</strong>'), + view_count = str_to_int(self._search_regex( + (r'<strong>([,0-9]+)</strong> views', + r'Views\s*:\s*<strong>([,0-9]+)</strong>'), webpage, 'view count', fatal=False)) comment_count = int_or_none(self._search_regex( @@ -109,11 +141,11 @@ class VidLiiIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, + 'formats': formats, 'uploader_url': uploader_url, 'upload_date': upload_date, 'duration': duration, diff --git a/hypervideo_dl/extractor/viewlift.py b/hypervideo_dl/extractor/viewlift.py index c3b2e86..4627f66 100644 --- a/hypervideo_dl/extractor/viewlift.py +++ b/hypervideo_dl/extractor/viewlift.py @@ -9,6 +9,7 @@ from ..utils import ( ExtractorError, int_or_none, parse_age_limit, + traverse_obj, ) @@ -32,26 +33,33 @@ class ViewLiftBaseIE(InfoExtractor): } _TOKENS = {} - def _call_api(self, site, path, video_id, query): - token = self._TOKENS.get(site) - if not token: - token_query = {'site': site} - email, password = self._get_login_info(netrc_machine=site) - if email: - resp = self._download_json( - self._API_BASE + 'identity/signin', video_id, - 'Logging in', query=token_query, data=json.dumps({ - 'email': email, - 'password': password, - }).encode()) - else: - resp = self._download_json( - self._API_BASE + 'identity/anonymous-token', video_id, - 'Downloading authorization token', query=token_query) - self._TOKENS[site] = token = resp['authorizationToken'] - return self._download_json( - self._API_BASE + path, video_id, - headers={'Authorization': token}, query=query) + def _fetch_token(self, site, url): + if self._TOKENS.get(site): + return + + cookies = self._get_cookies(url) + if cookies and cookies.get('token'): + self._TOKENS[site] = self._search_regex(r'22authorizationToken\%22:\%22([^\%]+)\%22', cookies['token'].value, 'token') + if not self._TOKENS.get(site): + self.raise_login_required('Cookies (not necessarily logged in) are needed to download from this website', method='cookies') + + def _call_api(self, site, path, video_id, url, query): + self._fetch_token(site, url) + try: + return self._download_json( + self._API_BASE + path, video_id, headers={'Authorization': self._TOKENS.get(site)}, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + webpage = e.cause.read().decode() + try: + error_message = traverse_obj(json.loads(webpage), 'errorMessage', 'message') + except json.JSONDecodeError: + raise ExtractorError(f'{site} said: {webpage}', cause=e.cause) + if error_message: + if 'has not purchased' in error_message: + self.raise_login_required(method='cookies') + raise ExtractorError(error_message, expected=True) + raise class ViewLiftEmbedIE(ViewLiftBaseIE): @@ -96,27 +104,24 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): site = domain.split('.')[-2] if site in self._SITE_MAP: site = self._SITE_MAP[site] - try: - content_data = self._call_api( - site, 'entitlement/video/status', film_id, { - 'id': film_id - })['video'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error_message = self._parse_json(e.cause.read().decode(), film_id).get('errorMessage') - if error_message == 'User does not have a valid subscription or has not purchased this content.': - self.raise_login_required() - raise ExtractorError(error_message, expected=True) - raise + + content_data = self._call_api( + site, 'entitlement/video/status', film_id, url, { + 'id': film_id + })['video'] gist = content_data['gist'] title = gist['title'] video_assets = content_data['streamingInfo']['videoAssets'] - formats = [] - mpeg_video_assets = video_assets.get('mpeg') or [] - for video_asset in mpeg_video_assets: + hls_url = video_assets.get('hls') + formats, subtitles = [], {} + if hls_url: + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + + for video_asset in video_assets.get('mpeg') or []: video_asset_url = video_asset.get('url') - if not video_asset: + if not video_asset_url: continue bitrate = int_or_none(video_asset.get('bitrate')) height = int_or_none(self._search_regex( @@ -130,13 +135,17 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'vcodec': video_asset.get('codec'), }) - hls_url = video_assets.get('hls') - if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) + subs = {} + for sub in traverse_obj(content_data, ('contentDetails', 'closedCaptions')) or []: + sub_url = sub.get('url') + if not sub_url: + continue + subs.setdefault(sub.get('language', 'English'), []).append({ + 'url': sub_url, + }) - info = { + self._sort_formats(formats) + return { 'id': film_id, 'title': title, 'description': gist.get('description'), @@ -145,14 +154,15 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'age_limit': parse_age_limit(content_data.get('parentalRating')), 'timestamp': int_or_none(gist.get('publishDate'), 1000), 'formats': formats, + 'subtitles': self._merge_subtitles(subs, subtitles), + 'categories': traverse_obj(content_data, ('categories', ..., 'title')), + 'tags': traverse_obj(content_data, ('tags', ..., 'title')), } - for k in ('categories', 'tags'): - info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] - return info class ViewLiftIE(ViewLiftBaseIE): IE_NAME = 'viewlift' + _API_BASE = 'https://prod-api-cached-2.viewlift.com/' _VALID_URL = r'https?://(?:www\.)?(?P<domain>%s)(?P<path>(?:/(?:films/title|show|(?:news/)?videos?|watch))?/(?P<id>[^?#]+))' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://www.snagfilms.com/films/title/lost_for_life', @@ -222,24 +232,111 @@ class ViewLiftIE(ViewLiftBaseIE): }, { 'url': 'https://www.marquee.tv/watch/sadlerswells-sacredmonsters', 'only_matching': True, + }, { # Free film with langauge code + 'url': 'https://www.hoichoi.tv/bn/films/title/shuyopoka', + 'info_dict': { + 'id': '7a7a9d33-1f4c-4771-9173-ee4fb6dbf196', + 'ext': 'mp4', + 'title': 'Shuyopoka', + 'description': 'md5:e28f2fb8680096a69c944d37c1fa5ffc', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20211006', + 'series': None + }, + 'params': {'skip_download': True}, + }, { # Free film + 'url': 'https://www.hoichoi.tv/films/title/dadu-no1', + 'info_dict': { + 'id': '0000015b-b009-d126-a1db-b81ff3780000', + 'ext': 'mp4', + 'title': 'Dadu No.1', + 'description': 'md5:605cba408e51a79dafcb824bdeded51e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20210827', + 'series': None + }, + 'params': {'skip_download': True}, + }, { # Free episode + 'url': 'https://www.hoichoi.tv/webseries/case-jaundice-s01-e01', + 'info_dict': { + 'id': 'f779e07c-30c8-459c-8612-5a834ab5e5ba', + 'ext': 'mp4', + 'title': 'Humans Vs. Corona', + 'description': 'md5:ca30a682b4528d02a3eb6d0427dd0f87', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20210830', + 'series': 'Case Jaundice' + }, + 'params': {'skip_download': True}, + }, { # Free video + 'url': 'https://www.hoichoi.tv/videos/1549072415320-six-episode-02-hindi', + 'info_dict': { + 'id': 'b41fa1ce-aca6-47b6-b208-283ff0a2de30', + 'ext': 'mp4', + 'title': 'Woman in red - Hindi', + 'description': 'md5:9d21edc1827d32f8633eb67c2054fc31', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20211006', + 'series': 'Six (Hindi)' + }, + 'params': {'skip_download': True}, + }, { # Free episode + 'url': 'https://www.hoichoi.tv/shows/watch-asian-paints-moner-thikana-online-season-1-episode-1', + 'info_dict': { + 'id': '1f45d185-8500-455c-b88d-13252307c3eb', + 'ext': 'mp4', + 'title': 'Jisshu Sengupta', + 'description': 'md5:ef6ffae01a3d83438597367400f824ed', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20211004', + 'series': 'Asian Paints Moner Thikana' + }, + 'params': {'skip_download': True}, + }, { # Free series + 'url': 'https://www.hoichoi.tv/shows/watch-moner-thikana-bengali-web-series-online', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'watch-moner-thikana-bengali-web-series-online', + }, + }, { # Premium series + 'url': 'https://www.hoichoi.tv/shows/watch-byomkesh-bengali-web-series-online', + 'playlist_mincount': 14, + 'info_dict': { + 'id': 'watch-byomkesh-bengali-web-series-online', + }, + }, { # Premium movie + 'url': 'https://www.hoichoi.tv/movies/detective-2020', + 'only_matching': True }] @classmethod def suitable(cls, url): return False if ViewLiftEmbedIE.suitable(url) else super(ViewLiftIE, cls).suitable(url) + def _show_entries(self, domain, seasons): + for season in seasons: + for episode in season.get('episodes') or []: + path = traverse_obj(episode, ('gist', 'permalink')) + if path: + yield self.url_result(f'https://www.{domain}{path}', ie=self.ie_key()) + def _real_extract(self, url): domain, path, display_id = self._match_valid_url(url).groups() site = domain.split('.')[-2] if site in self._SITE_MAP: site = self._SITE_MAP[site] modules = self._call_api( - site, 'content/pages', display_id, { + site, 'content/pages', display_id, url, { 'includeContent': 'true', 'moduleOffset': 1, 'path': path, 'site': site, })['modules'] + + seasons = next((m['contentData'][0]['seasons'] for m in modules if m.get('moduleType') == 'ShowDetailModule'), None) + if seasons: + return self.playlist_result(self._show_entries(domain, seasons), display_id) + film_id = next(m['contentData'][0]['gist']['id'] for m in modules if m.get('moduleType') == 'VideoDetailModule') return { '_type': 'url_transparent', diff --git a/hypervideo_dl/extractor/viki.py b/hypervideo_dl/extractor/viki.py index acb5ae5..8a93079 100644 --- a/hypervideo_dl/extractor/viki.py +++ b/hypervideo_dl/extractor/viki.py @@ -19,7 +19,7 @@ class VikiBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' _API_URL_TEMPLATE = 'https://api.viki.io%s' - _DEVICE_ID = '86085977d' # used for android api + _DEVICE_ID = '112395910d' _APP = '100005a' _APP_VERSION = '6.11.3' _APP_SECRET = 'd96704b180208dbb2efa30fe44c48bd8690441af9f567ba8fd710a72badc85198f7472' @@ -99,14 +99,7 @@ class VikiBaseIE(InfoExtractor): self.raise_login_required(message) self._raise_error(message) - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): self._token = self._call_api( 'sessions.json', None, 'Logging in', fatal=False, data={'username': username, 'password': password}).get('token') @@ -135,9 +128,6 @@ class VikiIE(VikiBaseIE): 'uploader': 'FCC', 'upload_date': '20201127', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', 'info_dict': { @@ -151,9 +141,6 @@ class VikiIE(VikiBaseIE): 'duration': 3570, 'episode_number': 14, }, - 'params': { - 'format': 'bestvideo', - }, 'skip': 'Blocked in the US', }, { # clip @@ -203,9 +190,6 @@ class VikiIE(VikiBaseIE): 'age_limit': 13, 'episode_number': 1, }, - 'params': { - 'format': 'bestvideo', - }, }, { # youtube external 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', @@ -241,9 +225,6 @@ class VikiIE(VikiBaseIE): 'title': 'Love In Magic', 'age_limit': 13, }, - 'params': { - 'format': 'bestvideo', - }, }] def _real_extract(self, url): @@ -265,7 +246,7 @@ class VikiIE(VikiBaseIE): } for thumbnail_id, thumbnail in (video.get('images') or {}).items() if thumbnail.get('url')] resp = self._call_api( - 'playback_streams/%s.json?drms=dt1,dt2&device_id=%s' % (video_id, self._DEVICE_ID), + 'playback_streams/%s.json?drms=dt3&device_id=%s' % (video_id, self._DEVICE_ID), video_id, 'Downloading video streams JSON')['main'][0] stream_id = try_get(resp, lambda x: x['properties']['track']['stream_id']) @@ -276,10 +257,13 @@ class VikiIE(VikiBaseIE): } for ext in ('srt', 'vtt')]) for lang in (video.get('subtitle_completions') or {}).keys()) mpd_url = resp['url'] - # 1080p is hidden in another mpd which can be found in the current manifest content + # 720p is hidden in another MPD which can be found in the current manifest content mpd_content = self._download_webpage(mpd_url, video_id, note='Downloading initial MPD manifest') mpd_url = self._search_regex( r'(?mi)<BaseURL>(http.+.mpd)', mpd_content, 'new manifest', default=mpd_url) + if 'mpdhd_high' not in mpd_url and 'sig=' not in mpd_url: + # Modify the URL to get 1080p + mpd_url = mpd_url.replace('mpdhd', 'mpdhd_high') formats = self._extract_mpd_formats(mpd_url, video_id) self._sort_formats(formats) diff --git a/hypervideo_dl/extractor/vimeo.py b/hypervideo_dl/extractor/vimeo.py index 9fb5475..4f025a5 100644 --- a/hypervideo_dl/extractor/vimeo.py +++ b/hypervideo_dl/extractor/vimeo.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import base64 import functools -import json import re import itertools @@ -17,8 +16,9 @@ from ..compat import ( from ..utils import ( clean_html, determine_ext, - dict_get, ExtractorError, + get_element_by_class, + HEADRequest, js_to_json, int_or_none, merge_dicts, @@ -26,10 +26,8 @@ from ..utils import ( parse_filesize, parse_iso8601, parse_qs, - RegexNotFoundError, sanitized_Request, smuggle_url, - std_headers, str_or_none, try_get, unified_timestamp, @@ -37,6 +35,7 @@ from ..utils import ( urlencode_postdata, urljoin, unescapeHTML, + urlhandle_detect_ext, ) @@ -45,12 +44,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): _LOGIN_REQUIRED = False _LOGIN_URL = 'https://vimeo.com/log_in' - def _login(self): - username, password = self._get_login_info() - if username is None: - if self._LOGIN_REQUIRED: - raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) - return + def _perform_login(self, username, password): webpage = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') token, vuid = self._extract_xsrft_and_vuid(webpage) @@ -76,6 +70,10 @@ class VimeoBaseInfoExtractor(InfoExtractor): expected=True) raise ExtractorError('Unable to log in') + def _real_initialize(self): + if self._LOGIN_REQUIRED and not self._get_cookies('https://vimeo.com').get('vuid'): + self._raise_login_required() + def _get_video_password(self): password = self.get_param('videopassword') if password is None: @@ -119,26 +117,29 @@ class VimeoBaseInfoExtractor(InfoExtractor): self._set_cookie('vimeo.com', name, value) def _vimeo_sort_formats(self, formats): - # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps - # at the same time without actual units specified. This lead to wrong sorting. - # But since hypervideo prefers 'res,fps' anyway, 'field_preference' is not needed - self._sort_formats(formats) + # Note: Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps + # at the same time without actual units specified. + self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source')) def _parse_config(self, config, video_id): video_data = config['video'] video_title = video_data['title'] live_event = video_data.get('live_event') or {} is_live = live_event.get('status') == 'started' + request = config.get('request') or {} formats = [] - config_files = video_data.get('files') or config['request'].get('files', {}) - for f in config_files.get('progressive', []): + subtitles = {} + + config_files = video_data.get('files') or request.get('files') or {} + for f in (config_files.get('progressive') or []): video_url = f.get('url') if not video_url: continue formats.append({ 'url': video_url, 'format_id': 'http-%s' % f.get('quality'), + 'source_preference': 10, 'width': int_or_none(f.get('width')), 'height': int_or_none(f.get('height')), 'fps': int_or_none(f.get('fps')), @@ -148,7 +149,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): # TODO: fix handling of 308 status code returned for live archive manifest requests sep_pattern = r'/sep/video/' for files_type in ('hls', 'dash'): - for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items(): + for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items(): manifest_url = cdn_data.get('url') if not manifest_url: continue @@ -162,21 +163,23 @@ class VimeoBaseInfoExtractor(InfoExtractor): sep_manifest_urls = [(format_id, manifest_url)] for f_id, m_url in sep_manifest_urls: if files_type == 'hls': - formats.extend(self._extract_m3u8_formats( - m_url, video_id, 'mp4', - 'm3u8' if is_live else 'm3u8_native', m3u8_id=f_id, + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m_url, video_id, 'mp4', live=is_live, m3u8_id=f_id, note='Downloading %s m3u8 information' % cdn_name, - fatal=False)) + fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) elif files_type == 'dash': if 'json=1' in m_url: real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url') if real_m_url: m_url = real_m_url - mpd_formats = self._extract_mpd_formats( + fmts, subs = self._extract_mpd_formats_and_subtitles( m_url.replace('/master.json', '/master.mpd'), video_id, f_id, 'Downloading %s MPD information' % cdn_name, fatal=False) - formats.extend(mpd_formats) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) live_archive = live_event.get('archive') or {} live_archive_source_url = live_archive.get('source_url') @@ -187,18 +190,15 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'quality': 10, }) - subtitles = {} - text_tracks = config['request'].get('text_tracks') - if text_tracks: - for tt in text_tracks: - subtitles[tt['lang']] = [{ - 'ext': 'vtt', - 'url': urljoin('https://vimeo.com', tt['url']), - }] + for tt in (request.get('text_tracks') or []): + subtitles.setdefault(tt['lang'], []).append({ + 'ext': 'vtt', + 'url': urljoin('https://vimeo.com', tt['url']), + }) thumbnails = [] if not is_live: - for key, thumb in video_data.get('thumbs', {}).items(): + for key, thumb in (video_data.get('thumbs') or {}).items(): thumbnails.append({ 'id': key, 'width': int_or_none(key), @@ -213,14 +213,25 @@ class VimeoBaseInfoExtractor(InfoExtractor): owner = video_data.get('owner') or {} video_uploader_url = owner.get('url') + duration = int_or_none(video_data.get('duration')) + chapter_data = try_get(config, lambda x: x['embed']['chapters']) or [] + chapters = [{ + 'title': current_chapter.get('title'), + 'start_time': current_chapter.get('timecode'), + 'end_time': next_chapter.get('timecode'), + } for current_chapter, next_chapter in zip(chapter_data, chapter_data[1:] + [{'timecode': duration}])] + if chapters and chapters[0]['start_time']: # Chapters may not start from 0 + chapters[:0] = [{'title': '<Untitled>', 'start_time': 0, 'end_time': chapters[0]['start_time']}] + return { 'id': str_or_none(video_data.get('id')) or video_id, - 'title': self._live_title(video_title) if is_live else video_title, + 'title': video_title, 'uploader': owner.get('name'), 'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None, 'uploader_url': video_uploader_url, 'thumbnails': thumbnails, - 'duration': int_or_none(video_data.get('duration')), + 'duration': duration, + 'chapters': chapters or None, 'formats': formats, 'subtitles': subtitles, 'is_live': is_live, @@ -232,27 +243,26 @@ class VimeoBaseInfoExtractor(InfoExtractor): query['unlisted_hash'] = unlisted_hash download_data = self._download_json( url, video_id, fatal=False, query=query, - headers={'X-Requested-With': 'XMLHttpRequest'}) - if download_data: - source_file = download_data.get('source_file') - if isinstance(source_file, dict): - download_url = source_file.get('download_url') - if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): - source_name = source_file.get('public_name', 'Original') - if self._is_valid_url(download_url, video_id, '%s video' % source_name): - ext = (try_get( - source_file, lambda x: x['extension'], - compat_str) or determine_ext( - download_url, None) or 'mp4').lower() - return { - 'url': download_url, - 'ext': ext, - 'width': int_or_none(source_file.get('width')), - 'height': int_or_none(source_file.get('height')), - 'filesize': parse_filesize(source_file.get('size')), - 'format_id': source_name, - 'quality': 1, - } + headers={'X-Requested-With': 'XMLHttpRequest'}, + expected_status=(403, 404)) or {} + source_file = download_data.get('source_file') + download_url = try_get(source_file, lambda x: x['download_url']) + if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): + source_name = source_file.get('public_name', 'Original') + if self._is_valid_url(download_url, video_id, '%s video' % source_name): + ext = (try_get( + source_file, lambda x: x['extension'], + compat_str) or determine_ext( + download_url, None) or 'mp4').lower() + return { + 'url': download_url, + 'ext': ext, + 'width': int_or_none(source_file.get('width')), + 'height': int_or_none(source_file.get('height')), + 'filesize': parse_filesize(source_file.get('size')), + 'format_id': source_name, + 'quality': 1, + } jwt_response = self._download_json( 'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {} @@ -261,15 +271,19 @@ class VimeoBaseInfoExtractor(InfoExtractor): headers = {'Authorization': 'jwt %s' % jwt_response['jwt']} original_response = self._download_json( f'https://api.vimeo.com/videos/{video_id}', video_id, - headers=headers, fatal=False) or {} - for download_data in original_response.get('download') or {}: + headers=headers, fatal=False, expected_status=(403, 404)) or {} + for download_data in original_response.get('download') or []: download_url = download_data.get('link') if not download_url or download_data.get('quality') != 'source': continue - query = parse_qs(download_url) + ext = determine_ext(parse_qs(download_url).get('filename', [''])[0].lower(), default_ext=None) + if not ext: + urlh = self._request_webpage( + HEADRequest(download_url), video_id, fatal=False, note='Determining source extension') + ext = urlh and urlhandle_detect_ext(urlh) return { 'url': download_url, - 'ext': determine_ext(query.get('filename', [''])[0].lower()), + 'ext': ext or 'unknown_video', 'format_id': download_data.get('public_name', 'Original'), 'width': int_or_none(download_data.get('width')), 'height': int_or_none(download_data.get('height')), @@ -294,7 +308,7 @@ class VimeoIE(VimeoBaseInfoExtractor): )? vimeo(?:pro)?\.com/ (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) - (?:.*?/)? + (?:[^/]+/)*? (?: (?: play_redirect_hls| @@ -313,7 +327,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'info_dict': { 'id': '56015672', 'ext': 'mp4', - 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", + 'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc", 'description': 'md5:2d3305bad981a06ff79f027f19865021', 'timestamp': 1355990239, 'upload_date': '20121220', @@ -326,6 +340,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'format': 'best[protocol=https]', }, + 'skip': 'No longer available' }, { 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', @@ -342,6 +357,11 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 1595, 'upload_date': '20130610', 'timestamp': 1370893156, + 'license': 'by', + 'thumbnail': 'https://i.vimeocdn.com/video/440260469-19b0d92fca3bd84066623b53f1eb8aaa3980c6c809e2d67b6b39ab7b4a77a344-d_960', + 'view_count': int, + 'comment_count': int, + 'like_count': int, }, 'params': { 'format': 'best[protocol=https]', @@ -349,7 +369,7 @@ class VimeoIE(VimeoBaseInfoExtractor): }, { 'url': 'http://player.vimeo.com/video/54469442', - 'md5': '619b811a4417aa4abe78dc653becf511', + 'md5': 'b3e7f4d2cbb53bd7dc3bb6ff4ed5cfbd', 'note': 'Videos that embed the url in the player page', 'info_dict': { 'id': '54469442', @@ -360,11 +380,11 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'businessofsoftware', 'duration': 3610, 'description': None, + 'thumbnail': 'https://i.vimeocdn.com/video/376682406-f34043e7b766af6bef2af81366eacd6724f3fc3173179a11a97a1e26587c9529-d_1280', }, 'params': { 'format': 'best[protocol=https]', }, - 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://vimeo.com/68375962', @@ -381,6 +401,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f', + 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_960', + 'view_count': int, + 'comment_count': int, + 'like_count': int, }, 'params': { 'format': 'best[protocol=https]', @@ -403,15 +427,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'timestamp': 1380339469, 'upload_date': '20130928', 'duration': 187, + 'thumbnail': 'https://i.vimeocdn.com/video/450239872-a05512d9b1e55d707a7c04365c10980f327b06d966351bc403a5d5d65c95e572-d_1280', + 'view_count': int, + 'comment_count': int, + 'like_count': int, }, - 'expected_warnings': ['Unable to download JSON metadata'], + 'params': {'format': 'http-1080p'}, }, { 'url': 'http://vimeo.com/76979871', 'note': 'Video with subtitles', 'info_dict': { 'id': '76979871', - 'ext': 'mp4', + 'ext': 'mov', 'title': 'The New Vimeo Player (You Know, For Videos)', 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', 'timestamp': 1381846109, @@ -420,7 +448,14 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'staff', 'uploader': 'Vimeo Staff', 'duration': 62, - } + 'subtitles': { + 'de': [{'ext': 'vtt'}], + 'en': [{'ext': 'vtt'}], + 'es': [{'ext': 'vtt'}], + 'fr': [{'ext': 'vtt'}], + }, + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], }, { # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/ @@ -433,6 +468,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Tulio Gonçalves', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user28849593', 'uploader_id': 'user28849593', + 'duration': 118, + 'thumbnail': 'https://i.vimeocdn.com/video/478636036-c18440305ef3df9decfb6bf207a61fe39d2d17fa462a96f6f2d93d30492b037d-d_1280', }, }, { @@ -449,6 +486,11 @@ class VimeoIE(VimeoBaseInfoExtractor): 'timestamp': 1324343742, 'upload_date': '20111220', 'description': 'md5:ae23671e82d05415868f7ad1aec21147', + 'duration': 60, + 'comment_count': int, + 'view_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d_1280', + 'like_count': int, }, }, { @@ -464,8 +506,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Framework Studio', 'description': 'md5:f2edc61af3ea7a5592681ddbb683db73', 'upload_date': '20200225', + 'duration': 176, + 'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d_1280', + 'uploader_url': 'https://vimeo.com/frameworkla', }, - 'expected_warnings': ['Unable to download JSON metadata'], }, { # only available via https://vimeo.com/channels/tributes/6213729 and @@ -483,11 +527,15 @@ class VimeoIE(VimeoBaseInfoExtractor): 'timestamp': 1250886430, 'upload_date': '20090821', 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + 'duration': 321, + 'comment_count': int, + 'view_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/22728298-bfc22146f930de7cf497821c7b0b9f168099201ecca39b00b6bd31fcedfca7a6-d_1280', + 'like_count': int, }, 'params': { 'skip_download': True, }, - 'expected_warnings': ['Unable to download JSON metadata'], }, { # redirects to ondemand extractor and should be passed through it @@ -507,7 +555,6 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'skip_download': True, }, - 'expected_warnings': ['Unable to download JSON metadata'], 'skip': 'this page is no longer available.', }, { @@ -517,10 +564,17 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', + 'timestamp': 1371200155, + 'upload_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, + 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f', + 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_960', + 'view_count': int, + 'comment_count': int, + 'like_count': int, }, 'params': { 'format': 'best[protocol=https]', @@ -550,12 +604,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'info_dict': { 'id': '119195465', 'ext': 'mp4', - 'title': 'youtube-dl test video \'ä"BaW_jenozKc', + 'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc", 'uploader': 'Philipp Hagemeister', 'uploader_id': 'user20132939', 'description': 'md5:fa7b6c6d8db0bdc353893df2f111855b', 'upload_date': '20150209', 'timestamp': 1423518307, + 'thumbnail': 'https://i.vimeocdn.com/video/default_1280', + 'duration': 10, + 'like_count': int, + 'uploader_url': 'https://vimeo.com/user20132939', + 'view_count': int, + 'comment_count': int, }, 'params': { 'format': 'best[protocol=https]', @@ -568,13 +628,94 @@ class VimeoIE(VimeoBaseInfoExtractor): 'only_matching': True, }, { + 'note': 'Direct URL with hash', 'url': 'https://vimeo.com/160743502/abd0e13fb4', - 'only_matching': True, + 'info_dict': { + 'id': '160743502', + 'ext': 'mp4', + 'uploader': 'Julian Tryba', + 'uploader_id': 'aliniamedia', + 'title': 'Harrisville New Hampshire', + 'timestamp': 1459259666, + 'upload_date': '20160329', + 'release_timestamp': 1459259666, + 'license': 'by-nc', + 'duration': 159, + 'comment_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/562802436-585eeb13b5020c6ac0f171a2234067938098f84737787df05ff0d767f6d54ee9-d_1280', + 'like_count': int, + 'uploader_url': 'https://vimeo.com/aliniamedia', + 'release_date': '20160329', + }, + 'params': {'skip_download': True}, + }, + { + 'url': 'https://vimeo.com/138909882', + 'info_dict': { + 'id': '138909882', + 'ext': 'mp4', + 'title': 'Eastnor Castle 2015 Firework Champions - The Promo!', + 'description': 'md5:5967e090768a831488f6e74b7821b3c1', + 'uploader_id': 'fireworkchampions', + 'uploader': 'Firework Champions', + 'upload_date': '20150910', + 'timestamp': 1441901895, + }, + 'params': { + 'skip_download': True, + 'format': 'Original', + }, + }, + { + 'url': 'https://vimeo.com/channels/staffpicks/143603739', + 'info_dict': { + 'id': '143603739', + 'ext': 'mp4', + 'uploader': 'Karim Huu Do', + 'timestamp': 1445846953, + 'upload_date': '20151026', + 'title': 'The Shoes - Submarine Feat. Blaine Harrison', + 'uploader_id': 'karimhd', + 'description': 'md5:8e2eea76de4504c2e8020a9bcfa1e843', + 'channel_id': 'staffpicks', + 'duration': 336, + 'comment_count': int, + 'view_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/541243181-b593db36a16db2f0096f655da3f5a4dc46b8766d77b0f440df937ecb0c418347-d_1280', + 'like_count': int, + 'uploader_url': 'https://vimeo.com/karimhd', + 'channel_url': 'https://vimeo.com/channels/staffpicks', + }, + 'params': {'skip_download': 'm3u8'}, }, { # requires passing unlisted_hash(a52724358e) to load_download_config request 'url': 'https://vimeo.com/392479337/a52724358e', 'only_matching': True, + }, + { + # similar, but all numeric: ID must be 581039021, not 9603038895 + # issue #29690 + 'url': 'https://vimeo.com/581039021/9603038895', + 'info_dict': { + 'id': '581039021', + 'ext': 'mp4', + 'timestamp': 1627621014, + 'release_timestamp': 1627621014, + 'duration': 976, + 'comment_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/1202249320-4ddb2c30398c0dc0ee059172d1bd5ea481ad12f0e0e3ad01d2266f56c744b015-d_1280', + 'like_count': int, + 'uploader_url': 'https://vimeo.com/txwestcapital', + 'release_date': '20210730', + 'uploader': 'Christopher Inks', + 'title': 'Thursday, July 29, 2021 BMA Evening Video Update', + 'uploader_id': 'txwestcapital', + 'upload_date': '20210730', + }, + 'params': { + 'skip_download': True, + }, } # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header @@ -623,8 +764,36 @@ class VimeoIE(VimeoBaseInfoExtractor): raise ExtractorError('Wrong video password', expected=True) return checked - def _real_initialize(self): - self._login() + def _extract_from_api(self, video_id, unlisted_hash=None): + token = self._download_json( + 'https://vimeo.com/_rv/jwt', video_id, headers={ + 'X-Requested-With': 'XMLHttpRequest' + })['token'] + api_url = 'https://api.vimeo.com/videos/' + video_id + if unlisted_hash: + api_url += ':' + unlisted_hash + video = self._download_json( + api_url, video_id, headers={ + 'Authorization': 'jwt ' + token, + }, query={ + 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', + }) + info = self._parse_config(self._download_json( + video['config_url'], video_id), video_id) + self._vimeo_sort_formats(info['formats']) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) + info.update({ + 'description': video.get('description'), + 'license': video.get('license'), + 'release_timestamp': get_timestamp('release'), + 'timestamp': get_timestamp('created'), + 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), + }) + connections = try_get( + video, lambda x: x['metadata']['connections'], dict) or {} + for k in ('comment', 'like'): + info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) + return info def _try_album_password(self, url): album_id = self._search_regex( @@ -666,54 +835,26 @@ class VimeoIE(VimeoBaseInfoExtractor): def _real_extract(self, url): url, data = unsmuggle_url(url, {}) - headers = std_headers.copy() + headers = self.get_param('http_headers').copy() if 'http_headers' in data: headers.update(data['http_headers']) if 'Referer' not in headers: headers['Referer'] = url # Extract ID from URL - video_id, unlisted_hash = self._match_valid_url(url).groups() + mobj = self._match_valid_url(url).groupdict() + video_id, unlisted_hash = mobj['id'], mobj.get('unlisted_hash') if unlisted_hash: - token = self._download_json( - 'https://vimeo.com/_rv/jwt', video_id, headers={ - 'X-Requested-With': 'XMLHttpRequest' - })['token'] - video = self._download_json( - 'https://api.vimeo.com/videos/%s:%s' % (video_id, unlisted_hash), - video_id, headers={ - 'Authorization': 'jwt ' + token, - }, query={ - 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', - }) - info = self._parse_config(self._download_json( - video['config_url'], video_id), video_id) - self._vimeo_sort_formats(info['formats']) - get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) - info.update({ - 'description': video.get('description'), - 'license': video.get('license'), - 'release_timestamp': get_timestamp('release'), - 'timestamp': get_timestamp('created'), - 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), - }) - connections = try_get( - video, lambda x: x['metadata']['connections'], dict) or {} - for k in ('comment', 'like'): - info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) - return info + return self._extract_from_api(video_id, unlisted_hash) orig_url = url is_pro = 'vimeopro.com/' in url - is_player = '://player.vimeo.com/video/' in url if is_pro: # some videos require portfolio_id to be present in player url # https://github.com/ytdl-org/youtube-dl/issues/20070 url = self._extract_url(url, self._download_webpage(url, video_id)) if not url: url = 'https://vimeo.com/' + video_id - elif is_player: - url = 'https://player.vimeo.com/video/' + video_id elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id @@ -734,14 +875,25 @@ class VimeoIE(VimeoBaseInfoExtractor): expected=True) raise - # Now we begin extracting as much information as we can from what we - # retrieved. First we extract the information common to all extractors, - # and latter we extract those that are Vimeo specific. - self.report_extraction(video_id) + if '://player.vimeo.com/video/' in url: + config = self._parse_json(self._search_regex( + r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) + if config.get('view') == 4: + config = self._verify_player_video_password( + redirect_url, video_id, headers) + info = self._parse_config(config, video_id) + self._vimeo_sort_formats(info['formats']) + return info + + if re.search(r'<form[^>]+?id="pw_form"', webpage): + video_password = self._get_video_password() + token, vuid = self._extract_xsrft_and_vuid(webpage) + webpage = self._verify_video_password( + redirect_url, video_id, video_password, token, vuid) vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) if vimeo_config: - seed_status = vimeo_config.get('seed_status', {}) + seed_status = vimeo_config.get('seed_status') or {} if seed_status.get('state') == 'failed': raise ExtractorError( '%s said: %s' % (self.IE_NAME, seed_status['title']), @@ -750,70 +902,41 @@ class VimeoIE(VimeoBaseInfoExtractor): cc_license = None timestamp = None video_description = None + info_dict = {} + config_url = None - # Extract the config JSON - try: - try: - config_url = self._html_search_regex( - r' data-config-url="(.+?)"', webpage, - 'config URL', default=None) - if not config_url: - # Sometimes new react-based page is served instead of old one that require - # different config URL extraction approach (see - # https://github.com/ytdl-org/youtube-dl/pull/7209) - page_config = self._parse_json(self._search_regex( - r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', - webpage, 'page config'), video_id) - config_url = page_config['player']['config_url'] - cc_license = page_config.get('cc_license') - timestamp = try_get( - page_config, lambda x: x['clip']['uploaded_on'], - compat_str) - video_description = clean_html(dict_get( - page_config, ('description', 'description_html_escaped'))) - config = self._download_json(config_url, video_id) - except RegexNotFoundError: - # For pro videos or player.vimeo.com urls - # We try to find out to which variable is assigned the config dic - m_variable_name = re.search(r'(\w)\.video\.id', webpage) - if m_variable_name is not None: - config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))] - else: - config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] - config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;') - config_re.append(r'\bconfig\s*=\s*({.+?})\s*;') - config = self._search_regex(config_re, webpage, 'info section', - flags=re.DOTALL) - config = json.loads(config) - except Exception as e: - if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): - raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option') - - if re.search(r'<form[^>]+?id="pw_form"', webpage) is not None: - if '_video_password_verified' in data: - raise ExtractorError('video password verification failed!') - video_password = self._get_video_password() - token, vuid = self._extract_xsrft_and_vuid(webpage) - self._verify_video_password( - redirect_url, video_id, video_password, token, vuid) - return self._real_extract( - smuggle_url(redirect_url, {'_video_password_verified': 'verified'})) - else: - raise ExtractorError('Unable to extract info section', - cause=e) - else: - if config.get('view') == 4: - config = self._verify_player_video_password(redirect_url, video_id, headers) - + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) + if channel_id: + config_url = self._html_search_regex( + r'\bdata-config-url="([^"]+)"', webpage, 'config URL', default=None) + video_description = clean_html(get_element_by_class('description', webpage)) + info_dict.update({ + 'channel_id': channel_id, + 'channel_url': 'https://vimeo.com/channels/' + channel_id, + }) + if not config_url: + page_config = self._parse_json(self._search_regex( + r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', + webpage, 'page config', default='{}'), video_id, fatal=False) + if not page_config: + return self._extract_from_api(video_id) + config_url = page_config['player']['config_url'] + cc_license = page_config.get('cc_license') + clip = page_config.get('clip') or {} + timestamp = clip.get('uploaded_on') + video_description = clean_html( + clip.get('description') or page_config.get('description_html_escaped')) + config = self._download_json(config_url, video_id) video = config.get('video') or {} vod = video.get('vod') or {} def is_rented(): if '>You rented this title.<' in webpage: return True - if config.get('user', {}).get('purchased'): + if try_get(config, lambda x: x['user']['purchased']): return True - for purchase_option in vod.get('purchase_options', []): + for purchase_option in (vod.get('purchase_options') or []): if purchase_option.get('purchased'): return True label = purchase_option.get('label_string') @@ -828,14 +951,14 @@ class VimeoIE(VimeoBaseInfoExtractor): 'https://player.vimeo.com/player/%s' % feature_id, {'force_feature_id': True}), 'Vimeo') - # Extract video description if not video_description: video_description = self._html_search_regex( r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>', webpage, 'description', default=None) if not video_description: video_description = self._html_search_meta( - 'description', webpage, default=None) + ['description', 'og:description', 'twitter:description'], + webpage, default=None) if not video_description and is_pro: orig_webpage = self._download_webpage( orig_url, video_id, @@ -844,24 +967,17 @@ class VimeoIE(VimeoBaseInfoExtractor): if orig_webpage: video_description = self._html_search_meta( 'description', orig_webpage, default=None) - if not video_description and not is_player: + if not video_description: self.report_warning('Cannot find video description') - # Extract upload date if not timestamp: timestamp = self._search_regex( r'<time[^>]+datetime="([^"]+)"', webpage, 'timestamp', default=None) - try: - view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) - like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count')) - comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count')) - except RegexNotFoundError: - # This info is only available in vimeo.com/{id} urls - view_count = None - like_count = None - comment_count = None + view_count = int_or_none(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count', default=None)) + like_count = int_or_none(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count', default=None)) + comment_count = int_or_none(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count', default=None)) formats = [] @@ -881,11 +997,7 @@ class VimeoIE(VimeoBaseInfoExtractor): r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') - channel_id = self._search_regex( - r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) - channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None - - info_dict = { + info_dict.update({ 'formats': formats, 'timestamp': unified_timestamp(timestamp), 'description': video_description, @@ -894,18 +1006,14 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': like_count, 'comment_count': comment_count, 'license': cc_license, - 'channel_id': channel_id, - 'channel_url': channel_url, - } - - info_dict = merge_dicts(info_dict, info_dict_config, json_ld) + }) - return info_dict + return merge_dicts(info_dict, info_dict_config, json_ld) class VimeoOndemandIE(VimeoIE): IE_NAME = 'vimeo:ondemand' - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?:[^/]+/)?(?P<id>[^/?#&]+)' _TESTS = [{ # ondemand video not available via https://vimeo.com/id 'url': 'https://vimeo.com/ondemand/20704', @@ -917,9 +1025,15 @@ class VimeoOndemandIE(VimeoIE): 'uploader': 'גם סרטים', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms', 'uploader_id': 'gumfilms', - 'description': 'md5:4c027c965e439de4baab621e48b60791', + 'description': 'md5:aeeba3dbd4d04b0fa98a4fdc9c639998', 'upload_date': '20140906', 'timestamp': 1410032453, + 'thumbnail': 'https://i.vimeocdn.com/video/488238335-d7bf151c364cff8d467f1b73784668fe60aae28a54573a35d53a1210ae283bd8-d_1280', + 'comment_count': int, + 'license': 'https://creativecommons.org/licenses/by-nc-nd/3.0/', + 'duration': 53, + 'view_count': int, + 'like_count': int, }, 'params': { 'format': 'best[protocol=https]', @@ -938,6 +1052,11 @@ class VimeoOndemandIE(VimeoIE): 'description': 'md5:c3c46a90529612c8279fb6af803fc0df', 'upload_date': '20150502', 'timestamp': 1430586422, + 'duration': 121, + 'comment_count': int, + 'view_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/517077723-7066ae1d9a79d3eb361334fb5d58ec13c8f04b52f8dd5eadfbd6fb0bcf11f613-d_1280', + 'like_count': int, }, 'params': { 'skip_download': True, @@ -967,7 +1086,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): 'id': 'tributes', 'title': 'Vimeo Tributes', }, - 'playlist_mincount': 25, + 'playlist_mincount': 22, }] _BASE_URL_TEMPL = 'https://vimeo.com/channels/%s' @@ -1128,10 +1247,10 @@ class VimeoGroupsIE(VimeoChannelIE): IE_NAME = 'vimeo:group' _VALID_URL = r'https://vimeo\.com/groups/(?P<id>[^/]+)(?:/(?!videos?/\d+)|$)' _TESTS = [{ - 'url': 'https://vimeo.com/groups/kattykay', + 'url': 'https://vimeo.com/groups/meetup', 'info_dict': { - 'id': 'kattykay', - 'title': 'Katty Kay', + 'id': 'meetup', + 'title': 'Vimeo Meetup!', }, 'playlist_mincount': 27, }] @@ -1152,8 +1271,10 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'uploader': 'Richard Hardwick', 'uploader_id': 'user21297594', 'description': "Comedian Dick Hardwick's five minute demo filmed in front of a live theater audience.\nEdit by Doug Mattocks", + 'duration': 304, + 'thumbnail': 'https://i.vimeocdn.com/video/450115033-43303819d9ebe24c2630352e18b7056d25197d09b3ae901abdac4c4f1d68de71-d_1280', + 'uploader_url': 'https://vimeo.com/user21297594', }, - 'expected_warnings': ['Unable to download JSON metadata'], }, { 'note': 'video player needs Referer', 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', @@ -1184,9 +1305,6 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'skip': 'video gone', }] - def _real_initialize(self): - self._login() - def _real_extract(self, url): page_url, video_id = self._match_valid_url(url).groups() data = self._download_json( @@ -1228,9 +1346,6 @@ class VimeoWatchLaterIE(VimeoChannelIE): 'only_matching': True, }] - def _real_initialize(self): - self._login() - def _page_url(self, base_url, pagenum): url = '%s/page:%d/' % (base_url, pagenum) request = sanitized_Request(url) diff --git a/hypervideo_dl/extractor/vimm.py b/hypervideo_dl/extractor/vimm.py new file mode 100644 index 0000000..060b92b --- /dev/null +++ b/hypervideo_dl/extractor/vimm.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from .common import InfoExtractor + + +class VimmIE(InfoExtractor): + IE_NAME = 'Vimm:stream' + _VALID_URL = r'https?://(?:www\.)?vimm\.tv/(?:c/)?(?P<id>[0-9a-z-]+)$' + _TESTS = [{ + 'url': 'https://www.vimm.tv/c/calimeatwagon', + 'info_dict': { + 'id': 'calimeatwagon', + 'ext': 'mp4', + 'title': 're:^calimeatwagon [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'live_status': 'is_live', + }, + 'skip': 'Live', + }, { + 'url': 'https://www.vimm.tv/octaafradio', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + + formats, subs = self._extract_m3u8_formats_and_subtitles( + f'https://www.vimm.tv/hls/{channel_id}.m3u8', channel_id, 'mp4', m3u8_id='hls', live=True) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': channel_id, + 'is_live': True, + 'formats': formats, + 'subtitles': subs, + } + + +class VimmRecordingIE(InfoExtractor): + IE_NAME = 'Vimm:recording' + _VALID_URL = r'https?://(?:www\.)?vimm\.tv/c/(?P<channel_id>[0-9a-z-]+)\?v=(?P<video_id>[0-9A-Za-z]+)' + _TESTS = [{ + 'url': 'https://www.vimm.tv/c/kaldewei?v=2JZsrPTFxsSz', + 'md5': '15122ee95baa32a548e4a3e120b598f1', + 'info_dict': { + 'id': '2JZsrPTFxsSz', + 'ext': 'mp4', + 'title': 'VIMM - [DE/GER] Kaldewei Live - In Farbe und Bunt', + 'uploader_id': 'kaldewei', + }, + }] + + def _real_extract(self, url): + channel_id, video_id = self._match_valid_url(url).groups() + + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + + formats, subs = self._extract_m3u8_formats_and_subtitles( + f'https://d211qfrkztakg3.cloudfront.net/{channel_id}/{video_id}/index.m3u8', video_id, 'mp4', m3u8_id='hls', live=False) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'is_live': False, + 'uploader_id': channel_id, + 'formats': formats, + 'subtitles': subs, + } diff --git a/hypervideo_dl/extractor/vine.py b/hypervideo_dl/extractor/vine.py index 07fce0d..e59b103 100644 --- a/hypervideo_dl/extractor/vine.py +++ b/hypervideo_dl/extractor/vine.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( determine_ext, + format_field, int_or_none, unified_timestamp, ) @@ -92,7 +93,7 @@ class VineIE(InfoExtractor): username = data.get('username') - alt_title = 'Vine by %s' % username if username else None + alt_title = format_field(username, template='Vine by %s') return { 'id': video_id, diff --git a/hypervideo_dl/extractor/viu.py b/hypervideo_dl/extractor/viu.py index 1b34c52..3cfca89 100644 --- a/hypervideo_dl/extractor/viu.py +++ b/hypervideo_dl/extractor/viu.py @@ -1,55 +1,32 @@ # coding: utf-8 from __future__ import unicode_literals -import json import re +import json +import uuid +import random +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_kwargs, - compat_str, - compat_urlparse, - compat_urllib_request, -) +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + strip_or_none, try_get, smuggle_url, unsmuggle_url, + url_or_none, ) class ViuBaseIE(InfoExtractor): - def _real_initialize(self): - viu_auth_res = self._request_webpage( - 'https://www.viu.com/api/apps/v2/authenticate', None, - 'Requesting Viu auth', query={ - 'acct': 'test', - 'appid': 'viu_desktop', - 'fmt': 'json', - 'iid': 'guest', - 'languageid': 'default', - 'platform': 'desktop', - 'userid': 'guest', - 'useridtype': 'guest', - 'ver': '1.0' - }, headers=self.geo_verification_headers()) - self._auth_token = viu_auth_res.info()['X-VIU-AUTH'] - - def _call_api(self, path, *args, **kwargs): - headers = self.geo_verification_headers() - headers.update({ - 'X-VIU-AUTH': self._auth_token - }) - headers.update(kwargs.get('headers', {})) - kwargs['headers'] = headers + def _call_api(self, path, *args, headers={}, **kwargs): response = self._download_json( - 'https://www.viu.com/api/' + path, *args, - **compat_kwargs(kwargs))['response'] + f'https://www.viu.com/api/{path}', *args, **kwargs, + headers={**self.geo_verification_headers(), **headers})['response'] if response.get('status') != 'success': - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, response['message']), expected=True) + raise ExtractorError(f'{self.IE_NAME} said: {response["message"]}', expected=True) return response @@ -101,6 +78,7 @@ class ViuIE(ViuBaseIE): tdirforwhole = video_data.get('tdirforwhole') # #EXT-X-BYTERANGE is not supported by native hls downloader # and ffmpeg (#10955) + # FIXME: It is supported in hypervideo # hls_file = video_data.get('hlsfile') hls_file = video_data.get('jwhlsfile') if url_path and tdirforwhole and hls_file: @@ -110,10 +88,9 @@ class ViuIE(ViuBaseIE): # r'(/hlsc_)[a-z]+(\d+\.m3u8)', # r'\1whe\2', video_data['href']) m3u8_url = video_data['href'] - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4') self._sort_formats(formats) - subtitles = {} for key, value in video_data.items(): mobj = re.match(r'^subtitle_(?P<lang>[^_]+)_(?P<ext>(vtt|srt))', key) if not mobj: @@ -227,42 +204,63 @@ class ViuOTTIE(InfoExtractor): 'zh-cn': 2, 'en-us': 3, } - _user_info = None + + _user_token = None + _auth_codes = {} def _detect_error(self, response): - code = response.get('status', {}).get('code') - if code > 0: + code = try_get(response, lambda x: x['status']['code']) + if code and code > 0: message = try_get(response, lambda x: x['status']['message']) - raise ExtractorError('%s said: %s (%s)' % ( - self.IE_NAME, message, code), expected=True) - return response['data'] - - def _raise_login_required(self): - raise ExtractorError( - 'This video requires login. ' - 'Specify --username and --password or --netrc (machine: %s) ' - 'to provide account credentials.' % self._NETRC_MACHINE, - expected=True) + raise ExtractorError(f'{self.IE_NAME} said: {message} ({code})', expected=True) + return response.get('data') or {} def _login(self, country_code, video_id): - if not self._user_info: + if self._user_token is None: username, password = self._get_login_info() - if username is None or password is None: + if username is None: return + headers = { + 'Authorization': f'Bearer {self._auth_codes[country_code]}', + 'Content-Type': 'application/json' + } + data = self._download_json( + 'https://api-gateway-global.viu.com/api/account/validate', + video_id, 'Validating email address', headers=headers, + data=json.dumps({ + 'principal': username, + 'provider': 'email' + }).encode()) + if not data.get('exists'): + raise ExtractorError('Invalid email address') data = self._download_json( - compat_urllib_request.Request( - 'https://www.viu.com/ott/%s/index.php' % country_code, method='POST'), - video_id, 'Logging in', errnote=False, fatal=False, - query={'r': 'user/login'}, + 'https://api-gateway-global.viu.com/api/auth/login', + video_id, 'Logging in', headers=headers, data=json.dumps({ - 'username': username, + 'email': username, 'password': password, - 'platform_flag_label': 'web', + 'provider': 'email', }).encode()) - self._user_info = self._detect_error(data)['user'] - - return self._user_info + self._detect_error(data) + self._user_token = data.get('identity') + # need to update with valid user's token else will throw an error again + self._auth_codes[country_code] = data.get('token') + return self._user_token + + def _get_token(self, country_code, video_id): + rand = ''.join(random.choice('0123456789') for _ in range(10)) + return self._download_json( + f'https://api-gateway-global.viu.com/api/auth/token?v={rand}000', video_id, + headers={'Content-Type': 'application/json'}, note='Getting bearer token', + data=json.dumps({ + 'countryCode': country_code.upper(), + 'platform': 'browser', + 'platformFlagLabel': 'web', + 'language': 'en', + 'uuid': str(uuid.uuid4()), + 'carrierId': '0' + }).encode('utf-8'))['token'] def _real_extract(self, url): url, idata = unsmuggle_url(url, {}) @@ -279,17 +277,16 @@ class ViuOTTIE(InfoExtractor): query['area_id'] = area_id product_data = self._download_json( - 'http://www.viu.com/ott/%s/index.php' % country_code, video_id, + f'http://www.viu.com/ott/{country_code}/index.php', video_id, 'Downloading video info', query=query)['data'] video_data = product_data.get('current_product') if not video_data: - raise ExtractorError('This video is not available in your region.', expected=True) + self.raise_geo_restricted() series_id = video_data.get('series_id') - if not self.get_param('noplaylist') and not idata.get('force_noplaylist'): - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % series_id) - series = product_data.get('series', {}) + if self._yes_playlist(series_id, video_id, idata): + series = product_data.get('series') or {} product = series.get('product') if product: entries = [] @@ -297,88 +294,78 @@ class ViuOTTIE(InfoExtractor): item_id = entry.get('product_id') if not item_id: continue - item_id = compat_str(item_id) entries.append(self.url_result( - smuggle_url( - 'http://www.viu.com/ott/%s/%s/vod/%s/' % (country_code, lang_code, item_id), - {'force_noplaylist': True}), # prevent infinite recursion - 'ViuOTT', - item_id, - entry.get('synopsis', '').strip())) + smuggle_url(f'http://www.viu.com/ott/{country_code}/{lang_code}/vod/{item_id}/', + {'force_noplaylist': True}), + ViuOTTIE, str(item_id), entry.get('synopsis', '').strip())) return self.playlist_result(entries, series_id, series.get('name'), series.get('description')) - if self.get_param('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - duration_limit = False query = { 'ccs_product_id': video_data['ccs_product_id'], 'language_flag_id': self._LANGUAGE_FLAG.get(lang_code.lower()) or '3', } - headers = { - 'Referer': url, - 'Origin': url, - } - try: + + def download_playback(): stream_data = self._download_json( - 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, - video_id, 'Downloading stream info', query=query, headers=headers) - stream_data = self._detect_error(stream_data)['stream'] - except (ExtractorError, KeyError): - stream_data = None - if video_data.get('user_level', 0) > 0: - user = self._login(country_code, video_id) - if user: - query['identity'] = user['identity'] - stream_data = self._download_json( - 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, - video_id, 'Downloading stream info', query=query, headers=headers) - stream_data = self._detect_error(stream_data).get('stream') - else: - # preview is limited to 3min for non-members - # try to bypass the duration limit - duration_limit = True - query['duration'] = '180' - stream_data = self._download_json( - 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, - video_id, 'Downloading stream info', query=query, headers=headers) - try: - stream_data = self._detect_error(stream_data)['stream'] - except (ExtractorError, KeyError): # if still not working, give up - self._raise_login_required() + 'https://api-gateway-global.viu.com/api/playback/distribute', + video_id=video_id, query=query, fatal=False, note='Downloading stream info', + headers={ + 'Authorization': f'Bearer {self._auth_codes[country_code]}', + 'Referer': url, + 'Origin': url + }) + return self._detect_error(stream_data).get('stream') + + if not self._auth_codes.get(country_code): + self._auth_codes[country_code] = self._get_token(country_code, video_id) + stream_data = None + try: + stream_data = download_playback() + except (ExtractorError, KeyError): + token = self._login(country_code, video_id) + if token is not None: + query['identity'] = token + else: + # The content is Preview or for VIP only. + # We can try to bypass the duration which is limited to 3mins only + duration_limit, query['duration'] = True, '180' + try: + stream_data = download_playback() + except (ExtractorError, KeyError): + if token is not None: + raise + self.raise_login_required(method='password') if not stream_data: raise ExtractorError('Cannot get stream info', expected=True) - stream_sizes = stream_data.get('size', {}) formats = [] - for vid_format, stream_url in stream_data.get('url', {}).items(): - height = int_or_none(self._search_regex( - r's(\d+)p', vid_format, 'height', default=None)) + for vid_format, stream_url in (stream_data.get('url') or {}).items(): + height = int(self._search_regex(r's(\d+)p', vid_format, 'height', default=None)) # bypass preview duration limit if duration_limit: - stream_url = compat_urlparse.urlparse(stream_url) - query = dict(compat_urlparse.parse_qsl(stream_url.query, keep_blank_values=True)) - time_duration = int_or_none(video_data.get('time_duration')) + old_stream_url = urllib.parse.urlparse(stream_url) + query = dict(urllib.parse.parse_qsl(old_stream_url.query, keep_blank_values=True)) query.update({ - 'duration': time_duration if time_duration > 0 else '9999999', + 'duration': video_data.get('time_duration') or '9999999', 'duration_start': '0', }) - stream_url = stream_url._replace(query=compat_urlparse.urlencode(query)).geturl() + stream_url = old_stream_url._replace(query=urllib.parse.urlencode(query)).geturl() formats.append({ 'format_id': vid_format, 'url': stream_url, 'height': height, 'ext': 'mp4', - 'filesize': int_or_none(stream_sizes.get(vid_format)) + 'filesize': try_get(stream_data, lambda x: x['size'][vid_format], int) }) self._sort_formats(formats) subtitles = {} - for sub in video_data.get('subtitle', []): + for sub in video_data.get('subtitle') or []: sub_url = sub.get('url') if not sub_url: continue @@ -387,17 +374,16 @@ class ViuOTTIE(InfoExtractor): 'ext': 'srt', }) - title = video_data['synopsis'].strip() - + title = strip_or_none(video_data.get('synopsis')) return { 'id': video_id, 'title': title, 'description': video_data.get('description'), - 'series': product_data.get('series', {}).get('name'), + 'series': try_get(product_data, lambda x: x['series']['name']), 'episode': title, 'episode_number': int_or_none(video_data.get('number')), 'duration': int_or_none(stream_data.get('duration')), - 'thumbnail': video_data.get('cover_image_url'), + 'thumbnail': url_or_none(video_data.get('cover_image_url')), 'formats': formats, 'subtitles': subtitles, } diff --git a/hypervideo_dl/extractor/vk.py b/hypervideo_dl/extractor/vk.py index d8a9b9a..cbc3159 100644 --- a/hypervideo_dl/extractor/vk.py +++ b/hypervideo_dl/extractor/vk.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import collections -import functools import re from .common import InfoExtractor @@ -12,7 +11,6 @@ from ..utils import ( ExtractorError, get_element_by_class, int_or_none, - OnDemandPagedList, orderedSet, str_or_none, str_to_int, @@ -31,11 +29,7 @@ from .youtube import YoutubeIE class VKBaseIE(InfoExtractor): _NETRC_MACHINE = 'vk' - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page, url_handle = self._download_webpage_handle( 'https://vk.com', None, 'Downloading login page') @@ -51,7 +45,7 @@ class VKBaseIE(InfoExtractor): self._apply_first_set_cookie_header(url_handle, 'remixlhk') login_page = self._download_webpage( - 'https://login.vk.com/?act=login', None, + 'https://vk.com/login', None, note='Logging in', data=urlencode_postdata(login_form)) @@ -59,9 +53,6 @@ class VKBaseIE(InfoExtractor): raise ExtractorError( 'Unable to login, incorrect username and/or password', expected=True) - def _real_initialize(self): - self._login() - def _download_payload(self, path, video_id, data, fatal=True): data['al'] = 1 code, payload = self._download_json( @@ -87,10 +78,10 @@ class VKIE(VKBaseIE): ) ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)| (?: - (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video| + (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?(?:video|clip)| (?:www\.)?daxab.com/embed/ ) - (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))? + (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>([\da-f]+)|(ln-[\da-zA-Z]+)))? ) ''' _TESTS = [ @@ -182,6 +173,17 @@ class VKIE(VKBaseIE): 'skip': 'Removed', }, { + 'url': 'https://vk.com/video-93049196_456239755?list=ln-cBjJ7S4jYYx3ADnmDT', + 'info_dict': { + 'id': '-93049196_456239755', + 'ext': 'mp4', + 'title': '8 серия (озвучка)', + 'duration': 8383, + 'upload_date': '20211222', + 'view_count': int, + }, + }, + { # video (removed?) only available with list id 'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4', 'md5': '091287af5402239a1051c37ec7b92913', @@ -298,6 +300,10 @@ class VKIE(VKBaseIE): # The video is not available in your region. 'url': 'https://vk.com/video-51812607_171445436', 'only_matching': True, + }, + { + 'url': 'https://vk.com/clip30014565_456240946', + 'only_matching': True, }] @staticmethod @@ -434,8 +440,6 @@ class VKIE(VKBaseIE): # 2 = live # 3 = post live (finished live) is_live = data.get('live') == 2 - if is_live: - title = self._live_title(title) timestamp = unified_timestamp(self._html_search_regex( r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page, @@ -471,6 +475,13 @@ class VKIE(VKBaseIE): }) self._sort_formats(formats) + subtitles = {} + for sub in data.get('subs') or {}: + subtitles.setdefault(sub.get('lang', 'en'), []).append({ + 'ext': sub.get('title', '.srt').split('.')[-1], + 'url': url_or_none(sub.get('url')), + }) + return { 'id': video_id, 'formats': formats, @@ -484,69 +495,66 @@ class VKIE(VKBaseIE): 'like_count': int_or_none(mv_data.get('likes')), 'comment_count': int_or_none(mv_data.get('commcount')), 'is_live': is_live, + 'subtitles': subtitles, } class VKUserVideosIE(VKBaseIE): IE_NAME = 'vk:uservideos' IE_DESC = "VK - User's Videos" - _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P<section>\w+))?|$)' + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/video/@(?P<id>[^?$#/&]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P<section>\w+))?|$)' _TEMPLATE_URL = 'https://vk.com/videos' _TESTS = [{ - 'url': 'https://vk.com/videos-767561', + 'url': 'https://vk.com/video/@mobidevices', 'info_dict': { - 'id': '-767561_all', + 'id': '-17892518_all', }, - 'playlist_mincount': 1150, + 'playlist_mincount': 1355, }, { - 'url': 'https://vk.com/videos-767561?section=uploaded', + 'url': 'https://vk.com/video/@mobidevices?section=uploaded', 'info_dict': { - 'id': '-767561_uploaded', + 'id': '-17892518_uploaded', }, - 'playlist_mincount': 425, - }, { - 'url': 'http://vk.com/videos205387401', - 'only_matching': True, - }, { - 'url': 'http://vk.com/videos-77521', - 'only_matching': True, - }, { - 'url': 'http://vk.com/videos-97664626?section=all', - 'only_matching': True, - }, { - 'url': 'http://m.vk.com/videos205387401', - 'only_matching': True, - }, { - 'url': 'http://new.vk.com/videos205387401', - 'only_matching': True, + 'playlist_mincount': 182, }] - _PAGE_SIZE = 1000 _VIDEO = collections.namedtuple('Video', ['owner_id', 'id']) - def _fetch_page(self, page_id, section, page): - l = self._download_payload('al_video', page_id, { + def _entries(self, page_id, section): + video_list_json = self._download_payload('al_video', page_id, { 'act': 'load_videos_silent', - 'offset': page * self._PAGE_SIZE, + 'offset': 0, 'oid': page_id, 'section': section, - })[0][section]['list'] - - for video in l: - v = self._VIDEO._make(video[:2]) - video_id = '%d_%d' % (v.owner_id, v.id) - yield self.url_result( - 'http://vk.com/video' + video_id, VKIE.ie_key(), video_id) + })[0][section] + count = video_list_json['count'] + total = video_list_json['total'] + video_list = video_list_json['list'] + + while True: + for video in video_list: + v = self._VIDEO._make(video[:2]) + video_id = '%d_%d' % (v.owner_id, v.id) + yield self.url_result( + 'http://vk.com/video' + video_id, VKIE.ie_key(), video_id) + if count >= total: + break + video_list_json = self._download_payload('al_video', page_id, { + 'act': 'load_videos_silent', + 'offset': count, + 'oid': page_id, + 'section': section, + })[0][section] + count += video_list_json['count'] + video_list = video_list_json['list'] def _real_extract(self, url): - page_id, section = self._match_valid_url(url).groups() + u_id, section = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, u_id) + page_id = self._search_regex(r'data-owner-id\s?=\s?"([^"]+)"', webpage, 'page_id') if not section: section = 'all' - entries = OnDemandPagedList( - functools.partial(self._fetch_page, page_id, section), - self._PAGE_SIZE) - - return self.playlist_result(entries, '%s_%s' % (page_id, section)) + return self.playlist_result(self._entries(page_id, section), '%s_%s' % (page_id, section)) class VKWallPostIE(VKBaseIE): @@ -673,7 +681,7 @@ class VKWallPostIE(VKBaseIE): 'artist': performer, 'track': title, 'ext': 'mp4', - 'protocol': 'm3u8', + 'protocol': 'm3u8_native', }) for video in re.finditer( diff --git a/hypervideo_dl/extractor/vlive.py b/hypervideo_dl/extractor/vlive.py index 84f51a5..ae35c97 100644 --- a/hypervideo_dl/extractor/vlive.py +++ b/hypervideo_dl/extractor/vlive.py @@ -12,22 +12,65 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + LazyList, merge_dicts, str_or_none, strip_or_none, try_get, urlencode_postdata, + url_or_none, ) class VLiveBaseIE(NaverBaseIE): - _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + _NETRC_MACHINE = 'vlive' + _logged_in = False + + def _perform_login(self, username, password): + if self._logged_in: + return + LOGIN_URL = 'https://www.vlive.tv/auth/email/login' + self._request_webpage( + LOGIN_URL, None, note='Downloading login cookies') + + self._download_webpage( + LOGIN_URL, None, note='Logging in', + data=urlencode_postdata({'email': username, 'pwd': password}), + headers={ + 'Referer': LOGIN_URL, + 'Content-Type': 'application/x-www-form-urlencoded' + }) + + login_info = self._download_json( + 'https://www.vlive.tv/auth/loginInfo', None, + note='Checking login status', + headers={'Referer': 'https://www.vlive.tv/home'}) + + if not try_get(login_info, lambda x: x['message']['login'], bool): + raise ExtractorError('Unable to log in', expected=True) + VLiveBaseIE._logged_in = True + + def _call_api(self, path_template, video_id, fields=None, query_add={}, note=None): + if note is None: + note = 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0] + query = {'appId': '8c6cc7b45d2568fb668be6e05b6e5a3b', 'gcc': 'KR', 'platformType': 'PC'} + if fields: + query['fields'] = fields + if query_add: + query.update(query_add) + try: + return self._download_json( + 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, + note, headers={'Referer': 'https://www.vlive.tv/'}, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message']) + raise class VLiveIE(VLiveBaseIE): IE_NAME = 'vlive' _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)' - _NETRC_MACHINE = 'vlive' _TESTS = [{ 'url': 'http://www.vlive.tv/video/1326', 'md5': 'cc7314812855ce56de70a06a27314983', @@ -38,6 +81,12 @@ class VLiveIE(VLiveBaseIE): 'creator': "Girl's Day", 'view_count': int, 'uploader_id': 'muploader_a', + 'upload_date': '20150817', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'timestamp': 1439816449, + }, + 'params': { + 'skip_download': True, }, }, { 'url': 'http://www.vlive.tv/video/16937', @@ -49,6 +98,9 @@ class VLiveIE(VLiveBaseIE): 'view_count': int, 'subtitles': 'mincount:12', 'uploader_id': 'muploader_j', + 'upload_date': '20161112', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'timestamp': 1478923074, }, 'params': { 'skip_download': True, @@ -81,53 +133,6 @@ class VLiveIE(VLiveBaseIE): 'playlist_mincount': 120 }] - def _real_initialize(self): - self._login() - - def _login(self): - email, password = self._get_login_info() - if None in (email, password): - return - - def is_logged_in(): - login_info = self._download_json( - 'https://www.vlive.tv/auth/loginInfo', None, - note='Downloading login info', - headers={'Referer': 'https://www.vlive.tv/home'}) - return try_get( - login_info, lambda x: x['message']['login'], bool) or False - - LOGIN_URL = 'https://www.vlive.tv/auth/email/login' - self._request_webpage( - LOGIN_URL, None, note='Downloading login cookies') - - self._download_webpage( - LOGIN_URL, None, note='Logging in', - data=urlencode_postdata({'email': email, 'pwd': password}), - headers={ - 'Referer': LOGIN_URL, - 'Content-Type': 'application/x-www-form-urlencoded' - }) - - if not is_logged_in(): - raise ExtractorError('Unable to log in', expected=True) - - def _call_api(self, path_template, video_id, fields=None, limit=None): - query = {'appId': self._APP_ID, 'gcc': 'KR', 'platformType': 'PC'} - if fields: - query['fields'] = fields - if limit: - query['limit'] = limit - try: - return self._download_json( - 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, - 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0], - headers={'Referer': 'https://www.vlive.tv/'}, query=query) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message']) - raise - def _real_extract(self, url): video_id = self._match_id(url) @@ -135,30 +140,24 @@ class VLiveIE(VLiveBaseIE): 'post/v1.0/officialVideoPost-%s', video_id, 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId},playlist{playlistSeq,totalCount,name}') - playlist = post.get('playlist') - if not playlist or self.get_param('noplaylist'): - if playlist: - self.to_screen( - 'Downloading just video %s because of --no-playlist' - % video_id) - + playlist_id = str_or_none(try_get(post, lambda x: x['playlist']['playlistSeq'])) + if not self._yes_playlist(playlist_id, video_id): video = post['officialVideo'] return self._get_vlive_info(post, video, video_id) - else: - playlist_name = playlist.get('name') - playlist_id = str_or_none(playlist.get('playlistSeq')) - playlist_count = str_or_none(playlist.get('totalCount')) - playlist = self._call_api( - 'playlist/v1.0/playlist-%s/posts', playlist_id, 'data', limit=playlist_count) + playlist_name = str_or_none(try_get(post, lambda x: x['playlist']['name'])) + playlist_count = str_or_none(try_get(post, lambda x: x['playlist']['totalCount'])) - entries = [] - for video_data in playlist['data']: - video = video_data.get('officialVideo') - video_id = str_or_none(video.get('videoSeq')) - entries.append(self._get_vlive_info(video_data, video, video_id)) + playlist = self._call_api( + 'playlist/v1.0/playlist-%s/posts', playlist_id, 'data', {'limit': playlist_count}) - return self.playlist_result(entries, playlist_id, playlist_name) + entries = [] + for video_data in playlist['data']: + video = video_data.get('officialVideo') + video_id = str_or_none(video.get('videoSeq')) + entries.append(self._get_vlive_info(video_data, video, video_id)) + + return self.playlist_result(entries, playlist_id, playlist_name) def _get_vlive_info(self, post, video, video_id): def get_common_fields(): @@ -172,6 +171,8 @@ class VLiveIE(VLiveBaseIE): 'view_count': int_or_none(video.get('playCount')), 'like_count': int_or_none(video.get('likeCount')), 'comment_count': int_or_none(video.get('commentCount')), + 'timestamp': int_or_none(video.get('createdAt'), scale=1000), + 'thumbnail': video.get('thumb'), } video_type = video.get('type') @@ -197,7 +198,7 @@ class VLiveIE(VLiveBaseIE): self._sort_formats(formats) info = get_common_fields() info.update({ - 'title': self._live_title(video['title']), + 'title': video['title'], 'id': video_id, 'formats': formats, 'is_live': True, @@ -216,7 +217,7 @@ class VLiveIE(VLiveBaseIE): raise ExtractorError('Unknown status ' + status) -class VLivePostIE(VLiveIE): +class VLivePostIE(VLiveBaseIE): IE_NAME = 'vlive:post' _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/post/(?P<id>\d-\d+)' _TESTS = [{ @@ -238,8 +239,6 @@ class VLivePostIE(VLiveIE): 'playlist_count': 1, }] _FVIDEO_TMPL = 'fvideo/v1.0/fvideo-%%s/%s' - _SOS_TMPL = _FVIDEO_TMPL % 'sosPlayInfo' - _INKEY_TMPL = _FVIDEO_TMPL % 'inKey' def _real_extract(self, url): post_id = self._match_id(url) @@ -266,7 +265,7 @@ class VLivePostIE(VLiveIE): entry = None if upload_type == 'SOS': download = self._call_api( - self._SOS_TMPL, video_id)['videoUrl']['download'] + self._FVIDEO_TMPL % 'sosPlayInfo', video_id)['videoUrl']['download'] formats = [] for f_id, f_url in download.items(): formats.append({ @@ -284,7 +283,7 @@ class VLivePostIE(VLiveIE): vod_id = upload_info.get('videoId') if not vod_id: continue - inkey = self._call_api(self._INKEY_TMPL, video_id)['inKey'] + inkey = self._call_api(self._FVIDEO_TMPL % 'inKey', video_id)['inKey'] entry = self._extract_video_info(video_id, vod_id, inkey) if entry: entry['title'] = '%s_part%s' % (title, idx) @@ -295,7 +294,7 @@ class VLivePostIE(VLiveIE): class VLiveChannelIE(VLiveBaseIE): IE_NAME = 'vlive:channel' - _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)' + _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<channel_id>[0-9A-Z]+)(?:/board/(?P<posts_id>\d+))?' _TESTS = [{ 'url': 'http://channels.vlive.tv/FCD4B', 'info_dict': { @@ -306,78 +305,57 @@ class VLiveChannelIE(VLiveBaseIE): }, { 'url': 'https://www.vlive.tv/channel/FCD4B', 'only_matching': True, + }, { + 'url': 'https://www.vlive.tv/channel/FCD4B/board/3546', + 'info_dict': { + 'id': 'FCD4B-3546', + 'title': 'MAMAMOO - Star Board', + }, + 'playlist_mincount': 880 }] - def _call_api(self, path, channel_key_suffix, channel_value, note, query): - q = { - 'app_id': self._APP_ID, - 'channel' + channel_key_suffix: channel_value, - } - q.update(query) - return self._download_json( - 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path, - channel_value, note='Downloading ' + note, query=q)['result'] - - def _real_extract(self, url): - channel_code = self._match_id(url) - - channel_seq = self._call_api( - 'decodeChannelCode', 'Code', channel_code, - 'decode channel code', {})['channelSeq'] - - channel_name = None - entries = [] + def _entries(self, posts_id, board_name): + if board_name: + posts_path = 'post/v1.0/board-%s/posts' + query_add = {'limit': 100, 'sortType': 'LATEST'} + else: + posts_path = 'post/v1.0/channel-%s/starPosts' + query_add = {'limit': 100} for page_num in itertools.count(1): video_list = self._call_api( - 'getChannelVideoList', 'Seq', channel_seq, - 'channel list page #%d' % page_num, { - # Large values of maxNumOfRows (~300 or above) may cause - # empty responses (see [1]), e.g. this happens for [2] that - # has more than 300 videos. - # 1. https://github.com/ytdl-org/youtube-dl/issues/13830 - # 2. http://channels.vlive.tv/EDBF. - 'maxNumOfRows': 100, - 'pageNo': page_num - } - ) - - if not channel_name: - channel_name = try_get( - video_list, - lambda x: x['channelInfo']['channelName'], - compat_str) + posts_path, posts_id, 'channel{channelName},contentType,postId,title,url', query_add, + note=f'Downloading playlist page {page_num}') + + for video in try_get(video_list, lambda x: x['data'], list) or []: + video_id = str(video.get('postId')) + video_title = str_or_none(video.get('title')) + video_url = url_or_none(video.get('url')) + if not all((video_id, video_title, video_url)) or video.get('contentType') != 'VIDEO': + continue + channel_name = try_get(video, lambda x: x['channel']['channelName'], compat_str) + yield self.url_result(video_url, VLivePostIE.ie_key(), video_id, video_title, channel=channel_name) - videos = try_get( - video_list, lambda x: x['videoList'], list) - if not videos: + after = try_get(video_list, lambda x: x['paging']['nextParams']['after'], compat_str) + if not after: break + query_add['after'] = after + + def _real_extract(self, url): + channel_id, posts_id = self._match_valid_url(url).groups() - for video in videos: - video_id = video.get('videoSeq') - video_type = video.get('videoType') + board_name = None + if posts_id: + board = self._call_api( + 'board/v1.0/board-%s', posts_id, 'title,boardType') + board_name = board.get('title') or 'Unknown' + if board.get('boardType') not in ('STAR', 'VLIVE_PLUS'): + raise ExtractorError(f'Board {board_name!r} is not supported', expected=True) - if not video_id or not video_type: - continue - video_id = compat_str(video_id) - - if video_type in ('PLAYLIST'): - first_video_id = try_get( - video, - lambda x: x['videoPlaylist']['videoList'][0]['videoSeq'], int) - - if not first_video_id: - continue - - entries.append( - self.url_result( - 'http://www.vlive.tv/video/%s' % first_video_id, - ie=VLiveIE.ie_key(), video_id=first_video_id)) - else: - entries.append( - self.url_result( - 'http://www.vlive.tv/video/%s' % video_id, - ie=VLiveIE.ie_key(), video_id=video_id)) + entries = LazyList(self._entries(posts_id or channel_id, board_name)) + channel_name = entries[0]['channel'] return self.playlist_result( - entries, channel_code, channel_name) + entries, + f'{channel_id}-{posts_id}' if posts_id else channel_id, + f'{channel_name} - {board_name}' if channel_name and board_name else channel_name) diff --git a/hypervideo_dl/extractor/voicy.py b/hypervideo_dl/extractor/voicy.py index 11ebe76..37c7d56 100644 --- a/hypervideo_dl/extractor/voicy.py +++ b/hypervideo_dl/extractor/voicy.py @@ -6,9 +6,10 @@ from ..compat import compat_str from ..utils import ( ExtractorError, smuggle_url, + str_or_none, traverse_obj, - unsmuggle_url, unified_strdate, + unsmuggle_url, ) import itertools @@ -25,9 +26,9 @@ class VoicyBaseIE(InfoExtractor): 'id': voice_id, 'title': compat_str(value.get('PlaylistName')), 'uploader': value.get('SpeakerName'), - 'uploader_id': compat_str(value.get('SpeakerId')), + 'uploader_id': str_or_none(value.get('SpeakerId')), 'channel': value.get('ChannelName'), - 'channel_id': compat_str(value.get('ChannelId')), + 'channel_id': str_or_none(value.get('ChannelId')), 'upload_date': upload_date, } diff --git a/hypervideo_dl/extractor/voot.py b/hypervideo_dl/extractor/voot.py index e2944ec..a9b66b9 100644 --- a/hypervideo_dl/extractor/voot.py +++ b/hypervideo_dl/extractor/voot.py @@ -15,7 +15,7 @@ class VootIE(InfoExtractor): _VALID_URL = r'''(?x) (?: voot:| - (?:https?://)(?:www\.)?voot\.com/? + https?://(?:www\.)?voot\.com/? (?: movies/[^/]+/| (?:shows|kids)/(?:[^/]+/){4} diff --git a/hypervideo_dl/extractor/vrv.py b/hypervideo_dl/extractor/vrv.py index 4196021..00e1006 100644 --- a/hypervideo_dl/extractor/vrv.py +++ b/hypervideo_dl/extractor/vrv.py @@ -19,6 +19,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + join_nonempty, traverse_obj, ) @@ -84,7 +85,30 @@ class VRVBaseIE(InfoExtractor): 'resource_key': resource_key, })['__links__']['cms_resource']['href'] - def _real_initialize(self): + def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): + if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'): + return [] + format_id = join_nonempty( + stream_format, + audio_lang and 'audio-%s' % audio_lang, + hardsub_lang and 'hardsub-%s' % hardsub_lang) + if 'hls' in stream_format: + adaptive_formats = self._extract_m3u8_formats( + url, video_id, 'mp4', m3u8_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + elif stream_format == 'dash': + adaptive_formats = self._extract_mpd_formats( + url, video_id, mpd_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + if audio_lang: + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_lang + return adaptive_formats + + def _set_api_params(self): webpage = self._download_webpage( 'https://vrv.co/', None, headers=self.geo_verification_headers()) self._API_PARAMS = self._parse_json(self._search_regex( @@ -123,47 +147,17 @@ class VRVIE(VRVBaseIE): }] _NETRC_MACHINE = 'vrv' - def _real_initialize(self): - super(VRVIE, self)._real_initialize() - - email, password = self._get_login_info() - if email is None: - return - + def _perform_login(self, username, password): token_credentials = self._call_api( 'authenticate/by:credentials', None, 'Token Credentials', data={ - 'email': email, + 'email': username, 'password': password, }) self._TOKEN = token_credentials['oauth_token'] self._TOKEN_SECRET = token_credentials['oauth_token_secret'] - def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): - if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'): - return [] - stream_id_list = [] - if audio_lang: - stream_id_list.append('audio-%s' % audio_lang) - if hardsub_lang: - stream_id_list.append('hardsub-%s' % hardsub_lang) - format_id = stream_format - if stream_id_list: - format_id += '-' + '-'.join(stream_id_list) - if 'hls' in stream_format: - adaptive_formats = self._extract_m3u8_formats( - url, video_id, 'mp4', m3u8_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - elif stream_format == 'dash': - adaptive_formats = self._extract_mpd_formats( - url, video_id, mpd_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - if audio_lang: - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = audio_lang - return adaptive_formats + def _initialize_pre_login(self): + return self._set_api_params() def _real_extract(self, url): video_id = self._match_id(url) @@ -258,6 +252,9 @@ class VRVSeriesIE(VRVBaseIE): 'playlist_mincount': 11, } + def _initialize_pre_login(self): + return self._set_api_params() + def _real_extract(self, url): series_id = self._match_id(url) diff --git a/hypervideo_dl/extractor/vshare.py b/hypervideo_dl/extractor/vshare.py index c631ac1..b4874ac 100644 --- a/hypervideo_dl/extractor/vshare.py +++ b/hypervideo_dl/extractor/vshare.py @@ -50,8 +50,7 @@ class VShareIE(InfoExtractor): 'https://vshare.io/v/%s/width-650/height-430/1' % video_id, video_id, headers={'Referer': url}) - title = self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title') + title = self._html_extract_title(webpage) title = title.split(' - ')[0] error = self._html_search_regex( diff --git a/hypervideo_dl/extractor/vupload.py b/hypervideo_dl/extractor/vupload.py index 9846aba..b561f63 100644 --- a/hypervideo_dl/extractor/vupload.py +++ b/hypervideo_dl/extractor/vupload.py @@ -7,6 +7,7 @@ from ..utils import ( parse_filesize, extract_attributes, int_or_none, + js_to_json ) @@ -27,9 +28,12 @@ class VuploadIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title') - video_e = self._html_search_regex(r'\|([a-z0-9]{60})\|', webpage, 'video') - video_url = f'https://wurize.megaupload.to/{video_e}/v.mp4' + title = self._html_extract_title(webpage) + video_json = self._parse_json(self._html_search_regex(r'sources:\s*(.+?]),', webpage, 'video'), video_id, transform_source=js_to_json) + formats = [] + for source in video_json: + if source['src'].endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(source['src'], video_id, m3u8_id='hls')) duration = parse_duration(self._html_search_regex( r'<i\s*class=["\']fad\s*fa-clock["\']></i>\s*([\d:]+)\s*</div>', webpage, 'duration', fatal=False)) filesize_approx = parse_filesize(self._html_search_regex( @@ -40,7 +44,7 @@ class VuploadIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'duration': duration, 'filesize_approx': filesize_approx, 'width': int_or_none(extra_video_info.get('width')), diff --git a/hypervideo_dl/extractor/vyborymos.py b/hypervideo_dl/extractor/vyborymos.py index 9e703c4..4d93666 100644 --- a/hypervideo_dl/extractor/vyborymos.py +++ b/hypervideo_dl/extractor/vyborymos.py @@ -44,11 +44,11 @@ class VyboryMosIE(InfoExtractor): info = self._download_json( 'http://vybory.mos.ru/json/voting_stations/%s/%s.json' % (compat_str(station_id)[:3], station_id), - station_id, 'Downloading station JSON', fatal=False) + station_id, 'Downloading station JSON', fatal=False) or {} return { 'id': station_id, - 'title': self._live_title(info['name'] if info else station_id), + 'title': info.get('name') or station_id, 'description': info.get('address'), 'is_live': True, 'formats': formats, diff --git a/hypervideo_dl/extractor/wakanim.py b/hypervideo_dl/extractor/wakanim.py index c956d61..a70a719 100644 --- a/hypervideo_dl/extractor/wakanim.py +++ b/hypervideo_dl/extractor/wakanim.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +from urllib.parse import unquote + from .common import InfoExtractor from ..utils import ( merge_dicts, @@ -23,7 +25,6 @@ class WakanimIE(InfoExtractor): 'episode_number': 2, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { @@ -31,26 +32,37 @@ class WakanimIE(InfoExtractor): 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/7843/sword-art-online-alicization-omu-arc-2-folge-15-omu', 'only_matching': True, }] + _GEO_BYPASS = False def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m3u8_url = urljoin(url, self._search_regex( - r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'm3u8 url', + if 'Geoblocking' in webpage: + if '/de/' in url: + self.raise_geo_restricted(countries=['DE', 'AT', 'CH']) + else: + self.raise_geo_restricted(countries=['RU']) + + manifest_url = urljoin(url, self._search_regex( + r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'manifest url', group='url')) if not self.get_param('allow_unplayable_formats'): # https://docs.microsoft.com/en-us/azure/media-services/previous/media-services-content-protection-overview#streaming-urls encryption = self._search_regex( r'encryption%3D(c(?:enc|bc(?:s-aapl)?))', - m3u8_url, 'encryption', default=None) + manifest_url, 'encryption', default=None) if encryption in ('cenc', 'cbcs-aapl'): self.report_drm(video_id) - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + if 'format=mpd-time-cmaf' in unquote(manifest_url): + formats = self._extract_mpd_formats( + manifest_url, video_id, mpd_id='dash') + else: + formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') info = self._search_json_ld(webpage, video_id, default={}) diff --git a/hypervideo_dl/extractor/wasdtv.py b/hypervideo_dl/extractor/wasdtv.py new file mode 100644 index 0000000..38c10dc --- /dev/null +++ b/hypervideo_dl/extractor/wasdtv.py @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + traverse_obj, + try_get, +) + + +class WASDTVBaseIE(InfoExtractor): + + def _fetch(self, path, video_id, description, query={}): + response = self._download_json( + f'https://wasd.tv/api/{path}', video_id, query=query, + note=f'Downloading {description} metadata', + errnote=f'Unable to download {description} metadata') + error = response.get('error') + if error: + raise ExtractorError(f'{self.IE_NAME} returned error: {error}', expected=True) + return response.get('result') + + def _extract_thumbnails(self, thumbnails_dict): + return [{ + 'url': url, + 'preference': index, + } for index, url in enumerate( + traverse_obj(thumbnails_dict, (('small', 'medium', 'large'),))) if url] + + def _real_extract(self, url): + container = self._get_container(url) + stream = traverse_obj(container, ('media_container_streams', 0)) + media = try_get(stream, lambda x: x['stream_media'][0]) + if not media: + raise ExtractorError('Can not extract media data.', expected=True) + media_meta = media.get('media_meta') + media_url, is_live = self._get_media_url(media_meta) + video_id = media.get('media_id') or container.get('media_container_id') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(media_url, video_id, 'mp4') + self._sort_formats(formats) + return { + 'id': str(video_id), + 'title': container.get('media_container_name') or self._og_search_title(self._download_webpage(url, video_id)), + 'description': container.get('media_container_description'), + 'thumbnails': self._extract_thumbnails(media_meta.get('media_preview_images')), + 'timestamp': parse_iso8601(container.get('created_at')), + 'view_count': int_or_none(stream.get('stream_current_viewers' if is_live else 'stream_total_viewers')), + 'is_live': is_live, + 'formats': formats, + 'subtitles': subtitles, + } + + def _get_container(self, url): + raise NotImplementedError('Subclass for get media container') + + def _get_media_url(self, media_meta): + raise NotImplementedError('Subclass for get media url') + + +class WASDTVStreamIE(WASDTVBaseIE): + IE_NAME = 'wasdtv:stream' + _VALID_URL = r'https?://wasd\.tv/(?P<id>[^/#?]+)$' + _TESTS = [{ + 'url': 'https://wasd.tv/24_7', + 'info_dict': { + 'id': '559738', + 'ext': 'mp4', + 'title': 'Live 24/7 Music', + 'description': '24/7 Music', + 'timestamp': int, + 'upload_date': r're:^\d{8}$', + 'is_live': True, + 'view_count': int, + }, + }] + + def _get_container(self, url): + nickname = self._match_id(url) + channel = self._fetch(f'channels/nicknames/{nickname}', video_id=nickname, description='channel') + channel_id = channel.get('channel_id') + containers = self._fetch( + 'v2/media-containers', channel_id, 'running media containers', + query={ + 'channel_id': channel_id, + 'media_container_type': 'SINGLE', + 'media_container_status': 'RUNNING', + }) + if not containers: + raise ExtractorError(f'{nickname} is offline', expected=True) + return containers[0] + + def _get_media_url(self, media_meta): + return media_meta['media_url'], True + + +class WASDTVRecordIE(WASDTVBaseIE): + IE_NAME = 'wasdtv:record' + _VALID_URL = r'https?://wasd\.tv/[^/#?]+/videos\?record=(?P<id>\d+)$' + _TESTS = [{ + 'url': 'https://wasd.tv/spacemita/videos?record=907755', + 'md5': 'c9899dd85be4cc997816ff9f9ca516ce', + 'info_dict': { + 'id': '906825', + 'ext': 'mp4', + 'title': 'Музыкальный', + 'description': 'md5:f510388d929ff60ae61d4c3cab3137cc', + 'timestamp': 1645812079, + 'upload_date': '20220225', + 'thumbnail': r're:^https?://.+\.jpg', + 'is_live': False, + 'view_count': int, + }, + }] + + def _get_container(self, url): + container_id = self._match_id(url) + return self._fetch( + f'v2/media-containers/{container_id}', container_id, 'media container') + + def _get_media_url(self, media_meta): + media_archive_url = media_meta.get('media_archive_url') + if media_archive_url: + return media_archive_url, False + return media_meta['media_url'], True + + +class WASDTVClipIE(WASDTVBaseIE): + IE_NAME = 'wasdtv:clip' + _VALID_URL = r'https?://wasd\.tv/[^/#?]+/clips\?clip=(?P<id>\d+)$' + _TESTS = [{ + 'url': 'https://wasd.tv/spacemita/clips?clip=26804', + 'md5': '818885e720143d7a4e776ff66fcff148', + 'info_dict': { + 'id': '26804', + 'ext': 'mp4', + 'title': 'Пуш флексит на голове стримера', + 'timestamp': 1646682908, + 'upload_date': '20220307', + 'thumbnail': r're:^https?://.+\.jpg', + 'view_count': int, + }, + }] + + def _real_extract(self, url): + clip_id = self._match_id(url) + clip = self._fetch(f'v2/clips/{clip_id}', video_id=clip_id, description='clip') + clip_data = clip.get('clip_data') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(clip_data.get('url'), video_id=clip_id, ext='mp4') + self._sort_formats(formats) + return { + 'id': clip_id, + 'title': clip.get('clip_title') or self._og_search_title(self._download_webpage(url, clip_id, fatal=False)), + 'thumbnails': self._extract_thumbnails(clip_data.get('preview')), + 'timestamp': parse_iso8601(clip.get('created_at')), + 'view_count': int_or_none(clip.get('clip_views_count')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/washingtonpost.py b/hypervideo_dl/extractor/washingtonpost.py index 8afb1af..9d6ae28 100644 --- a/hypervideo_dl/extractor/washingtonpost.py +++ b/hypervideo_dl/extractor/washingtonpost.py @@ -5,6 +5,8 @@ import re from .common import InfoExtractor +from ..utils import traverse_obj + class WashingtonPostIE(InfoExtractor): IE_NAME = 'washingtonpost' @@ -50,7 +52,7 @@ class WashingtonPostArticleIE(InfoExtractor): 'title': 'Sinkhole of bureaucracy', }, 'playlist': [{ - 'md5': 'b9be794ceb56c7267d410a13f99d801a', + 'md5': '7ccf53ea8cbb77de5f570242b3b21a59', 'info_dict': { 'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f', 'ext': 'mp4', @@ -59,9 +61,10 @@ class WashingtonPostArticleIE(InfoExtractor): 'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.', 'timestamp': 1395440416, 'upload_date': '20140321', + 'thumbnail': r're:https://[^\.]+.cloudfront\.net/PAPERMINESplash\.jpg', }, }, { - 'md5': '1fff6a689d8770966df78c8cb6c8c17c', + 'md5': '7ccf53ea8cbb77de5f570242b3b21a59', 'info_dict': { 'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f', 'ext': 'mp4', @@ -70,6 +73,7 @@ class WashingtonPostArticleIE(InfoExtractor): 'duration': 2220, 'timestamp': 1395441819, 'upload_date': '20140321', + 'thumbnail': r're:https://[^\.]+.cloudfront\.net/BoyersSplash\.jpeg', }, }], }, { @@ -88,7 +92,11 @@ class WashingtonPostArticleIE(InfoExtractor): 'timestamp': 1419972442, 'title': 'Why black boxes don’t transmit data in real time', } - }] + }], + 'skip': 'Doesnt have a video anymore', + }, { + 'url': 'https://www.washingtonpost.com/nation/2021/08/05/dixie-river-fire-california-climate/', + 'only_matching': True, }] @classmethod @@ -106,6 +114,13 @@ class WashingtonPostArticleIE(InfoExtractor): <div\s+class="posttv-video-embed[^>]*?data-uuid=| data-video-uuid= )"([^"]+)"''', webpage) + + if not uuids: + json_data = self._search_nextjs_data(webpage, page_id) + for content_element in traverse_obj(json_data, ('props', 'pageProps', 'globalContent', 'content_elements')): + if content_element.get('type') == 'video': + uuids.append(content_element.get('_id')) + entries = [self.url_result('washingtonpost:%s' % uuid, 'WashingtonPost', uuid) for uuid in uuids] return { diff --git a/hypervideo_dl/extractor/watchbox.py b/hypervideo_dl/extractor/watchbox.py index 7469fe9..d19d801 100644 --- a/hypervideo_dl/extractor/watchbox.py +++ b/hypervideo_dl/extractor/watchbox.py @@ -30,7 +30,6 @@ class WatchBoxIE(InfoExtractor): 'release_year': 2009, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'expected_warnings': ['Failed to download m3u8 information'], @@ -52,7 +51,6 @@ class WatchBoxIE(InfoExtractor): 'episode_number': 1, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'expected_warnings': ['Failed to download m3u8 information'], diff --git a/hypervideo_dl/extractor/wdr.py b/hypervideo_dl/extractor/wdr.py index f54aa6f..ef58a66 100644 --- a/hypervideo_dl/extractor/wdr.py +++ b/hypervideo_dl/extractor/wdr.py @@ -10,6 +10,7 @@ from ..compat import ( ) from ..utils import ( determine_ext, + dict_get, ExtractorError, js_to_json, strip_jsonp, @@ -22,9 +23,14 @@ from ..utils import ( class WDRIE(InfoExtractor): - _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P<id>\d+)\.js' + __API_URL_TPL = '//deviceids-medp.wdr.de/ondemand/%s/%s' + _VALID_URL = r'''(?x)https?:// + (?:deviceids-medp\.wdr\.de/ondemand/\d+/| + kinder\.wdr\.de/(?!mediathek/)[^#?]+-) + (?P<id>\d+)\.(?:js|assetjsonp) + ''' _GEO_COUNTRIES = ['DE'] - _TEST = { + _TESTS = [{ 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', 'info_dict': { 'id': 'mdb-1557833', @@ -32,11 +38,19 @@ class WDRIE(InfoExtractor): 'title': 'Biathlon-Staffel verpasst Podest bei Olympia-Generalprobe', 'upload_date': '20180112', }, - } + }] + + def _asset_url(self, wdr_id): + id_len = max(len(wdr_id), 5) + return ''.join(('https:', self.__API_URL_TPL % (wdr_id[:id_len - 4], wdr_id, ), '.js')) def _real_extract(self, url): video_id = self._match_id(url) + if url.startswith('wdr:'): + video_id = url[4:] + url = self._asset_url(video_id) + metadata = self._download_json( url, video_id, transform_source=strip_jsonp) @@ -113,7 +127,7 @@ class WDRIE(InfoExtractor): return { 'id': tracker_data.get('trackerClipId', video_id), - 'title': self._live_title(title) if is_live else title, + 'title': title, 'alt_title': tracker_data.get('trackerClipSubcategory'), 'formats': formats, 'subtitles': subtitles, @@ -122,10 +136,10 @@ class WDRIE(InfoExtractor): } -class WDRPageIE(InfoExtractor): - _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' +class WDRPageIE(WDRIE): + _MAUS_REGEX = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/)*?(?P<maus_id>[^/?#.]+)(?:/?|/index\.php5|\.php5)$' _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html' - _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL + _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _MAUS_REGEX _TESTS = [ { @@ -166,11 +180,11 @@ class WDRPageIE(InfoExtractor): { 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'info_dict': { - 'id': 'mdb-1406149', + 'id': 'mdb-2296252', 'ext': 'mp4', - 'title': r're:^WDR Fernsehen im Livestream \(nur in Deutschland erreichbar\) [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': r're:^WDR Fernsehen im Livestream (?:\(nur in Deutschland erreichbar\) )?[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'alt_title': 'WDR Fernsehen Live', - 'upload_date': '20150101', + 'upload_date': '20201112', 'is_live': True, }, 'params': { @@ -179,7 +193,7 @@ class WDRPageIE(InfoExtractor): }, { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', - 'playlist_mincount': 7, + 'playlist_mincount': 6, 'info_dict': { 'id': 'aktuelle-stunde-120', }, @@ -187,10 +201,10 @@ class WDRPageIE(InfoExtractor): { 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', 'info_dict': { - 'id': 'mdb-1552552', + 'id': 'mdb-2627637', 'ext': 'mp4', 'upload_date': 're:^[0-9]{8}$', - 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', + 'title': 're:^Die Sendung (?:mit der Maus )?vom [0-9.]{10}$', }, 'skip': 'The id changes from week to week because of the new episode' }, @@ -203,6 +217,7 @@ class WDRPageIE(InfoExtractor): 'upload_date': '20130919', 'title': 'Sachgeschichte - Achterbahn ', }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html', @@ -228,6 +243,7 @@ class WDRPageIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', @@ -241,7 +257,7 @@ class WDRPageIE(InfoExtractor): def _real_extract(self, url): mobj = self._match_valid_url(url) - display_id = mobj.group('display_id') + display_id = dict_get(mobj.groupdict(), ('display_id', 'maus_id'), 'wdrmaus') webpage = self._download_webpage(url, display_id) entries = [] @@ -267,6 +283,14 @@ class WDRPageIE(InfoExtractor): jsonp_url = try_get( media_link_obj, lambda x: x['mediaObj']['url'], compat_str) if jsonp_url: + # metadata, or player JS with ['ref'] giving WDR id, or just media, perhaps + clip_id = media_link_obj['mediaObj'].get('ref') + if jsonp_url.endswith('.assetjsonp'): + asset = self._download_json( + jsonp_url, display_id, fatal=False, transform_source=strip_jsonp) + clip_id = try_get(asset, lambda x: x['trackerData']['trackerClipId'], compat_str) + if clip_id: + jsonp_url = self._asset_url(clip_id[4:]) entries.append(self.url_result(jsonp_url, ie=WDRIE.ie_key())) # Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html) @@ -286,16 +310,14 @@ class WDRPageIE(InfoExtractor): class WDRElefantIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)wdrmaus\.de/elefantenseite/#(?P<id>.+)' _TEST = { - 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', + 'url': 'http://www.wdrmaus.de/elefantenseite/#elefantenkino_wippe', + # adaptive stream: unstable file MD5 'info_dict': { - 'title': 'Folge Oster-Spezial 2015', - 'id': 'mdb-1088195', + 'title': 'Wippe', + 'id': 'mdb-1198320', 'ext': 'mp4', 'age_limit': None, - 'upload_date': '20150406' - }, - 'params': { - 'skip_download': True, + 'upload_date': '20071003' }, } @@ -330,6 +352,7 @@ class WDRMobileIE(InfoExtractor): /[0-9]+/[0-9]+/ (?P<id>[0-9]+)_(?P<title>[0-9]+)''' IE_NAME = 'wdr:mobile' + _WORKING = False # no such domain _TEST = { 'url': 'http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4', 'info_dict': { diff --git a/hypervideo_dl/extractor/webcaster.py b/hypervideo_dl/extractor/webcaster.py index e4b65f5..a858e99 100644 --- a/hypervideo_dl/extractor/webcaster.py +++ b/hypervideo_dl/extractor/webcaster.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + join_nonempty, xpath_text, ) @@ -34,12 +35,9 @@ class WebcasterIE(InfoExtractor): title = xpath_text(video, './/event_name', 'event name', fatal=True) - def make_id(parts, separator): - return separator.join(filter(None, parts)) - formats = [] for format_id in (None, 'noise'): - track_tag = make_id(('track', format_id), '_') + track_tag = join_nonempty('track', format_id, delim='_') for track in video.findall('.//iphone/%s' % track_tag): track_url = track.text if not track_url: @@ -48,7 +46,7 @@ class WebcasterIE(InfoExtractor): m3u8_formats = self._extract_m3u8_formats( track_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=make_id(('hls', format_id), '-'), fatal=False) + m3u8_id=join_nonempty('hls', format_id, delim='-'), fatal=False) for f in m3u8_formats: f.update({ 'source_preference': 0 if format_id == 'noise' else 1, diff --git a/hypervideo_dl/extractor/weibo.py b/hypervideo_dl/extractor/weibo.py index 621df5b..dafa2af 100644 --- a/hypervideo_dl/extractor/weibo.py +++ b/hypervideo_dl/extractor/weibo.py @@ -73,8 +73,7 @@ class WeiboIE(InfoExtractor): webpage = self._download_webpage( url, video_id, note='Revisiting webpage') - title = self._html_search_regex( - r'<title>(.+?)</title>', webpage, 'title') + title = self._html_extract_title(webpage) video_formats = compat_parse_qs(self._search_regex( r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) diff --git a/hypervideo_dl/extractor/whowatch.py b/hypervideo_dl/extractor/whowatch.py index f8bc2e7..e4b610d 100644 --- a/hypervideo_dl/extractor/whowatch.py +++ b/hypervideo_dl/extractor/whowatch.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, qualities, + try_call, try_get, ExtractorError, ) @@ -26,10 +27,10 @@ class WhoWatchIE(InfoExtractor): metadata = self._download_json('https://api.whowatch.tv/lives/%s' % video_id, video_id) live_data = self._download_json('https://api.whowatch.tv/lives/%s/play' % video_id, video_id) - title = try_get(None, ( - lambda x: live_data['share_info']['live_title'][1:-1], - lambda x: metadata['live']['title'], - ), compat_str) + title = try_call( + lambda: live_data['share_info']['live_title'][1:-1], + lambda: metadata['live']['title'], + expected_type=str) hls_url = live_data.get('hls_url') if not hls_url: diff --git a/hypervideo_dl/extractor/willow.py b/hypervideo_dl/extractor/willow.py new file mode 100644 index 0000000..4d3d62f --- /dev/null +++ b/hypervideo_dl/extractor/willow.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from ..utils import ExtractorError +from .common import InfoExtractor + + +class WillowIE(InfoExtractor): + _VALID_URL = r'https?://(www\.)?willow\.tv/videos/(?P<id>[0-9a-z-_]+)' + _GEO_COUNTRIES = ['US'] + + _TESTS = [{ + 'url': 'http://willow.tv/videos/d5winning-moment-eng-vs-ind-streaming-online-4th-test-india-tour-of-england-2021', + 'info_dict': { + 'id': '169662', + 'display_id': 'd5winning-moment-eng-vs-ind-streaming-online-4th-test-india-tour-of-england-2021', + 'ext': 'mp4', + 'title': 'Winning Moment: 4th Test, England vs India', + 'thumbnail': 'https://aimages.willow.tv/ytThumbnails/6748_D5winning_moment.jpg', + 'duration': 233, + 'timestamp': 1630947954, + 'upload_date': '20210906', + 'location': 'Kennington Oval, London', + 'series': 'India tour of England 2021', + }, + 'params': { + 'skip_download': True, # AES-encrypted m3u8 + }, + }, { + 'url': 'http://willow.tv/videos/highlights-short-ind-vs-nz-streaming-online-2nd-t20i-new-zealand-tour-of-india-2021', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = self._parse_json(self._html_search_regex( + r'var\s+data_js\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, + 'data_js'), video_id) + + video = next((v for v in video_data.get('trending_videos') or [] + if v.get('secureurl')), None) + if not video: + raise ExtractorError('No videos found') + + formats = self._extract_m3u8_formats(video['secureurl'], video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': str(video.get('content_id')), + 'display_id': video.get('video_slug'), + 'title': video.get('video_name') or self._html_search_meta('twitter:title', webpage), + 'formats': formats, + 'thumbnail': video.get('yt_thumb_url') or self._html_search_meta( + 'twitter:image', webpage, default=None), + 'duration': video.get('duration_seconds'), + 'timestamp': video.get('created_date'), + 'location': video.get('venue'), + 'series': video.get('series_name'), + } diff --git a/hypervideo_dl/extractor/wppilot.py b/hypervideo_dl/extractor/wppilot.py new file mode 100644 index 0000000..3003a0f --- /dev/null +++ b/hypervideo_dl/extractor/wppilot.py @@ -0,0 +1,177 @@ +# coding: utf-8 + +from .common import InfoExtractor +from ..utils import ( + try_get, + ExtractorError, +) + +import json +import random +import re + + +class WPPilotBaseIE(InfoExtractor): + _VIDEO_URL = 'https://pilot.wp.pl/api/v1/channel/%s' + _VIDEO_GUEST_URL = 'https://pilot.wp.pl/api/v1/guest/channel/%s' + + _HEADERS_WEB = { + 'Content-Type': 'application/json; charset=UTF-8', + 'Referer': 'https://pilot.wp.pl/tv/', + } + + def _get_channel_list(self, cache=True): + if cache is True: + cache_res = self._downloader.cache.load('wppilot', 'channel-list') + if cache_res: + return cache_res, True + webpage = self._download_webpage('https://pilot.wp.pl/tv/', None, 'Downloading webpage') + page_data_base_url = self._search_regex( + r'<script src="(https://wp-pilot-gatsby\.wpcdn\.pl/v[\d.-]+/desktop)', + webpage, 'gatsby build version') + '/page-data' + page_data = self._download_json(f'{page_data_base_url}/tv/page-data.json', None, 'Downloading page data') + for qhash in page_data['staticQueryHashes']: + qhash_content = self._download_json( + f'{page_data_base_url}/sq/d/{qhash}.json', None, + 'Searching for channel list') + channel_list = try_get(qhash_content, lambda x: x['data']['allChannels']['nodes']) + if channel_list is None: + continue + self._downloader.cache.store('wppilot', 'channel-list', channel_list) + return channel_list, False + raise ExtractorError('Unable to find the channel list') + + def _parse_channel(self, chan): + return { + 'id': str(chan['id']), + 'title': chan['name'], + 'is_live': True, + 'thumbnails': [{ + 'id': key, + 'url': chan[key], + } for key in ('thumbnail', 'thumbnail_mobile', 'icon') if chan.get(key)], + } + + +class WPPilotIE(WPPilotBaseIE): + _VALID_URL = r'(?:https?://pilot\.wp\.pl/tv/?#|wppilot:)(?P<id>[a-z\d-]+)' + IE_NAME = 'wppilot' + + _TESTS = [{ + 'url': 'https://pilot.wp.pl/tv/#telewizja-wp-hd', + 'info_dict': { + 'id': '158', + 'ext': 'mp4', + 'title': 'Telewizja WP HD', + }, + 'params': { + 'format': 'bestvideo', + }, + }, { + # audio only + 'url': 'https://pilot.wp.pl/tv/#radio-nowy-swiat', + 'info_dict': { + 'id': '238', + 'ext': 'm4a', + 'title': 'Radio Nowy Świat', + }, + 'params': { + 'format': 'bestaudio', + }, + }, { + 'url': 'wppilot:9', + 'only_matching': True, + }] + + def _get_channel(self, id_or_slug): + video_list, is_cached = self._get_channel_list(cache=True) + key = 'id' if re.match(r'^\d+$', id_or_slug) else 'slug' + for video in video_list: + if video.get(key) == id_or_slug: + return self._parse_channel(video) + # if cached channel not found, download and retry + if is_cached: + video_list, _ = self._get_channel_list(cache=False) + for video in video_list: + if video.get(key) == id_or_slug: + return self._parse_channel(video) + raise ExtractorError('Channel not found') + + def _real_extract(self, url): + video_id = self._match_id(url) + + channel = self._get_channel(video_id) + video_id = str(channel['id']) + + is_authorized = next((c for c in self._downloader.cookiejar if c.name == 'netviapisessid'), None) + # cookies starting with "g:" are assigned to guests + is_authorized = True if is_authorized is not None and not is_authorized.value.startswith('g:') else False + + video = self._download_json( + (self._VIDEO_URL if is_authorized else self._VIDEO_GUEST_URL) % video_id, + video_id, query={ + 'device_type': 'web', + }, headers=self._HEADERS_WEB, + expected_status=(200, 422)) + + stream_token = try_get(video, lambda x: x['_meta']['error']['info']['stream_token']) + if stream_token: + close = self._download_json( + 'https://pilot.wp.pl/api/v1/channels/close', video_id, + 'Invalidating previous stream session', headers=self._HEADERS_WEB, + data=json.dumps({ + 'channelId': video_id, + 't': stream_token, + }).encode('utf-8')) + if try_get(close, lambda x: x['data']['status']) == 'ok': + return self.url_result(url, ie=WPPilotIE.ie_key()) + + formats = [] + + for fmt in video['data']['stream_channel']['streams']: + # live DASH does not work for now + # if fmt['type'] == 'dash@live:abr': + # formats.extend( + # self._extract_mpd_formats( + # random.choice(fmt['url']), video_id)) + if fmt['type'] == 'hls@live:abr': + formats.extend( + self._extract_m3u8_formats( + random.choice(fmt['url']), + video_id, live=True)) + + self._sort_formats(formats) + + channel['formats'] = formats + return channel + + +class WPPilotChannelsIE(WPPilotBaseIE): + _VALID_URL = r'(?:https?://pilot\.wp\.pl/(?:tv/?)?(?:\?[^#]*)?#?|wppilot:)$' + IE_NAME = 'wppilot:channels' + + _TESTS = [{ + 'url': 'wppilot:', + 'info_dict': { + 'id': 'wppilot', + 'title': 'WP Pilot', + }, + 'playlist_mincount': 100, + }, { + 'url': 'https://pilot.wp.pl/', + 'only_matching': True, + }] + + def _entries(self): + channel_list, _ = self._get_channel_list() + for chan in channel_list: + entry = self._parse_channel(chan) + entry.update({ + '_type': 'url_transparent', + 'url': f'wppilot:{chan["id"]}', + 'ie_key': WPPilotIE.ie_key(), + }) + yield entry + + def _real_extract(self, url): + return self.playlist_result(self._entries(), 'wppilot', 'WP Pilot') diff --git a/hypervideo_dl/extractor/xinpianchang.py b/hypervideo_dl/extractor/xinpianchang.py new file mode 100644 index 0000000..9832d23 --- /dev/null +++ b/hypervideo_dl/extractor/xinpianchang.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + update_url_query, + url_or_none, +) + + +class XinpianchangIE(InfoExtractor): + _VALID_URL = r'https?://www\.xinpianchang\.com/(?P<id>[^/]+?)(?:\D|$)' + IE_NAME = 'xinpianchang' + IE_DESC = 'xinpianchang.com' + _TESTS = [{ + 'url': 'https://www.xinpianchang.com/a11766551', + 'info_dict': { + 'id': 'a11766551', + 'ext': 'mp4', + 'title': '北京2022冬奥会闭幕式再见短片-冰墩墩下班了', + 'description': 'md5:4a730c10639a82190fabe921c0fa4b87', + 'duration': 151, + 'thumbnail': r're:^https?://oss-xpc0\.xpccdn\.com.+/assets/', + 'uploader': '正时文创', + 'uploader_id': 10357277, + 'categories': ['宣传片', '国家城市', '广告', '其他'], + 'keywords': ['北京冬奥会', '冰墩墩', '再见', '告别', '冰墩墩哭了', '感动', '闭幕式', '熄火'] + }, + }, { + 'url': 'https://www.xinpianchang.com/a11762904', + 'info_dict': { + 'id': 'a11762904', + 'ext': 'mp4', + 'title': '冬奥会决胜时刻《法国派出三只鸡?》', + 'description': 'md5:55cb139ef8f48f0c877932d1f196df8b', + 'duration': 136, + 'thumbnail': r're:^https?://oss-xpc0\.xpccdn\.com.+/assets/', + 'uploader': '精品动画', + 'uploader_id': 10858927, + 'categories': ['动画', '三维CG'], + 'keywords': ['France Télévisions', '法国3台', '蠢萌', '冬奥会'] + }, + }, { + 'url': 'https://www.xinpianchang.com/a11779743?from=IndexPick&part=%E7%BC%96%E8%BE%91%E7%B2%BE%E9%80%89&index=2', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id=video_id) + domain = self.find_value_with_regex(var='requireNewDomain', webpage=webpage) + vid = self.find_value_with_regex(var='vid', webpage=webpage) + app_key = self.find_value_with_regex(var='modeServerAppKey', webpage=webpage) + api = update_url_query(f'{domain}/mod/api/v2/media/{vid}', {'appKey': app_key}) + data = self._download_json(api, video_id=video_id)['data'] + formats, subtitles = [], {} + for k, v in data.get('resource').items(): + if k in ('dash', 'hls'): + v_url = v.get('url') + if not v_url: + continue + if k == 'dash': + fmts, subs = self._extract_mpd_formats_and_subtitles(v_url, video_id=video_id) + elif k == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles(v_url, video_id=video_id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + elif k == 'progressive': + formats.extend([{ + 'url': url_or_none(prog.get('url')), + 'width': int_or_none(prog.get('width')), + 'height': int_or_none(prog.get('height')), + 'ext': 'mp4', + } for prog in v if prog.get('url') or []]) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': data.get('title'), + 'description': data.get('description'), + 'duration': int_or_none(data.get('duration')), + 'categories': data.get('categories'), + 'keywords': data.get('keywords'), + 'thumbnail': data.get('cover'), + 'uploader': try_get(data, lambda x: x['owner']['username']), + 'uploader_id': try_get(data, lambda x: x['owner']['id']), + 'formats': formats, + 'subtitles': subtitles, + } + + def find_value_with_regex(self, var, webpage): + return self._search_regex(rf'var\s{var}\s=\s\"(?P<vid>[^\"]+)\"', webpage, name=var) diff --git a/hypervideo_dl/extractor/xnxx.py b/hypervideo_dl/extractor/xnxx.py index dd4fb54..27f9916 100644 --- a/hypervideo_dl/extractor/xnxx.py +++ b/hypervideo_dl/extractor/xnxx.py @@ -13,7 +13,7 @@ from ..utils import ( class XNXXIE(InfoExtractor): - _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/' + _VALID_URL = r'https?://(?:video|www)\.xnxx3?\.com/video-?(?P<id>[0-9a-z]+)/' _TESTS = [{ 'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video', 'md5': '7583e96c15c0f21e9da3453d9920fbba', @@ -32,6 +32,9 @@ class XNXXIE(InfoExtractor): }, { 'url': 'http://www.xnxx.com/video-55awb78/', 'only_matching': True, + }, { + 'url': 'http://www.xnxx3.com/video-55awb78/', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/xvideos.py b/hypervideo_dl/extractor/xvideos.py index 8fc6491..d5261b6 100644 --- a/hypervideo_dl/extractor/xvideos.py +++ b/hypervideo_dl/extractor/xvideos.py @@ -19,25 +19,41 @@ class XVideosIE(InfoExtractor): (?: (?:[^/]+\.)?xvideos2?\.com/video| (?:www\.)?xvideos\.es/video| - flashservice\.xvideos\.com/embedframe/| + (?:www|flashservice)\.xvideos\.com/embedframe/| static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video= ) (?P<id>[0-9]+) ''' _TESTS = [{ - 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl', + 'url': 'https://www.xvideos.com/video4588838/motorcycle_guy_cucks_influencer_steals_his_gf', 'md5': '14cea69fcb84db54293b1e971466c2e1', 'info_dict': { 'id': '4588838', 'ext': 'mp4', - 'title': 'Biker Takes his Girl', + 'title': 'Motorcycle Guy Cucks Influencer, Steals his GF', 'duration': 108, 'age_limit': 18, + 'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg', + } + }, { + # Broken HLS formats + 'url': 'https://www.xvideos.com/video65982001/what_s_her_name', + 'md5': 'b82d7d7ef7d65a84b1fa6965f81f95a5', + 'info_dict': { + 'id': '65982001', + 'ext': 'mp4', + 'title': 'what\'s her name?', + 'duration': 120, + 'age_limit': 18, + 'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg', } }, { 'url': 'https://flashservice.xvideos.com/embedframe/4588838', 'only_matching': True, }, { + 'url': 'https://www.xvideos.com/embedframe/4588838', + 'only_matching': True, + }, { 'url': 'http://static-hw.xvideos.com/swf/xv-player.swf?id_video=4588838', 'only_matching': True, }, { @@ -80,9 +96,7 @@ class XVideosIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - webpage = self._download_webpage( - 'https://www.xvideos.com/video%s/' % video_id, video_id) + webpage = self._download_webpage(url, video_id) mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) if mobj: @@ -125,9 +139,11 @@ class XVideosIE(InfoExtractor): r'setVideo([^(]+)\((["\'])(http.+?)\2\)', webpage): format_id = kind.lower() if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( + hls_formats = self._extract_m3u8_formats( format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + self._check_formats(hls_formats, video_id) + formats.extend(hls_formats) elif format_id in ('urllow', 'urlhigh'): formats.append({ 'url': format_url, diff --git a/hypervideo_dl/extractor/yahoo.py b/hypervideo_dl/extractor/yahoo.py index 53556de..20504de 100644 --- a/hypervideo_dl/extractor/yahoo.py +++ b/hypervideo_dl/extractor/yahoo.py @@ -264,7 +264,7 @@ class YahooIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'formats': formats, 'thumbnails': thumbnails, 'description': clean_html(video.get('description')), @@ -414,11 +414,14 @@ class YahooGyaOIE(InfoExtractor): IE_NAME = 'yahoo:gyao' _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:p|title(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/p/y)/(?P<id>\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TESTS = [{ - 'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/', + 'url': 'https://gyao.yahoo.co.jp/title/%E3%82%BF%E3%82%A4%E3%83%A0%E3%83%9C%E3%82%AB%E3%83%B3%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA%20%E3%83%A4%E3%83%83%E3%82%BF%E3%83%BC%E3%83%9E%E3%83%B3/5f60ceb3-6e5e-40ef-ba40-d68b598d067f', 'info_dict': { - 'id': '00449:v03102', + 'id': '5f60ceb3-6e5e-40ef-ba40-d68b598d067f', }, - 'playlist_count': 2, + 'playlist_mincount': 80, + }, { + 'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/', + 'only_matching': True, }, { 'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/', 'only_matching': True, @@ -430,19 +433,30 @@ class YahooGyaOIE(InfoExtractor): 'only_matching': True, }] + def _entries(self, program_id): + page = 1 + while True: + playlist = self._download_json( + f'https://gyao.yahoo.co.jp/api/programs/{program_id}/videos?page={page}', program_id, + note=f'Downloading JSON metadata page {page}') + if not playlist: + break + for video in playlist['videos']: + video_id = video.get('id') + if not video_id: + continue + if video.get('streamingAvailability') == 'notYet': + continue + yield self.url_result( + 'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'), + YahooGyaOPlayerIE.ie_key(), video_id) + if playlist.get('ended'): + break + page += 1 + def _real_extract(self, url): program_id = self._match_id(url).replace('/', ':') - videos = self._download_json( - 'https://gyao.yahoo.co.jp/api/programs/%s/videos' % program_id, program_id)['videos'] - entries = [] - for video in videos: - video_id = video.get('id') - if not video_id: - continue - entries.append(self.url_result( - 'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'), - YahooGyaOPlayerIE.ie_key(), video_id)) - return self.playlist_result(entries, program_id) + return self.playlist_result(self._entries(program_id), program_id) class YahooJapanNewsIE(InfoExtractor): @@ -519,7 +533,7 @@ class YahooJapanNewsIE(InfoExtractor): title = self._html_search_meta( ['og:title', 'twitter:title'], webpage, 'title', default=None - ) or self._html_search_regex('<title>([^<]+)</title>', webpage, 'title') + ) or self._html_extract_title(webpage) if display_id == host: # Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...) diff --git a/hypervideo_dl/extractor/yandexvideo.py b/hypervideo_dl/extractor/yandexvideo.py index 9974d65..7d3966b 100644 --- a/hypervideo_dl/extractor/yandexvideo.py +++ b/hypervideo_dl/extractor/yandexvideo.py @@ -7,9 +7,11 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + extract_attributes, int_or_none, try_get, url_or_none, + lowercase_escape, ) @@ -147,8 +149,46 @@ class YandexVideoIE(InfoExtractor): } +class YandexVideoPreviewIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?yandex\.ru/video/preview(?:/?\?.*?filmId=|/)(?P<id>\d+)' + _TESTS = [{ # Odnoklassniki + 'url': 'https://yandex.ru/video/preview/?filmId=10682852472978372885&text=summer', + 'info_dict': { + 'id': '1352565459459', + 'ext': 'mp4', + 'like_count': int, + 'upload_date': '20191202', + 'age_limit': 0, + 'duration': 196, + 'thumbnail': 'https://i.mycdn.me/videoPreview?id=544866765315&type=37&idx=13&tkn=TY5qjLYZHxpmcnK8U2LgzYkgmaU&fn=external_8', + 'uploader_id': '481054701571', + 'title': 'LOFT - summer, summer, summer HD', + 'uploader': 'АРТЁМ КУДРОВ', + }, + }, { # youtube + 'url': 'https://yandex.ru/video/preview/?filmId=4479424425337895262&source=main_redirect&text=видео&utm_source=main_stripe_big', + 'only_matching': True, + }, { # YandexVideo + 'url': 'https://yandex.ru/video/preview/5275069442094787341', + 'only_matching': True, + }, { # youtube + 'url': 'https://yandex.ru/video/preview/?filmId=16658118429797832897&from=tabbar&p=1&text=%D0%BF%D1%80%D0%BE%D1%81%D0%BC%D0%BE%D1%82%D1%80+%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82%D0%B0+%D0%BC%D0%B0%D0%BB%D0%B5%D0%BD%D1%8C%D0%BA%D0%B8%D0%B9+%D0%BF%D1%80%D0%B8%D0%BD%D1%86+%D0%BC%D1%8B+%D0%B2+%D0%BE%D1%82%D0%B2%D0%B5%D1%82%D0%B5+%D0%B7%D0%B0+%D1%82%D0%B5%D1%85+%D0%BA%D0%BE%D0%B3%D0%BE+%D0%BF%D1%80%D0%B8%D1%80%D1%83%D1%87%D0%B8%D0%BB%D0%B8', + 'only_matching': True, + }, { # Odnoklassniki + 'url': 'https://yandex.ru/video/preview/?text=Francis%20Lai%20-%20Le%20Bon%20Et%20Les%20MC)chants&path=wizard&parent-reqid=1643208087979310-1481782809207673478-sas3-0931-2f9-sas-l7-balancer-8080-BAL-9380&wiz_type=vital&filmId=12508152936505397283', + 'only_matching': True, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_raw = self._search_regex(r'window.Ya.__inline_params__\s*=\s*JSON.parse\(\'([^"]+?\\u0022video\\u0022:[^"]+?})\'\);', webpage, 'data_raw') + data_json = self._parse_json(data_raw, id, transform_source=lowercase_escape) + return self.url_result(data_json['video']['url']) + + class ZenYandexIE(InfoExtractor): - _VALID_URL = r'https?://zen\.yandex\.ru/media/(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-(?P<id>[a-z0-9-]+)' + _VALID_URL = r'https?://zen\.yandex\.ru(?:/video)?/(media|watch)/(?:(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-)?(?P<id>[a-z0-9-]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/media/popmech/izverjenie-vulkana-iz-spichek-zreliscnyi-opyt-6002240ff8b1af50bb2da5e3', 'info_dict': { @@ -156,19 +196,38 @@ class ZenYandexIE(InfoExtractor): 'ext': 'mp4', 'title': 'Извержение вулкана из спичек: зрелищный опыт', 'description': 'md5:053ad3c61b5596d510c9a199dc8ee633', - 'thumbnail': 'https://avatars.mds.yandex.net/get-zen-pub-og/3558619/pub_6002240ff8b1af50bb2da5e3_600bad814d953e4132a30b5e/orig', + 'thumbnail': 're:^https://avatars.mds.yandex.net/', 'uploader': 'Популярная механика', }, + 'params': { + 'skip_download': 'm3u8', + }, }, { 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/vot-eto-focus-dedy-morozy-na-gidrociklah-60c7c443da18892ebfe85ed7', 'info_dict': { 'id': '60c7c443da18892ebfe85ed7', 'ext': 'mp4', 'title': 'ВОТ ЭТО Focus. Деды Морозы на гидроциклах', - 'description': 'md5:8684912f6086f298f8078d4af0e8a600', - 'thumbnail': 'https://avatars.mds.yandex.net/get-zen-pub-og/4410519/pub_60c7c443da18892ebfe85ed7_60c7c48e060a163121f42cc3/orig', + 'description': 'md5:f3db3d995763b9bbb7b56d4ccdedea89', + 'thumbnail': 're:^https://avatars.mds.yandex.net/', 'uploader': 'AcademeG DailyStream' }, + 'params': { + 'skip_download': 'm3u8', + 'format': 'bestvideo', + }, + }, { + 'url': 'https://zen.yandex.ru/video/watch/6002240ff8b1af50bb2da5e3', + 'info_dict': { + 'id': '6002240ff8b1af50bb2da5e3', + 'ext': 'mp4', + 'title': 'Извержение вулкана из спичек: зрелищный опыт', + 'description': 'md5:053ad3c61b5596d510c9a199dc8ee633', + 'uploader': 'Популярная механика', + }, + 'params': { + 'skip_download': 'm3u8', + }, }, { 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/novyi-samsung-fold-3-moskvich-barahlit-612f93b7f8d48e7e945792a2?from=channel&rid=2286618386.482.1630817595976.42360', 'only_matching': True, @@ -177,23 +236,37 @@ class ZenYandexIE(InfoExtractor): def _real_extract(self, url): id = self._match_id(url) webpage = self._download_webpage(url, id) - data_json = self._parse_json(self._search_regex(r'w\._data\s?=\s?({.+?});', webpage, 'metadata'), id) - stream_json = try_get(data_json, lambda x: x['publication']['content']['gifContent'], dict) - stream_url = stream_json.get('stream') or try_get(stream_json, lambda x: x['streams']['url']) - formats = self._extract_m3u8_formats(stream_url, id) + data_json = self._parse_json( + self._search_regex(r'data\s*=\s*({["\']_*serverState_*video.+?});', webpage, 'metadata'), id) + serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', + webpage, 'server state').replace('State', 'Settings') + uploader = self._search_regex(r'(<a\s*class=["\']card-channel-link[^"\']+["\'][^>]+>)', + webpage, 'uploader', default='<a>') + uploader_name = extract_attributes(uploader).get('aria-label') + video_json = try_get(data_json, lambda x: x[serverstate]['exportData']['video'], dict) + stream_urls = try_get(video_json, lambda x: x['video']['streams']) + formats = [] + for s_url in stream_urls: + ext = determine_ext(s_url) + if ext == 'mpd': + formats.extend(self._extract_mpd_formats(s_url, id, mpd_id='dash')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats(s_url, id, 'mp4')) self._sort_formats(formats) return { 'id': id, - 'title': try_get(data_json, (lambda x: x['og']['title'], lambda x: x['publication']['content']['preview']['title'])), - 'uploader': data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']), - 'description': try_get(data_json, lambda x: x['og']['description']), - 'thumbnail': try_get(data_json, lambda x: x['og']['imageUrl']), + 'title': video_json.get('title') or self._og_search_title(webpage), 'formats': formats, + 'duration': int_or_none(video_json.get('duration')), + 'view_count': int_or_none(video_json.get('views')), + 'uploader': uploader_name or data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']), + 'description': self._og_search_description(webpage) or try_get(data_json, lambda x: x['og']['description']), + 'thumbnail': self._og_search_thumbnail(webpage) or try_get(data_json, lambda x: x['og']['imageUrl']), } class ZenYandexChannelIE(InfoExtractor): - _VALID_URL = r'https?://zen\.yandex\.ru/(?!media)(?:id/)?(?P<id>[a-z0-9-_]+)' + _VALID_URL = r'https?://zen\.yandex\.ru/(?!media|video)(?:id/)?(?P<id>[a-z0-9-_]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/tok_media', 'info_dict': { diff --git a/hypervideo_dl/extractor/youjizz.py b/hypervideo_dl/extractor/youjizz.py index 5f5fbf2..111623f 100644 --- a/hypervideo_dl/extractor/youjizz.py +++ b/hypervideo_dl/extractor/youjizz.py @@ -36,8 +36,7 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'<title>(.+?)</title>', webpage, 'title') + title = self._html_extract_title(webpage) formats = [] diff --git a/hypervideo_dl/extractor/younow.py b/hypervideo_dl/extractor/younow.py index 04dbc87..583aea3 100644 --- a/hypervideo_dl/extractor/younow.py +++ b/hypervideo_dl/extractor/younow.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, + format_field, int_or_none, try_get, ) @@ -58,7 +59,7 @@ class YouNowLiveIE(InfoExtractor): return { 'id': uploader, 'is_live': True, - 'title': self._live_title(uploader), + 'title': uploader, 'thumbnail': data.get('awsUrl'), 'tags': data.get('tags'), 'categories': data.get('tags'), @@ -93,7 +94,7 @@ def _extract_moment(item, fatal=True): uploader = try_get(item, lambda x: x['owner']['name'], compat_str) uploader_id = try_get(item, lambda x: x['owner']['userId']) - uploader_url = 'https://www.younow.com/%s' % uploader if uploader else None + uploader_url = format_field(uploader, template='https://www.younow.com/%s') entry = { 'extractor_key': 'YouNowMoment', diff --git a/hypervideo_dl/extractor/youtube.py b/hypervideo_dl/extractor/youtube.py index dc5ee63..dec3b14 100644 --- a/hypervideo_dl/extractor/youtube.py +++ b/hypervideo_dl/extractor/youtube.py @@ -2,18 +2,21 @@ from __future__ import unicode_literals -import base64 import calendar import copy import datetime +import functools import hashlib import itertools import json +import math import os.path import random import re +import sys import time import traceback +import threading from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( @@ -28,7 +31,7 @@ from ..compat import ( ) from ..jsinterp import JSInterpreter from ..utils import ( - bytes_to_intlist, + bug_reports_message, clean_html, datetime_from_str, dict_get, @@ -36,11 +39,14 @@ from ..utils import ( ExtractorError, float_or_none, format_field, + get_first, int_or_none, - intlist_to_bytes, is_html, + join_nonempty, + js_to_json, mimetype2ext, network_exceptions, + NO_DEFAULT, orderedSet, parse_codecs, parse_count, @@ -53,10 +59,12 @@ from ..utils import ( smuggle_url, str_or_none, str_to_int, + strftime_or_none, traverse_obj, try_get, unescapeHTML, unified_strdate, + unified_timestamp, unsmuggle_url, update_url_query, url_or_none, @@ -72,7 +80,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB', - 'clientVersion': '2.20210622.10.00', + 'clientVersion': '2.20211221.00.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1 @@ -82,7 +90,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_EMBEDDED_PLAYER', - 'clientVersion': '1.20210620.0.1', + 'clientVersion': '1.20211215.00.01', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 56 @@ -93,96 +101,96 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_REMIX', - 'clientVersion': '1.20210621.00.00', + 'clientVersion': '1.20211213.00.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, }, 'web_creator': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyBUPetSUmoZL-OhlxA7wSac5XinrygCqMo', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_CREATOR', - 'clientVersion': '1.20210621.00.00', + 'clientVersion': '1.20211220.02.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, }, 'android': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID', - 'clientVersion': '16.20', + 'clientVersion': '16.49', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, 'REQUIRE_JS_PLAYER': False }, 'android_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyCjc_pVEDi4qsv5MtC2dMXzpIaDoRFLsxw', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_EMBEDDED_PLAYER', - 'clientVersion': '16.20', + 'clientVersion': '16.49', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, 'REQUIRE_JS_PLAYER': False }, 'android_music': { - 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30', - 'INNERTUBE_HOST': 'music.youtube.com', + 'INNERTUBE_API_KEY': 'AIzaSyAOghZGza2MQSZkY_zfZ370N-PUdXEo8AI', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_MUSIC', - 'clientVersion': '4.32', + 'clientVersion': '4.57', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, 'REQUIRE_JS_PLAYER': False }, 'android_creator': { + 'INNERTUBE_API_KEY': 'AIzaSyD_qjV8zaaUMehtLkrKFgVeSX_Iqbtyws8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_CREATOR', - 'clientVersion': '21.24.100', + 'clientVersion': '21.47', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, 'REQUIRE_JS_PLAYER': False }, - # ios has HLS live streams - # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680 + # iOS clients have HLS live streams. Setting device model to get 60fps formats. + # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 'ios': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS', - 'clientVersion': '16.20', + 'clientVersion': '16.46', + 'deviceModel': 'iPhone14,3', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, 'REQUIRE_JS_PLAYER': False }, 'ios_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MESSAGES_EXTENSION', - 'clientVersion': '16.20', + 'clientVersion': '16.46', + 'deviceModel': 'iPhone14,3', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, 'REQUIRE_JS_PLAYER': False }, 'ios_music': { - 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og', - 'INNERTUBE_HOST': 'music.youtube.com', + 'INNERTUBE_API_KEY': 'AIzaSyBAETezhkwP0ZWA02RsqT1zu78Fpt0bC_s', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MUSIC', - 'clientVersion': '4.32', + 'clientVersion': '4.57', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, @@ -192,7 +200,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_CREATOR', - 'clientVersion': '21.24.100', + 'clientVersion': '21.47', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, @@ -201,39 +209,61 @@ INNERTUBE_CLIENTS = { # mweb has 'ultralow' formats # See: https://github.com/hypervideo/hypervideo/pull/557 'mweb': { - 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8', + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'MWEB', - 'clientVersion': '2.20210721.07.00', + 'clientVersion': '2.20211221.01.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2 }, + # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option) + # See: https://github.com/zerodytrash/YouTube-Internal-Clients + 'tv_embedded': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', + 'clientVersion': '2.0', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 85 + }, } +def _split_innertube_client(client_name): + variant, *base = client_name.rsplit('.', 1) + if base: + return variant, base[0], variant + base, *variant = client_name.split('_', 1) + return client_name, base, variant[0] if variant else None + + def build_innertube_clients(): - third_party = { - 'embedUrl': 'https://google.com', # Can be any valid URL + THIRD_PARTY = { + 'embedUrl': 'https://www.youtube.com/', # Can be any valid URL } - base_clients = ('android', 'web', 'ios', 'mweb') - priority = qualities(base_clients[::-1]) + BASE_CLIENTS = ('android', 'web', 'tv', 'ios', 'mweb') + priority = qualities(BASE_CLIENTS[::-1]) for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8') ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') - ytcfg['priority'] = 10 * priority(client.split('_', 1)[0]) - - if client in base_clients: - INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg) - agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED' - agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party - agegate_ytcfg['priority'] -= 1 - elif client.endswith('_embedded'): - ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party + + _, base_client, variant = _split_innertube_client(client) + ytcfg['priority'] = 10 * priority(base_client) + + if not variant: + INNERTUBE_CLIENTS[f'{client}_embedscreen'] = embedscreen = copy.deepcopy(ytcfg) + embedscreen['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED' + embedscreen['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY + embedscreen['priority'] -= 3 + elif variant == 'embedded': + ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY ytcfg['priority'] -= 2 else: ytcfg['priority'] -= 3 @@ -247,31 +277,82 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _RESERVED_NAMES = ( r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|' - r'shorts|movies|results|shared|hashtag|trending|feed|feeds|' + r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|' r'browse|oembed|get_video_info|iframe_api|s/player|' r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout') _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' - _NETRC_MACHINE = 'youtube' + # _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - def _login(self): - """ - Attempt to log in to YouTube. - If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. - """ - - if (self._LOGIN_REQUIRED - and self.get_param('cookiefile') is None - and self.get_param('cookiesfrombrowser') is None): - self.raise_login_required( - 'Login details are needed to download this content', method='cookies') - username, password = self._get_login_info() - if username: - self.report_warning(f'Cannot login to YouTube using username and password. {self._LOGIN_HINTS["cookies"]}') + _INVIDIOUS_SITES = ( + # invidious-redirect websites + r'(?:www\.)?redirect\.invidious\.io', + r'(?:(?:www|dev)\.)?invidio\.us', + # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md + r'(?:www\.)?invidious\.pussthecat\.org', + r'(?:www\.)?invidious\.zee\.li', + r'(?:www\.)?invidious\.ethibox\.fr', + r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', + r'(?:www\.)?osbivz6guyeahrwp2lnwyjk2xos342h4ocsxyqrlaopqjuhwn2djiiyd\.onion', + r'(?:www\.)?u2cvlit75owumwpy4dj2hsmvkq7nvrclkpht7xgyye2pyoxhpmclkrad\.onion', + # youtube-dl invidious instances list + r'(?:(?:www|no)\.)?invidiou\.sh', + r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', + r'(?:www\.)?invidious\.kabi\.tk', + r'(?:www\.)?invidious\.mastodon\.host', + r'(?:www\.)?invidious\.zapashcanon\.fr', + r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks', + r'(?:www\.)?invidious\.tinfoil-hat\.net', + r'(?:www\.)?invidious\.himiko\.cloud', + r'(?:www\.)?invidious\.reallyancient\.tech', + r'(?:www\.)?invidious\.tube', + r'(?:www\.)?invidiou\.site', + r'(?:www\.)?invidious\.site', + r'(?:www\.)?invidious\.xyz', + r'(?:www\.)?invidious\.nixnet\.xyz', + r'(?:www\.)?invidious\.048596\.xyz', + r'(?:www\.)?invidious\.drycat\.fr', + r'(?:www\.)?inv\.skyn3t\.in', + r'(?:www\.)?tube\.poal\.co', + r'(?:www\.)?tube\.connect\.cafe', + r'(?:www\.)?vid\.wxzm\.sx', + r'(?:www\.)?vid\.mint\.lgbt', + r'(?:www\.)?vid\.puffyan\.us', + r'(?:www\.)?yewtu\.be', + r'(?:www\.)?yt\.elukerio\.org', + r'(?:www\.)?yt\.lelux\.fi', + r'(?:www\.)?invidious\.ggc-project\.de', + r'(?:www\.)?yt\.maisputain\.ovh', + r'(?:www\.)?ytprivate\.com', + r'(?:www\.)?invidious\.13ad\.de', + r'(?:www\.)?invidious\.toot\.koeln', + r'(?:www\.)?invidious\.fdn\.fr', + r'(?:www\.)?watch\.nettohikari\.com', + r'(?:www\.)?invidious\.namazso\.eu', + r'(?:www\.)?invidious\.silkky\.cloud', + r'(?:www\.)?invidious\.exonip\.de', + r'(?:www\.)?invidious\.riverside\.rocks', + r'(?:www\.)?invidious\.blamefran\.net', + r'(?:www\.)?invidious\.moomoo\.de', + r'(?:www\.)?ytb\.trom\.tf', + r'(?:www\.)?yt\.cyberhost\.uk', + r'(?:www\.)?kgg2m7yk5aybusll\.onion', + r'(?:www\.)?qklhadlycap4cnod\.onion', + r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', + r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion', + r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion', + r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', + r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', + r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', + r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', + r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', + r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', + r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', + ) def _initialize_consent(self): cookies = self._get_cookies('https://www.youtube.com/') @@ -288,9 +369,25 @@ class YoutubeBaseInfoExtractor(InfoExtractor): consent_id = random.randint(100, 999) self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + def _initialize_pref(self): + cookies = self._get_cookies('https://www.youtube.com/') + pref_cookie = cookies.get('PREF') + pref = {} + if pref_cookie: + try: + pref = dict(compat_urlparse.parse_qsl(pref_cookie.value)) + except ValueError: + self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) + pref.update({'hl': 'en', 'tz': 'UTC'}) + self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref)) + def _real_initialize(self): + self._initialize_pref() self._initialize_consent() - self._login() + if (self._LOGIN_REQUIRED + and self.get_param('cookiefile') is None + and self.get_param('cookiesfrombrowser') is None): + self.raise_login_required('Login details are needed to download this content', method='cookies') _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' @@ -321,23 +418,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client) def _extract_context(self, ytcfg=None, default_client='web'): - _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict) - context = _get_context(ytcfg) - if context: - return context - - context = _get_context(self._get_default_ytcfg(default_client)) - if not ytcfg: - return context - - # Recreate the client context (required) - context['client'].update({ - 'clientVersion': self._extract_client_version(ytcfg, default_client), - 'clientName': self._extract_client_name(ytcfg, default_client), - }) - visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str) - if visitor_data: - context['client']['visitorData'] = visitor_data + context = get_first( + (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) + # Enforce language and tz for extraction + client_context = traverse_obj(context, 'client', expected_type=dict, default={}) + client_context.update({'hl': 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0}) return context _SAPISID = None @@ -381,7 +466,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep), video_id=video_id, fatal=fatal, note=note, errnote=errnote, data=json.dumps(data).encode('utf8'), headers=real_headers, - query={'key': api_key or self._extract_api_key()}) + query={'key': api_key or self._extract_api_key(), 'prettyPrint': 'false'}) def extract_yt_initial_data(self, item_id, webpage, fatal=True): data = self._search_regex( @@ -437,9 +522,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): Extracts visitorData from an API response or ytcfg Appears to be used to track session state """ - return traverse_obj( - args, (..., ('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))), - expected_type=compat_str, get_all=False) + return get_first( + args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], + expected_type=str) @property def is_authenticated(self): @@ -594,6 +679,72 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if text: return text + def _get_count(self, data, *path_list): + count_text = self._get_text(data, *path_list) or '' + count = parse_count(count_text) + if count is None: + count = str_to_int( + self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None)) + return count + + @staticmethod + def _extract_thumbnails(data, *path_list): + """ + Extract thumbnails from thumbnails dict + @param path_list: path list to level that contains 'thumbnails' key + """ + thumbnails = [] + for path in path_list or [()]: + for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...), default=[]): + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + # Sometimes youtube gives a wrong thumbnail URL. See: + # https://github.com/hypervideo/hypervideo/issues/233 + # https://github.com/ytdl-org/youtube-dl/issues/28023 + if 'maxresdefault' in thumbnail_url: + thumbnail_url = thumbnail_url.split('?')[0] + thumbnails.append({ + 'url': thumbnail_url, + 'height': int_or_none(thumbnail.get('height')), + 'width': int_or_none(thumbnail.get('width')), + }) + return thumbnails + + @staticmethod + def extract_relative_time(relative_time_text): + """ + Extracts a relative time from string and converts to dt object + e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today' + """ + mobj = re.search(r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text) + if mobj: + start = mobj.group('start') + if start: + return datetime_from_str(start) + try: + return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit'))) + except ValueError: + return None + + def _extract_time_text(self, renderer, *path_list): + text = self._get_text(renderer, *path_list) or '' + dt = self.extract_relative_time(text) + timestamp = None + if isinstance(dt, datetime.datetime): + timestamp = calendar.timegm(dt.timetuple()) + + if timestamp is None: + timestamp = ( + unified_timestamp(text) or unified_timestamp( + self._search_regex( + (r'([a-z]+\s*\d{1,2},?\s*20\d{2})', r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*(?:on|for))?\s*(.+\d)'), + text.lower(), 'time text', default=None))) + + if text and timestamp is None: + self.report_warning(f"Cannot parse localized time text '{text}'" + bug_reports_message(), only_once=True) + return timestamp, text + def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None, ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None, default_client='web'): @@ -617,13 +768,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor): note='%s%s' % (note, ' (retry #%d)' % count if count else '')) except ExtractorError as e: if isinstance(e.cause, network_exceptions): - if isinstance(e.cause, compat_HTTPError) and not is_html(e.cause.read(512)): - e.cause.seek(0) - yt_error = try_get( - self._parse_json(e.cause.read().decode(), item_id, fatal=False), - lambda x: x['error']['message'], compat_str) - if yt_error: - self._report_alerts([('ERROR', yt_error)], fatal=False) + if isinstance(e.cause, compat_HTTPError): + first_bytes = e.cause.read(512) + if not is_html(first_bytes): + yt_error = try_get( + self._parse_json( + self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), + lambda x: x['error']['message'], compat_str) + if yt_error: + self._report_alerts([('ERROR', yt_error)], fatal=False) # Downloading page may result in intermittent 5xx HTTP error # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 # We also want to catch all other network exceptions since errors in later pages can be troublesome @@ -674,91 +827,58 @@ class YoutubeBaseInfoExtractor(InfoExtractor): description = self._get_text(renderer, 'descriptionSnippet') duration = parse_duration(self._get_text( renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text'))) - view_count_text = self._get_text(renderer, 'viewCountText') or '' - view_count = str_to_int(self._search_regex( - r'^([\d,]+)', re.sub(r'\s', '', view_count_text), - 'view count', default=None)) + if duration is None: + duration = parse_duration(self._search_regex( + r'(?i)(ago)(?!.*\1)\s+(?P<duration>[a-z0-9 ,]+?)(?:\s+[\d,]+\s+views)?(?:\s+-\s+play\s+short)?$', + traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str), + video_id, default=None, group='duration')) + + view_count = self._get_count(renderer, 'viewCountText') uploader = self._get_text(renderer, 'ownerText', 'shortBylineText') + channel_id = traverse_obj( + renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), + expected_type=str, get_all=False) + timestamp, time_text = self._extract_time_text(renderer, 'publishedTimeText') + scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False)) + overlay_style = traverse_obj( + renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), + get_all=False, expected_type=str) + badges = self._extract_badges(renderer) + thumbnails = self._extract_thumbnails(renderer, 'thumbnail') + navigation_url = urljoin('https://www.youtube.com/', traverse_obj( + renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), + expected_type=str)) or '' + url = f'https://www.youtube.com/watch?v={video_id}' + if overlay_style == 'SHORTS' or '/shorts/' in navigation_url: + url = f'https://www.youtube.com/shorts/{video_id}' return { '_type': 'url', 'ie_key': YoutubeIE.ie_key(), 'id': video_id, - 'url': f'https://www.youtube.com/watch?v={video_id}', + 'url': url, 'title': title, 'description': description, 'duration': duration, 'view_count': view_count, 'uploader': uploader, + 'channel_id': channel_id, + 'thumbnails': thumbnails, + 'upload_date': (strftime_or_none(timestamp, '%Y%m%d') + if self._configuration_arg('approximate_date', ie_key='youtubetab') + else None), + 'live_status': ('is_upcoming' if scheduled_timestamp is not None + else 'was_live' if 'streamed' in time_text.lower() + else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges + else None), + 'release_timestamp': scheduled_timestamp, + 'availability': self._availability(needs_premium='premium' in badges, needs_subscription='members only' in badges) } class YoutubeIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com' - _INVIDIOUS_SITES = ( - # invidious-redirect websites - r'(?:www\.)?redirect\.invidious\.io', - r'(?:(?:www|dev)\.)?invidio\.us', - # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md - r'(?:www\.)?invidious\.pussthecat\.org', - r'(?:www\.)?invidious\.zee\.li', - r'(?:www\.)?invidious\.ethibox\.fr', - r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', - # youtube-dl invidious instances list - r'(?:(?:www|no)\.)?invidiou\.sh', - r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', - r'(?:www\.)?invidious\.kabi\.tk', - r'(?:www\.)?invidious\.mastodon\.host', - r'(?:www\.)?invidious\.zapashcanon\.fr', - r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks', - r'(?:www\.)?invidious\.tinfoil-hat\.net', - r'(?:www\.)?invidious\.himiko\.cloud', - r'(?:www\.)?invidious\.reallyancient\.tech', - r'(?:www\.)?invidious\.tube', - r'(?:www\.)?invidiou\.site', - r'(?:www\.)?invidious\.site', - r'(?:www\.)?invidious\.xyz', - r'(?:www\.)?invidious\.nixnet\.xyz', - r'(?:www\.)?invidious\.048596\.xyz', - r'(?:www\.)?invidious\.drycat\.fr', - r'(?:www\.)?inv\.skyn3t\.in', - r'(?:www\.)?tube\.poal\.co', - r'(?:www\.)?tube\.connect\.cafe', - r'(?:www\.)?vid\.wxzm\.sx', - r'(?:www\.)?vid\.mint\.lgbt', - r'(?:www\.)?vid\.puffyan\.us', - r'(?:www\.)?yewtu\.be', - r'(?:www\.)?yt\.elukerio\.org', - r'(?:www\.)?yt\.lelux\.fi', - r'(?:www\.)?invidious\.ggc-project\.de', - r'(?:www\.)?yt\.maisputain\.ovh', - r'(?:www\.)?ytprivate\.com', - r'(?:www\.)?invidious\.13ad\.de', - r'(?:www\.)?invidious\.toot\.koeln', - r'(?:www\.)?invidious\.fdn\.fr', - r'(?:www\.)?watch\.nettohikari\.com', - r'(?:www\.)?invidious\.namazso\.eu', - r'(?:www\.)?invidious\.silkky\.cloud', - r'(?:www\.)?invidious\.exonip\.de', - r'(?:www\.)?invidious\.riverside\.rocks', - r'(?:www\.)?invidious\.blamefran\.net', - r'(?:www\.)?invidious\.moomoo\.de', - r'(?:www\.)?ytb\.trom\.tf', - r'(?:www\.)?yt\.cyberhost\.uk', - r'(?:www\.)?kgg2m7yk5aybusll\.onion', - r'(?:www\.)?qklhadlycap4cnod\.onion', - r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', - r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion', - r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion', - r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', - r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', - r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', - r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', - r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', - r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', - r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', - ) + IE_DESC = 'YouTube' _VALID_URL = r"""(?x)^ ( (?:https?://|//) # http(s):// or protocol-independent URL @@ -772,7 +892,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: - (?:(?:v|embed|e|shorts)/(?!videoseries)) # v/ or embed/ or e/ or shorts/ + (?:(?:v|embed|e|shorts)/(?!videoseries|live_stream)) # v/ or embed/ or e/ or shorts/ |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! @@ -792,7 +912,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow (?:\#|$)""" % { - 'invidious': '|'.join(_INVIDIOUS_SITES), + 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), } _PLAYER_INFO_RE = ( r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player', @@ -923,18 +1043,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'channel': 'Philipp Hagemeister', 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', + 'description': 'md5:8fb536f4877b8a7455c2ec23794dbc22', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, 'view_count': int, 'like_count': int, - 'dislike_count': int, + 'availability': 'public', + 'playable_in_embed': True, + 'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg', + 'live_status': 'not_live', + 'age_limit': 0, 'start_time': 1, 'end_time': 9, + 'channel_follower_count': int } }, { @@ -963,14 +1089,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'channel': 'Philipp Hagemeister', + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', + 'description': 'md5:8fb536f4877b8a7455c2ec23794dbc22', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, 'view_count': int, 'like_count': int, - 'dislike_count': int, + 'availability': 'public', + 'playable_in_embed': True, + 'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg', + 'live_status': 'not_live', + 'age_limit': 0, + 'channel_follower_count': int }, 'params': { 'skip_download': True, @@ -1008,6 +1142,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', 'abr': 129.495, + 'like_count': int, + 'channel_id': 'UChuZAo1RKL85gev3Eal9_zg', + 'playable_in_embed': True, + 'channel_url': 'https://www.youtube.com/channel/UChuZAo1RKL85gev3Eal9_zg', + 'view_count': int, + 'track': 'The Spark', + 'live_status': 'not_live', + 'thumbnail': 'https://i.ytimg.com/vi_webp/IB3lcPjvWLA/maxresdefault.webp', + 'channel': 'Afrojack', + 'uploader_url': 'http://www.youtube.com/user/AfrojackVEVO', + 'tags': 'count:19', + 'availability': 'public', + 'categories': ['Music'], + 'age_limit': 0, + 'alt_title': 'The Spark', + 'channel_follower_count': int }, 'params': { 'youtube_include_dash_manifest': True, @@ -1029,6 +1179,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', 'age_limit': 18, + 'categories': ['Gaming'], + 'thumbnail': 'https://i.ytimg.com/vi_webp/HtVdAasjOgU/maxresdefault.webp', + 'availability': 'needs_auth', + 'channel_url': 'https://www.youtube.com/channel/UCzybXLxv08IApdjdN0mJhEg', + 'like_count': int, + 'channel': 'The Witcher', + 'live_status': 'not_live', + 'tags': 'count:17', + 'channel_id': 'UCzybXLxv08IApdjdN0mJhEg', + 'playable_in_embed': True, + 'view_count': int, + 'channel_follower_count': int }, }, { @@ -1043,6 +1205,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'FlyingKitty900', 'uploader': 'FlyingKitty', 'age_limit': 18, + 'availability': 'needs_auth', + 'channel_id': 'UCYQT13AtrJC0gsM1far_zJg', + 'uploader_url': 'http://www.youtube.com/user/FlyingKitty900', + 'channel': 'FlyingKitty', + 'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg', + 'view_count': int, + 'categories': ['Entertainment'], + 'live_status': 'not_live', + 'tags': ['Flyingkitty', 'godzilla 2'], + 'thumbnail': 'https://i.ytimg.com/vi/HsUATh_Nc2U/maxresdefault.jpg', + 'like_count': int, + 'duration': 177, + 'playable_in_embed': True, + 'channel_follower_count': int }, }, { @@ -1052,11 +1228,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'Tq92D6wQ1mg', 'title': '[MMD] Adios - EVERGLOW [+Motion DL]', 'ext': 'mp4', - 'upload_date': '20191227', + 'upload_date': '20191228', 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', 'uploader': 'Projekt Melody', 'description': 'md5:17eccca93a786d51bc67646756894066', 'age_limit': 18, + 'like_count': int, + 'availability': 'needs_auth', + 'uploader_url': 'http://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', + 'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', + 'view_count': int, + 'thumbnail': 'https://i.ytimg.com/vi_webp/Tq92D6wQ1mg/sddefault.webp', + 'channel': 'Projekt Melody', + 'live_status': 'not_live', + 'tags': ['mmd', 'dance', 'mikumikudance', 'kpop', 'vtuber'], + 'playable_in_embed': True, + 'categories': ['Entertainment'], + 'duration': 106, + 'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', + 'channel_follower_count': int }, }, { @@ -1070,6 +1260,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'st3in234', 'description': 'Fan Video. Music & Lyrics by OOMPH!.', 'upload_date': '20130730', + 'track': 'Such mich find mich', + 'age_limit': 0, + 'tags': ['oomph', 'such mich find mich', 'lyrics', 'german industrial', 'musica industrial'], + 'like_count': int, + 'playable_in_embed': False, + 'creator': 'OOMPH!', + 'thumbnail': 'https://i.ytimg.com/vi/MeJVWBSsPAY/sddefault.jpg', + 'view_count': int, + 'alt_title': 'Such mich find mich', + 'duration': 210, + 'channel': 'Herr Lurik', + 'channel_id': 'UCdR3RSDPqub28LjZx0v9-aA', + 'categories': ['Music'], + 'availability': 'public', + 'uploader_url': 'http://www.youtube.com/user/st3in234', + 'channel_url': 'https://www.youtube.com/channel/UCdR3RSDPqub28LjZx0v9-aA', + 'live_status': 'not_live', + 'artist': 'OOMPH!', + 'channel_follower_count': int }, }, { @@ -1093,6 +1302,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'deadmau5', 'title': 'Deadmau5 - Some Chords (HD)', 'alt_title': 'Some Chords', + 'availability': 'public', + 'tags': 'count:14', + 'channel_id': 'UCYEK6xds6eo-3tr4xRdflmQ', + 'view_count': int, + 'live_status': 'not_live', + 'channel': 'deadmau5', + 'thumbnail': 'https://i.ytimg.com/vi_webp/__2ABJjxzNo/maxresdefault.webp', + 'like_count': int, + 'track': 'Some Chords', + 'artist': 'deadmau5', + 'playable_in_embed': True, + 'age_limit': 0, + 'channel_url': 'https://www.youtube.com/channel/UCYEK6xds6eo-3tr4xRdflmQ', + 'categories': ['Music'], + 'album': 'Some Chords', + 'channel_follower_count': int }, 'expected_warnings': [ 'DASH manifest missing', @@ -1111,6 +1336,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', 'uploader': 'Olympics', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', + 'like_count': int, + 'release_timestamp': 1343767800, + 'playable_in_embed': True, + 'categories': ['Sports'], + 'release_date': '20120731', + 'channel': 'Olympics', + 'tags': ['Hockey', '2012-07-31', '31 July 2012', 'Riverbank Arena', 'Session', 'Olympics', 'Olympic Games', 'London 2012', '2012 Summer Olympics', 'Summer Games'], + 'channel_id': 'UCTl3QQTvqHFjurroKxexy2Q', + 'thumbnail': 'https://i.ytimg.com/vi/lqQg6PlCWgI/maxresdefault.jpg', + 'age_limit': 0, + 'availability': 'public', + 'live_status': 'was_live', + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCTl3QQTvqHFjurroKxexy2Q', + 'channel_follower_count': int }, 'params': { 'skip_download': 'requires avconv', @@ -1130,6 +1370,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', 'uploader': '孫ᄋᄅ', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', + 'playable_in_embed': True, + 'channel': '孫ᄋᄅ', + 'age_limit': 0, + 'tags': 'count:11', + 'channel_url': 'https://www.youtube.com/channel/UCS-xxCmRaA6BFdmgDPA_BIw', + 'channel_id': 'UCS-xxCmRaA6BFdmgDPA_BIw', + 'thumbnail': 'https://i.ytimg.com/vi/_b-2C3KPAM0/maxresdefault.jpg', + 'view_count': int, + 'categories': ['People & Blogs'], + 'like_count': int, + 'live_status': 'not_live', + 'availability': 'unlisted', + 'channel_follower_count': int }, }, # url_encoded_fmt_stream_map is empty string @@ -1286,6 +1539,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'track': 'Dark Walk', 'artist': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan', 'album': 'Position Music - Production Music Vol. 143 - Dark Walk', + 'thumbnail': 'https://i.ytimg.com/vi_webp/lsguqyKfVQg/maxresdefault.webp', + 'categories': ['Film & Animation'], + 'view_count': int, + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCTSRgz5jylBvFt_S7wnsqLQ', + 'channel_id': 'UCTSRgz5jylBvFt_S7wnsqLQ', + 'tags': 'count:13', + 'availability': 'public', + 'channel': 'IronSoulElf', + 'playable_in_embed': True, + 'like_count': int, + 'age_limit': 0, + 'channel_follower_count': int }, 'params': { 'skip_download': True, @@ -1327,11 +1593,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'md5:e41008789470fc2533a3252216f1c1d1', 'description': 'md5:a677553cf0840649b731a3024aeff4cc', 'duration': 721, - 'upload_date': '20150127', + 'upload_date': '20150128', 'uploader_id': 'BerkmanCenter', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', 'uploader': 'The Berkman Klein Center for Internet & Society', 'license': 'Creative Commons Attribution license (reuse allowed)', + 'channel_id': 'UCuLGmD72gJDBwmLw06X58SA', + 'channel_url': 'https://www.youtube.com/channel/UCuLGmD72gJDBwmLw06X58SA', + 'like_count': int, + 'age_limit': 0, + 'tags': ['Copyright (Legal Subject)', 'Law (Industry)', 'William W. Fisher (Author)'], + 'channel': 'The Berkman Klein Center for Internet & Society', + 'availability': 'public', + 'view_count': int, + 'categories': ['Education'], + 'thumbnail': 'https://i.ytimg.com/vi_webp/M4gD1WSo5mA/maxresdefault.webp', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel_follower_count': int }, 'params': { 'skip_download': True, @@ -1346,11 +1625,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', 'description': 'md5:13a2503d7b5904ef4b223aa101628f39', 'duration': 4060, - 'upload_date': '20151119', + 'upload_date': '20151120', 'uploader': 'Bernie Sanders', 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', 'license': 'Creative Commons Attribution license (reuse allowed)', + 'playable_in_embed': True, + 'tags': 'count:12', + 'like_count': int, + 'channel_id': 'UCH1dpzjCEiGAt8CXkryhkZg', + 'age_limit': 0, + 'availability': 'public', + 'categories': ['News & Politics'], + 'channel': 'Bernie Sanders', + 'thumbnail': 'https://i.ytimg.com/vi_webp/eQcmzGIKrzg/maxresdefault.webp', + 'view_count': int, + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', + 'channel_follower_count': int }, 'params': { 'skip_download': True, @@ -1400,6 +1692,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'series': 'Mind Field', 'season_number': 1, 'episode_number': 1, + 'thumbnail': 'https://i.ytimg.com/vi_webp/iqKdEhx-dD4/maxresdefault.webp', + 'tags': 'count:12', + 'view_count': int, + 'availability': 'public', + 'age_limit': 0, + 'channel': 'Vsauce', + 'episode': 'Episode 1', + 'categories': ['Entertainment'], + 'season': 'Season 1', + 'channel_id': 'UC6nSFpj9HTCZ5t-N3Rm3-HA', + 'channel_url': 'https://www.youtube.com/channel/UC6nSFpj9HTCZ5t-N3Rm3-HA', + 'like_count': int, + 'playable_in_embed': True, + 'live_status': 'not_live', + 'channel_follower_count': int }, 'params': { 'skip_download': True, @@ -1493,6 +1800,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'album': 'it\'s too much love to know my dear', 'release_date': '20190313', 'release_year': 2019, + 'alt_title': 'Voyeur Girl', + 'view_count': int, + 'uploader_url': 'http://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA', + 'playable_in_embed': True, + 'like_count': int, + 'categories': ['Music'], + 'channel_url': 'https://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA', + 'channel': 'Stephen', + 'availability': 'public', + 'creator': 'Stephen', + 'duration': 169, + 'thumbnail': 'https://i.ytimg.com/vi_webp/MgNrAu2pzNs/maxresdefault.webp', + 'age_limit': 0, + 'channel_id': 'UC-pWHpBjdGG69N9mM2auIAA', + 'tags': 'count:11', + 'live_status': 'not_live', + 'channel_follower_count': int }, 'params': { 'skip_download': True, @@ -1534,6 +1858,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20170613', 'uploader_id': 'ElevageOrVert', 'uploader': 'ElevageOrVert', + 'view_count': int, + 'thumbnail': 'https://i.ytimg.com/vi_webp/x41yOUIvK2k/maxresdefault.webp', + 'uploader_url': 'http://www.youtube.com/user/ElevageOrVert', + 'like_count': int, + 'channel_id': 'UCo03ZQPBW5U4UC3regpt1nw', + 'tags': [], + 'channel_url': 'https://www.youtube.com/channel/UCo03ZQPBW5U4UC3regpt1nw', + 'availability': 'public', + 'age_limit': 0, + 'categories': ['Pets & Animals'], + 'duration': 7, + 'playable_in_embed': True, + 'live_status': 'not_live', + 'channel': 'ElevageOrVert', + 'channel_follower_count': int }, 'params': { 'skip_download': True, @@ -1553,6 +1892,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20130831', 'uploader_id': 'kudvenkat', 'uploader': 'kudvenkat', + 'channel_id': 'UCCTVrRB5KpIiK6V2GGVsR1Q', + 'like_count': int, + 'uploader_url': 'http://www.youtube.com/user/kudvenkat', + 'channel_url': 'https://www.youtube.com/channel/UCCTVrRB5KpIiK6V2GGVsR1Q', + 'live_status': 'not_live', + 'categories': ['Education'], + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi/CHqg6qOn4no/sddefault.jpg', + 'tags': 'count:12', + 'playable_in_embed': True, + 'age_limit': 0, + 'view_count': int, + 'duration': 522, + 'channel': 'kudvenkat', + 'channel_follower_count': int }, 'params': { 'skip_download': True, @@ -1582,8 +1936,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'artist': 'The Cinematic Orchestra', 'track': 'Burn Out', 'album': 'Every Day', - 'release_data': None, - 'release_year': None, + 'like_count': int, + 'live_status': 'not_live', + 'alt_title': 'Burn Out', + 'duration': 614, + 'age_limit': 0, + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw', + 'creator': 'The Cinematic Orchestra', + 'channel': 'The Cinematic Orchestra', + 'tags': ['The Cinematic Orchestra', 'Every Day', 'Burn Out'], + 'channel_id': 'UCIzsJBIyo8hhpFm1NK0uLgw', + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi/OtqTfy26tG0/maxresdefault.jpg', + 'categories': ['Music'], + 'playable_in_embed': True, + 'channel_follower_count': int }, 'params': { 'skip_download': True, @@ -1602,10 +1970,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'San Diego teen commits suicide after bullying over embarrassing video', 'channel_id': 'UC-SJ6nODDmufqBzPBwCvYvQ', - 'uploader': 'CBS This Morning', + 'uploader': 'CBS Mornings', 'uploader_id': 'CBSThisMorning', 'upload_date': '20140716', - 'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7' + 'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7', + 'duration': 170, + 'categories': ['News & Politics'], + 'uploader_url': 'http://www.youtube.com/user/CBSThisMorning', + 'view_count': int, + 'channel': 'CBS Mornings', + 'tags': ['suicide', 'bullying', 'video', 'cbs', 'news'], + 'thumbnail': 'https://i.ytimg.com/vi/SZJvDhaSDnc/hqdefault.jpg', + 'age_limit': 18, + 'availability': 'needs_auth', + 'channel_url': 'https://www.youtube.com/channel/UC-SJ6nODDmufqBzPBwCvYvQ', + 'like_count': int, + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel_follower_count': int } }, { @@ -1620,6 +2002,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Walk around Japan', 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw', + 'duration': 1456, + 'categories': ['Travel & Events'], + 'channel_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw', + 'view_count': int, + 'channel': 'Walk around Japan', + 'tags': ['Ueno Tokyo', 'Okachimachi Tokyo', 'Ameyoko Street', 'Tokyo attraction', 'Travel in Tokyo'], + 'thumbnail': 'https://i.ytimg.com/vi_webp/cBvYw8_A0vQ/hqdefault.webp', + 'age_limit': 0, + 'availability': 'public', + 'channel_url': 'https://www.youtube.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel_follower_count': int }, 'params': { 'skip_download': True, @@ -1648,7 +2043,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'colinfurze', 'uploader_id': 'colinfurze', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw', - 'description': 'md5:b5096f56af7ccd7a555c84db81738b22' + 'description': 'md5:5d5991195d599b56cd0c4148907eec50', + 'duration': 596, + 'categories': ['Entertainment'], + 'uploader_url': 'http://www.youtube.com/user/colinfurze', + 'view_count': int, + 'channel': 'colinfurze', + 'tags': ['Colin', 'furze', 'Terry', 'tunnel', 'underground', 'bunker'], + 'thumbnail': 'https://i.ytimg.com/vi/YOelRv7fMxY/maxresdefault.jpg', + 'age_limit': 0, + 'availability': 'public', + 'like_count': int, + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel_follower_count': int }, 'params': { 'format': '17', # 3gp format available on android @@ -1666,6 +2074,120 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # shorts 'url': 'https://www.youtube.com/shorts/BGQWPY4IigY', 'only_matching': True, + }, { + 'note': 'Storyboards', + 'url': 'https://www.youtube.com/watch?v=5KLPxDtMqe8', + 'info_dict': { + 'id': '5KLPxDtMqe8', + 'ext': 'mhtml', + 'format_id': 'sb0', + 'title': 'Your Brain is Plastic', + 'uploader_id': 'scishow', + 'description': 'md5:89cd86034bdb5466cd87c6ba206cd2bc', + 'upload_date': '20140324', + 'uploader': 'SciShow', + 'like_count': int, + 'channel_id': 'UCZYTClx2T1of7BRZ86-8fow', + 'channel_url': 'https://www.youtube.com/channel/UCZYTClx2T1of7BRZ86-8fow', + 'view_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/5KLPxDtMqe8/maxresdefault.jpg', + 'playable_in_embed': True, + 'tags': 'count:12', + 'uploader_url': 'http://www.youtube.com/user/scishow', + 'availability': 'public', + 'channel': 'SciShow', + 'live_status': 'not_live', + 'duration': 248, + 'categories': ['Education'], + 'age_limit': 0, + 'channel_follower_count': int + }, 'params': {'format': 'mhtml', 'skip_download': True} + }, { + # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939) + 'url': 'https://www.youtube.com/watch?v=2NUZ8W2llS4', + 'info_dict': { + 'id': '2NUZ8W2llS4', + 'ext': 'mp4', + 'title': 'The NP that test your phone performance 🙂', + 'description': 'md5:144494b24d4f9dfacb97c1bbef5de84d', + 'uploader': 'Leon Nguyen', + 'uploader_id': 'VNSXIII', + 'uploader_url': 'http://www.youtube.com/user/VNSXIII', + 'channel_id': 'UCRqNBSOHgilHfAczlUmlWHA', + 'channel_url': 'https://www.youtube.com/channel/UCRqNBSOHgilHfAczlUmlWHA', + 'duration': 21, + 'view_count': int, + 'age_limit': 0, + 'categories': ['Gaming'], + 'tags': 'count:23', + 'playable_in_embed': True, + 'live_status': 'not_live', + 'upload_date': '20220103', + 'like_count': int, + 'availability': 'public', + 'channel': 'Leon Nguyen', + 'thumbnail': 'https://i.ytimg.com/vi_webp/2NUZ8W2llS4/maxresdefault.webp', + 'channel_follower_count': int + } + }, { + # date text is premiered video, ensure upload date in UTC (published 1641172509) + 'url': 'https://www.youtube.com/watch?v=mzZzzBU6lrM', + 'info_dict': { + 'id': 'mzZzzBU6lrM', + 'ext': 'mp4', + 'title': 'I Met GeorgeNotFound In Real Life...', + 'description': 'md5:cca98a355c7184e750f711f3a1b22c84', + 'uploader': 'Quackity', + 'uploader_id': 'QuackityHQ', + 'uploader_url': 'http://www.youtube.com/user/QuackityHQ', + 'channel_id': 'UC_8NknAFiyhOUaZqHR3lq3Q', + 'channel_url': 'https://www.youtube.com/channel/UC_8NknAFiyhOUaZqHR3lq3Q', + 'duration': 955, + 'view_count': int, + 'age_limit': 0, + 'categories': ['Entertainment'], + 'tags': 'count:26', + 'playable_in_embed': True, + 'live_status': 'not_live', + 'release_timestamp': 1641172509, + 'release_date': '20220103', + 'upload_date': '20220103', + 'like_count': int, + 'availability': 'public', + 'channel': 'Quackity', + 'thumbnail': 'https://i.ytimg.com/vi/mzZzzBU6lrM/maxresdefault.jpg', + 'channel_follower_count': int + } + }, + { # continuous livestream. Microformat upload date should be preferred. + # Upload date was 2021-06-19 (not UTC), while stream start is 2021-11-27 + 'url': 'https://www.youtube.com/watch?v=kgx4WGK0oNU', + 'info_dict': { + 'id': 'kgx4WGK0oNU', + 'title': r're:jazz\/lofi hip hop radio🌱chill beats to relax\/study to \[LIVE 24\/7\] \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'ext': 'mp4', + 'channel_id': 'UC84whx2xxsiA1gXHXXqKGOA', + 'availability': 'public', + 'age_limit': 0, + 'release_timestamp': 1637975704, + 'upload_date': '20210619', + 'channel_url': 'https://www.youtube.com/channel/UC84whx2xxsiA1gXHXXqKGOA', + 'live_status': 'is_live', + 'thumbnail': 'https://i.ytimg.com/vi/kgx4WGK0oNU/maxresdefault.jpg', + 'uploader': '阿鲍Abao', + 'uploader_url': 'http://www.youtube.com/channel/UC84whx2xxsiA1gXHXXqKGOA', + 'channel': 'Abao in Tokyo', + 'channel_follower_count': int, + 'release_date': '20211127', + 'tags': 'count:39', + 'categories': ['People & Blogs'], + 'like_count': int, + 'uploader_id': 'UC84whx2xxsiA1gXHXXqKGOA', + 'view_count': int, + 'playable_in_embed': True, + 'description': 'md5:2ef1d002cad520f65825346e2084e49d', + }, + 'params': {'skip_download': True} }, ] @@ -1683,18 +2205,158 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._code_cache = {} self._player_cache = {} + def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data): + lock = threading.Lock() + + is_live = True + start_time = time.time() + formats = [f for f in formats if f.get('is_from_start')] + + def refetch_manifest(format_id, delay): + nonlocal formats, start_time, is_live + if time.time() <= start_time + delay: + return + + _, _, prs, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url) + video_details = traverse_obj( + prs, (..., 'videoDetails'), expected_type=dict, default=[]) + microformats = traverse_obj( + prs, (..., 'microformat', 'playerMicroformatRenderer'), + expected_type=dict, default=[]) + _, is_live, _, formats = self._list_formats(video_id, microformats, video_details, prs, player_url) + start_time = time.time() + + def mpd_feed(format_id, delay): + """ + @returns (manifest_url, manifest_stream_number, is_live) or None + """ + with lock: + refetch_manifest(format_id, delay) + + f = next((f for f in formats if f['format_id'] == format_id), None) + if not f: + if not is_live: + self.to_screen(f'{video_id}: Video is no longer live') + else: + self.report_warning( + f'Cannot find refreshed manifest for format {format_id}{bug_reports_message()}') + return None + return f['manifest_url'], f['manifest_stream_number'], is_live + + for f in formats: + f['is_live'] = True + f['protocol'] = 'http_dash_segments_generator' + f['fragments'] = functools.partial( + self._live_dash_fragments, f['format_id'], live_start_time, mpd_feed) + + def _live_dash_fragments(self, format_id, live_start_time, mpd_feed, ctx): + FETCH_SPAN, MAX_DURATION = 5, 432000 + + mpd_url, stream_number, is_live = None, None, True + + begin_index = 0 + download_start_time = ctx.get('start') or time.time() + + lack_early_segments = download_start_time - (live_start_time or download_start_time) > MAX_DURATION + if lack_early_segments: + self.report_warning(bug_reports_message( + 'Starting download from the last 120 hours of the live stream since ' + 'YouTube does not have data before that. If you think this is wrong,'), only_once=True) + lack_early_segments = True + + known_idx, no_fragment_score, last_segment_url = begin_index, 0, None + fragments, fragment_base_url = None, None + + def _extract_sequence_from_mpd(refresh_sequence, immediate): + nonlocal mpd_url, stream_number, is_live, no_fragment_score, fragments, fragment_base_url + # Obtain from MPD's maximum seq value + old_mpd_url = mpd_url + last_error = ctx.pop('last_error', None) + expire_fast = immediate or last_error and isinstance(last_error, compat_HTTPError) and last_error.code == 403 + mpd_url, stream_number, is_live = (mpd_feed(format_id, 5 if expire_fast else 18000) + or (mpd_url, stream_number, False)) + if not refresh_sequence: + if expire_fast and not is_live: + return False, last_seq + elif old_mpd_url == mpd_url: + return True, last_seq + try: + fmts, _ = self._extract_mpd_formats_and_subtitles( + mpd_url, None, note=False, errnote=False, fatal=False) + except ExtractorError: + fmts = None + if not fmts: + no_fragment_score += 2 + return False, last_seq + fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number) + fragments = fmt_info['fragments'] + fragment_base_url = fmt_info['fragment_base_url'] + assert fragment_base_url + + _last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1)) + return True, _last_seq + + while is_live: + fetch_time = time.time() + if no_fragment_score > 30: + return + if last_segment_url: + # Obtain from "X-Head-Seqnum" header value from each segment + try: + urlh = self._request_webpage( + last_segment_url, None, note=False, errnote=False, fatal=False) + except ExtractorError: + urlh = None + last_seq = try_get(urlh, lambda x: int_or_none(x.headers['X-Head-Seqnum'])) + if last_seq is None: + no_fragment_score += 2 + last_segment_url = None + continue + else: + should_continue, last_seq = _extract_sequence_from_mpd(True, no_fragment_score > 15) + no_fragment_score += 2 + if not should_continue: + continue + + if known_idx > last_seq: + last_segment_url = None + continue + + last_seq += 1 + + if begin_index < 0 and known_idx < 0: + # skip from the start when it's negative value + known_idx = last_seq + begin_index + if lack_early_segments: + known_idx = max(known_idx, last_seq - int(MAX_DURATION // fragments[-1]['duration'])) + try: + for idx in range(known_idx, last_seq): + # do not update sequence here or you'll get skipped some part of it + should_continue, _ = _extract_sequence_from_mpd(False, False) + if not should_continue: + known_idx = idx - 1 + raise ExtractorError('breaking out of outer loop') + last_segment_url = urljoin(fragment_base_url, 'sq/%d' % idx) + yield { + 'url': last_segment_url, + } + if known_idx == last_seq: + no_fragment_score += 5 + else: + no_fragment_score = 0 + known_idx = last_seq + except ExtractorError: + continue + + time.sleep(max(0, FETCH_SPAN + fetch_time - time.time())) + def _extract_player_url(self, *ytcfgs, webpage=None): player_url = traverse_obj( ytcfgs, (..., 'PLAYER_JS_URL'), (..., 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'jsUrl'), get_all=False, expected_type=compat_str) if not player_url: return - if player_url.startswith('//'): - player_url = 'https:' + player_url - elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( - 'https://www.youtube.com', player_url) - return player_url + return urljoin('https://www.youtube.com', player_url) def _download_player_url(self, video_id, fatal=False): res = self._download_webpage( @@ -1720,7 +2382,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError('Cannot identify player %r' % player_url) return id_m.group('id') - def _load_player(self, video_id, player_url, fatal=True) -> bool: + def _load_player(self, video_id, player_url, fatal=True): player_id = self._extract_player_info(player_url) if player_id not in self._code_cache: code = self._download_webpage( @@ -1729,7 +2391,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): errnote='Download of %s failed' % player_url) if code: self._code_cache[player_id] = code - return player_id in self._code_cache + return self._code_cache.get(player_id) def _extract_signature_function(self, video_id, player_url, example_sig): player_id = self._extract_player_info(player_url) @@ -1743,8 +2405,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) - if self._load_player(video_id, player_url): - code = self._code_cache[player_id] + code = self._load_player(video_id, player_url) + if code: res = self._parse_sig_js(code) test_string = ''.join(map(compat_chr, range(len(example_sig)))) @@ -1755,6 +2417,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return res def _print_sig_code(self, func, example_sig): + if not self.get_param('youtube_print_sig_code'): + return + def gen_sig_code(idxs): def _genslice(start, end, step): starts = '' if start == 0 else str(start) @@ -1831,13 +2496,59 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) self._player_cache[player_id] = func func = self._player_cache[player_id] - if self.get_param('youtube_print_sig_code'): - self._print_sig_code(func, s) + self._print_sig_code(func, s) return func(s) except Exception as e: - tb = traceback.format_exc() - raise ExtractorError( - 'Signature extraction failed: ' + tb, cause=e) + raise ExtractorError('Signature extraction failed: ' + traceback.format_exc(), cause=e) + + def _decrypt_nsig(self, s, video_id, player_url): + """Turn the encrypted n field into a working signature""" + if player_url is None: + raise ExtractorError('Cannot decrypt nsig without player_url') + player_url = urljoin('https://www.youtube.com', player_url) + + sig_id = ('nsig_value', s) + if sig_id in self._player_cache: + return self._player_cache[sig_id] + + try: + player_id = ('nsig', player_url) + if player_id not in self._player_cache: + self._player_cache[player_id] = self._extract_n_function(video_id, player_url) + func = self._player_cache[player_id] + self._player_cache[sig_id] = func(s) + self.write_debug(f'Decrypted nsig {s} => {self._player_cache[sig_id]}') + return self._player_cache[sig_id] + except Exception as e: + raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id) + + def _extract_n_function_name(self, jscode): + nfunc, idx = self._search_regex( + r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)', + jscode, 'Initial JS player n function name', group=('nfunc', 'idx')) + if not idx: + return nfunc + return json.loads(js_to_json(self._search_regex( + rf'var {re.escape(nfunc)}\s*=\s*(\[.+?\]);', jscode, + f'Initial JS player n function list ({nfunc}.{idx})')))[int(idx)] + + def _extract_n_function(self, video_id, player_url): + player_id = self._extract_player_info(player_url) + func_code = self._downloader.cache.load('youtube-nsig', player_id) + + if func_code: + jsi = JSInterpreter(func_code) + else: + jscode = self._load_player(video_id, player_url) + funcname = self._extract_n_function_name(jscode) + jsi = JSInterpreter(jscode) + func_code = jsi.extract_function_code(funcname) + self._downloader.cache.store('youtube-nsig', player_id, func_code) + + if self.get_param('youtube_print_sig_code'): + self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n') + + return lambda s: jsi.extract_function_from_code(*func_code)([s]) def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): """ @@ -1856,18 +2567,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError(error_msg) self.report_warning(error_msg) return - if self._load_player(video_id, player_url, fatal=fatal): - player_id = self._extract_player_info(player_url) - code = self._code_cache[player_id] + code = self._load_player(video_id, player_url, fatal=fatal) + if code: sts = int_or_none(self._search_regex( r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code, 'JS player signature timestamp', group='sts', fatal=fatal)) return sts def _mark_watched(self, video_id, player_responses): - playback_url = traverse_obj( - player_responses, (..., 'playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'), - expected_type=url_or_none, get_all=False) + playback_url = get_first( + player_responses, ('playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'), + expected_type=url_or_none) if not playback_url: self.report_warning('Unable to mark watched') return @@ -1991,19 +2701,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), regex), webpage, name, default='{}'), video_id, fatal=False) - @staticmethod - def parse_time_text(time_text): - """ - Parse the comment time text - time_text is in the format 'X units ago (edited)' - """ - time_text_split = time_text.split(' ') - if len(time_text_split) >= 3: - try: - return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto') - except ValueError: - return None - def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') if not comment_id: @@ -2012,10 +2709,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): text = self._get_text(comment_renderer, 'contentText') # note: timestamp is an estimate calculated from the current time and time_text - time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' - time_text_dt = self.parse_time_text(time_text) - if isinstance(time_text_dt, datetime.datetime): - timestamp = calendar.timegm(time_text_dt.timetuple()) + timestamp, time_text = self._extract_time_text(comment_renderer, 'publishedTimeText') author = self._get_text(comment_renderer, 'authorText') author_id = try_get(comment_renderer, lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str) @@ -2042,20 +2736,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'parent': parent or 'root' } - def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, comment_counts=None): + def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, tracker=None): + + get_single_config_arg = lambda c: self._configuration_arg(c, [''])[0] def extract_header(contents): _continuation = None for content in contents: - comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer']) - expected_comment_count = parse_count(self._get_text( - comments_header_renderer, 'countText', 'commentsCount', max_runs=1)) + comments_header_renderer = traverse_obj(content, 'commentsHeaderRenderer') + expected_comment_count = self._get_count( + comments_header_renderer, 'countText', 'commentsCount') if expected_comment_count: - comment_counts[1] = expected_comment_count - self.to_screen('Downloading ~%d comments' % expected_comment_count) - sort_mode_str = self._configuration_arg('comment_sort', [''])[0] - comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top + tracker['est_total'] = expected_comment_count + self.to_screen(f'Downloading ~{expected_comment_count} comments') + comment_sort_index = int(get_single_config_arg('comment_sort') != 'top') # 1 = new, 0 = top sort_menu_item = try_get( comments_header_renderer, @@ -2066,76 +2761,84 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not _continuation: continue - sort_text = sort_menu_item.get('title') - if isinstance(sort_text, compat_str): - sort_text = sort_text.lower() - else: + sort_text = str_or_none(sort_menu_item.get('title')) + if not sort_text: sort_text = 'top comments' if comment_sort_index == 0 else 'newest first' - self.to_screen('Sorting comments by %s' % sort_text) + self.to_screen('Sorting comments by %s' % sort_text.lower()) break return _continuation def extract_thread(contents): if not parent: - comment_counts[2] = 0 + tracker['current_page_thread'] = 0 for content in contents: + if not parent and tracker['total_parent_comments'] >= max_parents: + yield comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer']) - comment_renderer = try_get( - comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get( - content, (lambda x: x['commentRenderer'], dict)) + comment_renderer = get_first( + (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]], + expected_type=dict, default={}) - if not comment_renderer: - continue comment = self._extract_comment(comment_renderer, parent) if not comment: continue - comment_counts[0] += 1 + + tracker['running_total'] += 1 + tracker['total_reply_comments' if parent else 'total_parent_comments'] += 1 yield comment + # Attempt to get the replies comment_replies_renderer = try_get( comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict) if comment_replies_renderer: - comment_counts[2] += 1 + tracker['current_page_thread'] += 1 comment_entries_iter = self._comment_entries( comment_replies_renderer, ytcfg, video_id, - parent=comment.get('id'), comment_counts=comment_counts) - - for reply_comment in comment_entries_iter: + parent=comment.get('id'), tracker=tracker) + for reply_comment in itertools.islice(comment_entries_iter, min(max_replies_per_thread, max(0, max_replies - tracker['total_reply_comments']))): yield reply_comment + # Keeps track of counts across recursive calls + if not tracker: + tracker = dict( + running_total=0, + est_total=0, + current_page_thread=0, + total_parent_comments=0, + total_reply_comments=0) + + # TODO: Deprecated # YouTube comments have a max depth of 2 - max_depth = int_or_none(self._configuration_arg('max_comment_depth', [''])[0]) or float('inf') + max_depth = int_or_none(get_single_config_arg('max_comment_depth')) + if max_depth: + self._downloader.deprecation_warning( + '[youtube] max_comment_depth extractor argument is deprecated. Set max replies in the max-comments extractor argument instead.') if max_depth == 1 and parent: return - if not comment_counts: - # comment so far, est. total comments, current comment thread # - comment_counts = [0, 0, 0] - continuation = self._extract_continuation(root_continuation_data) - if continuation and len(continuation['continuation']) < 27: - self.write_debug('Detected old API continuation token. Generating new API compatible token.') - continuation_token = self._generate_comment_continuation(video_id) - continuation = self._build_api_continuation_query(continuation_token, None) + max_comments, max_parents, max_replies, max_replies_per_thread, *_ = map( + lambda p: int_or_none(p, default=sys.maxsize), self._configuration_arg('max_comments', ) + [''] * 4) + continuation = self._extract_continuation(root_continuation_data) message = self._get_text(root_continuation_data, ('contents', ..., 'messageRenderer', 'text'), max_runs=1) if message and not parent: self.report_warning(message, video_id=video_id) - visitor_data = None + response = None is_first_continuation = parent is None for page_num in itertools.count(0): if not continuation: break - headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=visitor_data) - comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1]) + headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response)) + comment_prog_str = f"({tracker['running_total']}/{tracker['est_total']})" if page_num == 0: if is_first_continuation: note_prefix = 'Downloading comment section API JSON' else: note_prefix = ' Downloading comment API JSON reply thread %d %s' % ( - comment_counts[2], comment_prog_str) + tracker['current_page_thread'], comment_prog_str) else: note_prefix = '%sDownloading comment%s API JSON page %d %s' % ( ' ' if parent else '', ' replies' if parent else '', @@ -2144,83 +2847,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor): response = self._extract_response( item_id=None, query=continuation, ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix, - check_get_keys=('onResponseReceivedEndpoints', 'continuationContents')) - if not response: - break - visitor_data = try_get( - response, - lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'], - compat_str) or visitor_data + check_get_keys='onResponseReceivedEndpoints') - continuation_contents = dict_get(response, ('onResponseReceivedEndpoints', 'continuationContents')) + continuation_contents = traverse_obj( + response, 'onResponseReceivedEndpoints', expected_type=list, default=[]) continuation = None - if isinstance(continuation_contents, list): - for continuation_section in continuation_contents: - if not isinstance(continuation_section, dict): - continue - continuation_items = try_get( - continuation_section, - (lambda x: x['reloadContinuationItemsCommand']['continuationItems'], - lambda x: x['appendContinuationItemsAction']['continuationItems']), - list) or [] - if is_first_continuation: - continuation = extract_header(continuation_items) - is_first_continuation = False - if continuation: - break - continue - count = 0 - for count, entry in enumerate(extract_thread(continuation_items)): - yield entry - continuation = self._extract_continuation({'contents': continuation_items}) + for continuation_section in continuation_contents: + continuation_items = traverse_obj( + continuation_section, + (('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems'), + get_all=False, expected_type=list) or [] + if is_first_continuation: + continuation = extract_header(continuation_items) + is_first_continuation = False if continuation: - # Sometimes YouTube provides a continuation without any comments - # In most cases we end up just downloading these with very little comments to come. - if count == 0: - if not parent: - self.report_warning('No comments received - assuming end of comments') - continuation = None break + continue - # Deprecated response structure - elif isinstance(continuation_contents, dict): - known_continuation_renderers = ('itemSectionContinuation', 'commentRepliesContinuation') - for key, continuation_renderer in continuation_contents.items(): - if key not in known_continuation_renderers: - continue - if not isinstance(continuation_renderer, dict): - continue - if is_first_continuation: - header_continuation_items = [continuation_renderer.get('header') or {}] - continuation = extract_header(header_continuation_items) - is_first_continuation = False - if continuation: - break - - # Sometimes YouTube provides a continuation without any comments - # In most cases we end up just downloading these with very little comments to come. - count = 0 - for count, entry in enumerate(extract_thread(continuation_renderer.get('contents') or {})): - yield entry - continuation = self._extract_continuation(continuation_renderer) - if count == 0: - if not parent: - self.report_warning('No comments received - assuming end of comments') - continuation = None + for entry in extract_thread(continuation_items): + if not entry: + return + yield entry + continuation = self._extract_continuation({'contents': continuation_items}) + if continuation: break - @staticmethod - def _generate_comment_continuation(video_id): - """ - Generates initial comment section continuation token from given video id - """ - b64_vid_id = base64.b64encode(bytes(video_id.encode('utf-8'))) - parts = ('Eg0SCw==', b64_vid_id, 'GAYyJyIRIgs=', b64_vid_id, 'MAB4AjAAQhBjb21tZW50cy1zZWN0aW9u') - new_continuation_intlist = list(itertools.chain.from_iterable( - [bytes_to_intlist(base64.b64decode(part)) for part in parts])) - return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8') - def _get_comments(self, ytcfg, video_id, contents, webpage): """Entry for comment extraction""" def _real_comment_extract(contents): @@ -2230,11 +2882,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): yield from self._comment_entries(renderer, ytcfg, video_id) max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) - # Force English regardless of account setting to prevent parsing issues - # See: https://github.com/hypervideo/hypervideo/issues/532 - ytcfg = copy.deepcopy(ytcfg) - traverse_obj( - ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en' return itertools.islice(_real_comment_extract(contents), 0, max_comments) @staticmethod @@ -2290,18 +2937,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _get_requested_clients(self, url, smuggled_data): requested_clients = [] + default = ['android', 'web'] allowed_clients = sorted( [client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'], key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) for client in self._configuration_arg('player_client'): if client in allowed_clients: requested_clients.append(client) + elif client == 'default': + requested_clients.extend(default) elif client == 'all': requested_clients.extend(allowed_clients) else: self.report_warning(f'Skipping unsupported client {client}') if not requested_clients: - requested_clients = ['android', 'web'] + requested_clients = default if smuggled_data.get('is_music_url') or self.is_music_url(url): requested_clients.extend( @@ -2316,7 +2966,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }.get(client) if not url: return {} - webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config') + webpage = self._download_webpage(url, video_id, fatal=False, note='Downloading %s config' % client.replace('_', ' ').strip()) return self.extract_ytcfg(video_id, webpage) or {} def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg): @@ -2326,13 +2976,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') - original_clients = clients + all_clients = set(clients) clients = clients[::-1] prs = [] - def append_client(client_name): - if client_name in INNERTUBE_CLIENTS and client_name not in original_clients: - clients.append(client_name) + def append_client(*client_names): + """ Append the first client name that exists but not already used """ + for client_name in client_names: + actual_client = _split_innertube_client(client_name)[0] + if actual_client in INNERTUBE_CLIENTS: + if actual_client not in all_clients: + clients.append(client_name) + all_clients.add(actual_client) + return # Android player_response does not have microFormats which are needed for # extraction of some data. So we return the initial_pr with formats @@ -2347,7 +3003,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): tried_iframe_fallback = False player_url = None while clients: - client = clients.pop() + client, base_client, variant = _split_innertube_client(clients.pop()) player_ytcfg = master_ytcfg if client == 'web' else {} if 'configs' not in self._configuration_arg('player_skip'): player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg @@ -2375,10 +3031,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): prs.append(pr) # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in - if client.endswith('_agegate') and self._is_unplayable(pr) and self.is_authenticated: - append_client(client.replace('_agegate', '_creator')) + if variant == 'embedded' and self._is_unplayable(pr) and self.is_authenticated: + append_client(f'{base_client}_creator') elif self._is_agegated(pr): - append_client(f'{client}_agegate') + if variant == 'tv_embedded': + append_client(f'{base_client}_embedded') + elif not variant: + append_client(f'tv_embedded.{base_client}', f'{base_client}_embedded') if last_error: if not len(prs): @@ -2386,8 +3045,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_warning(last_error) return prs, player_url - def _extract_formats(self, streaming_data, video_id, player_url, is_live): - itags, stream_ids = [], [] + def _extract_formats(self, streaming_data, video_id, player_url, is_live, duration): + itags, stream_ids = {}, [] itag_qualities, res_qualities = {}, {} q = qualities([ # Normally tiny is the smallest video-only formats. But @@ -2399,7 +3058,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[]) for fmt in streaming_formats: - if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): + if fmt.get('targetDurationSec'): continue itag = str_or_none(fmt.get('itag')) @@ -2440,28 +3099,56 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' fmt_url += '&' + sp + '=' + signature + query = parse_qs(fmt_url) + throttled = False + if query.get('n'): + try: + fmt_url = update_url_query(fmt_url, { + 'n': self._decrypt_nsig(query['n'][0], video_id, player_url)}) + except ExtractorError as e: + self.report_warning( + f'nsig extraction failed: You may experience throttling for some formats\n' + f'n = {query["n"][0]} ; player = {player_url}\n{e}', only_once=True) + throttled = True + if itag: - itags.append(itag) + itags[itag] = 'https' stream_ids.append(stream_id) - tbr = float_or_none( - fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) + tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) + language_preference = ( + 10 if audio_track.get('audioIsDefault') and 10 + else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10 + else -1) + # Some formats may have much smaller duration than others (possibly damaged during encoding) + # Eg: 2-nOtRESiUc Ref: https://github.com/hypervideo/hypervideo/issues/2823 + # Make sure to avoid false positives with small duration differences. + # Eg: __2ABJjxzNo, ySuUZEjARPY + is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) / duration < 500) + if is_damaged: + self.report_warning(f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), 'format_id': itag, - 'format_note': ', '.join(filter(None, ( + 'format_note': join_nonempty( '%s%s' % (audio_track.get('displayName') or '', - ' (default)' if audio_track.get('audioIsDefault') else ''), - fmt.get('qualityLabel') or quality.replace('audio_quality_', '')))), - 'fps': int_or_none(fmt.get('fps')), + ' (default)' if language_preference > 0 else ''), + fmt.get('qualityLabel') or quality.replace('audio_quality_', ''), + throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '), + 'source_preference': -10 if throttled else -1, + 'fps': int_or_none(fmt.get('fps')) or None, 'height': height, 'quality': q(quality), + 'has_drm': bool(fmt.get('drmFamilies')), 'tbr': tbr, 'url': fmt_url, 'width': int_or_none(fmt.get('width')), - 'language': audio_track.get('id', '').split('.')[0], - 'language_preference': 1 if audio_track.get('audioIsDefault') else -1, + 'language': join_nonempty(audio_track.get('id', '').split('.')[0], + 'desc' if language_preference < -1 else ''), + 'language_preference': language_preference, + # Strictly de-prioritize damaged and 3gp formats + 'preference': -10 if is_damaged else -2 if itag == '17' else None, } mime_mobj = re.match( r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') @@ -2483,59 +3170,84 @@ class YoutubeIE(YoutubeBaseInfoExtractor): dct['container'] = dct['ext'] + '_dash' yield dct + live_from_start = is_live and self.get_param('live_from_start') skip_manifests = self._configuration_arg('skip') - get_dash = ( - (not is_live or self._configuration_arg('include_live_dash')) - and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True)) - get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True) + if not self.get_param('youtube_include_hls_manifest', True): + skip_manifests.append('hls') + get_dash = 'dash' not in skip_manifests and ( + not is_live or live_from_start or self._configuration_arg('include_live_dash')) + get_hls = not live_from_start and 'hls' not in skip_manifests + + def process_manifest_format(f, proto, itag): + if itag in itags: + if itags[itag] == proto or f'{itag}-{proto}' in itags: + return False + itag = f'{itag}-{proto}' + if itag: + f['format_id'] = itag + itags[itag] = proto - def guess_quality(f): - for val, qdict in ((f.get('format_id'), itag_qualities), (f.get('height'), res_qualities)): - if val in qdict: - return q(qdict[val]) - return -1 + f['quality'] = next(( + q(qdict[val]) + for val, qdict in ((f.get('format_id', '').split('-')[0], itag_qualities), (f.get('height'), res_qualities)) + if val in qdict), -1) + return True for sd in streaming_data: hls_manifest_url = get_hls and sd.get('hlsManifestUrl') if hls_manifest_url: for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False): - itag = self._search_regex( - r'/itag/(\d+)', f['url'], 'itag', default=None) - if itag in itags: - itag += '-hls' - if itag in itags: - continue - if itag: - f['format_id'] = itag - itags.append(itag) - f['quality'] = guess_quality(f) - yield f + if process_manifest_format(f, 'hls', self._search_regex( + r'/itag/(\d+)', f['url'], 'itag', default=None)): + yield f dash_manifest_url = get_dash and sd.get('dashManifestUrl') if dash_manifest_url: for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False): - itag = f['format_id'] - if itag in itags: - itag += '-dash' - if itag in itags: - continue - if itag: - f['format_id'] = itag - itags.append(itag) - f['quality'] = guess_quality(f) - filesize = int_or_none(self._search_regex( - r'/clen/(\d+)', f.get('fragment_base_url') - or f['url'], 'file size', default=None)) - if filesize: - f['filesize'] = filesize - yield f - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - video_id = self._match_id(url) + if process_manifest_format(f, 'dash', f['format_id']): + f['filesize'] = int_or_none(self._search_regex( + r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) + if live_from_start: + f['is_from_start'] = True + + yield f + + def _extract_storyboard(self, player_responses, duration): + spec = get_first( + player_responses, ('storyboards', 'playerStoryboardSpecRenderer', 'spec'), default='').split('|')[::-1] + base_url = url_or_none(urljoin('https://i.ytimg.com/', spec.pop() or None)) + if not base_url: + return + L = len(spec) - 1 + for i, args in enumerate(spec): + args = args.split('#') + counts = list(map(int_or_none, args[:5])) + if len(args) != 8 or not all(counts): + self.report_warning(f'Malformed storyboard {i}: {"#".join(args)}{bug_reports_message()}') + continue + width, height, frame_count, cols, rows = counts + N, sigh = args[6:] + + url = base_url.replace('$L', str(L - i)).replace('$N', N) + f'&sigh={sigh}' + fragment_count = frame_count / (cols * rows) + fragment_duration = duration / fragment_count + yield { + 'format_id': f'sb{i}', + 'format_note': 'storyboard', + 'ext': 'mhtml', + 'protocol': 'mhtml', + 'acodec': 'none', + 'vcodec': 'none', + 'url': url, + 'width': width, + 'height': height, + 'fragments': [{ + 'url': url.replace('$M', str(j)), + 'duration': min(fragment_duration, duration - (j * fragment_duration)), + } for j in range(math.ceil(fragment_count))], + } - base_url = self.http_scheme() + '//www.youtube.com/' - webpage_url = base_url + 'watch?v=' + video_id + def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): webpage = None if 'webpage' not in self._configuration_arg('player_skip'): webpage = self._download_webpage( @@ -2547,7 +3259,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._get_requested_clients(url, smuggled_data), video_id, webpage, master_ytcfg) - get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) + return webpage, master_ytcfg, player_responses, player_url + + def _list_formats(self, video_id, microformats, video_details, player_responses, player_url, duration=None): + live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails')) + is_live = get_first(video_details, 'isLive') + if is_live is None: + is_live = get_first(live_broadcast_details, 'isLiveNow') + + streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[]) + formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live, duration)) + + return live_broadcast_details, is_live, streaming_data, formats + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + video_id = self._match_id(url) + + base_url = self.http_scheme() + '//www.youtube.com/' + webpage_url = base_url + 'watch?v=' + video_id + + webpage, master_ytcfg, player_responses, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url) playability_statuses = traverse_obj( player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[]) @@ -2574,57 +3306,56 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or search_meta(['og:title', 'twitter:title', 'title'])) video_description = get_first(video_details, 'shortDescription') - if not smuggled_data.get('force_singlefeed', False): - if not self.get_param('noplaylist'): - multifeed_metadata_list = get_first( - player_responses, - ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'), - expected_type=str) - if multifeed_metadata_list: - entries = [] - feed_ids = [] - for feed in multifeed_metadata_list.split(','): - # Unquote should take place before split on comma (,) since textual - # fields may contain comma as well (see - # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs( - compat_urllib_parse_unquote_plus(feed)) - - def feed_entry(name): - return try_get( - feed_data, lambda x: x[name][0], compat_str) - - feed_id = feed_entry('id') - if not feed_id: - continue - feed_title = feed_entry('title') - title = video_title - if feed_title: - title += ' (%s)' % feed_title - entries.append({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - 'url': smuggle_url( - '%swatch?v=%s' % (base_url, feed_data['id'][0]), - {'force_singlefeed': True}), - 'title': title, - }) - feed_ids.append(feed_id) - self.to_screen( - 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' - % (', '.join(feed_ids), video_id)) - return self.playlist_result( - entries, video_id, video_title, video_description) - else: + multifeed_metadata_list = get_first( + player_responses, + ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'), + expected_type=str) + if multifeed_metadata_list and not smuggled_data.get('force_singlefeed'): + if self.get_param('noplaylist'): self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + else: + entries = [] + feed_ids = [] + for feed in multifeed_metadata_list.split(','): + # Unquote should take place before split on comma (,) since textual + # fields may contain comma as well (see + # https://github.com/ytdl-org/youtube-dl/issues/8536) + feed_data = compat_parse_qs( + compat_urllib_parse_unquote_plus(feed)) + + def feed_entry(name): + return try_get( + feed_data, lambda x: x[name][0], compat_str) + + feed_id = feed_entry('id') + if not feed_id: + continue + feed_title = feed_entry('title') + title = video_title + if feed_title: + title += ' (%s)' % feed_title + entries.append({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + 'url': smuggle_url( + '%swatch?v=%s' % (base_url, feed_data['id'][0]), + {'force_singlefeed': True}), + 'title': title, + }) + feed_ids.append(feed_id) + self.to_screen( + 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' + % (', '.join(feed_ids), video_id)) + return self.playlist_result( + entries, video_id, video_title, video_description) - live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails')) - is_live = get_first(video_details, 'isLive') - if is_live is None: - is_live = get_first(live_broadcast_details, 'isLiveNow') + duration = int_or_none( + get_first(video_details, 'lengthSeconds') + or get_first(microformats, 'lengthSeconds') + or parse_duration(search_meta('duration'))) or None - streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[]) - formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live)) + live_broadcast_details, is_live, streaming_data, formats = self._list_formats( + video_id, microformats, video_details, player_responses, player_url, duration) if not formats: if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')): @@ -2645,16 +3376,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if reason: self.raise_no_formats(reason, expected=True) - for f in formats: - if '&c=WEB&' in f['url'] and '&ratebypass=yes&' not in f['url']: # throttled - f['source_preference'] = -10 - # TODO: this method is not reliable - f['format_note'] = format_field(f, 'format_note', '%s ') + '(maybe throttled)' - - # Source is given priority since formats that throttle are given lower source_preference - # When throttling issue is fully fixed, remove this - self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source', 'codec:vp9.2', 'lang')) - keywords = get_first(video_details, 'keywords', expected_type=list) or [] if not keywords and webpage: keywords = [ @@ -2672,30 +3393,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if f.get('vcodec') != 'none': f['stretched_ratio'] = ratio break - - thumbnails = [] - thumbnail_dicts = traverse_obj( - (video_details, microformats), (..., ..., 'thumbnail', 'thumbnails', ...), - expected_type=dict, default=[]) - for thumbnail in thumbnail_dicts: - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - # Sometimes youtube gives a wrong thumbnail URL. See: - # https://github.com/hypervideo/hypervideo/issues/233 - # https://github.com/ytdl-org/youtube-dl/issues/28023 - if 'maxresdefault' in thumbnail_url: - thumbnail_url = thumbnail_url.split('?')[0] - thumbnails.append({ - 'url': thumbnail_url, - 'height': int_or_none(thumbnail.get('height')), - 'width': int_or_none(thumbnail.get('width')), - }) + thumbnails = self._extract_thumbnails((video_details, microformats), (..., ..., 'thumbnail')) thumbnail_url = search_meta(['og:image', 'twitter:image']) if thumbnail_url: thumbnails.append({ 'url': thumbnail_url, }) + original_thumbnails = thumbnails.copy() + # The best resolution thumbnails sometimes does not appear in the webpage # See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/hypervideo/hypervideo/issues/340 # List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029> @@ -2706,7 +3411,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'default', '1', '2', '3' ] n_thumbnail_names = len(thumbnail_names) - thumbnails.extend({ 'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format( video_id=video_id, name=name, ext=ext, @@ -2716,16 +3420,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names) thumb['preference'] = (0 if '.webp' in thumb['url'] else -1) - (2 * i) self._remove_duplicate_formats(thumbnails) + self._downloader._sort_thumbnails(original_thumbnails) category = get_first(microformats, 'category') or search_meta('genre') channel_id = str_or_none( get_first(video_details, 'channelId') or get_first(microformats, 'externalChannelId') or search_meta('channelId')) - duration = int_or_none( - get_first(video_details, 'lengthSeconds') - or get_first(microformats, 'lengthSeconds') - or parse_duration(search_meta('duration'))) or None owner_profile_url = get_first(microformats, 'ownerProfileUrl') live_content = get_first(video_details, 'isLiveContent') @@ -2735,25 +3436,34 @@ class YoutubeIE(YoutubeBaseInfoExtractor): is_live = False if is_upcoming is None and (live_content or is_live): is_upcoming = False - live_starttime = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp')) - live_endtime = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp')) - if not duration and live_endtime and live_starttime: - duration = live_endtime - live_starttime + live_start_time = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp')) + live_end_time = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp')) + if not duration and live_end_time and live_start_time: + duration = live_end_time - live_start_time + + if is_live and self.get_param('live_from_start'): + self._prepare_live_from_start_formats(formats, video_id, live_start_time, url, webpage_url, smuggled_data) + + formats.extend(self._extract_storyboard(player_responses, duration)) + + # Source is given priority since formats that throttle are given lower source_preference + # When throttling issue is fully fixed, remove this + self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source', 'codec:vp9.2', 'lang', 'proto')) info = { 'id': video_id, - 'title': self._live_title(video_title) if is_live else video_title, + 'title': video_title, 'formats': formats, 'thumbnails': thumbnails, + # The best thumbnail that we are sure exists. Prevents unnecessary + # URL checking if user don't care about getting the best possible thumbnail + 'thumbnail': traverse_obj(original_thumbnails, (-1, 'url')), 'description': video_description, - 'upload_date': unified_strdate( - get_first(microformats, 'uploadDate') - or search_meta('uploadDate')), 'uploader': get_first(video_details, 'author'), 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, 'uploader_url': owner_profile_url, 'channel_id': channel_id, - 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None, + 'channel_url': format_field(channel_id, template='https://www.youtube.com/channel/%s'), 'duration': duration, 'view_count': int_or_none( get_first((video_details, microformats), (..., 'viewCount')) @@ -2772,7 +3482,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else None if is_live is None or is_upcoming is None else live_content), 'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL - 'release_timestamp': live_starttime, + 'release_timestamp': live_start_time, } pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict) @@ -2797,13 +3507,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }) lang_subs.append({ 'ext': fmt, - 'url': update_url_query(base_url, query), + 'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)), 'name': sub_name, }) subtitles, automatic_captions = {}, {} for lang_code, caption_track in captions.items(): base_url = caption_track.get('baseUrl') + orig_lang = parse_qs(base_url).get('lang', [None])[-1] if not base_url: continue lang_name = self._get_text(caption_track, 'name', max_runs=1) @@ -2817,11 +3528,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for trans_code, trans_name in translation_languages.items(): if not trans_code: continue + orig_trans_code = trans_code if caption_track.get('kind') != 'asr': + if 'translated_subs' in self._configuration_arg('skip'): + continue trans_code += f'-{lang_code}' trans_name += format_field(lang_name, template=' from %s') - process_language( - automatic_captions, base_url, trans_code, trans_name, {'tlang': trans_code}) + # Add an "-orig" label to the original language so that it can be distinguished. + # The subs are returned without "-orig" as well for compatibility + if lang_code == f'a-{orig_trans_code}': + process_language( + automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {}) + # Setting tlang=lang returns damaged subtitles. + process_language(automatic_captions, base_url, trans_code, trans_name, + {} if orig_lang == orig_trans_code else {'tlang': trans_code}) info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles @@ -2884,87 +3604,101 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or self._extract_chapters_from_engagement_panel(initial_data, duration) or None) - contents = try_get( - initial_data, - lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], - list) or [] - for content in contents: - vpir = content.get('videoPrimaryInfoRenderer') - if vpir: - stl = vpir.get('superTitleLink') - if stl: - stl = self._get_text(stl) - if try_get( - vpir, - lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': - info['location'] = stl - else: - mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) - if mobj: - info.update({ - 'series': mobj.group(1), - 'season_number': int(mobj.group(2)), - 'episode_number': int(mobj.group(3)), - }) - for tlb in (try_get( - vpir, - lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], - list) or []): - tbr = tlb.get('toggleButtonRenderer') or {} - for getter, regex in [( - lambda x: x['defaultText']['accessibility']['accessibilityData'], - r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ - lambda x: x['accessibility'], - lambda x: x['accessibilityData']['accessibilityData'], - ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]: - label = (try_get(tbr, getter, dict) or {}).get('label') - if label: - mobj = re.match(regex, label) - if mobj: - info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) - break - sbr_tooltip = try_get( - vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) - if sbr_tooltip: - like_count, dislike_count = sbr_tooltip.split(' / ') + contents = traverse_obj( + initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents'), + expected_type=list, default=[]) + + vpir = get_first(contents, 'videoPrimaryInfoRenderer') + if vpir: + stl = vpir.get('superTitleLink') + if stl: + stl = self._get_text(stl) + if try_get( + vpir, + lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': + info['location'] = stl + else: + mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) + if mobj: info.update({ - 'like_count': str_to_int(like_count), - 'dislike_count': str_to_int(dislike_count), + 'series': mobj.group(1), + 'season_number': int(mobj.group(2)), + 'episode_number': int(mobj.group(3)), }) - vsir = content.get('videoSecondaryInfoRenderer') - if vsir: - info['channel'] = self._get_text(vsir, ('owner', 'videoOwnerRenderer', 'title')) - rows = try_get( - vsir, - lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], - list) or [] - multiple_songs = False - for row in rows: - if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: - multiple_songs = True + for tlb in (try_get( + vpir, + lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], + list) or []): + tbr = tlb.get('toggleButtonRenderer') or {} + for getter, regex in [( + lambda x: x['defaultText']['accessibility']['accessibilityData'], + r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ + lambda x: x['accessibility'], + lambda x: x['accessibilityData']['accessibilityData'], + ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]: + label = (try_get(tbr, getter, dict) or {}).get('label') + if label: + mobj = re.match(regex, label) + if mobj: + info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) break - for row in rows: - mrr = row.get('metadataRowRenderer') or {} - mrr_title = mrr.get('title') - if not mrr_title: - continue - mrr_title = self._get_text(mrr, 'title') - mrr_contents_text = self._get_text(mrr, ('contents', 0)) - if mrr_title == 'License': - info['license'] = mrr_contents_text - elif not multiple_songs: - if mrr_title == 'Album': - info['album'] = mrr_contents_text - elif mrr_title == 'Artist': - info['artist'] = mrr_contents_text - elif mrr_title == 'Song': - info['track'] = mrr_contents_text + sbr_tooltip = try_get( + vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) + if sbr_tooltip: + like_count, dislike_count = sbr_tooltip.split(' / ') + info.update({ + 'like_count': str_to_int(like_count), + 'dislike_count': str_to_int(dislike_count), + }) + vsir = get_first(contents, 'videoSecondaryInfoRenderer') + if vsir: + vor = traverse_obj(vsir, ('owner', 'videoOwnerRenderer')) + info.update({ + 'channel': self._get_text(vor, 'title'), + 'channel_follower_count': self._get_count(vor, 'subscriberCountText')}) + + rows = try_get( + vsir, + lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], + list) or [] + multiple_songs = False + for row in rows: + if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: + multiple_songs = True + break + for row in rows: + mrr = row.get('metadataRowRenderer') or {} + mrr_title = mrr.get('title') + if not mrr_title: + continue + mrr_title = self._get_text(mrr, 'title') + mrr_contents_text = self._get_text(mrr, ('contents', 0)) + if mrr_title == 'License': + info['license'] = mrr_contents_text + elif not multiple_songs: + if mrr_title == 'Album': + info['album'] = mrr_contents_text + elif mrr_title == 'Artist': + info['artist'] = mrr_contents_text + elif mrr_title == 'Song': + info['track'] = mrr_contents_text fallbacks = { 'channel': 'uploader', 'channel_id': 'uploader_id', 'channel_url': 'uploader_url', } + + # The upload date for scheduled, live and past live streams / premieres in microformats + # may be different from the stream date. Although not in UTC, we will prefer it in this case. + # See: https://github.com/hypervideo/hypervideo/pull/2223#issuecomment-1008485139 + upload_date = ( + unified_strdate(get_first(microformats, 'uploadDate')) + or unified_strdate(search_meta('uploadDate'))) + if not upload_date or (not info.get('is_live') and not info.get('was_live') and info.get('live_status') != 'is_upcoming'): + upload_date = strftime_or_none(self._extract_time_text(vpir, 'dateText')[0], '%Y%m%d') + info['upload_date'] = upload_date + for to, frm in fallbacks.items(): if not info.get(to): info[to] = info.get(frm) @@ -3009,494 +3743,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return info -class YoutubeTabIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com tab' - _VALID_URL = r'''(?x) - https?:// - (?:\w+\.)? - (?: - youtube(?:kids)?\.com| - invidio\.us - )/ - (?: - (?P<channel_type>channel|c|user|browse)/| - (?P<not_channel> - feed/|hashtag/| - (?:playlist|watch)\?.*?\blist= - )| - (?!(?:%s)\b) # Direct URLs - ) - (?P<id>[^/?\#&]+) - ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES - IE_NAME = 'youtube:tab' +class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): - _TESTS = [{ - 'note': 'playlists, multipage', - 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', - 'playlist_mincount': 94, - 'info_dict': { - 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Игорь Клейнер - Playlists', - 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', - 'uploader': 'Игорь Клейнер', - 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', - }, - }, { - 'note': 'playlists, multipage, different order', - 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', - 'playlist_mincount': 94, - 'info_dict': { - 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Игорь Клейнер - Playlists', - 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', - 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'uploader': 'Игорь Клейнер', - }, - }, { - 'note': 'playlists, series', - 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'UCYO_jab_esuFRV4b17AJtAw', - 'title': '3Blue1Brown - Playlists', - 'description': 'md5:e1384e8a133307dd10edee76e875d62f', - 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', - 'uploader': '3Blue1Brown', - }, - }, { - 'note': 'playlists, singlepage', - 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', - 'playlist_mincount': 4, - 'info_dict': { - 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', - 'title': 'ThirstForScience - Playlists', - 'description': 'md5:609399d937ea957b0f53cbffb747a14c', - 'uploader': 'ThirstForScience', - 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ', - } - }, { - 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', - 'only_matching': True, - }, { - 'note': 'basic, single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', - 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'title': 'youtube-dl public playlist', - }, - 'playlist_count': 1, - }, { - 'note': 'empty playlist', - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', - 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', - 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', - 'title': 'youtube-dl empty playlist', - }, - 'playlist_count': 0, - }, { - 'note': 'Home tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Home', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 2, - }, { - 'note': 'Videos tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Videos', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 975, - }, { - 'note': 'Videos tab, sorted by popular', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Videos', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 199, - }, { - 'note': 'Playlists tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Playlists', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 17, - }, { - 'note': 'Community tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Community', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 18, - }, { - 'note': 'Channels tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Channels', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 12, - }, { - 'note': 'Search tab', - 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', - 'playlist_mincount': 40, - 'info_dict': { - 'id': 'UCYO_jab_esuFRV4b17AJtAw', - 'title': '3Blue1Brown - Search - linear algebra', - 'description': 'md5:e1384e8a133307dd10edee76e875d62f', - 'uploader': '3Blue1Brown', - 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', - }, - }, { - 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', - 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'info_dict': { - 'title': '29C3: Not my department', - 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'uploader': 'Christiaan008', - 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', - 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268', - }, - 'playlist_count': 96, - }, { - 'note': 'Large playlist', - 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', - 'info_dict': { - 'title': 'Uploads from Cauchemar', - 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', - 'uploader': 'Cauchemar', - 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', - }, - 'playlist_mincount': 1123, - }, { - 'note': 'even larger playlist, 8832 videos', - 'url': 'http://www.youtube.com/user/NASAgovVideo/videos', - 'only_matching': True, - }, { - 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', - 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', - 'info_dict': { - 'title': 'Uploads from Interstellar Movie', - 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', - 'uploader': 'Interstellar Movie', - 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', - }, - 'playlist_mincount': 21, - }, { - 'note': 'Playlist with "show unavailable videos" button', - 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q', - 'info_dict': { - 'title': 'Uploads from Phim Siêu Nhân Nhật Bản', - 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q', - 'uploader': 'Phim Siêu Nhân Nhật Bản', - 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', - }, - 'playlist_mincount': 200, - }, { - 'note': 'Playlist with unavailable videos in page 7', - 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w', - 'info_dict': { - 'title': 'Uploads from BlankTV', - 'id': 'UU8l9frL61Yl5KFOl87nIm2w', - 'uploader': 'BlankTV', - 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w', - }, - 'playlist_mincount': 1000, - }, { - 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844', - 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'info_dict': { - 'title': 'Data Analysis with Dr Mike Pound', - 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', - 'uploader': 'Computerphile', - 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487', - }, - 'playlist_mincount': 11, - }, { - 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'only_matching': True, - }, { - 'note': 'Playlist URL that does not actually serve a playlist', - 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', - 'info_dict': { - 'id': 'FqZTN594JQw', - 'ext': 'webm', - 'title': "Smiley's People 01 detective, Adventure Series, Action", - 'uploader': 'STREEM', - 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', - 'upload_date': '20150526', - 'license': 'Standard YouTube License', - 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', - 'categories': ['People & Blogs'], - 'tags': list, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - 'add_ie': [YoutubeIE.ie_key()], - }, { - 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', - 'info_dict': { - 'id': '3yImotZU3tw', # This will keep changing - 'ext': 'mp4', - 'title': compat_str, - 'uploader': 'Sky News', - 'uploader_id': 'skynews', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', - 'upload_date': r're:\d{8}', - 'description': compat_str, - 'categories': ['News & Politics'], - 'tags': list, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '], - }, { - 'url': 'https://www.youtube.com/user/TheYoungTurks/live', - 'info_dict': { - 'id': 'a48o2S1cPoo', - 'ext': 'mp4', - 'title': 'The Young Turks - Live Main Show', - 'uploader': 'The Young Turks', - 'uploader_id': 'TheYoungTurks', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', - 'upload_date': '20150715', - 'license': 'Standard YouTube License', - 'description': 'md5:438179573adcdff3c97ebb1ee632b891', - 'categories': ['News & Politics'], - 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', - 'only_matching': True, - }, { - 'note': 'A channel that is not live. Should raise error', - 'url': 'https://www.youtube.com/user/numberphile/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/trending', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/library', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/history', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/subscriptions', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/watch_later', - 'only_matching': True, - }, { - 'note': 'Recommended - redirects to home page.', - 'url': 'https://www.youtube.com/feed/recommended', - 'only_matching': True, - }, { - 'note': 'inline playlist with not always working continuations', - 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/course', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/zsecurity', - 'only_matching': True, - }, { - 'url': 'http://www.youtube.com/NASAgovVideo/videos', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/TheYoungTurks/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/hashtag/cctv9', - 'info_dict': { - 'id': 'cctv9', - 'title': '#cctv9', - }, - 'playlist_mincount': 350, - }, { - 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', - 'only_matching': True, - }, { - 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist', - 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', - 'only_matching': True - }, { - 'note': '/browse/ should redirect to /channel/', - 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng', - 'only_matching': True - }, { - 'note': 'VLPL, should redirect to playlist?list=PL...', - 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', - 'info_dict': { - 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', - 'uploader': 'NoCopyrightSounds', - 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', - 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', - 'title': 'NCS Releases', - }, - 'playlist_mincount': 166, - }, { - 'note': 'Topic, should redirect to playlist?list=UU...', - 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', - 'info_dict': { - 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', - 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', - 'title': 'Uploads from Royalty Free Music - Topic', - 'uploader': 'Royalty Free Music - Topic', - }, - 'expected_warnings': [ - 'A channel/user page was given', - 'The URL does not have a videos tab', - ], - 'playlist_mincount': 101, - }, { - 'note': 'Topic without a UU playlist', - 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', - 'info_dict': { - 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', - 'title': 'UCtFRv9O2AHqOZjjynzrv-xg', - }, - 'expected_warnings': [ - 'A channel/user page was given', - 'The URL does not have a videos tab', - 'Falling back to channel URL', - ], - 'playlist_mincount': 9, - }, { - 'note': 'Youtube music Album', - 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE', - 'info_dict': { - 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0', - 'title': 'Album - Royalty Free Music Library V2 (50 Songs)', - }, - 'playlist_count': 50, - }, { - 'note': 'unlisted single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', - 'info_dict': { - 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', - 'uploader': 'colethedj', - 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', - 'title': 'hypervideo unlisted playlist test', - 'availability': 'unlisted' - }, - 'playlist_count': 1, - }, { - 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', - 'url': 'https://www.youtube.com/feed/recommended', - 'info_dict': { - 'id': 'recommended', - 'title': 'recommended', - }, - 'playlist_mincount': 50, - 'params': { - 'skip_download': True, - 'extractor_args': {'youtubetab': {'skip': ['webpage']}} - }, - }, { - 'note': 'API Fallback: /videos tab, sorted by oldest first', - 'url': 'https://www.youtube.com/user/theCodyReeder/videos?view=0&sort=da&flow=grid', - 'info_dict': { - 'id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', - 'title': 'Cody\'sLab - Videos', - 'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa', - 'uploader': 'Cody\'sLab', - 'uploader_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', - }, - 'playlist_mincount': 650, - 'params': { - 'skip_download': True, - 'extractor_args': {'youtubetab': {'skip': ['webpage']}} - }, - }, { - 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', - 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', - 'info_dict': { - 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', - 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', - 'title': 'Uploads from Royalty Free Music - Topic', - 'uploader': 'Royalty Free Music - Topic', - }, - 'expected_warnings': [ - 'A channel/user page was given', - 'The URL does not have a videos tab', - ], - 'playlist_mincount': 101, - 'params': { - 'skip_download': True, - 'extractor_args': {'youtubetab': {'skip': ['webpage']}} - }, - }] + @staticmethod + def passthrough_smuggled_data(func): + def _smuggle(entries, smuggled_data): + for entry in entries: + # TODO: Convert URL to music.youtube instead. + # Do we need to passthrough any other smuggled_data? + entry['url'] = smuggle_url(entry['url'], smuggled_data) + yield entry - @classmethod - def suitable(cls, url): - return False if YoutubeIE.suitable(url) else super( - YoutubeTabIE, cls).suitable(url) + @functools.wraps(func) + def wrapper(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + if self.is_music_url(url): + smuggled_data['is_music_url'] = True + info_dict = func(self, url, smuggled_data) + if smuggled_data and info_dict.get('entries'): + info_dict['entries'] = _smuggle(info_dict['entries'], smuggled_data) + return info_dict + return wrapper def _extract_channel_id(self, webpage): channel_id = self._html_search_meta( @@ -3515,7 +3782,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): def _extract_basic_item_renderer(item): # Modified from _extract_grid_item_renderer known_basic_renderers = ( - 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer' + 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer', 'reelItemRenderer' ) for key, renderer in item.items(): if not isinstance(renderer, dict): @@ -3565,6 +3832,24 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title) break + def _music_reponsive_list_entry(self, renderer): + video_id = traverse_obj(renderer, ('playlistItemData', 'videoId')) + if video_id: + return self.url_result(f'https://music.youtube.com/watch?v={video_id}', + ie=YoutubeIE.ie_key(), video_id=video_id) + playlist_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'playlistId')) + if playlist_id: + video_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'videoId')) + if video_id: + return self.url_result(f'https://music.youtube.com/watch?v={video_id}&list={playlist_id}', + ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + return self.url_result(f'https://music.youtube.com/playlist?list={playlist_id}', + ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + browse_id = traverse_obj(renderer, ('navigationEndpoint', 'browseEndpoint', 'browseId')) + if browse_id: + return self.url_result(f'https://music.youtube.com/browse/{browse_id}', + ie=YoutubeTabIE.ie_key(), video_id=browse_id) + def _shelf_entries_from_content(self, shelf_renderer): content = shelf_renderer.get('content') if not isinstance(content, dict): @@ -3623,6 +3908,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if video_id: return self._extract_video(video_renderer) + def _hashtag_tile_entry(self, hashtag_tile_renderer): + url = urljoin('https://youtube.com', traverse_obj( + hashtag_tile_renderer, ('onTapCommand', 'commandMetadata', 'webCommandMetadata', 'url'))) + if url: + return self.url_result( + url, ie=YoutubeTabIE.ie_key(), title=self._get_text(hashtag_tile_renderer, 'hashtag')) + def _post_thread_entries(self, post_thread_renderer): post_renderer = try_get( post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict) @@ -3679,49 +3971,59 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if entry: yield entry ''' - def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): - def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds - contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] - for content in contents: - if not isinstance(content, dict): - continue - is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict) - if not is_renderer: - renderer = content.get('richItemRenderer') - if renderer: - for entry in self._rich_entries(renderer): - yield entry - continuation_list[0] = self._extract_continuation(parent_renderer) + def _extract_entries(self, parent_renderer, continuation_list): + # continuation_list is modified in-place with continuation_list = [continuation_token] + continuation_list[:] = [None] + contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): + continue + is_renderer = traverse_obj( + content, 'itemSectionRenderer', 'musicShelfRenderer', 'musicShelfContinuation', + expected_type=dict) + if not is_renderer: + renderer = content.get('richItemRenderer') + if renderer: + for entry in self._rich_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(parent_renderer) + continue + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): continue - isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] - for isr_content in isr_contents: - if not isinstance(isr_content, dict): - continue - - known_renderers = { - 'playlistVideoListRenderer': self._playlist_entries, - 'gridRenderer': self._grid_entries, - 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'), - 'backstagePostThreadRenderer': self._post_thread_entries, - 'videoRenderer': lambda x: [self._video_entry(x)], - } - for key, renderer in isr_content.items(): - if key not in known_renderers: - continue - for entry in known_renderers[key](renderer): - if entry: - yield entry - continuation_list[0] = self._extract_continuation(renderer) - break - if not continuation_list[0]: - continuation_list[0] = self._extract_continuation(is_renderer) + known_renderers = { + 'playlistVideoListRenderer': self._playlist_entries, + 'gridRenderer': self._grid_entries, + 'reelShelfRenderer': self._grid_entries, + 'shelfRenderer': self._shelf_entries, + 'musicResponsiveListItemRenderer': lambda x: [self._music_reponsive_list_entry(x)], + 'backstagePostThreadRenderer': self._post_thread_entries, + 'videoRenderer': lambda x: [self._video_entry(x)], + 'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}), + 'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}), + 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)] + } + for key, renderer in isr_content.items(): + if key not in known_renderers: + continue + for entry in known_renderers[key](renderer): + if entry: + yield entry + continuation_list[0] = self._extract_continuation(renderer) + break if not continuation_list[0]: - continuation_list[0] = self._extract_continuation(parent_renderer) + continuation_list[0] = self._extract_continuation(is_renderer) + + if not continuation_list[0]: + continuation_list[0] = self._extract_continuation(parent_renderer) - continuation_list = [None] # Python 2 does not support nonlocal + def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): + continuation_list = [None] + extract_entries = lambda x: self._extract_entries(x, continuation_list) tab_content = try_get(tab, lambda x: x['content'], dict) if not tab_content: return @@ -3770,6 +4072,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): continue known_renderers = { + 'videoRenderer': (self._grid_entries, 'items'), # for membership tab 'gridPlaylistRenderer': (self._grid_entries, 'items'), 'gridVideoRenderer': (self._grid_entries, 'items'), 'gridChannelRenderer': (self._grid_entries, 'items'), @@ -3797,13 +4100,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): break @staticmethod - def _extract_selected_tab(tabs): + def _extract_selected_tab(tabs, fatal=True): for tab in tabs: renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {} if renderer.get('selected') is True: return renderer else: - raise ExtractorError('Unable to find selected tab') + if fatal: + raise ExtractorError('Unable to find selected tab') @classmethod def _extract_uploader(cls, data): @@ -3822,10 +4126,10 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): def _extract_from_tabs(self, item_id, ytcfg, data, tabs): playlist_id = title = description = channel_url = channel_name = channel_id = None - thumbnails_list = [] tags = [] selected_tab = self._extract_selected_tab(tabs) + primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) if renderer: @@ -3841,34 +4145,49 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): description = renderer.get('description', '') playlist_id = channel_id tags = renderer.get('keywords', '').split() - thumbnails_list = ( - try_get(renderer, lambda x: x['avatar']['thumbnails'], list) - or try_get( - self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'), - lambda x: x['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'], - list) - or []) - thumbnails = [] - for t in thumbnails_list: - if not isinstance(t, dict): - continue - thumbnail_url = url_or_none(t.get('url')) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(t.get('width')), - 'height': int_or_none(t.get('height')), - }) + # We can get the uncropped banner/avatar by replacing the crop params with '=s0' + # See: https://github.com/hypervideo/hypervideo/issues/2237#issuecomment-1013694714 + def _get_uncropped(url): + return url_or_none((url or '').split('=')[0] + '=s0') + + avatar_thumbnails = self._extract_thumbnails(renderer, 'avatar') + if avatar_thumbnails: + uncropped_avatar = _get_uncropped(avatar_thumbnails[0]['url']) + if uncropped_avatar: + avatar_thumbnails.append({ + 'url': uncropped_avatar, + 'id': 'avatar_uncropped', + 'preference': 1 + }) + + channel_banners = self._extract_thumbnails( + data, ('header', ..., ['banner', 'mobileBanner', 'tvBanner'])) + for banner in channel_banners: + banner['preference'] = -10 + + if channel_banners: + uncropped_banner = _get_uncropped(channel_banners[0]['url']) + if uncropped_banner: + channel_banners.append({ + 'url': uncropped_banner, + 'id': 'banner_uncropped', + 'preference': -5 + }) + + primary_thumbnails = self._extract_thumbnails( + primary_sidebar_renderer, ('thumbnailRenderer', ('playlistVideoThumbnailRenderer', 'playlistCustomThumbnailRenderer'), 'thumbnail')) + if playlist_id is None: playlist_id = item_id + + playlist_stats = traverse_obj(primary_sidebar_renderer, 'stats') + last_updated_unix, _ = self._extract_time_text(playlist_stats, 2) if title is None: - title = ( - try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText']) - or playlist_id) + title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id title += format_field(selected_tab, 'title', ' - %s') title += format_field(selected_tab, 'expandedText', ' - %s') + metadata = { 'playlist_id': playlist_id, 'playlist_title': title, @@ -3876,12 +4195,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'uploader': channel_name, 'uploader_id': channel_id, 'uploader_url': channel_url, - 'thumbnails': thumbnails, + 'thumbnails': primary_thumbnails + avatar_thumbnails + channel_banners, 'tags': tags, + 'view_count': self._get_count(playlist_stats, 1), + 'availability': self._extract_availability(data), + 'modified_date': strftime_or_none(last_updated_unix, '%Y%m%d'), + 'playlist_count': self._get_count(playlist_stats, 0), + 'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')), } - availability = self._extract_availability(data) - if availability: - metadata['availability'] = availability if not channel_id: metadata.update(self._extract_uploader(data)) metadata.update({ @@ -4059,7 +4380,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): self.report_warning(error_to_compat_str(e)) break - if dict_get(data, ('contents', 'currentVideoEndpoint')): + if dict_get(data, ('contents', 'currentVideoEndpoint', 'onResponseReceivedActions')): break last_error = 'Incomplete yt initial data received' @@ -4076,6 +4397,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if 'webpage' not in self._configuration_arg('skip'): webpage, data = self._extract_webpage(url, item_id, fatal=webpage_fatal) ytcfg = ytcfg or self.extract_ytcfg(item_id, webpage) + # Reject webpage data if redirected to home page without explicitly requesting + selected_tab = self._extract_selected_tab(traverse_obj( + data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list, default=[]), fatal=False) or {} + if (url != 'https://www.youtube.com/feed/recommended' + and selected_tab.get('tabIdentifier') == 'FEwhat_to_watch' # Home page + and 'no-youtube-channel-redirect' not in self.get_param('compat_opts', [])): + msg = 'The channel/playlist does not exist and the URL redirected to youtube.com home page' + if fatal: + raise ExtractorError(msg, expected=True) + self.report_warning(msg, only_once=True) if not data: if not ytcfg and self.is_authenticated: msg = 'Playlists that require authentication may not extract correctly without a successful webpage download.' @@ -4100,67 +4431,756 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): return self._extract_response( item_id=item_id, query=params, ep=ep, headers=headers, ytcfg=ytcfg, fatal=fatal, default_client=default_client, - check_get_keys=('contents', 'currentVideoEndpoint')) + check_get_keys=('contents', 'currentVideoEndpoint', 'onResponseReceivedActions')) err_note = 'Failed to resolve url (does the playlist exist?)' if fatal: raise ExtractorError(err_note, expected=True) self.report_warning(err_note, item_id) - @staticmethod - def _smuggle_data(entries, data): - for entry in entries: - if data: - entry['url'] = smuggle_url(entry['url'], data) - yield entry + _SEARCH_PARAMS = None + + def _search_results(self, query, params=NO_DEFAULT, default_client='web'): + data = {'query': query} + if params is NO_DEFAULT: + params = self._SEARCH_PARAMS + if params: + data['params'] = params + + content_keys = ( + ('contents', 'twoColumnSearchResultsRenderer', 'primaryContents', 'sectionListRenderer', 'contents'), + ('onResponseReceivedCommands', 0, 'appendContinuationItemsAction', 'continuationItems'), + # ytmusic search + ('contents', 'tabbedSearchResultsRenderer', 'tabs', 0, 'tabRenderer', 'content', 'sectionListRenderer', 'contents'), + ('continuationContents', ), + ) + check_get_keys = tuple(set(keys[0] for keys in content_keys)) + + continuation_list = [None] + for page_num in itertools.count(1): + data.update(continuation_list[0] or {}) + search = self._extract_response( + item_id='query "%s" page %s' % (query, page_num), ep='search', query=data, + default_client=default_client, check_get_keys=check_get_keys) + slr_contents = traverse_obj(search, *content_keys) + yield from self._extract_entries({'contents': list(variadic(slr_contents))}, continuation_list) + if not continuation_list[0]: + break - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - if self.is_music_url(url): - smuggled_data['is_music_url'] = True - info_dict = self.__real_extract(url, smuggled_data) - if info_dict.get('entries'): - info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data) - return info_dict - _url_re = re.compile(r'(?P<pre>%s)(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$' % _VALID_URL) +class YoutubeTabIE(YoutubeTabBaseInfoExtractor): + IE_DESC = 'YouTube Tabs' + _VALID_URL = r'''(?x: + https?:// + (?:\w+\.)? + (?: + youtube(?:kids)?\.com| + %(invidious)s + )/ + (?: + (?P<channel_type>channel|c|user|browse)/| + (?P<not_channel> + feed/|hashtag/| + (?:playlist|watch)\?.*?\blist= + )| + (?!(?:%(reserved_names)s)\b) # Direct URLs + ) + (?P<id>[^/?\#&]+) + )''' % { + 'reserved_names': YoutubeBaseInfoExtractor._RESERVED_NAMES, + 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), + } + IE_NAME = 'youtube:tab' - def __real_extract(self, url, smuggled_data): + _TESTS = [{ + 'note': 'playlists, multipage', + 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Igor Kleiner - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + 'uploader': 'Igor Kleiner', + 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'channel': 'Igor Kleiner', + 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'tags': ['"критическое', 'мышление"', '"наука', 'просто"', 'математика', '"анализ', 'данных"'], + 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', + 'uploader_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', + 'channel_follower_count': int + }, + }, { + 'note': 'playlists, multipage, different order', + 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Igor Kleiner - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'uploader': 'Igor Kleiner', + 'uploader_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', + 'tags': ['"критическое', 'мышление"', '"наука', 'просто"', 'математика', '"анализ', 'данных"'], + 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'channel': 'Igor Kleiner', + 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', + 'channel_follower_count': int + }, + }, { + 'note': 'playlists, series', + 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Playlists', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', + 'uploader': '3Blue1Brown', + 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', + 'uploader_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', + 'channel': '3Blue1Brown', + 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', + 'tags': ['Mathematics'], + 'channel_follower_count': int + }, + }, { + 'note': 'playlists, singlepage', + 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', + 'playlist_mincount': 4, + 'info_dict': { + 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + 'title': 'ThirstForScience - Playlists', + 'description': 'md5:609399d937ea957b0f53cbffb747a14c', + 'uploader': 'ThirstForScience', + 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + 'uploader_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ', + 'channel_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ', + 'channel_id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + 'tags': 'count:13', + 'channel': 'ThirstForScience', + 'channel_follower_count': int + } + }, { + 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', + 'only_matching': True, + }, { + 'note': 'basic, single video playlist', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'info_dict': { + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'title': 'youtube-dl public playlist', + 'description': '', + 'tags': [], + 'view_count': int, + 'modified_date': '20201130', + 'channel': 'Sergey M.', + 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + }, + 'playlist_count': 1, + }, { + 'note': 'empty playlist', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'info_dict': { + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'title': 'youtube-dl empty playlist', + 'tags': [], + 'channel': 'Sergey M.', + 'description': '', + 'modified_date': '20160902', + 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + }, + 'playlist_count': 0, + }, { + 'note': 'Home tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Home', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel': 'lex will', + 'tags': ['bible', 'history', 'prophesy'], + 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_follower_count': int + }, + 'playlist_mincount': 2, + }, { + 'note': 'Videos tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'tags': ['bible', 'history', 'prophesy'], + 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel': 'lex will', + 'channel_follower_count': int + }, + 'playlist_mincount': 975, + }, { + 'note': 'Videos tab, sorted by popular', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel': 'lex will', + 'tags': ['bible', 'history', 'prophesy'], + 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_follower_count': int + }, + 'playlist_mincount': 199, + }, { + 'note': 'Playlists tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Playlists', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel': 'lex will', + 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'tags': ['bible', 'history', 'prophesy'], + 'channel_follower_count': int + }, + 'playlist_mincount': 17, + }, { + 'note': 'Community tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Community', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel': 'lex will', + 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'tags': ['bible', 'history', 'prophesy'], + 'channel_follower_count': int + }, + 'playlist_mincount': 18, + }, { + 'note': 'Channels tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Channels', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel': 'lex will', + 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'tags': ['bible', 'history', 'prophesy'], + 'channel_follower_count': int + }, + 'playlist_mincount': 12, + }, { + 'note': 'Search tab', + 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', + 'playlist_mincount': 40, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Search - linear algebra', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'uploader': '3Blue1Brown', + 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', + 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', + 'uploader_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', + 'tags': ['Mathematics'], + 'channel': '3Blue1Brown', + 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', + 'channel_follower_count': int + }, + }, { + 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', + 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'info_dict': { + 'title': '29C3: Not my department', + 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'uploader': 'Christiaan008', + 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', + 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268', + 'tags': [], + 'uploader_url': 'https://www.youtube.com/c/ChRiStIaAn008', + 'view_count': int, + 'modified_date': '20150605', + 'channel_id': 'UCEPzS1rYsrkqzSLNp76nrcg', + 'channel_url': 'https://www.youtube.com/c/ChRiStIaAn008', + 'channel': 'Christiaan008', + }, + 'playlist_count': 96, + }, { + 'note': 'Large playlist', + 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', + 'info_dict': { + 'title': 'Uploads from Cauchemar', + 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', + 'uploader': 'Cauchemar', + 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', + 'channel_url': 'https://www.youtube.com/c/Cauchemar89', + 'tags': [], + 'modified_date': r're:\d{8}', + 'channel': 'Cauchemar', + 'uploader_url': 'https://www.youtube.com/c/Cauchemar89', + 'view_count': int, + 'description': '', + 'channel_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', + }, + 'playlist_mincount': 1123, + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], + }, { + 'note': 'even larger playlist, 8832 videos', + 'url': 'http://www.youtube.com/user/NASAgovVideo/videos', + 'only_matching': True, + }, { + 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', + 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', + 'info_dict': { + 'title': 'Uploads from Interstellar Movie', + 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', + 'uploader': 'Interstellar Movie', + 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', + 'uploader_url': 'https://www.youtube.com/c/InterstellarMovie', + 'tags': [], + 'view_count': int, + 'channel_id': 'UCXw-G3eDE9trcvY2sBMM_aA', + 'channel_url': 'https://www.youtube.com/c/InterstellarMovie', + 'channel': 'Interstellar Movie', + 'description': '', + 'modified_date': r're:\d{8}', + }, + 'playlist_mincount': 21, + }, { + 'note': 'Playlist with "show unavailable videos" button', + 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q', + 'info_dict': { + 'title': 'Uploads from Phim Siêu Nhân Nhật Bản', + 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q', + 'uploader': 'Phim Siêu Nhân Nhật Bản', + 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', + 'view_count': int, + 'channel': 'Phim Siêu Nhân Nhật Bản', + 'tags': [], + 'uploader_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q', + 'description': '', + 'channel_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q', + 'channel_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', + 'modified_date': r're:\d{8}', + }, + 'playlist_mincount': 200, + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], + }, { + 'note': 'Playlist with unavailable videos in page 7', + 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w', + 'info_dict': { + 'title': 'Uploads from BlankTV', + 'id': 'UU8l9frL61Yl5KFOl87nIm2w', + 'uploader': 'BlankTV', + 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w', + 'channel': 'BlankTV', + 'channel_url': 'https://www.youtube.com/c/blanktv', + 'channel_id': 'UC8l9frL61Yl5KFOl87nIm2w', + 'view_count': int, + 'tags': [], + 'uploader_url': 'https://www.youtube.com/c/blanktv', + 'modified_date': r're:\d{8}', + 'description': '', + }, + 'playlist_mincount': 1000, + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], + }, { + 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844', + 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'info_dict': { + 'title': 'Data Analysis with Dr Mike Pound', + 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', + 'uploader': 'Computerphile', + 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487', + 'uploader_url': 'https://www.youtube.com/user/Computerphile', + 'tags': [], + 'view_count': int, + 'channel_id': 'UC9-y-6csu5WGm29I7JiwpnA', + 'channel_url': 'https://www.youtube.com/user/Computerphile', + 'channel': 'Computerphile', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'only_matching': True, + }, { + 'note': 'Playlist URL that does not actually serve a playlist', + 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', + 'info_dict': { + 'id': 'FqZTN594JQw', + 'ext': 'webm', + 'title': "Smiley's People 01 detective, Adventure Series, Action", + 'uploader': 'STREEM', + 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', + 'upload_date': '20150526', + 'license': 'Standard YouTube License', + 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', + 'categories': ['People & Blogs'], + 'tags': list, + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video is not available.', + 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', + 'info_dict': { + 'id': 'GgL890LIznQ', # This will keep changing + 'ext': 'mp4', + 'title': str, + 'uploader': 'Sky News', + 'uploader_id': 'skynews', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', + 'upload_date': r're:\d{8}', + 'description': str, + 'categories': ['News & Politics'], + 'tags': list, + 'like_count': int, + 'release_timestamp': 1642502819, + 'channel': 'Sky News', + 'channel_id': 'UCoMdktPbSTixAyNGwb-UYkQ', + 'age_limit': 0, + 'view_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/GgL890LIznQ/maxresdefault_live.jpg', + 'playable_in_embed': True, + 'release_date': '20220118', + 'availability': 'public', + 'live_status': 'is_live', + 'channel_url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ', + 'channel_follower_count': int + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Ignoring subtitle tracks found in '], + }, { + 'url': 'https://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'uploader': 'The Young Turks', + 'uploader_id': 'TheYoungTurks', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', + 'only_matching': True, + }, { + 'note': 'A channel that is not live. Should raise error', + 'url': 'https://www.youtube.com/user/numberphile/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/trending', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/library', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/history', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/subscriptions', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/watch_later', + 'only_matching': True, + }, { + 'note': 'Recommended - redirects to home page.', + 'url': 'https://www.youtube.com/feed/recommended', + 'only_matching': True, + }, { + 'note': 'inline playlist with not always working continuations', + 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/course', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/zsecurity', + 'only_matching': True, + }, { + 'url': 'http://www.youtube.com/NASAgovVideo/videos', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/TheYoungTurks/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/hashtag/cctv9', + 'info_dict': { + 'id': 'cctv9', + 'title': '#cctv9', + 'tags': [], + }, + 'playlist_mincount': 350, + }, { + 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', + 'only_matching': True, + }, { + 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist', + 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'only_matching': True + }, { + 'note': '/browse/ should redirect to /channel/', + 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng', + 'only_matching': True + }, { + 'note': 'VLPL, should redirect to playlist?list=PL...', + 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'info_dict': { + 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'uploader': 'NoCopyrightSounds', + 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', + 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'title': 'NCS Releases', + 'uploader_url': 'https://www.youtube.com/c/NoCopyrightSounds', + 'channel_url': 'https://www.youtube.com/c/NoCopyrightSounds', + 'modified_date': r're:\d{8}', + 'view_count': int, + 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'tags': [], + 'channel': 'NoCopyrightSounds', + }, + 'playlist_mincount': 166, + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], + }, { + 'note': 'Topic, should redirect to playlist?list=UU...', + 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', + 'info_dict': { + 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', + 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', + 'title': 'Uploads from Royalty Free Music - Topic', + 'uploader': 'Royalty Free Music - Topic', + 'tags': [], + 'channel_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', + 'channel': 'Royalty Free Music - Topic', + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', + 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', + 'modified_date': r're:\d{8}', + 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', + 'description': '', + }, + 'expected_warnings': [ + 'The URL does not have a videos tab', + r'[Uu]navailable videos (are|will be) hidden', + ], + 'playlist_mincount': 101, + }, { + 'note': 'Topic without a UU playlist', + 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', + 'info_dict': { + 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', + 'title': 'UCtFRv9O2AHqOZjjynzrv-xg', + 'tags': [], + }, + 'expected_warnings': [ + 'the playlist redirect gave error', + ], + 'playlist_mincount': 9, + }, { + 'note': 'Youtube music Album', + 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE', + 'info_dict': { + 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0', + 'title': 'Album - Royalty Free Music Library V2 (50 Songs)', + 'tags': [], + 'view_count': int, + 'description': '', + 'availability': 'unlisted', + 'modified_date': r're:\d{8}', + }, + 'playlist_count': 50, + }, { + 'note': 'unlisted single video playlist', + 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', + 'info_dict': { + 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', + 'uploader': 'colethedj', + 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', + 'title': 'hypervideo unlisted playlist test', + 'availability': 'unlisted', + 'tags': [], + 'modified_date': '20211208', + 'channel': 'colethedj', + 'view_count': int, + 'description': '', + 'uploader_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q', + 'channel_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', + 'channel_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q', + }, + 'playlist_count': 1, + }, { + 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', + 'url': 'https://www.youtube.com/feed/recommended', + 'info_dict': { + 'id': 'recommended', + 'title': 'recommended', + 'tags': [], + }, + 'playlist_mincount': 50, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}} + }, + }, { + 'note': 'API Fallback: /videos tab, sorted by oldest first', + 'url': 'https://www.youtube.com/user/theCodyReeder/videos?view=0&sort=da&flow=grid', + 'info_dict': { + 'id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', + 'title': 'Cody\'sLab - Videos', + 'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa', + 'uploader': 'Cody\'sLab', + 'uploader_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', + 'channel': 'Cody\'sLab', + 'channel_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', + 'tags': [], + 'channel_url': 'https://www.youtube.com/channel/UCu6mSoMNzHQiBIOCkHUa2Aw', + 'uploader_url': 'https://www.youtube.com/channel/UCu6mSoMNzHQiBIOCkHUa2Aw', + 'channel_follower_count': int + }, + 'playlist_mincount': 650, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}} + }, + }, { + 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', + 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', + 'info_dict': { + 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', + 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', + 'title': 'Uploads from Royalty Free Music - Topic', + 'uploader': 'Royalty Free Music - Topic', + 'modified_date': r're:\d{8}', + 'channel_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', + 'description': '', + 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', + 'tags': [], + 'channel': 'Royalty Free Music - Topic', + 'view_count': int, + 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', + }, + 'expected_warnings': [ + 'does not have a videos tab', + r'[Uu]navailable videos (are|will be) hidden', + ], + 'playlist_mincount': 101, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}} + }, + }, { + 'note': 'non-standard redirect to regional channel', + 'url': 'https://www.youtube.com/channel/UCwVVpHQ2Cs9iGJfpdFngePQ', + 'only_matching': True + }] + + @classmethod + def suitable(cls, url): + return False if YoutubeIE.suitable(url) else super( + YoutubeTabIE, cls).suitable(url) + + _URL_RE = re.compile(rf'(?P<pre>{_VALID_URL})(?(not_channel)|(?P<tab>/\w+))?(?P<post>.*)$') + + @YoutubeTabBaseInfoExtractor.passthrough_smuggled_data + def _real_extract(self, url, smuggled_data): item_id = self._match_id(url) url = compat_urlparse.urlunparse( compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) compat_opts = self.get_param('compat_opts', []) def get_mobj(url): - mobj = self._url_re.match(url).groupdict() + mobj = self._URL_RE.match(url).groupdict() mobj.update((k, '') for k, v in mobj.items() if v is None) return mobj - mobj = get_mobj(url) + mobj, redirect_warning = get_mobj(url), None # Youtube returns incomplete data if tabname is not lower case pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel'] if is_channel: if smuggled_data.get('is_music_url'): - if item_id[:2] == 'VL': - # Youtube music VL channels have an equivalent playlist + if item_id[:2] == 'VL': # Youtube music VL channels have an equivalent playlist item_id = item_id[2:] - pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False - elif item_id[:2] == 'MP': - # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist + pre, tab, post, is_channel = f'https://www.youtube.com/playlist?list={item_id}', '', '', False + elif item_id[:2] == 'MP': # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist mdata = self._extract_tab_endpoint( - 'https://music.youtube.com/channel/%s' % item_id, item_id, default_client='web_music') - murl = traverse_obj( - mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'), get_all=False, expected_type=compat_str) + f'https://music.youtube.com/channel/{item_id}', item_id, default_client='web_music') + murl = traverse_obj(mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'), + get_all=False, expected_type=compat_str) if not murl: - raise ExtractorError('Failed to resolve album to playlist.') + raise ExtractorError('Failed to resolve album to playlist') return self.url_result(murl, ie=YoutubeTabIE.ie_key()) - elif mobj['channel_type'] == 'browse': - # Youtube music /browse/ should be changed to /channel/ - pre = 'https://www.youtube.com/channel/%s' % item_id + elif mobj['channel_type'] == 'browse': # Youtube music /browse/ should be changed to /channel/ + pre = f'https://www.youtube.com/channel/{item_id}' + + original_tab_name = tab if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts: # Home URLs should redirect to /videos/ - self.report_warning( - 'A channel/user page was given. All the channel\'s videos will be downloaded. ' - 'To download only the videos in the home page, add a "/featured" to the URL') + redirect_warning = ('A channel/user page was given. All the channel\'s videos will be downloaded. ' + 'To download only the videos in the home page, add a "/featured" to the URL') tab = '/videos' url = ''.join((pre, tab, post)) @@ -4168,89 +5188,111 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): # Handle both video/playlist URLs qs = parse_qs(url) - video_id = qs.get('v', [None])[0] - playlist_id = qs.get('list', [None])[0] + video_id, playlist_id = [qs.get(key, [None])[0] for key in ('v', 'list')] if not video_id and mobj['not_channel'].startswith('watch'): if not playlist_id: # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable raise ExtractorError('Unable to recognize tab page') # Common mistake: https://www.youtube.com/watch?list=playlist_id - self.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id) - url = 'https://www.youtube.com/playlist?list=%s' % playlist_id + self.report_warning(f'A video URL was given without video ID. Trying to download playlist {playlist_id}') + url = f'https://www.youtube.com/playlist?list={playlist_id}' mobj = get_mobj(url) if video_id and playlist_id: if self.get_param('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return self.url_result(f'https://www.youtube.com/watch?v={video_id}', ie=YoutubeIE.ie_key(), video_id=video_id) - self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id)) + self.to_screen(f'Downloading just video {video_id} because of --no-playlist') + return self.url_result(f'https://www.youtube.com/watch?v={video_id}', + ie=YoutubeIE.ie_key(), video_id=video_id) + self.to_screen(f'Downloading playlist {playlist_id}; add --no-playlist to just download video {video_id}') data, ytcfg = self._extract_data(url, item_id) - tabs = try_get( - data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) + # YouTube may provide a non-standard redirect to the regional channel + # See: https://github.com/hypervideo/hypervideo/issues/2694 + redirect_url = traverse_obj( + data, ('onResponseReceivedActions', ..., 'navigateAction', 'endpoint', 'commandMetadata', 'webCommandMetadata', 'url'), get_all=False) + if redirect_url and 'no-youtube-channel-redirect' not in compat_opts: + redirect_url = ''.join(( + urljoin('https://www.youtube.com', redirect_url), mobj['tab'], mobj['post'])) + self.to_screen(f'This playlist is likely not available in your region. Following redirect to regional playlist {redirect_url}') + return self.url_result(redirect_url, ie=YoutubeTabIE.ie_key()) + + tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list) if tabs: selected_tab = self._extract_selected_tab(tabs) - tab_name = selected_tab.get('title', '') + selected_tab_name = selected_tab.get('title', '').lower() + if selected_tab_name == 'home': + selected_tab_name = 'featured' + requested_tab_name = mobj['tab'][1:] if 'no-youtube-channel-redirect' not in compat_opts: - if mobj['tab'] == '/live': + if requested_tab_name == 'live': # Live tab should have redirected to the video raise ExtractorError('The channel is not currently live', expected=True) - if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]: - if not mobj['not_channel'] and item_id[:2] == 'UC': - # Topic channels don't have /videos. Use the equivalent playlist instead - self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:])) - pl_id = 'UU%s' % item_id[2:] - pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post']) - try: - data, ytcfg, item_id, url = *self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True), pl_id, pl_url - except ExtractorError: - self.report_warning('The playlist gave error. Falling back to channel URL') + if requested_tab_name not in ('', selected_tab_name): + redirect_warning = f'The channel does not have a {requested_tab_name} tab' + if not original_tab_name: + if item_id[:2] == 'UC': + # Topic channels don't have /videos. Use the equivalent playlist instead + pl_id = f'UU{item_id[2:]}' + pl_url = f'https://www.youtube.com/playlist?list={pl_id}' + try: + data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True, webpage_fatal=True) + except ExtractorError: + redirect_warning += ' and the playlist redirect gave error' + else: + item_id, url, selected_tab_name = pl_id, pl_url, requested_tab_name + redirect_warning += f'. Redirecting to playlist {pl_id} instead' + if selected_tab_name and selected_tab_name != requested_tab_name: + redirect_warning += f'. {selected_tab_name} tab is being downloaded instead' else: - self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name)) + raise ExtractorError(redirect_warning, expected=True) - self.write_debug('Final URL: %s' % url) + if redirect_warning: + self.to_screen(redirect_warning) + self.write_debug(f'Final URL: {url}') # YouTube sometimes provides a button to reload playlist with unavailable videos. if 'no-youtube-unavailable-videos' not in compat_opts: data = self._reload_with_unavailable_videos(item_id, data, ytcfg) or data self._extract_and_report_alerts(data, only_once=True) - tabs = try_get( - data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) + tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list) if tabs: return self._extract_from_tabs(item_id, ytcfg, data, tabs) - playlist = try_get( - data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) + playlist = traverse_obj( + data, ('contents', 'twoColumnWatchNextResults', 'playlist', 'playlist'), expected_type=dict) if playlist: return self._extract_from_playlist(item_id, url, data, playlist, ytcfg) - video_id = try_get( - data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'], - compat_str) or video_id + video_id = traverse_obj( + data, ('currentVideoEndpoint', 'watchEndpoint', 'videoId'), expected_type=str) or video_id if video_id: if mobj['tab'] != '/live': # live tab is expected to redirect to video - self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id) - return self.url_result(f'https://www.youtube.com/watch?v={video_id}', ie=YoutubeIE.ie_key(), video_id=video_id) + self.report_warning(f'Unable to recognize playlist. Downloading just video {video_id}') + return self.url_result(f'https://www.youtube.com/watch?v={video_id}', + ie=YoutubeIE.ie_key(), video_id=video_id) raise ExtractorError('Unable to recognize tab page') class YoutubePlaylistIE(InfoExtractor): - IE_DESC = 'YouTube.com playlists' + IE_DESC = 'YouTube playlists' _VALID_URL = r'''(?x)(?: (?:https?://)? (?:\w+\.)? (?: (?: youtube(?:kids)?\.com| - invidio\.us + %(invidious)s ) /.*?\?.*?\blist= )? (?P<id>%(playlist_id)s) - )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} + )''' % { + 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE, + 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), + } IE_NAME = 'youtube:playlist' _TESTS = [{ 'note': 'issue #673', @@ -4258,9 +5300,16 @@ class YoutubePlaylistIE(InfoExtractor): 'info_dict': { 'title': '[OLD]Team Fortress 2 (Class-based LP)', 'id': 'PLBB231211A4F62143', - 'uploader': 'Wickydoo', + 'uploader': 'Wickman', 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', 'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2', + 'view_count': int, + 'uploader_url': 'https://www.youtube.com/user/Wickydoo', + 'modified_date': r're:\d{8}', + 'channel_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', + 'channel': 'Wickman', + 'tags': [], + 'channel_url': 'https://www.youtube.com/user/Wickydoo', }, 'playlist_mincount': 29, }, { @@ -4280,7 +5329,16 @@ class YoutubePlaylistIE(InfoExtractor): 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', 'uploader': 'milan', 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', - } + 'description': '', + 'channel_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw', + 'tags': [], + 'modified_date': '20140919', + 'view_count': int, + 'channel': 'milan', + 'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw', + 'uploader_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw', + }, + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', 'playlist_mincount': 654, @@ -4290,7 +5348,15 @@ class YoutubePlaylistIE(InfoExtractor): 'uploader': 'LBK', 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA', 'description': 'md5:da521864744d60a198e3a88af4db0d9d', - } + 'channel': 'LBK', + 'view_count': int, + 'channel_url': 'https://www.youtube.com/c/愛低音的國王', + 'tags': [], + 'uploader_url': 'https://www.youtube.com/c/愛低音的國王', + 'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA', + 'modified_date': r're:\d{8}', + }, + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', 'only_matching': True, @@ -4304,9 +5370,7 @@ class YoutubePlaylistIE(InfoExtractor): def suitable(cls, url): if YoutubeTabIE.suitable(url): return False - # Hack for lazy extractors until more generic solution is implemented - # (see #28780) - from .youtube import parse_qs + from ..utils import parse_qs qs = parse_qs(url) if qs.get('v', [None])[0]: return False @@ -4340,7 +5404,16 @@ class YoutubeYtBeIE(InfoExtractor): 'categories': ['Nonprofits & Activism'], 'tags': list, 'like_count': int, - 'dislike_count': int, + 'age_limit': 0, + 'playable_in_embed': True, + 'thumbnail': 'https://i.ytimg.com/vi_webp/yeWKywCrFtk/maxresdefault.webp', + 'channel': 'Backus-Page House Museum', + 'channel_id': 'UCEfMCQ9bs3tjvjy1s451zaw', + 'live_status': 'not_live', + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCEfMCQ9bs3tjvjy1s451zaw', + 'availability': 'public', + 'duration': 59, }, 'params': { 'noplaylist': True, @@ -4363,8 +5436,24 @@ class YoutubeYtBeIE(InfoExtractor): }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id) +class YoutubeLivestreamEmbedIE(InfoExtractor): + IE_DESC = 'YouTube livestream embeds' + _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/embed/live_stream/?\?(?:[^#]+&)?channel=(?P<id>[^&#]+)' + _TESTS = [{ + 'url': 'https://www.youtube.com/embed/live_stream?channel=UC2_KI6RB__jGdlnK6dvFEZA', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + return self.url_result( + f'https://www.youtube.com/channel/{channel_id}/live', + ie=YoutubeTabIE.ie_key(), video_id=channel_id) + + class YoutubeYtUserIE(InfoExtractor): - IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword' + IE_DESC = 'YouTube user videos; "ytuser:" prefix' + IE_NAME = 'youtube:user' _VALID_URL = r'ytuser:(?P<id>.+)' _TESTS = [{ 'url': 'ytuser:phihag', @@ -4374,13 +5463,13 @@ class YoutubeYtUserIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) return self.url_result( - 'https://www.youtube.com/user/%s' % user_id, + 'https://www.youtube.com/user/%s/videos' % user_id, ie=YoutubeTabIE.ie_key(), video_id=user_id) class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube:favorites' - IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)' + IE_DESC = 'YouTube liked videos; ":ytfav" keyword (requires cookies)' _VALID_URL = r':ytfav(?:ou?rite)?s?' _LOGIN_REQUIRED = True _TESTS = [{ @@ -4397,79 +5486,40 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): ie=YoutubeTabIE.ie_key()) -class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE): - IE_DESC = 'YouTube.com searches, "ytsearch" keyword' - # there doesn't appear to be a real limit, for example if you search for - # 'python' you get more than 8.000.000 results - _MAX_RESULTS = float('inf') +class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): + IE_DESC = 'YouTube search' IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' - _SEARCH_PARAMS = None - _TESTS = [] - - def _search_results(self, query): - data = {'query': query} - if self._SEARCH_PARAMS: - data['params'] = self._SEARCH_PARAMS - continuation = {} - for page_num in itertools.count(1): - data.update(continuation) - search = self._extract_response( - item_id='query "%s" page %s' % (query, page_num), ep='search', query=data, - check_get_keys=('contents', 'onResponseReceivedCommands') - ) - if not search: - break - slr_contents = try_get( - search, - (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], - lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), - list) - if not slr_contents: - break - - # Youtube sometimes adds promoted content to searches, - # changing the index location of videos and token. - # So we search through all entries till we find them. - continuation = None - for slr_content in slr_contents: - if not continuation: - continuation = self._extract_continuation({'contents': [slr_content]}) - - isr_contents = try_get( - slr_content, - lambda x: x['itemSectionRenderer']['contents'], - list) - if not isr_contents: - continue - for content in isr_contents: - if not isinstance(content, dict): - continue - video = content.get('videoRenderer') - if not isinstance(video, dict): - continue - video_id = video.get('videoId') - if not video_id: - continue - - yield self._extract_video(video) - - if not continuation: - break + _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only + _TESTS = [{ + 'url': 'ytsearch5:youtube-dl test video', + 'playlist_count': 5, + 'info_dict': { + 'id': 'youtube-dl test video', + 'title': 'youtube-dl test video', + } + }] -class YoutubeSearchDateIE(YoutubeSearchIE): +class YoutubeSearchDateIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _SEARCH_KEY = 'ytsearchdate' - IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword' - _SEARCH_PARAMS = 'CAI%3D' + IE_DESC = 'YouTube search, newest videos first' + _SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date + _TESTS = [{ + 'url': 'ytsearchdate5:youtube-dl test video', + 'playlist_count': 5, + 'info_dict': { + 'id': 'youtube-dl test video', + 'title': 'youtube-dl test video', + } + }] -class YoutubeSearchURLIE(YoutubeSearchIE): - IE_DESC = 'YouTube.com search URLs' +class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): + IE_DESC = 'YouTube search URLs with sorting and filter support' IE_NAME = YoutubeSearchIE.IE_NAME + '_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)' - # _MAX_RESULTS = 100 + _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:results|search)\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -4478,22 +5528,88 @@ class YoutubeSearchURLIE(YoutubeSearchIE): 'title': 'youtube-dl test video', } }, { + 'url': 'https://www.youtube.com/results?search_query=python&sp=EgIQAg%253D%253D', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'python', + 'title': 'python', + } + }, { + 'url': 'https://www.youtube.com/results?search_query=%23cats', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '#cats', + 'title': '#cats', + 'entries': [{ + 'url': r're:https://(www\.)?youtube\.com/hashtag/cats', + 'title': '#cats', + }], + }, + }, { 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', 'only_matching': True, }] - @classmethod - def _make_valid_url(cls): - return cls._VALID_URL + def _real_extract(self, url): + qs = parse_qs(url) + query = (qs.get('search_query') or qs.get('q'))[0] + return self.playlist_result(self._search_results(query, qs.get('sp', (None,))[0]), query, query) + + +class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor): + IE_DESC = 'YouTube music search URLs with selectable sections (Eg: #songs)' + IE_NAME = 'youtube:music:search_url' + _VALID_URL = r'https?://music\.youtube\.com/search\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)' + _TESTS = [{ + 'url': 'https://music.youtube.com/search?q=royalty+free+music', + 'playlist_count': 16, + 'info_dict': { + 'id': 'royalty free music', + 'title': 'royalty free music', + } + }, { + 'url': 'https://music.youtube.com/search?q=royalty+free+music&sp=EgWKAQIIAWoKEAoQAxAEEAkQBQ%3D%3D', + 'playlist_mincount': 30, + 'info_dict': { + 'id': 'royalty free music - songs', + 'title': 'royalty free music - songs', + }, + 'params': {'extract_flat': 'in_playlist'} + }, { + 'url': 'https://music.youtube.com/search?q=royalty+free+music#community+playlists', + 'playlist_mincount': 30, + 'info_dict': { + 'id': 'royalty free music - community playlists', + 'title': 'royalty free music - community playlists', + }, + 'params': {'extract_flat': 'in_playlist'} + }] + + _SECTIONS = { + 'albums': 'EgWKAQIYAWoKEAoQAxAEEAkQBQ==', + 'artists': 'EgWKAQIgAWoKEAoQAxAEEAkQBQ==', + 'community playlists': 'EgeKAQQoAEABagoQChADEAQQCRAF', + 'featured playlists': 'EgeKAQQoADgBagwQAxAJEAQQDhAKEAU==', + 'songs': 'EgWKAQIIAWoKEAoQAxAEEAkQBQ==', + 'videos': 'EgWKAQIQAWoKEAoQAxAEEAkQBQ==', + } def _real_extract(self, url): qs = parse_qs(url) query = (qs.get('search_query') or qs.get('q'))[0] - self._SEARCH_PARAMS = qs.get('sp', ('',))[0] - return self._get_n_results(query, self._MAX_RESULTS) + params = qs.get('sp', (None,))[0] + if params: + section = next((k for k, v in self._SECTIONS.items() if v == params), params) + else: + section = compat_urllib_parse_unquote_plus((url.split('#') + [''])[1]).lower() + params = self._SECTIONS.get(section) + if not params: + section = None + title = join_nonempty(query, section, delim=' - ') + return self.playlist_result(self._search_results(query, params, default_client='web_music'), title, title) -class YoutubeFeedsInfoExtractor(YoutubeTabIE): +class YoutubeFeedsInfoExtractor(InfoExtractor): """ Base class for feed extractors Subclasses must define the _FEED_NAME property. @@ -4507,13 +5623,12 @@ class YoutubeFeedsInfoExtractor(YoutubeTabIE): def _real_extract(self, url): return self.url_result( - 'https://www.youtube.com/feed/%s' % self._FEED_NAME, - ie=YoutubeTabIE.ie_key()) + f'https://www.youtube.com/feed/{self._FEED_NAME}', ie=YoutubeTabIE.ie_key()) class YoutubeWatchLaterIE(InfoExtractor): IE_NAME = 'youtube:watchlater' - IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' + IE_DESC = 'Youtube watch later list; ":ytwatchlater" keyword (requires cookies)' _VALID_URL = r':ytwatchlater' _TESTS = [{ 'url': ':ytwatchlater', @@ -4526,7 +5641,7 @@ class YoutubeWatchLaterIE(InfoExtractor): class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' + IE_DESC = 'YouTube recommended videos; ":ytrec" keyword' _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _LOGIN_REQUIRED = False @@ -4543,7 +5658,7 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)' + IE_DESC = 'YouTube subscriptions feed; ":ytsubs" keyword (requires cookies)' _VALID_URL = r':ytsub(?:scription)?s?' _FEED_NAME = 'subscriptions' _TESTS = [{ @@ -4556,7 +5671,7 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)' + IE_DESC = 'Youtube watch history; ":ythis" keyword (requires cookies)' _VALID_URL = r':ythis(?:tory)?' _FEED_NAME = 'history' _TESTS = [{ diff --git a/hypervideo_dl/extractor/zattoo.py b/hypervideo_dl/extractor/zattoo.py index a13d124..c02b4ca 100644 --- a/hypervideo_dl/extractor/zattoo.py +++ b/hypervideo_dl/extractor/zattoo.py @@ -12,6 +12,7 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + join_nonempty, try_get, url_or_none, urlencode_postdata, @@ -24,13 +25,11 @@ class ZattooPlatformBaseIE(InfoExtractor): def _host_url(self): return 'https://%s' % (self._API_HOST if hasattr(self, '_API_HOST') else self._HOST) - def _login(self): - username, password = self._get_login_info() - if not username or not password: - self.raise_login_required( - 'A valid %s account is needed to access this media.' - % self._NETRC_MACHINE) + def _real_initialize(self): + if not self._power_guide_hash: + self.raise_login_required('An account is needed to access this media', method='password') + def _perform_login(self, username, password): try: data = self._download_json( '%s/zapi/v2/account/login' % self._host_url(), None, 'Logging in', @@ -51,7 +50,7 @@ class ZattooPlatformBaseIE(InfoExtractor): self._power_guide_hash = data['session']['power_guide_hash'] - def _real_initialize(self): + def _initialize_pre_login(self): webpage = self._download_webpage( self._host_url(), None, 'Downloading app token') app_token = self._html_search_regex( @@ -71,8 +70,6 @@ class ZattooPlatformBaseIE(InfoExtractor): 'format': 'json', })) - self._login() - def _extract_cid(self, video_id, channel_name): channel_groups = self._download_json( '%s/zapi/v2/cached/channels/%s' % (self._host_url(), @@ -156,15 +153,9 @@ class ZattooPlatformBaseIE(InfoExtractor): watch_url = url_or_none(watch.get('url')) if not watch_url: continue - format_id_list = [stream_type] - maxrate = watch.get('maxrate') - if maxrate: - format_id_list.append(compat_str(maxrate)) audio_channel = watch.get('audio_channel') - if audio_channel: - format_id_list.append(compat_str(audio_channel)) preference = 1 if audio_channel == 'A' else None - format_id = '-'.join(format_id_list) + format_id = join_nonempty(stream_type, watch.get('maxrate'), audio_channel) if stream_type in ('dash', 'dash_widevine', 'dash_playready'): this_formats = self._extract_mpd_formats( watch_url, video_id, mpd_id=format_id, fatal=False) @@ -192,7 +183,7 @@ class ZattooPlatformBaseIE(InfoExtractor): cid = self._extract_cid(video_id, channel_name) info_dict = { 'id': channel_name, - 'title': self._live_title(channel_name), + 'title': channel_name, 'is_live': True, } else: diff --git a/hypervideo_dl/extractor/zdf.py b/hypervideo_dl/extractor/zdf.py index 8c279c5..5f4d266 100644 --- a/hypervideo_dl/extractor/zdf.py +++ b/hypervideo_dl/extractor/zdf.py @@ -9,12 +9,13 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, + join_nonempty, merge_dicts, NO_DEFAULT, orderedSet, parse_codecs, qualities, - str_or_none, + traverse_obj, try_get, unified_timestamp, update_url_query, @@ -70,11 +71,11 @@ class ZDFBaseIE(InfoExtractor): f = {'vcodec': data[0], 'acodec': data[1]} f.update({ 'url': format_url, - 'format_id': '-'.join(filter(str_or_none, ('http', meta.get('type'), meta.get('quality')))), + 'format_id': join_nonempty('http', meta.get('type'), meta.get('quality')), }) new_formats = [f] formats.extend(merge_dicts(f, { - 'format_note': ', '.join(filter(None, (meta.get('quality'), meta.get('class')))), + 'format_note': join_nonempty('quality', 'class', from_dict=meta, delim=', '), 'language': meta.get('language'), 'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1, 'quality': qualities(self._QUALITIES)(meta.get('quality')), @@ -147,6 +148,7 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1613948400, 'upload_date': '20210221', }, + 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', }, { # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', @@ -160,6 +162,20 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1608604200, 'upload_date': '20201222', }, + 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', + }, { + 'url': 'https://www.zdf.de/nachrichten/heute-journal/heute-journal-vom-30-12-2021-100.html', + 'info_dict': { + 'id': '211230_sendung_hjo', + 'ext': 'mp4', + 'description': 'md5:47dff85977bde9fb8cba9e9c9b929839', + 'duration': 1890.0, + 'upload_date': '20211230', + 'chapters': list, + 'thumbnail': 'md5:e65f459f741be5455c952cd820eb188e', + 'title': 'heute journal vom 30.12.2021', + 'timestamp': 1640897100, + } }, { 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', 'info_dict': { @@ -170,6 +186,20 @@ class ZDFIE(ZDFBaseIE): 'duration': 2615, 'timestamp': 1465021200, 'upload_date': '20160604', + 'thumbnail': 'https://www.zdf.de/assets/mauve-im-labor-100~768x432?cb=1464909117806', + }, + }, { + 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html', + 'md5': '3d6f1049e9682178a11c54b91f3dd065', + 'info_dict': { + 'ext': 'mp4', + 'id': 'video_funk_1770473', + 'duration': 1278, + 'description': 'Die Neue an der Schule verdreht Ismail den Kopf.', + 'title': 'Alles ist verzaubert', + 'timestamp': 1635520560, + 'upload_date': '20211029', + 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-100~1920x1080?cb=1636466431799', }, }, { # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche @@ -192,6 +222,17 @@ class ZDFIE(ZDFBaseIE): }, { 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/arte/todliche-flucht/page-video-artede-toedliche-flucht-16-100.html', + 'info_dict': { + 'id': 'video_artede_083871-001-A', + 'ext': 'mp4', + 'title': 'Tödliche Flucht (1/6)', + 'description': 'md5:e34f96a9a5f8abd839ccfcebad3d5315', + 'duration': 3193.0, + 'timestamp': 1641355200, + 'upload_date': '20220105', + }, }] def _extract_entry(self, url, player, content, video_id): @@ -202,8 +243,9 @@ class ZDFIE(ZDFBaseIE): ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') if not ptmd_path: - ptmd_path = t[ - 'http://zdf.de/rels/streams/ptmd-template'].replace( + ptmd_path = traverse_obj( + t, ('streams', 'default', 'http://zdf.de/rels/streams/ptmd-template'), + 'http://zdf.de/rels/streams/ptmd-template').replace( '{playerId}', 'ngplayer_2_4') info = self._extract_ptmd( @@ -229,12 +271,21 @@ class ZDFIE(ZDFBaseIE): }) thumbnails.append(thumbnail) + chapter_marks = t.get('streamAnchorTag') or [] + chapter_marks.append({'anchorOffset': int_or_none(t.get('duration'))}) + chapters = [{ + 'start_time': chap.get('anchorOffset'), + 'end_time': next_chap.get('anchorOffset'), + 'title': chap.get('anchorLabel') + } for chap, next_chap in zip(chapter_marks, chapter_marks[1:])] + return merge_dicts(info, { 'title': title, 'description': content.get('leadParagraph') or content.get('teasertext'), 'duration': int_or_none(t.get('duration')), 'timestamp': unified_timestamp(content.get('editorialDate')), 'thumbnails': thumbnails, + 'chapters': chapters or None }) def _extract_regular(self, url, player, video_id): diff --git a/hypervideo_dl/extractor/zee5.py b/hypervideo_dl/extractor/zee5.py index 5366041..3e3f11b 100644 --- a/hypervideo_dl/extractor/zee5.py +++ b/hypervideo_dl/extractor/zee5.py @@ -21,9 +21,9 @@ class Zee5IE(InfoExtractor): _VALID_URL = r'''(?x) (?: zee5:| - (?:https?://)(?:www\.)?zee5\.com/(?:[^#?]+/)? + https?://(?:www\.)?zee5\.com/(?:[^#?]+/)? (?: - (?:tvshows|kids|zee5originals)(?:/[^#/?]+){3} + (?:tv-shows|kids|web-series|zee5originals)(?:/[^#/?]+){3} |movies/[^#/?]+ )/(?P<display_id>[^#/?]+)/ ) @@ -37,48 +37,53 @@ class Zee5IE(InfoExtractor): 'display_id': 'krishna-the-birth', 'title': 'Krishna - The Birth', 'duration': 4368, - 'average_rating': 4, 'description': compat_str, 'alt_title': 'Krishna - The Birth', 'uploader': 'Zee Entertainment Enterprises Ltd', 'release_date': '20060101', 'upload_date': '20060101', 'timestamp': 1136073600, - 'thumbnail': 'https://akamaividz.zee5.com/resources/0-0-63098/list/270x152/0063098_list_80888170.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 0, + 'episode': 'Episode 0', 'tags': list }, 'params': { 'format': 'bv', }, }, { - 'url': 'https://zee5.com/tvshows/details/krishna-balram/0-6-1871/episode-1-the-test-of-bramha/0-1-233402', + 'url': 'https://www.zee5.com/kids/kids-shows/bandbudh-aur-budbak/0-6-1899/yoga-se-hoga-bandbudh-aur-budbak/0-1-239839', 'info_dict': { - 'id': '0-1-233402', + 'id': '0-1-239839', 'ext': 'mp4', - 'display_id': 'episode-1-the-test-of-bramha', - 'title': 'Episode 1 - The Test Of Bramha', - 'duration': 1336, - 'average_rating': 4, + 'display_id': 'yoga-se-hoga-bandbudh-aur-budbak', + 'title': 'Yoga Se Hoga-Bandbudh aur Budbak', + 'duration': 659, 'description': compat_str, - 'alt_title': 'Episode 1 - The Test Of Bramha', + 'alt_title': 'Yoga Se Hoga-Bandbudh aur Budbak', 'uploader': 'Zee Entertainment Enterprises Ltd', - 'release_date': '20090101', - 'upload_date': '20090101', - 'timestamp': 1230768000, - 'thumbnail': 'https://akamaividz.zee5.com/resources/0-1-233402/list/270x152/01233402_list.jpg', - 'series': 'Krishna Balram', + 'release_date': '20150101', + 'upload_date': '20150101', + 'timestamp': 1420070400, + 'thumbnail': r're:^https?://.*\.jpg$', + 'series': 'Bandbudh Aur Budbak', 'season_number': 1, 'episode_number': 1, + 'episode': 'Episode 1', + 'season': 'Season 1', 'tags': list, }, 'params': { 'format': 'bv', }, }, { - 'url': 'https://www.zee5.com/hi/tvshows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730?country=IN', + 'url': 'https://www.zee5.com/hi/tv-shows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730?country=IN', 'only_matching': True }, { - 'url': 'https://www.zee5.com/global/hi/tvshows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730', + 'url': 'https://www.zee5.com/global/hi/tv-shows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730', + 'only_matching': True + }, { + 'url': 'https://www.zee5.com/web-series/details/mithya/0-6-4z587408/maine-dekhi-hai-uski-mrityu/0-1-6z587412', 'only_matching': True }] _DETAIL_API_URL = 'https://spapi.zee5.com/singlePlayback/getDetails?content_id={}&device_id={}&platform_name=desktop_web&country=IN&check_parental_control=false' @@ -86,31 +91,29 @@ class Zee5IE(InfoExtractor): _USER_TOKEN = None _LOGIN_HINT = 'Use "--username <mobile_number>" to login using otp or "--username token" and "--password <user_token>" to login using user token.' _NETRC_MACHINE = 'zee5' + _GEO_COUNTRIES = ['IN'] - def _login(self): - username, password = self._get_login_info() - if username: - if len(username) == 10 and username.isdigit() and self._USER_TOKEN is None: - self.report_login() - otp_request_json = self._download_json('https://b2bapi.zee5.com/device/sendotp_v1.php?phoneno=91{}'.format(username), - None, note='Sending OTP') - if otp_request_json['code'] == 0: - self.to_screen(otp_request_json['message']) - else: - raise ExtractorError(otp_request_json['message'], expected=True) - otp_code = self._get_tfa_info('OTP') - otp_verify_json = self._download_json('https://b2bapi.zee5.com/device/verifyotp_v1.php?phoneno=91{}&otp={}&guest_token={}&platform=web'.format(username, otp_code, self._DEVICE_ID), - None, note='Verifying OTP', fatal=False) - if not otp_verify_json: - raise ExtractorError('Unable to verify OTP.', expected=True) - self._USER_TOKEN = otp_verify_json.get('token') - if not self._USER_TOKEN: - raise ExtractorError(otp_request_json['message'], expected=True) - elif username.lower() == 'token' and len(password) > 1198: - self._USER_TOKEN = password - - def _real_initialize(self): - self._login() + def _perform_login(self, username, password): + if len(username) == 10 and username.isdigit() and self._USER_TOKEN is None: + self.report_login() + otp_request_json = self._download_json('https://b2bapi.zee5.com/device/sendotp_v1.php?phoneno=91{}'.format(username), + None, note='Sending OTP') + if otp_request_json['code'] == 0: + self.to_screen(otp_request_json['message']) + else: + raise ExtractorError(otp_request_json['message'], expected=True) + otp_code = self._get_tfa_info('OTP') + otp_verify_json = self._download_json('https://b2bapi.zee5.com/device/verifyotp_v1.php?phoneno=91{}&otp={}&guest_token={}&platform=web'.format(username, otp_code, self._DEVICE_ID), + None, note='Verifying OTP', fatal=False) + if not otp_verify_json: + raise ExtractorError('Unable to verify OTP.', expected=True) + self._USER_TOKEN = otp_verify_json.get('token') + if not self._USER_TOKEN: + raise ExtractorError(otp_request_json['message'], expected=True) + elif username.lower() == 'token' and len(password) > 1198: + self._USER_TOKEN = password + else: + raise ExtractorError(self._LOGIN_HINT, expected=True) def _real_extract(self, url): video_id, display_id = self._match_valid_url(url).group('id', 'display_id') @@ -153,7 +156,6 @@ class Zee5IE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'duration': int_or_none(asset_data.get('duration')), - 'average_rating': int_or_none(asset_data.get('rating')), 'description': str_or_none(asset_data.get('description')), 'alt_title': str_or_none(asset_data.get('original_title')), 'uploader': str_or_none(asset_data.get('content_owner')), @@ -174,43 +176,48 @@ class Zee5SeriesIE(InfoExtractor): _VALID_URL = r'''(?x) (?: zee5:series:| - (?:https?://)(?:www\.)?zee5\.com/(?:[^#?]+/)? - (?:tvshows|kids|zee5originals)(?:/[^#/?]+){2}/ + https?://(?:www\.)?zee5\.com/(?:[^#?]+/)? + (?:tv-shows|web-series|kids|zee5originals)(?:/[^#/?]+){2}/ ) - (?P<id>[^#/?]+)/?(?:$|[?#]) + (?P<id>[^#/?]+)(?:/episodes)?/?(?:$|[?#]) ''' _TESTS = [{ - 'url': 'https://www.zee5.com/kids/kids-shows/krishna-balram/0-6-1871', - 'playlist_mincount': 43, + 'url': 'https://www.zee5.com/kids/kids-shows/bandbudh-aur-budbak/0-6-1899', + 'playlist_mincount': 156, 'info_dict': { - 'id': '0-6-1871', + 'id': '0-6-1899', }, }, { - 'url': 'https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199', + 'url': 'https://www.zee5.com/tv-shows/details/bhabi-ji-ghar-par-hai/0-6-199', 'playlist_mincount': 1500, 'info_dict': { 'id': '0-6-199', }, }, { - 'url': 'https://www.zee5.com/tvshows/details/agent-raghav-crime-branch/0-6-965', + 'url': 'https://www.zee5.com/tv-shows/details/agent-raghav-crime-branch/0-6-965', 'playlist_mincount': 24, 'info_dict': { 'id': '0-6-965', }, }, { - 'url': 'https://www.zee5.com/ta/tvshows/details/nagabhairavi/0-6-3201', + 'url': 'https://www.zee5.com/ta/tv-shows/details/nagabhairavi/0-6-3201', 'playlist_mincount': 3, 'info_dict': { 'id': '0-6-3201', }, }, { - 'url': 'https://www.zee5.com/global/hi/tvshows/details/khwaabon-ki-zamin-par/0-6-270', + 'url': 'https://www.zee5.com/global/hi/tv-shows/details/khwaabon-ki-zamin-par/0-6-270', 'playlist_mincount': 150, 'info_dict': { 'id': '0-6-270', }, - } - ] + }, { + 'url': 'https://www.zee5.com/tv-shows/details/chala-hawa-yeu-dya-ladies-zindabaad/0-6-2943/episodes', + 'only_matching': True, + }, { + 'url': 'https://www.zee5.com/web-series/details/mithya/0-6-4z587408', + 'only_matching': True, + }] def _entries(self, show_id): access_token_request = self._download_json( diff --git a/hypervideo_dl/extractor/zhihu.py b/hypervideo_dl/extractor/zhihu.py index d1ed55b..278a943 100644 --- a/hypervideo_dl/extractor/zhihu.py +++ b/hypervideo_dl/extractor/zhihu.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import float_or_none, int_or_none +from ..utils import format_field, float_or_none, int_or_none class ZhihuIE(InfoExtractor): @@ -61,7 +61,7 @@ class ZhihuIE(InfoExtractor): 'uploader': author.get('name'), 'timestamp': int_or_none(zvideo.get('published_at')), 'uploader_id': author.get('id'), - 'uploader_url': 'https://www.zhihu.com/people/' + url_token if url_token else None, + 'uploader_url': format_field(url_token, template='https://www.zhihu.com/people/%s'), 'duration': float_or_none(video.get('duration')), 'view_count': int_or_none(zvideo.get('play_count')), 'like_count': int_or_none(zvideo.get('liked_count')), diff --git a/hypervideo_dl/extractor/zingmp3.py b/hypervideo_dl/extractor/zingmp3.py index a3edc15..419bf30 100644 --- a/hypervideo_dl/extractor/zingmp3.py +++ b/hypervideo_dl/extractor/zingmp3.py @@ -1,22 +1,46 @@ # coding: utf-8 from __future__ import unicode_literals +import hashlib +import hmac +import urllib.parse + from .common import InfoExtractor from ..utils import ( int_or_none, + traverse_obj, ) class ZingMp3BaseIE(InfoExtractor): - _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?:%s)/[^/]+/(?P<id>\w+)\.html' + _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<type>(?:%s))/[^/]+/(?P<id>\w+)(?:\.html|\?)' _GEO_COUNTRIES = ['VN'] + _DOMAIN = 'https://zingmp3.vn' + _SLUG_API = { + 'bai-hat': '/api/v2/page/get/song', + 'embed': '/api/v2/page/get/song', + 'video-clip': '/api/v2/page/get/video', + 'playlist': '/api/v2/page/get/playlist', + 'album': '/api/v2/page/get/playlist', + 'lyric': '/api/v2/lyric/get/lyric', + 'song_streaming': '/api/v2/song/get/streaming', + } + + _API_KEY = '88265e23d4284f25963e6eedac8fbfa3' + _SECRET_KEY = b'2aa2d1c561e809b267f3638c4a307aab' + + def _extract_item(self, item, song_id, type_url, fatal): + item_id = item.get('encodeId') or song_id + title = item.get('title') or item.get('alias') - def _extract_item(self, item, fatal): - item_id = item['id'] - title = item.get('name') or item['title'] + if type_url == 'video-clip': + source = item.get('streaming') + else: + api = self.get_api_with_signature(name_api=self._SLUG_API.get('song_streaming'), param={'id': item_id}) + source = self._download_json(api, video_id=item_id).get('data') formats = [] - for k, v in (item.get('source') or {}).items(): + for k, v in (source or {}).items(): if not v: continue if k in ('mp4', 'hls'): @@ -34,31 +58,35 @@ class ZingMp3BaseIE(InfoExtractor): 'height': int_or_none(self._search_regex( r'^(\d+)p', res, 'resolution', default=None)), }) - else: - formats.append({ - 'ext': 'mp3', - 'format_id': k, - 'tbr': int_or_none(k), - 'url': self._proto_relative_url(v), - 'vcodec': 'none', - }) + continue + elif v == 'VIP': + continue + formats.append({ + 'ext': 'mp3', + 'format_id': k, + 'tbr': int_or_none(k), + 'url': self._proto_relative_url(v), + 'vcodec': 'none', + }) if not formats: if not fatal: return - msg = item['msg'] + msg = item.get('msg') if msg == 'Sorry, this content is not available in your country.': self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) self.raise_no_formats(msg, expected=True) self._sort_formats(formats) - subtitles = None lyric = item.get('lyric') - if lyric: - subtitles = { - 'origin': [{ - 'url': lyric, - }], - } + if not lyric: + api = self.get_api_with_signature(name_api=self._SLUG_API.get("lyric"), param={'id': item_id}) + info_lyric = self._download_json(api, video_id=item_id) + lyric = traverse_obj(info_lyric, ('data', 'file')) + subtitles = { + 'origin': [{ + 'url': lyric, + }], + } if lyric else None album = item.get('album') or {} @@ -66,30 +94,40 @@ class ZingMp3BaseIE(InfoExtractor): 'id': item_id, 'title': title, 'formats': formats, - 'thumbnail': item.get('thumbnail'), + 'thumbnail': traverse_obj(item, 'thumbnail', 'thumbnailM'), 'subtitles': subtitles, 'duration': int_or_none(item.get('duration')), 'track': title, - 'artist': item.get('artists_names'), - 'album': album.get('name') or album.get('title'), - 'album_artist': album.get('artists_names'), + 'artist': traverse_obj(item, 'artistsNames', 'artists_names'), + 'album': traverse_obj(album, 'name', 'title'), + 'album_artist': traverse_obj(album, 'artistsNames', 'artists_names'), } + def _real_initialize(self): + if not self.get_param('cookiefile') and not self.get_param('cookiesfrombrowser'): + self._request_webpage(self.get_api_with_signature(name_api=self._SLUG_API['bai-hat'], param={'id': ''}), + None, note='Updating cookies') + def _real_extract(self, url): - page_id = self._match_id(url) - webpage = self._download_webpage( - url.replace('://zingmp3.vn/', '://mp3.zing.vn/'), - page_id, query={'play_song': 1}) - data_path = self._search_regex( - r'data-xml="([^"]+)', webpage, 'data path') - return self._process_data(self._download_json( - 'https://mp3.zing.vn/xhr' + data_path, page_id)['data']) + song_id, type_url = self._match_valid_url(url).group('id', 'type') + api = self.get_api_with_signature(name_api=self._SLUG_API[type_url], param={'id': song_id}) + return self._process_data(self._download_json(api, song_id)['data'], song_id, type_url) + + def get_api_with_signature(self, name_api, param): + param.update({'ctime': '1'}) + sha256 = hashlib.sha256(''.join(f'{i}={param[i]}' for i in sorted(param)).encode('utf-8')).hexdigest() + data = { + 'apiKey': self._API_KEY, + 'sig': hmac.new(self._SECRET_KEY, f'{name_api}{sha256}'.encode('utf-8'), hashlib.sha512).hexdigest(), + **param, + } + return f'{self._DOMAIN}{name_api}?{urllib.parse.urlencode(data)}' class ZingMp3IE(ZingMp3BaseIE): - _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip' + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip|embed' _TESTS = [{ - 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', + 'url': 'https://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'md5': 'ead7ae13693b3205cbc89536a077daed', 'info_dict': { 'id': 'ZWZB9WAB', @@ -108,8 +146,8 @@ class ZingMp3IE(ZingMp3BaseIE): 'album_artist': 'Bảo Thy', }, }, { - 'url': 'https://mp3.zing.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html', - 'md5': 'e9c972b693aa88301ef981c8151c4343', + 'url': 'https://zingmp3.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html', + 'md5': 'c7f23d971ac1a4f675456ed13c9b9612', 'info_dict': { 'id': 'ZO8ZF7C7', 'title': 'Sương Hoa Đưa Lối', @@ -118,16 +156,35 @@ class ZingMp3IE(ZingMp3BaseIE): 'duration': 207, 'track': 'Sương Hoa Đưa Lối', 'artist': 'K-ICM, RYO', + 'album': 'Sương Hoa Đưa Lối (Single)', + 'album_artist': 'K-ICM, RYO', }, }, { + 'url': 'https://zingmp3.vn/bai-hat/Nguoi-Yeu-Toi-Lanh-Lung-Sat-Da-Mr-Siro/ZZ6IW7OU.html', + 'md5': '3e9f7a9bd0d965573dbff8d7c68b629d', + 'info_dict': { + 'id': 'ZZ6IW7OU', + 'title': 'Người Yêu Tôi Lạnh Lùng Sắt Đá', + 'ext': 'mp3', + 'thumbnail': r're:^https?://.+\.jpg', + 'duration': 303, + 'track': 'Người Yêu Tôi Lạnh Lùng Sắt Đá', + 'artist': 'Mr. Siro', + 'album': 'Người Yêu Tôi Lạnh Lùng Sắt Đá (Single)', + 'album_artist': 'Mr. Siro', + }, + }, { + 'url': 'https://zingmp3.vn/embed/song/ZWZEI76B?start=false', + 'only_matching': True, + }, { 'url': 'https://zingmp3.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'only_matching': True, }] IE_NAME = 'zingmp3' - IE_DESC = 'mp3.zing.vn' + IE_DESC = 'zingmp3.vn' - def _process_data(self, data): - return self._extract_item(data, True) + def _process_data(self, data, song_id, type_url): + return self._extract_item(data, song_id, type_url, True) class ZingMp3AlbumIE(ZingMp3BaseIE): @@ -139,7 +196,15 @@ class ZingMp3AlbumIE(ZingMp3BaseIE): 'id': 'ZWZBWDAF', 'title': 'Lâu Đài Tình Ái', }, - 'playlist_count': 10, + 'playlist_count': 9, + }, { + 'url': 'https://zingmp3.vn/album/Nhung-Bai-Hat-Hay-Nhat-Cua-Mr-Siro-Mr-Siro/ZWZAEZZD.html', + 'info_dict': { + '_type': 'playlist', + 'id': 'ZWZAEZZD', + 'title': 'Những Bài Hát Hay Nhất Của Mr. Siro', + }, + 'playlist_count': 49, }, { 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', 'only_matching': True, @@ -149,12 +214,12 @@ class ZingMp3AlbumIE(ZingMp3BaseIE): }] IE_NAME = 'zingmp3:album' - def _process_data(self, data): + def _process_data(self, data, song_id, type_url): def entries(): - for item in (data.get('items') or []): - entry = self._extract_item(item, False) + for item in traverse_obj(data, ('song', 'items')) or []: + entry = self._extract_item(item, song_id, type_url, False) if entry: yield entry - info = data.get('info') or {} - return self.playlist_result( - entries(), info.get('id'), info.get('name') or info.get('title')) + + return self.playlist_result(entries(), traverse_obj(data, 'id', 'encodeId'), + traverse_obj(data, 'name', 'title')) diff --git a/hypervideo_dl/extractor/zoom.py b/hypervideo_dl/extractor/zoom.py index 25a0902..c005488 100644 --- a/hypervideo_dl/extractor/zoom.py +++ b/hypervideo_dl/extractor/zoom.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + str_or_none, js_to_json, parse_filesize, urlencode_postdata, @@ -23,7 +24,8 @@ class ZoomIE(InfoExtractor): 'id': 'dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', 'ext': 'mp4', 'title': 'China\'s "two sessions" and the new five-year plan', - } + }, + 'skip': 'Recording requires email authentication to access', } def _real_extract(self, url): @@ -56,22 +58,46 @@ class ZoomIE(InfoExtractor): webpage, 'data'), play_id, js_to_json) subtitles = {} - for _type in ('transcript', 'cc'): + for _type in ('transcript', 'cc', 'chapter'): if data.get('%sUrl' % _type): subtitles[_type] = [{ 'url': urljoin(base_url, data['%sUrl' % _type]), 'ext': 'vtt', }] + formats = [] + + if data.get('viewMp4Url'): + formats.append({ + 'format_note': 'Camera stream', + 'url': str_or_none(data.get('viewMp4Url')), + 'width': int_or_none(data.get('viewResolvtionsWidth')), + 'height': int_or_none(data.get('viewResolvtionsHeight')), + 'format_id': str_or_none(data.get('recordingId')), + 'ext': 'mp4', + 'filesize_approx': parse_filesize(data.get('fileSize')), + 'preference': 0 + }) + + if data.get('shareMp4Url'): + formats.append({ + 'format_note': 'Screen share stream', + 'url': str_or_none(data.get('shareMp4Url')), + 'width': int_or_none(data.get('shareResolvtionsWidth')), + 'height': int_or_none(data.get('shareResolvtionsHeight')), + 'format_id': str_or_none(data.get('shareVideoId')), + 'ext': 'mp4', + 'preference': -1 + }) + + self._sort_formats(formats) + return { 'id': play_id, - 'title': data['topic'], - 'url': data['viewMp4Url'], + 'title': data.get('topic'), 'subtitles': subtitles, - 'width': int_or_none(data.get('viewResolvtionsWidth')), - 'height': int_or_none(data.get('viewResolvtionsHeight')), + 'formats': formats, 'http_headers': { 'Referer': base_url, }, - 'filesize_approx': parse_filesize(data.get('fileSize')), } |