diff options
-rw-r--r-- | yt_dlp/downloader/__init__.py | 2 | ||||
-rw-r--r-- | yt_dlp/downloader/fc2.py | 41 | ||||
-rw-r--r-- | yt_dlp/downloader/fragment.py | 39 | ||||
-rw-r--r-- | yt_dlp/extractor/abematv.py | 488 | ||||
-rw-r--r-- | yt_dlp/extractor/extractors.py | 5 | ||||
-rw-r--r-- | yt_dlp/extractor/fc2.py | 150 | ||||
-rw-r--r-- | yt_dlp/extractor/youtube.py | 14 | ||||
-rw-r--r-- | yt_dlp/utils.py | 448 |
8 files changed, 1101 insertions, 86 deletions
diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py index 76841993b..96d484dee 100644 --- a/yt_dlp/downloader/__init__.py +++ b/yt_dlp/downloader/__init__.py @@ -30,6 +30,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N from .common import FileDownloader from .dash import DashSegmentsFD from .f4m import F4mFD +from .fc2 import FC2LiveFD from .hls import HlsFD from .http import HttpFD from .rtmp import RtmpFD @@ -58,6 +59,7 @@ PROTOCOL_MAP = { 'ism': IsmFD, 'mhtml': MhtmlFD, 'niconico_dmc': NiconicoDmcFD, + 'fc2_live': FC2LiveFD, 'websocket_frag': WebSocketFragmentFD, 'youtube_live_chat': YoutubeLiveChatFD, 'youtube_live_chat_replay': YoutubeLiveChatFD, diff --git a/yt_dlp/downloader/fc2.py b/yt_dlp/downloader/fc2.py new file mode 100644 index 000000000..157bcf23e --- /dev/null +++ b/yt_dlp/downloader/fc2.py @@ -0,0 +1,41 @@ +from __future__ import division, unicode_literals + +import threading + +from .common import FileDownloader +from .external import FFmpegFD + + +class FC2LiveFD(FileDownloader): + """ + Downloads FC2 live without being stopped. <br> + Note, this is not a part of public API, and will be removed without notice. + DO NOT USE + """ + + def real_download(self, filename, info_dict): + ws = info_dict['ws'] + + heartbeat_lock = threading.Lock() + heartbeat_state = [None, 1] + + def heartbeat(): + try: + heartbeat_state[1] += 1 + ws.send('{"name":"heartbeat","arguments":{},"id":%d}' % heartbeat_state[1]) + except Exception: + self.to_screen('[fc2:live] Heartbeat failed') + + with heartbeat_lock: + heartbeat_state[0] = threading.Timer(30, heartbeat) + heartbeat_state[0]._daemonic = True + heartbeat_state[0].start() + + heartbeat() + + new_info_dict = info_dict.copy() + new_info_dict.update({ + 'ws': None, + 'protocol': 'live_ffmpeg', + }) + return FFmpegFD(self.ydl, self.params or {}).download(filename, new_info_dict) diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 19c0990d3..7b213cd5f 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -25,6 +25,7 @@ from ..utils import ( error_to_compat_str, encodeFilename, sanitized_Request, + traverse_obj, ) @@ -382,6 +383,7 @@ class FragmentFD(FileDownloader): max_workers = self.params.get('concurrent_fragment_downloads', 1) if max_progress > 1: self._prepare_multiline_status(max_progress) + is_live = any(traverse_obj(args, (..., 2, 'is_live'), default=[])) def thread_func(idx, ctx, fragments, info_dict, tpe): ctx['max_progress'] = max_progress @@ -395,25 +397,43 @@ class FragmentFD(FileDownloader): def __exit__(self, exc_type, exc_val, exc_tb): pass - spins = [] if compat_os_name == 'nt': - self.report_warning('Ctrl+C does not work on Windows when used with parallel threads. ' - 'This is a known issue and patches are welcome') + def bindoj_result(future): + while True: + try: + return future.result(0.1) + except KeyboardInterrupt: + raise + except concurrent.futures.TimeoutError: + continue + else: + def bindoj_result(future): + return future.result() + + def interrupt_trigger_iter(fg): + for f in fg: + if not interrupt_trigger[0]: + break + yield f + + spins = [] for idx, (ctx, fragments, info_dict) in enumerate(args): tpe = FTPE(math.ceil(max_workers / max_progress)) - job = tpe.submit(thread_func, idx, ctx, fragments, info_dict, tpe) + job = tpe.submit(thread_func, idx, ctx, interrupt_trigger_iter(fragments), info_dict, tpe) spins.append((tpe, job)) result = True for tpe, job in spins: try: - result = result and job.result() + result = result and bindoj_result(job) except KeyboardInterrupt: interrupt_trigger[0] = False finally: tpe.shutdown(wait=True) - if not interrupt_trigger[0]: + if not interrupt_trigger[0] and not is_live: raise KeyboardInterrupt() + # we expect the user wants to stop and DO WANT the preceding postprocessors to run; + # so returning a intermediate result here instead of KeyboardInterrupt on live return result def download_and_append_fragments( @@ -431,10 +451,11 @@ class FragmentFD(FileDownloader): pack_func = lambda frag_content, _: frag_content def download_fragment(fragment, ctx): + if not interrupt_trigger[0]: + return False, fragment['frag_index'] + frag_index = ctx['fragment_index'] = fragment['frag_index'] ctx['last_error'] = None - if not interrupt_trigger[0]: - return False, frag_index headers = info_dict.get('http_headers', {}).copy() byte_range = fragment.get('byte_range') if byte_range: @@ -500,8 +521,6 @@ class FragmentFD(FileDownloader): self.report_warning('The download speed shown is only of one thread. This is a known issue and patches are welcome') with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool: for fragment, frag_content, frag_index, frag_filename in pool.map(_download_fragment, fragments): - if not interrupt_trigger[0]: - break ctx['fragment_filename_sanitized'] = frag_filename ctx['fragment_index'] = frag_index result = append_fragment(decrypt_fragment(fragment, frag_content), frag_index, ctx) diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py new file mode 100644 index 000000000..66b12c72f --- /dev/null +++ b/yt_dlp/extractor/abematv.py @@ -0,0 +1,488 @@ +import io +import json +import time +import hashlib +import hmac +import re +import struct +from base64 import urlsafe_b64encode +from binascii import unhexlify + +import typing +if typing.TYPE_CHECKING: + from ..YoutubeDL import YoutubeDL + +from .common import InfoExtractor +from ..aes import aes_ecb_decrypt +from ..compat import ( + compat_urllib_response, + compat_urllib_parse_urlparse, + compat_urllib_request, +) +from ..utils import ( + ExtractorError, + decode_base, + int_or_none, + random_uuidv4, + request_to_url, + time_seconds, + update_url_query, + traverse_obj, + intlist_to_bytes, + bytes_to_intlist, + urljoin, +) + + +# NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862) + +def add_opener(self: 'YoutubeDL', handler): + ''' Add a handler for opening URLs, like _download_webpage ''' + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 + assert isinstance(self._opener, compat_urllib_request.OpenerDirector) + self._opener.add_handler(handler) + + +def remove_opener(self: 'YoutubeDL', handler): + ''' + Remove handler(s) for opening URLs + @param handler Either handler object itself or handler type. + Specifying handler type will remove all handler which isinstance returns True. + ''' + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 + opener = self._opener + assert isinstance(self._opener, compat_urllib_request.OpenerDirector) + if isinstance(handler, (type, tuple)): + find_cp = lambda x: isinstance(x, handler) + else: + find_cp = lambda x: x is handler + + removed = [] + for meth in dir(handler): + if meth in ["redirect_request", "do_open", "proxy_open"]: + # oops, coincidental match + continue + + i = meth.find("_") + protocol = meth[:i] + condition = meth[i + 1:] + + if condition.startswith("error"): + j = condition.find("_") + i + 1 + kind = meth[j + 1:] + try: + kind = int(kind) + except ValueError: + pass + lookup = opener.handle_error.get(protocol, {}) + opener.handle_error[protocol] = lookup + elif condition == "open": + kind = protocol + lookup = opener.handle_open + elif condition == "response": + kind = protocol + lookup = opener.process_response + elif condition == "request": + kind = protocol + lookup = opener.process_request + else: + continue + + handlers = lookup.setdefault(kind, []) + if handlers: + handlers[:] = [x for x in handlers if not find_cp(x)] + + removed.append(x for x in handlers if find_cp(x)) + + if removed: + for x in opener.handlers: + if find_cp(x): + x.add_parent(None) + opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)] + + +class AbemaLicenseHandler(compat_urllib_request.BaseHandler): + handler_order = 499 + STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' + HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E' + + def __init__(self, ie: 'AbemaTVIE'): + # the protcol that this should really handle is 'abematv-license://' + # abematv_license_open is just a placeholder for development purposes + # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510 + setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open')) + self.ie = ie + + def _get_videokey_from_ticket(self, ticket): + to_show = self.ie._downloader.params.get('verbose', False) + media_token = self.ie._get_media_token(to_show=to_show) + + license_response = self.ie._download_json( + 'https://license.abema.io/abematv-hls', None, note='Requesting playback license' if to_show else False, + query={'t': media_token}, + data=json.dumps({ + 'kv': 'a', + 'lt': ticket + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) + + res = decode_base(license_response['k'], self.STRTABLE) + encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff)) + + h = hmac.new( + unhexlify(self.HKEY), + (license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'), + digestmod=hashlib.sha256) + enckey = bytes_to_intlist(h.digest()) + + return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey)) + + def abematv_license_open(self, url): + url = request_to_url(url) + ticket = compat_urllib_parse_urlparse(url).netloc + response_data = self._get_videokey_from_ticket(ticket) + return compat_urllib_response.addinfourl(io.BytesIO(response_data), headers={ + 'Content-Length': len(response_data), + }, url=url, code=200) + + +class AbemaTVBaseIE(InfoExtractor): + def _extract_breadcrumb_list(self, webpage, video_id): + for jld in re.finditer( + r'(?is)</span></li></ul><script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', + webpage): + jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False) + if jsonld: + if jsonld.get('@type') != 'BreadcrumbList': + continue + trav = traverse_obj(jsonld, ('itemListElement', ..., 'name')) + if trav: + return trav + return [] + + +class AbemaTVIE(AbemaTVBaseIE): + _VALID_URL = r'https?://abema\.tv/(?P<type>now-on-air|video/episode|channels/.+?/slots)/(?P<id>[^?/]+)' + _NETRC_MACHINE = 'abematv' + _TESTS = [{ + 'url': 'https://abema.tv/video/episode/194-25_s2_p1', + 'info_dict': { + 'id': '194-25_s2_p1', + 'title': '第1話 「チーズケーキ」 「モーニング再び」', + 'series': '異世界食堂2', + 'series_number': 2, + 'episode': '第1話 「チーズケーキ」 「モーニング再び」', + 'episode_number': 1, + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d', + 'info_dict': { + 'id': 'E8tvAnMJ7a9a5d', + 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', + 'series': 'ゆるキャン△ SEASON2', + 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', + 'series_number': 2, + 'episode_number': 1, + 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17', + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047', + 'info_dict': { + 'id': 'E8tvAnMJ7a9a5d', + 'title': '第5話『光射す』', + 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d', + 'thumbnail': r're:https://hayabusa\.io/.+', + 'series': '相棒', + 'episode': '第5話『光射す』', + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/now-on-air/abema-anime', + 'info_dict': { + 'id': 'abema-anime', + # this varies + # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】', + 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f', + 'is_live': True, + }, + 'skip': 'Not supported until yt-dlp implements native live downloader OR AbemaTV can start a local HTTP server', + }] + _USERTOKEN = None + _DEVICE_ID = None + _TIMETABLE = None + _MEDIATOKEN = None + + _SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe' + + def _generate_aks(self, deviceid): + deviceid = deviceid.encode('utf-8') + # add 1 hour and then drop minute and secs + ts_1hour = int((time_seconds(hours=9) // 3600 + 1) * 3600) + time_struct = time.gmtime(ts_1hour) + ts_1hour_str = str(ts_1hour).encode('utf-8') + + tmp = None + + def mix_once(nonce): + nonlocal tmp + h = hmac.new(self._SECRETKEY, digestmod=hashlib.sha256) + h.update(nonce) + tmp = h.digest() + + def mix_tmp(count): + nonlocal tmp + for i in range(count): + mix_once(tmp) + + def mix_twist(nonce): + nonlocal tmp + mix_once(urlsafe_b64encode(tmp).rstrip(b'=') + nonce) + + mix_once(self._SECRETKEY) + mix_tmp(time_struct.tm_mon) + mix_twist(deviceid) + mix_tmp(time_struct.tm_mday % 5) + mix_twist(ts_1hour_str) + mix_tmp(time_struct.tm_hour % 5) + + return urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8') + + def _get_device_token(self): + if self._USERTOKEN: + return self._USERTOKEN + + self._DEVICE_ID = random_uuidv4() + aks = self._generate_aks(self._DEVICE_ID) + user_data = self._download_json( + 'https://api.abema.io/v1/users', None, note='Authorizing', + data=json.dumps({ + 'deviceId': self._DEVICE_ID, + 'applicationKeySecret': aks, + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) + self._USERTOKEN = user_data['token'] + + # don't allow adding it 2 times or more, though it's guarded + remove_opener(self._downloader, AbemaLicenseHandler) + add_opener(self._downloader, AbemaLicenseHandler(self)) + + return self._USERTOKEN + + def _get_media_token(self, invalidate=False, to_show=True): + if not invalidate and self._MEDIATOKEN: + return self._MEDIATOKEN + + self._MEDIATOKEN = self._download_json( + 'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False, + query={ + 'osName': 'android', + 'osVersion': '6.0.1', + 'osLang': 'ja_JP', + 'osTimezone': 'Asia/Tokyo', + 'appId': 'tv.abema', + 'appVersion': '3.27.1' + }, headers={ + 'Authorization': 'bearer ' + self._get_device_token() + })['token'] + + return self._MEDIATOKEN + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + # No authentication to be performed + if not username: + return True + + if '@' in username: # don't strictly check if it's email address or not + ep, method = 'user/email', 'email' + else: + ep, method = 'oneTimePassword', 'userId' + + login_response = self._download_json( + f'https://api.abema.io/v1/auth/{ep}', None, note='Logging in', + data=json.dumps({ + method: username, + 'password': password + }).encode('utf-8'), headers={ + 'Authorization': 'bearer ' + self._get_device_token(), + 'Origin': 'https://abema.tv', + 'Referer': 'https://abema.tv/', + 'Content-Type': 'application/json', + }) + + self._USERTOKEN = login_response['token'] + self._get_media_token(True) + + def _real_extract(self, url): + # starting download using infojson from this extractor is undefined behavior, + # and never be fixed in the future; you must trigger downloads by directly specifing URL. + # (unless there's a way to hook before downloading by extractor) + video_id, video_type = self._match_valid_url(url).group('id', 'type') + headers = { + 'Authorization': 'Bearer ' + self._get_device_token(), + } + video_type = video_type.split('/')[-1] + + webpage = self._download_webpage(url, video_id) + canonical_url = self._search_regex( + r'<link\s+rel="canonical"\s*href="(.+?)"', webpage, 'canonical URL', + default=url) + info = self._search_json_ld(webpage, video_id, default={}) + + title = self._search_regex( + r'<span\s*class=".+?EpisodeTitleBlock__title">(.+?)</span>', webpage, 'title', default=None) + if not title: + jsonld = None + for jld in re.finditer( + r'(?is)<span\s*class="com-m-Thumbnail__image">(?:</span>)?<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', + webpage): + jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False) + if jsonld: + break + if jsonld: + title = jsonld.get('caption') + if not title and video_type == 'now-on-air': + if not self._TIMETABLE: + # cache the timetable because it goes to 5MiB in size (!!) + self._TIMETABLE = self._download_json( + 'https://api.abema.io/v1/timetable/dataSet?debug=false', video_id, + headers=headers) + now = time_seconds(hours=9) + for slot in self._TIMETABLE.get('slots', []): + if slot.get('channelId') != video_id: + continue + if slot['startAt'] <= now and now < slot['endAt']: + title = slot['title'] + break + + # read breadcrumb on top of page + breadcrumb = self._extract_breadcrumb_list(webpage, video_id) + if breadcrumb: + # breadcrumb list translates to: (example is 1st test for this IE) + # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title) + # hence this works + info['series'] = breadcrumb[-2] + info['episode'] = breadcrumb[-1] + if not title: + title = info['episode'] + + description = self._html_search_regex( + (r'<p\s+class="com-video-EpisodeDetailsBlock__content"><span\s+class=".+?">(.+?)</span></p><div', + r'<span\s+class=".+?SlotSummary.+?">(.+?)</span></div><div',), + webpage, 'description', default=None, group=1) + if not description: + og_desc = self._html_search_meta( + ('description', 'og:description', 'twitter:description'), webpage) + if og_desc: + description = re.sub(r'''(?sx) + ^(.+?)(?: + アニメの動画を無料で見るならABEMA!| # anime + 等、.+ # applies for most of categories + )? + ''', r'\1', og_desc) + + # canonical URL may contain series and episode number + mobj = re.search(r's(\d+)_p(\d+)$', canonical_url) + if mobj: + seri = int_or_none(mobj.group(1), default=float('inf')) + epis = int_or_none(mobj.group(2), default=float('inf')) + info['series_number'] = seri if seri < 100 else None + # some anime like Detective Conan (though not available in AbemaTV) + # has more than 1000 episodes (1026 as of 2021/11/15) + info['episode_number'] = epis if epis < 2000 else None + + is_live, m3u8_url = False, None + if video_type == 'now-on-air': + is_live = True + channel_url = 'https://api.abema.io/v1/channels' + if video_id == 'news-global': + channel_url = update_url_query(channel_url, {'division': '1'}) + onair_channels = self._download_json(channel_url, video_id) + for ch in onair_channels['channels']: + if video_id == ch['id']: + m3u8_url = ch['playback']['hls'] + break + else: + raise ExtractorError(f'Cannot find on-air {video_id} channel.', expected=True) + elif video_type == 'episode': + api_response = self._download_json( + f'https://api.abema.io/v1/video/programs/{video_id}', video_id, + note='Checking playability', + headers=headers) + ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType'), default=[]) + if 3 not in ondemand_types: + # cannot acquire decryption key for these streams + self.report_warning('This is a premium-only stream') + + m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8' + elif video_type == 'slots': + api_response = self._download_json( + f'https://api.abema.io/v1/media/slots/{video_id}', video_id, + note='Checking playability', + headers=headers) + if not traverse_obj(api_response, ('slot', 'flags', 'timeshiftFree'), default=False): + self.report_warning('This is a premium-only stream') + + m3u8_url = f'https://vod-abematv.akamaized.net/slot/{video_id}/playlist.m3u8' + else: + raise ExtractorError('Unreachable') + + if is_live: + self.report_warning("This is a livestream; yt-dlp doesn't support downloading natively, but FFmpeg cannot handle m3u8 manifests from AbemaTV") + self.report_warning('Please consider using Streamlink to download these streams (https://github.com/streamlink/streamlink)') + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', live=is_live) + + info.update({ + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'is_live': is_live, + }) + return info + + +class AbemaTVTitleIE(AbemaTVBaseIE): + _VALID_URL = r'https?://abema\.tv/video/title/(?P<id>[^?/]+)' + + _TESTS = [{ + 'url': 'https://abema.tv/video/title/90-1597', + 'info_dict': { + 'id': '90-1597', + 'title': 'シャッフルアイランド', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://abema.tv/video/title/193-132', + 'info_dict': { + 'id': '193-132', + 'title': '真心が届く~僕とスターのオフィス・ラブ!?~', + }, + 'playlist_mincount': 16, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + playlist_title, breadcrumb = None, self._extract_breadcrumb_list(webpage, video_id) + if breadcrumb: + playlist_title = breadcrumb[-1] + + playlist = [ + self.url_result(urljoin('https://abema.tv/', mobj.group(1))) + for mobj in re.finditer(r'<li\s*class=".+?EpisodeList.+?"><a\s*href="(/[^"]+?)"', webpage)] + + return self.playlist_result(playlist, playlist_title=playlist_title, playlist_id=video_id) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 7d4262acf..ef1d6c14d 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -14,6 +14,10 @@ from .abcotvs import ( ABCOTVSIE, ABCOTVSClipsIE, ) +from .abematv import ( + AbemaTVIE, + AbemaTVTitleIE, +) from .academicearth import AcademicEarthCourseIE from .acast import ( ACastIE, @@ -474,6 +478,7 @@ from .faz import FazIE from .fc2 import ( FC2IE, FC2EmbedIE, + FC2LiveIE, ) from .fczenit import FczenitIE from .filmmodu import FilmmoduIE diff --git a/yt_dlp/extractor/fc2.py b/yt_dlp/extractor/fc2.py index 2c19a0c6e..7fc6b0e3d 100644 --- a/yt_dlp/extractor/fc2.py +++ b/yt_dlp/extractor/fc2.py @@ -1,14 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_parse_qs, ) from ..utils import ( ExtractorError, + WebSocketsWrapper, + has_websockets, + js_to_json, sanitized_Request, + std_headers, traverse_obj, + update_url_query, urlencode_postdata, urljoin, ) @@ -147,3 +154,146 @@ class FC2EmbedIE(InfoExtractor): 'title': title, 'thumbnail': thumbnail, } + + +class FC2LiveIE(InfoExtractor): + _VALID_URL = r'https?://live\.fc2\.com/(?P<id>\d+)' + IE_NAME = 'fc2:live' + + _TESTS = [{ + 'url': 'https://live.fc2.com/57892267/', + 'info_dict': { + 'id': '57892267', + 'title': 'どこまで・・・', + 'uploader': 'あつあげ', + 'uploader_id': '57892267', + 'thumbnail': r're:https?://.+fc2.+', + }, + 'skip': 'livestream', + }] + + def _real_extract(self, url): + if not has_websockets: + raise ExtractorError('websockets library is not available. Please install it.', expected=True) + video_id = self._match_id(url) + webpage = self._download_webpage('https://live.fc2.com/%s/' % video_id, video_id) + + self._set_cookie('live.fc2.com', 'js-player_size', '1') + + member_api = self._download_json( + 'https://live.fc2.com/api/memberApi.php', video_id, data=urlencode_postdata({ + 'channel': '1', + 'profile': '1', + 'user': '1', + 'streamid': video_id + }), note='Requesting member info') + + control_server = self._download_json( + 'https://live.fc2.com/api/getControlServer.php', video_id, note='Downloading ControlServer data', + data=urlencode_postdata({ + 'channel_id': video_id, + 'mode': 'play', + 'orz': '', + 'channel_version': member_api['data']['channel_data']['version'], + 'client_version': '2.1.0\n [1]', + 'client_type': 'pc', + 'client_app': 'browser_hls', + 'ipv6': '', + }), headers={'X-Requested-With': 'XMLHttpRequest'}) + self._set_cookie('live.fc2.com', 'l_ortkn', control_server['orz_raw']) + + ws_url = update_url_query(control_server['url'], {'control_token': control_server['control_token']}) + playlist_data = None + + self.to_screen('%s: Fetching HLS playlist info via WebSocket' % video_id) + ws = WebSocketsWrapper(ws_url, { + 'Cookie': str(self._get_cookies('https://live.fc2.com/'))[12:], + 'Origin': 'https://live.fc2.com', + 'Accept': '*/*', + 'User-Agent': std_headers['User-Agent'], + }) + ws.__enter__() + + self.write_debug('[debug] Sending HLS server request') + + while True: + recv = ws.recv() + if not recv: + continue + data = self._parse_json(recv, video_id, fatal=False) + if not data or not isinstance(data, dict): + continue + + if data.get('name') == 'connect_complete': + break + ws.send(r'{"name":"get_hls_information","arguments":{},"id":1}') + + while True: + recv = ws.recv() + if not recv: + continue + data = self._parse_json(recv, video_id, fatal=False) + if not data or not isinstance(data, dict): + continue + if data.get('name') == '_response_' and data.get('id') == 1: + self.write_debug('[debug] Goodbye.') + playlist_data = data + break + elif self._downloader.params.get('verbose', False): + if len(recv) > 100: + recv = recv[:100] + '...' + self.to_screen('[debug] Server said: %s' % recv) + + if not playlist_data: + raise ExtractorError('Unable to fetch HLS playlist info via WebSocket') + + formats = [] + for name, playlists in playlist_data['arguments'].items(): + if not isinstance(playlists, list): + continue + for pl in playlists: + if pl.get('status') == 0 and 'master_playlist' in pl.get('url'): + formats.extend(self._extract_m3u8_formats( + pl['url'], video_id, ext='mp4', m3u8_id=name, live=True, + headers={ + 'Origin': 'https://live.fc2.com', + 'Referer': url, + })) + + self._sort_formats(formats) + for fmt in formats: + fmt.update({ + 'protocol': 'fc2_live', + 'ws': ws, + }) + + title = self._html_search_meta(('og:title', 'twitter:title'), webpage, 'live title', fatal=False) + if not title: + title = self._html_extract_title(webpage, 'html title', fatal=False) + if title: + # remove service name in <title> + title = re.sub(r'\s+-\s+.+$', '', title) + uploader = None + if title: + match = self._search_regex(r'^(.+?)\s*\[(.+?)\]$', title, 'title and uploader', default=None, group=(1, 2)) + if match and all(match): + title, uploader = match + + live_info_view = self._search_regex(r'(?s)liveInfoView\s*:\s*({.+?}),\s*premiumStateView', webpage, 'user info', fatal=False) or None + if live_info_view: + # remove jQuery code from object literal + live_info_view = re.sub(r'\$\(.+?\)[^,]+,', '"",', live_info_view) + live_info_view = self._parse_json(js_to_json(live_info_view), video_id) + + return { + 'id': video_id, + 'title': title or traverse_obj(live_info_view, 'title'), + 'description': self._html_search_meta( + ('og:description', 'twitter:description'), + webpage, 'live description', fatal=False) or traverse_obj(live_info_view, 'info'), + 'formats': formats, + 'uploader': uploader or traverse_obj(live_info_view, 'name'), + 'uploader_id': video_id, + 'thumbnail': traverse_obj(live_info_view, 'thumb'), + 'is_live': True, + } diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 636bf42b6..47b3c5a85 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2135,6 +2135,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return f['manifest_url'], f['manifest_stream_number'], is_live for f in formats: + f['is_live'] = True f['protocol'] = 'http_dash_segments_generator' f['fragments'] = functools.partial( self._live_dash_fragments, f['format_id'], live_start_time, mpd_feed) @@ -2157,12 +2158,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): known_idx, no_fragment_score, last_segment_url = begin_index, 0, None fragments, fragment_base_url = None, None - def _extract_sequence_from_mpd(refresh_sequence): + def _extract_sequence_from_mpd(refresh_sequence, immediate): nonlocal mpd_url, stream_number, is_live, no_fragment_score, fragments, fragment_base_url # Obtain from MPD's maximum seq value old_mpd_url = mpd_url last_error = ctx.pop('last_error', None) - expire_fast = last_error and isinstance(last_error, compat_HTTPError) and last_error.code == 403 + expire_fast = immediate or last_error and isinstance(last_error, compat_HTTPError) and last_error.code == 403 mpd_url, stream_number, is_live = (mpd_feed(format_id, 5 if expire_fast else 18000) or (mpd_url, stream_number, False)) if not refresh_sequence: @@ -2176,7 +2177,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): except ExtractorError: fmts = None if not fmts: - no_fragment_score += 1 + no_fragment_score += 2 return False, last_seq fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number) fragments = fmt_info['fragments'] @@ -2199,11 +2200,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): urlh = None last_seq = try_get(urlh, lambda x: int_or_none(x.headers['X-Head-Seqnum'])) if last_seq is None: - no_fragment_score += 1 + no_fragment_score += 2 last_segment_url = None continue else: - should_continue, last_seq = _extract_sequence_from_mpd(True) + should_continue, last_seq = _extract_sequence_from_mpd(True, no_fragment_score > 15) + no_fragment_score += 2 if not should_continue: continue @@ -2221,7 +2223,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): try: for idx in range(known_idx, last_seq): # do not update sequence here or you'll get skipped some part of it - should_continue, _ = _extract_sequence_from_mpd(False) + should_continue, _ = _extract_sequence_from_mpd(False, False) if not should_continue: known_idx = idx - 1 raise ExtractorError('breaking out of outer loop') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 90502dbc0..6ec8da11b 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals +import asyncio +import atexit import base64 import binascii import calendar @@ -58,6 +60,7 @@ from .compat import ( compat_kwargs, compat_os_name, compat_parse_qs, + compat_shlex_split, compat_shlex_quote, compat_str, compat_struct_pack, @@ -72,6 +75,7 @@ from .compat import ( compat_urllib_parse_unquote_plus, compat_urllib_request, compat_urlparse, + compat_websockets, compat_xpath, ) @@ -144,6 +148,7 @@ std_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-us,en;q=0.5', + 'Sec-Fetch-Mode': 'navigate', } @@ -415,17 +420,33 @@ def get_element_by_id(id, html): return get_element_by_attribute('id', id, html) +def get_element_html_by_id(id, html): + """Return the html of the tag with the specified ID in the passed HTML document""" + return get_element_html_by_attribute('id', id, html) + + def get_element_by_class(class_name, html): """Return the content of the first tag with the specified class in the passed HTML document""" retval = get_elements_by_class(class_name, html) return retval[0] if retval else None +def get_element_html_by_class(class_name, html): + """Return the html of the first tag with the specified class in the passed HTML document""" + retval = get_elements_html_by_class(class_name, html) + return retval[0] if retval else None + + def get_element_by_attribute(attribute, value, html, escape_value=True): retval = get_elements_by_attribute(attribute, value, html, escape_value) return retval[0] if retval else None +def get_element_html_by_attribute(attribute, value, html, escape_value=True): + retval = get_elements_html_by_attribute(attribute, value, html, escape_value) + return retval[0] if retval else None + + def get_elements_by_class(class_name, html): """Return the content of all tags with the specified class in the passed HTML document as a list""" return get_elements_by_attribute( @@ -433,29 +454,123 @@ def get_elements_by_class(class_name, html): html, escape_value=False) -def get_elements_by_attribute(attribute, value, html, escape_value=True): +def get_elements_html_by_class(class_name, html): + """Return the html of all tags with the specified class in the passed HTML document as a list""" + return get_elements_html_by_attribute( + 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + html, escape_value=False) + + +def get_elements_by_attribute(*args, **kwargs): """Return the content of the tag with the specified attribute in the passed HTML document""" + return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)] + + +def get_elements_html_by_attribute(*args, **kwargs): + """Return the html of the tag with the specified attribute in the passed HTML document""" + return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)] + + +def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True): + """ + Return the text (content) and the html (whole) of the tag with the specified + attribute in the passed HTML document + """ + + value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?' value = re.escape(value) if escape_value else value - retlist = [] - for m in re.finditer(r'''(?xs) - <([a-zA-Z0-9:._-]+) - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? - \s+%s=['"]?%s['"]? - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? - \s*> - (?P<content>.*?) - </\1> - ''' % (re.escape(attribute), value), html): - res = m.group('content') + partial_element_re = r'''(?x) + <(?P<tag>[a-zA-Z0-9:._-]+) + (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? + \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q) + ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional} + + for m in re.finditer(partial_element_re, html): + content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():]) + + yield ( + unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)), + whole + ) + + +class HTMLBreakOnClosingTagParser(compat_HTMLParser): + """ + HTML parser which raises HTMLBreakOnClosingTagException upon reaching the + closing tag for the first opening tag it has encountered, and can be used + as a context manager + """ + + class HTMLBreakOnClosingTagException(Exception): + pass + + def __init__(self): + self.tagstack = collections.deque() + compat_HTMLParser.__init__(self) + + def __enter__(self): + return self + + def __exit__(self, *_): + self.close() - if res.startswith('"') or res.startswith("'"): - res = res[1:-1] + def close(self): + # handle_endtag does not return upon raising HTMLBreakOnClosingTagException, + # so data remains buffered; we no longer have any interest in it, thus + # override this method to discard it + pass - retlist.append(unescapeHTML(res)) + def handle_starttag(self, tag, _): + self.tagstack.append(tag) - return retlist + def handle_endtag(self, tag): + if not self.tagstack: + raise compat_HTMLParseError('no tags in the stack') + while self.tagstack: + inner_tag = self.tagstack.pop() + if inner_tag == tag: + break + else: + raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found') + if not self.tagstack: + raise self.HTMLBreakOnClosingTagException() + + +def get_element_text_and_html_by_tag(tag, html): + """ + For the first element with the specified tag in the passed HTML document + return its' content (text) and the whole element (html) + """ + def find_or_raise(haystack, needle, exc): + try: + return haystack.index(needle) + except ValueError: + raise exc + closing_tag = f'</{tag}>' + whole_start = find_or_raise( + html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found')) + content_start = find_or_raise( + html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag')) + content_start += whole_start + 1 + with HTMLBreakOnClosingTagParser() as parser: + parser.feed(html[whole_start:content_start]) + if not parser.tagstack or parser.tagstack[0] != tag: + raise compat_HTMLParseError(f'parser did not match opening {tag} tag') + offset = content_start + while offset < len(html): + next_closing_tag_start = find_or_raise( + html[offset:], closing_tag, + compat_HTMLParseError(f'closing {tag} tag not found')) + next_closing_tag_end = next_closing_tag_start + len(closing_tag) + try: + parser.feed(html[offset:offset + next_closing_tag_end]) + offset += next_closing_tag_end + except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException: + return html[content_start:offset + next_closing_tag_start], \ + html[whole_start:offset + next_closing_tag_end] + raise compat_HTMLParseError('unexpected end of html') class HTMLAttributeParser(compat_HTMLParser): @@ -527,10 +642,9 @@ def clean_html(html): if html is None: # Convenience for sanitizing descriptions etc. return html - # Newline vs <br /> - html = html.replace('\n', ' ') - html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html) - html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) + html = re.sub(r'\s+', ' ', html) + html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html) + html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html) # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities @@ -554,7 +668,7 @@ def sanitize_open(filename, open_mode): import msvcrt msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename) - stream = open(encodeFilename(filename), open_mode) + stream = locked_file(filename, open_mode, block=False).open() return (stream, filename) except (IOError, OSError) as err: if err.errno in (errno.EACCES,): @@ -566,7 +680,7 @@ def sanitize_open(filename, open_mode): raise else: # An exception here should be caught in the caller - stream = open(encodeFilename(alt_filename), open_mode) + stream = locked_file(filename, open_mode, block=False).open() return (stream, alt_filename) @@ -885,6 +999,8 @@ def make_HTTPS_handler(params, **kwargs): opts_check_certificate = not params.get('nocheckcertificate') context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) context.check_hostname = opts_check_certificate + if params.get('legacyserverconnect'): + context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE if opts_check_certificate: try: @@ -905,13 +1021,9 @@ def make_HTTPS_handler(params, **kwargs): def bug_reports_message(before=';'): - if ytdl_is_updateable(): - update_cmd = 'type doas pacman -Sy hypervideo to update' - else: - update_cmd = 'see https://git.conocimientoslibres.ga/software/hypervideo.git/about/#how-do-i-update-hypervideo' - msg = 'please report this issue on https://github.com/yt-dlp/yt-dlp .' - msg += ' Make sure you are using the latest version; %s.' % update_cmd - msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.' + msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp , ' + 'filling out the "Broken site" issue template properly. ' + 'Confirm you are on the latest version using -U') before = before.rstrip() if not before or before.endswith(('.', '!', '?')): @@ -1734,7 +1846,7 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): if precision == 'auto': auto_precision = True precision = 'microsecond' - today = datetime_round(datetime.datetime.now(), precision) + today = datetime_round(datetime.datetime.utcnow(), precision) if date_str in ('now', 'today'): return today if date_str == 'yesterday': @@ -2010,7 +2122,7 @@ if sys.platform == 'win32': whole_low = 0xffffffff whole_high = 0x7fffffff - def _lock_file(f, exclusive): + def _lock_file(f, exclusive, block): # todo: block unused on win32 overlapped = OVERLAPPED() overlapped.Offset = 0 overlapped.OffsetHigh = 0 @@ -2033,15 +2145,19 @@ else: try: import fcntl - def _lock_file(f, exclusive): - fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) + def _lock_file(f, exclusive, block): + fcntl.flock(f, + fcntl.LOCK_SH if not exclusive + else fcntl.LOCK_EX if block + else fcntl.LOCK_EX | fcntl.LOCK_NB) def _unlock_file(f): fcntl.flock(f, fcntl.LOCK_UN) + except ImportError: UNSUPPORTED_MSG = 'file locking is not supported on this platform' - def _lock_file(f, exclusive): + def _lock_file(f, exclusive, block): raise IOError(UNSUPPORTED_MSG) def _unlock_file(f): @@ -2049,15 +2165,16 @@ else: class locked_file(object): - def __init__(self, filename, mode, encoding=None): - assert mode in ['r', 'a', 'w'] + def __init__(self, filename, mode, block=True, encoding=None): + assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb'] self.f = io.open(filename, mode, encoding=encoding) self.mode = mode + self.block = block def __enter__(self): - exclusive = self.mode != 'r' + exclusive = 'r' not in self.mode try: - _lock_file(self.f, exclusive) + _lock_file(self.f, exclusive, self.block) except IOError: self.f.close() raise @@ -2078,6 +2195,15 @@ class locked_file(object): def read(self, *args): return self.f.read(*args) + def flush(self): + self.f.flush() + + def open(self): + return self.__enter__() + + def close(self, *args): + self.__exit__(self, *args, value=False, traceback=False) + def get_filesystem_encoding(): encoding = sys.getfilesystemencoding() @@ -2120,9 +2246,11 @@ def format_decimal_suffix(num, fmt='%d%s', *, factor=1000): if num is None: return None exponent = 0 if num == 0 else int(math.log(num, factor)) - suffix = ['', *'KMGTPEZY'][exponent] + suffix = ['', *'kMGTPEZY'][exponent] + if factor == 1024: + suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i') converted = num / (factor ** exponent) - return fmt % (converted, f'{suffix}i' if suffix and factor == 1024 else suffix) + return fmt % (converted, suffix) def format_bytes(bytes): @@ -2382,13 +2510,8 @@ class PUTRequest(compat_urllib_request.Request): def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): - if get_attr: - if v is not None: - v = getattr(v, get_attr, None) - if v == '': - v = None - if v is None: - return default + if get_attr and v is not None: + v = getattr(v, get_attr, None) try: return int(v) * invscale // scale except (ValueError, TypeError, OverflowError): @@ -2432,6 +2555,13 @@ def url_or_none(url): return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None +def request_to_url(req): + if isinstance(req, compat_urllib_request.Request): + return req.get_full_url() + else: + return req + + def strftime_or_none(timestamp, date_format, default=None): datetime_object = None try: @@ -2452,9 +2582,14 @@ def parse_duration(s): return None days, hours, mins, secs, ms = [None] * 5 - m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s) + m = re.match(r'''(?x) + (?P<before_secs> + (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)? + (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+)) + (?P<ms>[.:][0-9]+)?Z?$ + ''', s) if m: - days, hours, mins, secs, ms = m.groups() + days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms') else: m = re.match( r'''(?ix)(?:P? @@ -2499,7 +2634,7 @@ def parse_duration(s): if days: duration += float(days) * 24 * 60 * 60 if ms: - duration += float(ms) + duration += float(ms.replace(':', '.')) return duration @@ -2733,8 +2868,7 @@ class InAdvancePagedList(PagedList): def _getslice(self, start, end): start_page = start // self._pagesize - end_page = ( - self._pagecount if end is None else (end // self._pagesize + 1)) + end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1) skip_elems = start - start_page * self._pagesize only_more = None if end is None else end - start for pagenum in range(start_page, end_page): @@ -3055,6 +3189,7 @@ OUTTMPL_TYPES = { 'annotation': 'annotations.xml', 'infojson': 'info.json', 'link': None, + 'pl_video': None, 'pl_thumbnail': None, 'pl_description': 'description', 'pl_infojson': 'info.json', @@ -3203,7 +3338,7 @@ def parse_codecs(codecs_str): return {} split_codecs = list(filter(None, map( str.strip, codecs_str.strip().strip(',').split(',')))) - vcodec, acodec, hdr = None, None, None + vcodec, acodec, tcodec, hdr = None, None, None, None for full_codec in split_codecs: parts = full_codec.split('.') codec = parts[0].replace('0', '') @@ -3220,13 +3355,17 @@ def parse_codecs(codecs_str): elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): if not acodec: acodec = full_codec + elif codec in ('stpp', 'wvtt',): + if not tcodec: + tcodec = full_codec else: write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr) - if vcodec or acodec: + if vcodec or acodec or tcodec: return { 'vcodec': vcodec or 'none', 'acodec': acodec or 'none', 'dynamic_range': hdr, + **({'tcodec': tcodec} if tcodec is not None else {}), } elif len(split_codecs) == 2: return { @@ -3316,12 +3455,11 @@ def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False): return [max(width(str(v)) for v in col) for col in zip(*table)] def filter_using_list(row, filterArray): - return [col for (take, col) in zip(filterArray, row) if take] + return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take] - if hide_empty: - max_lens = get_max_lens(data) - header_row = filter_using_list(header_row, max_lens) - data = [filter_using_list(row, max_lens) for row in data] + max_lens = get_max_lens(data) if hide_empty else [] + header_row = filter_using_list(header_row, max_lens) + data = [filter_using_list(row, max_lens) for row in data] table = [header_row] + data max_lens = get_max_lens(table) @@ -4860,13 +4998,10 @@ def to_high_limit_path(path): def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None): - if field is None: - val = obj if obj is not None else default - else: - val = obj.get(field, default) - if func and val not in ignore: - val = func(val) - return template % val if val not in ignore else default + val = traverse_obj(obj, *variadic(field)) + if val in ignore: + return default + return template % (func(val) if func else val) def clean_podcast_url(url): @@ -4942,11 +5077,12 @@ def traverse_obj( ''' Traverse nested list/dict/tuple @param path_list A list of paths which are checked one by one. Each path is a list of keys where each key is a string, - a function, a tuple of strings or "...". + a function, a tuple of strings/None or "...". When a fuction is given, it takes the key as argument and returns whether the key matches or not. When a tuple is given, all the keys given in the tuple are traversed, and "..." traverses all the keys in the object + "None" returns the object without traversal @param default Default value to return @param expected_type Only accept final value of this type (Can also be any callable) @param get_all Return all the values obtained from a path or only the first one @@ -4965,8 +5101,8 @@ def traverse_obj( nonlocal depth path = tuple(variadic(path)) for i, key in enumerate(path): - if obj is None: - return None + if None in (key, obj): + return obj if isinstance(key, (list, tuple)): obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key] key = ... @@ -5034,7 +5170,6 @@ def traverse_obj( return default -# Deprecated def traverse_dict(dictn, keys, casesense=True): write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated ' 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead') @@ -5045,6 +5180,22 @@ def variadic(x, allowed_types=(str, bytes, dict)): return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,) +def decode_base(value, digits): + # This will convert given base-x string to scalar (long or int) + table = {char: index for index, char in enumerate(digits)} + result = 0 + base = len(digits) + for chr in value: + result *= base + result += table[chr] + return result + + +def time_seconds(**kwargs): + t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs))) + return t.timestamp() + + # create a JSON Web Signature (jws) with HS256 algorithm # the resulting format is in JWS Compact Serialization # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html @@ -5099,3 +5250,160 @@ def join_nonempty(*values, delim='-', from_dict=None): if from_dict is not None: values = map(from_dict.get, values) return delim.join(map(str, filter(None, values))) + + +class Config: + own_args = None + filename = None + __initialized = False + + def __init__(self, parser, label=None): + self._parser, self.label = parser, label + self._loaded_paths, self.configs = set(), [] + + def init(self, args=None, filename=None): + assert not self.__initialized + directory = '' + if filename: + location = os.path.realpath(filename) + directory = os.path.dirname(location) + if location in self._loaded_paths: + return False + self._loaded_paths.add(location) + + self.__initialized = True + self.own_args, self.filename = args, filename + for location in self._parser.parse_args(args)[0].config_locations or []: + location = os.path.join(directory, expand_path(location)) + if os.path.isdir(location): + location = os.path.join(location, 'yt-dlp.conf') + if not os.path.exists(location): + self._parser.error(f'config location {location} does not exist') + self.append_config(self.read_file(location), location) + return True + + def __str__(self): + label = join_nonempty( + self.label, 'config', f'"{self.filename}"' if self.filename else '', + delim=' ') + return join_nonempty( + self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}', + *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs), + delim='\n') + + @staticmethod + def read_file(filename, default=[]): + try: + optionf = open(filename) + except IOError: + return default # silently skip if file is not present + try: + # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 + contents = optionf.read() + if sys.version_info < (3,): + contents = contents.decode(preferredencoding()) + res = compat_shlex_split(contents, comments=True) + finally: + optionf.close() + return res + + @staticmethod + def hide_login_info(opts): + PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username']) + eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') + + def _scrub_eq(o): + m = eqre.match(o) + if m: + return m.group('key') + '=PRIVATE' + else: + return o + + opts = list(map(_scrub_eq, opts)) + for idx, opt in enumerate(opts): + if opt in PRIVATE_OPTS and idx + 1 < len(opts): + opts[idx + 1] = 'PRIVATE' + return opts + + def append_config(self, *args, label=None): + config = type(self)(self._parser, label) + config._loaded_paths = self._loaded_paths + if config.init(*args): + self.configs.append(config) + + @property + def all_args(self): + for config in reversed(self.configs): + yield from config.all_args + yield from self.own_args or [] + + def parse_args(self): + return self._parser.parse_args(list(self.all_args)) + + +class WebSocketsWrapper(): + """Wraps websockets module to use in non-async scopes""" + + def __init__(self, url, headers=None): + self.loop = asyncio.events.new_event_loop() + self.conn = compat_websockets.connect( + url, extra_headers=headers, ping_interval=None, + close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf')) + atexit.register(self.__exit__, None, None, None) + + def __enter__(self): + self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop) + return self + + def send(self, *args): + self.run_with_loop(self.pool.send(*args), self.loop) + + def recv(self, *args): + return self.run_with_loop(self.pool.recv(*args), self.loop) + + def __exit__(self, type, value, traceback): + try: + return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop) + finally: + self.loop.close() + self._cancel_all_tasks(self.loop) + + # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications + # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class + @staticmethod + def run_with_loop(main, loop): + if not asyncio.coroutines.iscoroutine(main): + raise ValueError(f'a coroutine was expected, got {main!r}') + + try: + return loop.run_until_complete(main) + finally: + loop.run_until_complete(loop.shutdown_asyncgens()) + if hasattr(loop, 'shutdown_default_executor'): + loop.run_until_complete(loop.shutdown_default_executor()) + + @staticmethod + def _cancel_all_tasks(loop): + to_cancel = asyncio.tasks.all_tasks(loop) + + if not to_cancel: + return + + for task in to_cancel: + task.cancel() + + loop.run_until_complete( + asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True)) + + for task in to_cancel: + if task.cancelled(): + continue + if task.exception() is not None: + loop.call_exception_handler({ + 'message': 'unhandled exception during asyncio.run() shutdown', + 'exception': task.exception(), + 'task': task, + }) + + +has_websockets = bool(compat_websockets) |