diff options
author | Jesús <heckyel@hyperbola.info> | 2022-04-06 00:09:08 +0800 |
---|---|---|
committer | Jesús <heckyel@hyperbola.info> | 2022-04-06 00:09:08 +0800 |
commit | 0150bfdaba1b5b92521dea896f810083dbfed417 (patch) | |
tree | 98f604716f3abfe031f84e0dbb63db82c00e4dbb | |
parent | 950cc067b8c41ac246deb4725177a372c95d8341 (diff) | |
parent | a44ca5a470e09b5170fc9c3a46733f050fadbfae (diff) | |
download | hypervideo-pre-0150bfdaba1b5b92521dea896f810083dbfed417.tar.lz hypervideo-pre-0150bfdaba1b5b92521dea896f810083dbfed417.tar.xz hypervideo-pre-0150bfdaba1b5b92521dea896f810083dbfed417.zip |
updated from upstream | 06/04/2022 at 00:09
75 files changed, 1277 insertions, 426 deletions
diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..40c19fa66 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,8 @@ +root = true + +[**.py] +charset = utf-8 +indent_size = 4 +indent_style = space +trim_trailing_whitespace = true +insert_final_newline = true diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index e57716e00..6d8018690 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -516,7 +516,7 @@ class YoutubeDL(object): _format_fields = { # NB: Keep in sync with the docstring of extractor/common.py - 'url', 'manifest_url', 'ext', 'format', 'format_id', 'format_note', + 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note', 'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', @@ -937,7 +937,7 @@ class YoutubeDL(object): def deprecation_warning(self, message): if self.params.get('logger') is not None: - self.params['logger'].warning('DeprecationWarning: {message}') + self.params['logger'].warning(f'DeprecationWarning: {message}') else: self.to_stderr(f'{self._format_err("DeprecationWarning:", self.Styles.ERROR)} {message}', True) @@ -1239,18 +1239,21 @@ class YoutubeDL(object): outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs) return self.escape_outtmpl(outtmpl) % info_dict - def _prepare_filename(self, info_dict, tmpl_type='default'): + def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None): + assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive' + if outtmpl is None: + outtmpl = self.outtmpl_dict.get(tmpl_type or 'default', self.outtmpl_dict['default']) try: - outtmpl = self._outtmpl_expandpath(self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])) + outtmpl = self._outtmpl_expandpath(outtmpl) filename = self.evaluate_outtmpl(outtmpl, info_dict, True) if not filename: return None - if tmpl_type in ('default', 'temp'): + if tmpl_type in ('', 'temp'): final_ext, ext = self.params.get('final_ext'), info_dict.get('ext') if final_ext and ext and final_ext != ext and filename.endswith(f'.{final_ext}'): filename = replace_extension(filename, ext, final_ext) - else: + elif tmpl_type: force_ext = OUTTMPL_TYPES[tmpl_type] if force_ext: filename = replace_extension(filename, force_ext, info_dict.get('ext')) @@ -1266,10 +1269,12 @@ class YoutubeDL(object): self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') return None - def prepare_filename(self, info_dict, dir_type='', warn=False): - """Generate the output filename.""" - - filename = self._prepare_filename(info_dict, dir_type or 'default') + def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False): + """Generate the output filename""" + if outtmpl: + assert not dir_type, 'outtmpl and dir_type are mutually exclusive' + dir_type = None + filename = self._prepare_filename(info_dict, tmpl_type=dir_type, outtmpl=outtmpl) if not filename and dir_type not in ('', 'temp'): return '' @@ -2182,7 +2187,7 @@ class YoutubeDL(object): yield merged_format else: - format_fallback, format_reverse, format_idx = False, True, 1 + format_fallback, seperate_fallback, format_reverse, format_idx = False, None, True, 1 mobj = re.match( r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$', format_spec) @@ -2209,6 +2214,7 @@ class YoutubeDL(object): filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' elif format_spec in self._format_selection_exts['video']: filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none' + seperate_fallback = lambda f: f.get('ext') == format_spec and f.get('vcodec') != 'none' elif format_spec in self._format_selection_exts['storyboards']: filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none' else: @@ -2217,11 +2223,15 @@ class YoutubeDL(object): def selector_function(ctx): formats = list(ctx['formats']) matches = list(filter(filter_f, formats)) if filter_f is not None else formats - if format_fallback and ctx['incomplete_formats'] and not matches: - # for extractors with incomplete formats (audio only (soundcloud) - # or video only (imgur)) best/worst will fallback to - # best/worst {video,audio}-only format - matches = formats + if not matches: + if format_fallback and ctx['incomplete_formats']: + # for extractors with incomplete formats (audio only (soundcloud) + # or video only (imgur)) best/worst will fallback to + # best/worst {video,audio}-only format + matches = formats + elif seperate_fallback and not ctx['has_merged_format']: + # for compatibility with youtube-dl when there is no pre-merged format + matches = list(filter(seperate_fallback, formats)) matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1])) try: yield matches[format_idx - 1] @@ -2467,8 +2477,9 @@ class YoutubeDL(object): if info_dict.get('is_live') and formats: formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start] if get_from_start and not formats: - self.raise_no_formats(info_dict, msg='--live-from-start is passed, but there are no formats that can be downloaded from the start. ' - 'If you want to download from the current time, pass --no-live-from-start') + self.raise_no_formats(info_dict, msg=( + '--live-from-start is passed, but there are no formats that can be downloaded from the start. ' + 'If you want to download from the current time, use --no-live-from-start')) if not formats: self.raise_no_formats(info_dict) @@ -2598,33 +2609,15 @@ class YoutubeDL(object): self.report_error(err, tb=False, is_error=False) continue - # While in format selection we may need to have an access to the original - # format set in order to calculate some metrics or do some processing. - # For now we need to be able to guess whether original formats provided - # by extractor are incomplete or not (i.e. whether extractor provides only - # video-only or audio-only formats) for proper formats selection for - # extractors with such incomplete formats (see - # https://github.com/ytdl-org/youtube-dl/pull/5556). - # Since formats may be filtered during format selection and may not match - # the original formats the results may be incorrect. Thus original formats - # or pre-calculated metrics should be passed to format selection routines - # as well. - # We will pass a context object containing all necessary additional data - # instead of just formats. - # This fixes incorrect format selection issue (see - # https://github.com/ytdl-org/youtube-dl/issues/10083). - incomplete_formats = ( - # All formats are video-only or - all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) - # all formats are audio-only - or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) - - ctx = { + formats_to_download = list(format_selector({ 'formats': formats, - 'incomplete_formats': incomplete_formats, - } - - formats_to_download = list(format_selector(ctx)) + 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats), + 'incomplete_formats': ( + # All formats are video-only or + all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) + # all formats are audio-only + or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)), + })) if interactive_format_selection and not formats_to_download: self.report_error('Requested format is not available', tb=False, is_error=False) continue @@ -2766,7 +2759,7 @@ class YoutubeDL(object): self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy)) for tmpl, file_tmpl in self.params['print_to_file'].get(key, []): - filename = self.evaluate_outtmpl(file_tmpl, info_dict) + filename = self.prepare_filename(info_dict, outtmpl=file_tmpl) tmpl = format_tmpl(tmpl) self.to_screen(f'[info] Writing {tmpl!r} to: {filename}') if self._ensure_dir_exists(filename): diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 6d5a64336..0599af92c 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -378,7 +378,7 @@ def validate_options(opts): 'To let yt-dlp download and merge the best available formats, simply do not pass any format selection', 'If you know what you are doing and want only the best pre-merged format, use "-f b" instead to suppress this warning'))) - # --(post-processor/downloader)-args without name + # --(postprocessor/downloader)-args without name def report_args_compat(name, value, key1, key2=None): if key1 in value and key2 not in value: warnings.append(f'{name} arguments given without specifying name. The arguments will be given to all {name}s') diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 7265cad81..1f08a3664 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -21,6 +21,7 @@ from .compat import ( compat_cookiejar_Cookie, ) from .utils import ( + error_to_str, expand_path, Popen, YoutubeDLCookieJar, @@ -721,7 +722,7 @@ def _get_kwallet_network_wallet(logger): network_wallet = stdout.decode('utf-8').strip() logger.debug('NetworkWallet = "{}"'.format(network_wallet)) return network_wallet - except BaseException as e: + except Exception as e: logger.warning('exception while obtaining NetworkWallet: {}'.format(e)) return default_wallet @@ -766,8 +767,8 @@ def _get_kwallet_password(browser_keyring_name, logger): if stdout[-1:] == b'\n': stdout = stdout[:-1] return stdout - except BaseException as e: - logger.warning(f'exception running kwallet-query: {type(e).__name__}({e})') + except Exception as e: + logger.warning(f'exception running kwallet-query: {error_to_str(e)}') return b'' @@ -823,8 +824,8 @@ def _get_mac_keyring_password(browser_keyring_name, logger): if stdout[-1:] == b'\n': stdout = stdout[:-1] return stdout - except BaseException as e: - logger.warning(f'exception running find-generic-password: {type(e).__name__}({e})') + except Exception as e: + logger.warning(f'exception running find-generic-password: {error_to_str(e)}') return None diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 6b75dfc62..c45a8a476 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -403,7 +403,7 @@ class FragmentFD(FileDownloader): pass if compat_os_name == 'nt': - def bindoj_result(future): + def future_result(future): while True: try: return future.result(0.1) @@ -412,7 +412,7 @@ class FragmentFD(FileDownloader): except concurrent.futures.TimeoutError: continue else: - def bindoj_result(future): + def future_result(future): return future.result() def interrupt_trigger_iter(fg): @@ -430,7 +430,7 @@ class FragmentFD(FileDownloader): result = True for tpe, job in spins: try: - result = result and bindoj_result(job) + result = result and future_result(job) except KeyboardInterrupt: interrupt_trigger[0] = False finally: @@ -494,16 +494,14 @@ class FragmentFD(FileDownloader): self.report_error('Giving up after %s fragment retries' % fragment_retries) def append_fragment(frag_content, frag_index, ctx): - if not frag_content: - if not is_fatal(frag_index - 1): - self.report_skip_fragment(frag_index, 'fragment not found') - return True - else: - ctx['dest_stream'].close() - self.report_error( - 'fragment %s not found, unable to continue' % frag_index) - return False - self._append_fragment(ctx, pack_func(frag_content, frag_index)) + if frag_content: + self._append_fragment(ctx, pack_func(frag_content, frag_index)) + elif not is_fatal(frag_index - 1): + self.report_skip_fragment(frag_index, 'fragment not found') + else: + ctx['dest_stream'].close() + self.report_error(f'fragment {frag_index} not found, unable to continue') + return False return True decrypt_fragment = self.decrypter(info_dict) diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index 8e096b76b..591a9b08d 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -7,7 +7,6 @@ import random from .common import FileDownloader from ..compat import ( - compat_str, compat_urllib_error, compat_http_client ) @@ -18,7 +17,7 @@ from ..utils import ( parse_http_range, sanitized_Request, ThrottledDownload, - try_get, + try_call, write_xattr, XAttrMetadataError, XAttrUnavailableError, @@ -58,8 +57,6 @@ class HttpFD(FileDownloader): ctx.resume_len = 0 ctx.block_size = self.params.get('buffersize', 1024) ctx.start_time = time.time() - ctx.chunk_size = None - throttle_start = None # parse given Range req_start, req_end, _ = parse_http_range(headers.get('Range')) @@ -85,12 +82,6 @@ class HttpFD(FileDownloader): class NextFragment(Exception): pass - def set_range(req, start, end): - range_header = 'bytes=%d-' % start - if end: - range_header += compat_str(end) - req.add_header('Range', range_header) - def establish_connection(): ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size) if not is_test and chunk_size else chunk_size) @@ -120,18 +111,18 @@ class HttpFD(FileDownloader): else: range_end = None - if try_get(None, lambda _: range_start > range_end): + if try_call(lambda: range_start > range_end): ctx.resume_len = 0 ctx.open_mode = 'wb' raise RetryDownload(Exception(f'Conflicting range. (start={range_start} > end={range_end})')) - if try_get(None, lambda _: range_end >= ctx.content_len): + if try_call(lambda: range_end >= ctx.content_len): range_end = ctx.content_len - 1 request = sanitized_Request(url, request_data, headers) has_range = range_start is not None if has_range: - set_range(request, range_start, range_end) + request.add_header('Range', f'bytes={int(range_start)}-{int_or_none(range_end) or ""}') # Establish connection try: ctx.data = self.ydl.urlopen(request) @@ -214,7 +205,6 @@ class HttpFD(FileDownloader): raise RetryDownload(err) def download(): - nonlocal throttle_start data_len = ctx.data.info().get('Content-length', None) # Range HTTP header may be ignored/unsupported by a webserver @@ -329,14 +319,14 @@ class HttpFD(FileDownloader): if speed and speed < (self.params.get('throttledratelimit') or 0): # The speed must stay below the limit for 3 seconds # This prevents raising error when the speed temporarily goes down - if throttle_start is None: - throttle_start = now - elif now - throttle_start > 3: + if ctx.throttle_start is None: + ctx.throttle_start = now + elif now - ctx.throttle_start > 3: if ctx.stream is not None and ctx.tmpfilename != '-': ctx.stream.close() raise ThrottledDownload() elif speed: - throttle_start = None + ctx.throttle_start = None if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len: ctx.resume_len = byte_counter diff --git a/yt_dlp/extractor/adobeconnect.py b/yt_dlp/extractor/adobeconnect.py index e688dddcb..e2e6f93f3 100644 --- a/yt_dlp/extractor/adobeconnect.py +++ b/yt_dlp/extractor/adobeconnect.py @@ -14,7 +14,7 @@ class AdobeConnectIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title') + title = self._html_extract_title(webpage) qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1]) is_live = qs.get('isLive', ['false'])[0] == 'true' formats = [] diff --git a/yt_dlp/extractor/allocine.py b/yt_dlp/extractor/allocine.py index cd533acfc..403a277e9 100644 --- a/yt_dlp/extractor/allocine.py +++ b/yt_dlp/extractor/allocine.py @@ -7,6 +7,7 @@ from ..utils import ( int_or_none, qualities, remove_end, + strip_or_none, try_get, unified_timestamp, url_basename, @@ -102,10 +103,7 @@ class AllocineIE(InfoExtractor): video_id = display_id media_data = self._download_json( 'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id) - title = remove_end( - self._html_search_regex( - r'(?s)<title>(.+?)</title>', webpage, 'title').strip(), - ' - AlloCiné') + title = remove_end(strip_or_none(self._html_extract_title(webpage), ' - AlloCiné')) for key, value in media_data['video'].items(): if not key.endswith('Path'): continue diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index 2a25c0713..2ab3c1beb 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -457,7 +457,7 @@ class YoutubeWebArchiveIE(InfoExtractor): _OLDEST_CAPTURE_DATE = 20050214000000 _NEWEST_CAPTURE_DATE = 20500101000000 - def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note='Downloading CDX API JSON'): + def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note=None, fatal=False): # CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md query = { 'url': url, @@ -468,7 +468,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'collapse': collapse or [], **(query or {}) } - res = self._download_json('https://web.archive.org/cdx/search/cdx', item_id, note, query=query) + res = self._download_json( + 'https://web.archive.org/cdx/search/cdx', item_id, + note or 'Downloading CDX API JSON', query=query, fatal=fatal) if isinstance(res, list) and len(res) >= 2: # format response to make it easier to use return list(dict(zip(res[0], v)) for v in res[1:]) @@ -481,8 +483,7 @@ class YoutubeWebArchiveIE(InfoExtractor): regex), webpage, name, default='{}'), video_id, fatal=False) def _extract_webpage_title(self, webpage): - page_title = self._html_search_regex( - r'<title>([^<]*)</title>', webpage, 'title', default='') + page_title = self._html_extract_title(webpage, default='') # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix. return self._html_search_regex( r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)', diff --git a/yt_dlp/extractor/asiancrush.py b/yt_dlp/extractor/asiancrush.py index 75a632958..7f1940fca 100644 --- a/yt_dlp/extractor/asiancrush.py +++ b/yt_dlp/extractor/asiancrush.py @@ -181,8 +181,7 @@ class AsianCrushPlaylistIE(AsianCrushBaseIE): 'title', default=None) or self._og_search_title( webpage, default=None) or self._html_search_meta( 'twitter:title', webpage, 'title', - default=None) or self._search_regex( - r'<title>([^<]+)</title>', webpage, 'title', fatal=False) + default=None) or self._html_extract_title(webpage) if title: title = re.sub(r'\s*\|\s*.+?$', '', title) diff --git a/yt_dlp/extractor/azmedien.py b/yt_dlp/extractor/azmedien.py index b3cabbf94..0168340b9 100644 --- a/yt_dlp/extractor/azmedien.py +++ b/yt_dlp/extractor/azmedien.py @@ -11,7 +11,7 @@ class AZMedienIE(InfoExtractor): IE_DESC = 'AZ Medien videos' _VALID_URL = r'''(?x) https?:// - (?:www\.)? + (?:www\.|tv\.)? (?P<host> telezueri\.ch| telebaern\.tv| @@ -31,7 +31,7 @@ class AZMedienIE(InfoExtractor): ''' _TESTS = [{ - 'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', + 'url': 'https://tv.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', 'info_dict': { 'id': '1_anruz3wy', 'ext': 'mp4', @@ -39,6 +39,9 @@ class AZMedienIE(InfoExtractor): 'uploader_id': 'TVOnline', 'upload_date': '20180930', 'timestamp': 1538328802, + 'view_count': int, + 'thumbnail': 'http://cfvod.kaltura.com/p/1719221/sp/171922100/thumbnail/entry_id/1_anruz3wy/version/100031', + 'duration': 1930 }, 'params': { 'skip_download': True, diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 823155730..29ad7ded7 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -906,9 +906,8 @@ class BBCIE(BBCCoUkIE): playlist_title = json_ld_info.get('title') if not playlist_title: - playlist_title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'<title>(.+?)</title>', webpage, 'playlist title', default=None) + playlist_title = (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'playlist title', default=None)) if playlist_title: playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index dd1ff512e..3212f3328 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -926,9 +926,9 @@ class BiliIntlIE(BiliIntlBaseIE): if season_id and not video_data: # Non-Bstation layout, read through episode list season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) - video_data = next( - episode for episode in traverse_obj(season_json, ('sections', ..., 'episodes', ...), expected_type=dict) - if str(episode.get('episode_id')) == ep_id) + video_data = traverse_obj(season_json, + ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id), + expected_type=dict, get_all=False) return self._extract_video_info(video_data, ep_id=ep_id, aid=aid) diff --git a/yt_dlp/extractor/br.py b/yt_dlp/extractor/br.py index 7169eceb6..0155827d8 100644 --- a/yt_dlp/extractor/br.py +++ b/yt_dlp/extractor/br.py @@ -175,7 +175,7 @@ class BRIE(InfoExtractor): class BRMediathekIE(InfoExtractor): IE_DESC = 'Bayerischer Rundfunk Mediathek' - _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})' + _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek//?video/(?:[^/?&#]+?-)?(?P<id>av:[0-9a-f]{24})' _TESTS = [{ 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', @@ -188,6 +188,9 @@ class BRMediathekIE(InfoExtractor): 'timestamp': 1511942766, 'upload_date': '20171129', } + }, { + 'url': 'https://www.br.de/mediathek//video/av:61b0db581aed360007558c12', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/breitbart.py b/yt_dlp/extractor/breitbart.py index f50f719dc..e029aa627 100644 --- a/yt_dlp/extractor/breitbart.py +++ b/yt_dlp/extractor/breitbart.py @@ -29,9 +29,8 @@ class BreitBartIE(InfoExtractor): self._sort_formats(formats) return { 'id': video_id, - 'title': self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'(?s)<title>(.*?)</title>', webpage, 'video title'), + 'title': (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'video title')), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), 'age_limit': self._rta_search(webpage), diff --git a/yt_dlp/extractor/callin.py b/yt_dlp/extractor/callin.py index acf327ace..1f3b7cfff 100644 --- a/yt_dlp/extractor/callin.py +++ b/yt_dlp/extractor/callin.py @@ -54,7 +54,7 @@ class CallinIE(InfoExtractor): id = episode['id'] title = (episode.get('title') or self._og_search_title(webpage, fatal=False) - or self._html_search_regex('<title>(.*?)</title>', webpage, 'title')) + or self._html_extract_title(webpage)) url = episode['m3u8'] formats = self._extract_m3u8_formats(url, display_id, ext='ts') self._sort_formats(formats) diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py index 31e7d7de6..8b9903774 100644 --- a/yt_dlp/extractor/canvas.py +++ b/yt_dlp/extractor/canvas.py @@ -245,10 +245,6 @@ class VrtNUIE(GigyaBaseIE): 'upload_date': '20200727', }, 'skip': 'This video is only available for registered users', - 'params': { - 'username': '<snip>', - 'password': '<snip>', - }, 'expected_warnings': ['is not a supported codec'], }, { # Only available via new API endpoint @@ -264,10 +260,6 @@ class VrtNUIE(GigyaBaseIE): 'episode_number': 5, }, 'skip': 'This video is only available for registered users', - 'params': { - 'username': '<snip>', - 'password': '<snip>', - }, 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], }] _NETRC_MACHINE = 'vrtnu' diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index ac1272f7b..fba8bf965 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -127,9 +127,9 @@ class CBCIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - title = self._og_search_title(webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title', fatal=False) + title = (self._og_search_title(webpage, default=None) + or self._html_search_meta('twitter:title', webpage, 'title', default=None) + or self._html_extract_title(webpage)) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] diff --git a/yt_dlp/extractor/closertotruth.py b/yt_dlp/extractor/closertotruth.py index 26243d52d..517e121e0 100644 --- a/yt_dlp/extractor/closertotruth.py +++ b/yt_dlp/extractor/closertotruth.py @@ -54,8 +54,7 @@ class CloserToTruthIE(InfoExtractor): r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)', webpage, 'kaltura partner_id') - title = self._search_regex( - r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title') + title = self._html_extract_title(webpage, 'video title') select = self._search_regex( r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>', diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index d0e57da23..e2605c1f4 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -139,6 +139,8 @@ class InfoExtractor(object): for HDS - URL of the F4M manifest, for DASH - URL of the MPD manifest, for MSS - URL of the ISM manifest. + * manifest_stream_number (For internal use only) + The index of the stream in the manifest file * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). @@ -215,7 +217,7 @@ class InfoExtractor(object): (HTTP or RTMP) download. Boolean. * has_drm The format has DRM and cannot be downloaded. Boolean * downloader_options A dictionary of downloader options as - described in FileDownloader + described in FileDownloader (For internal use only) RTMP formats can also have the additional fields: page_url, app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn, rtmp_protocol, rtmp_real_time @@ -1297,8 +1299,8 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' - % {'prop': re.escape(prop)}) + property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)' + % {'prop': re.escape(prop), 'sep': '(?::|[:-])'}) template = r'<meta[^>]+?%s[^>]+?%s' return [ template % (property_re, content_re), @@ -1329,9 +1331,8 @@ class InfoExtractor(object): def _og_search_description(self, html, **kargs): return self._og_search_property('description', html, fatal=False, **kargs) - def _og_search_title(self, html, **kargs): - kargs.setdefault('fatal', False) - return self._og_search_property('title', html, **kargs) + def _og_search_title(self, html, *, fatal=False, **kargs): + return self._og_search_property('title', html, fatal=fatal, **kargs) def _og_search_video_url(self, html, name='video url', secure=True, **kargs): regexes = self._og_regexes('video') + self._og_regexes('video:url') @@ -1342,9 +1343,8 @@ class InfoExtractor(object): def _og_search_url(self, html, **kargs): return self._og_search_property('url', html, **kargs) - def _html_extract_title(self, html, name, **kwargs): - return self._html_search_regex( - r'(?s)<title>(.*?)</title>', html, name, **kwargs) + def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs): + return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs) def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): name = variadic(name) @@ -3686,9 +3686,9 @@ class InfoExtractor(object): def _merge_subtitle_items(subtitle_list1, subtitle_list2): """ Merge subtitle items for one language. Items with duplicated URLs/data will be dropped. """ - list1_data = set([item.get('url') or item['data'] for item in subtitle_list1]) + list1_data = set((item.get('url'), item.get('data')) for item in subtitle_list1) ret = list(subtitle_list1) - ret.extend([item for item in subtitle_list2 if (item.get('url') or item['data']) not in list1_data]) + ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data) return ret @classmethod diff --git a/yt_dlp/extractor/craftsy.py b/yt_dlp/extractor/craftsy.py new file mode 100644 index 000000000..ed2f4420e --- /dev/null +++ b/yt_dlp/extractor/craftsy.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor + +from ..utils import ( + dict_get, + get_element_by_id, + js_to_json, + traverse_obj, +) + + +class CraftsyIE(InfoExtractor): + _VALID_URL = r'https?://www.craftsy.com/class/(?P<id>[a-z0-9_-]+)/' + _TESTS = [{ + 'url': 'https://www.craftsy.com/class/the-midnight-quilt-show-season-5/', + 'info_dict': { + 'id': 'the-midnight-quilt-show-season-5', + 'title': 'The Midnight Quilt Show Season 5', + 'description': 'md5:113eda818e985d1a566625fb2f833b7a', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.craftsy.com/class/sew-your-own-designer-handbag/', + 'info_dict': { + 'id': 'sew-your-own-designer-handbag', + 'title': 'Sew Your Own Designer Handbag', + 'description': 'md5:8270d0ef5427d3c895a27351aeaac276', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://www.craftsy.com/class/all-access-estes-park-wool-market/', + 'info_dict': { + 'id': 'all-access-estes-park-wool-market', + 'title': 'All Access: Estes Park Wool Market', + 'description': 'md5:aded1bd8d38ae2fae4dae936c0ae01e7', + }, + 'playlist_count': 6, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json(self._search_regex( + r'class_video_player_vars\s*=\s*({.*})\s*;', + get_element_by_id('vidstore-classes_class-video-player-js-extra', webpage), + 'video data'), video_id, transform_source=js_to_json) + + account_id = traverse_obj(video_data, ('video_player', 'bc_account_id')) + + entries = [] + class_preview = traverse_obj(video_data, ('video_player', 'class_preview')) + if class_preview: + v_id = class_preview.get('video_id') + entries.append(self.url_result( + f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={v_id}', + BrightcoveNewIE, v_id, class_preview.get('title'))) + + if dict_get(video_data, ('is_free', 'user_has_access')): + entries += [ + self.url_result( + f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={lesson["video_id"]}', + BrightcoveNewIE, lesson['video_id'], lesson.get('title')) + for lesson in video_data['lessons']] + + return self.playlist_result( + entries, video_id, video_data.get('class_title'), + self._html_search_meta(('og:description', 'description'), webpage, default=None)) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index bf1bf8c1c..7edb645f8 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -9,7 +9,7 @@ import zlib from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor -from .vrv import VRVIE +from .vrv import VRVBaseIE from ..compat import ( compat_b64decode, compat_etree_Element, @@ -86,6 +86,22 @@ class CrunchyrollBaseIE(InfoExtractor): if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): raise ExtractorError('Login succeeded but did not set etp_rt cookie') + # Beta-specific, but needed for redirects + def _get_beta_embedded_json(self, webpage, display_id): + initial_state = self._parse_json(self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) + app_config = self._parse_json(self._search_regex( + r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) + return initial_state, app_config + + def _redirect_to_beta(self, webpage, iekey, video_id): + if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): + raise ExtractorError('Received a beta page from non-beta url when not logged in.') + initial_state, app_config = self._get_beta_embedded_json(webpage, video_id) + url = app_config['baseSiteUrl'] + initial_state['router']['locations']['current']['pathname'] + self.to_screen(f'{video_id}: Redirected to beta site - {url}') + return self.url_result(f'{url}', iekey, video_id) + @staticmethod def _add_skip_wall(url): parsed_url = compat_urlparse.urlparse(url) @@ -100,7 +116,7 @@ class CrunchyrollBaseIE(InfoExtractor): parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) -class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): +class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE): IE_NAME = 'crunchyroll' _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<id>[0-9]+))(?:[/?&]|$)' _TESTS = [{ @@ -406,6 +422,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage = self._download_webpage( self._add_skip_wall(webpage_url), video_id, headers=self.geo_verification_headers()) + if re.search(r'<div id="preload-data">', webpage): + return self._redirect_to_beta(webpage, CrunchyrollBetaIE.ie_key(), video_id) note_m = self._html_search_regex( r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='') @@ -670,6 +688,8 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): # https:// gives a 403, but http:// does not self._add_skip_wall(url).replace('https://', 'http://'), show_id, headers=self.geo_verification_headers()) + if re.search(r'<div id="preload-data">', webpage): + return self._redirect_to_beta(webpage, CrunchyrollBetaShowIE.ie_key(), show_id) title = self._html_search_meta('name', webpage, default=None) episode_re = r'<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"' @@ -692,9 +712,56 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): } -class CrunchyrollBetaIE(CrunchyrollBaseIE): +class CrunchyrollBetaBaseIE(CrunchyrollBaseIE): + params = None + + def _get_params(self, lang): + if not CrunchyrollBetaBaseIE.params: + initial_state, app_config = self._get_beta_embedded_json(self._download_webpage( + f'https://beta.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) + api_domain = app_config['cxApiParams']['apiDomain'] + basic_token = str(base64.b64encode(('%s:' % app_config['cxApiParams']['accountAuthClientId']).encode('ascii')), 'ascii') + auth_response = self._download_json( + f'{api_domain}/auth/v1/token', None, note='Authenticating with cookie', + headers={ + 'Authorization': 'Basic ' + basic_token + }, data='grant_type=etp_rt_cookie'.encode('ascii')) + policy_response = self._download_json( + f'{api_domain}/index/v2', None, note='Retrieving signed policy', + headers={ + 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] + }) + bucket = policy_response['cms']['bucket'] + params = { + 'Policy': policy_response['cms']['policy'], + 'Signature': policy_response['cms']['signature'], + 'Key-Pair-Id': policy_response['cms']['key_pair_id'] + } + locale = traverse_obj(initial_state, ('localization', 'locale')) + if locale: + params['locale'] = locale + CrunchyrollBetaBaseIE.params = (api_domain, bucket, params) + return CrunchyrollBetaBaseIE.params + + def _redirect_from_beta(self, url, lang, internal_id, display_id, is_episode, iekey): + initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(url, display_id), display_id) + content_data = initial_state['content']['byId'][internal_id] + if is_episode: + video_id = content_data['external_id'].split('.')[1] + series_id = content_data['episode_metadata']['series_slug_title'] + else: + series_id = content_data['slug_title'] + series_id = re.sub(r'-{2,}', '-', series_id) + url = f'https://www.crunchyroll.com/{lang}{series_id}' + if is_episode: + url = url + f'/{display_id}-{video_id}' + self.to_screen(f'{display_id}: Not logged in. Redirecting to non-beta site - {url}') + return self.url_result(url, iekey, display_id) + + +class CrunchyrollBetaIE(CrunchyrollBetaBaseIE): IE_NAME = 'crunchyroll:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<internal_id>\w+)/(?P<id>[\w\-]+)/?(?:\?|$)' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)' _TESTS = [{ 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', 'info_dict': { @@ -705,51 +772,49 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'uploader': 'Toei Animation', 'title': 'World Trigger Episode 73 – To the Future', 'upload_date': '20160402', + 'episode_number': 73, + 'series': 'World Trigger', + 'average_rating': 4.9, + 'episode': 'To the Future', + 'season': 'World Trigger', + 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/c870dedca1a83137c2d3d144984155ed1459527119_main.jpg', + 'season_number': 1, + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Unable to download XML'] + }, { + 'url': 'https://beta.crunchyroll.com/watch/GYK53DMPR/wicked-lord-shingan-reborn', + 'info_dict': { + 'id': '648781', + 'ext': 'mp4', + 'episode_number': 1, + 'timestamp': 1389173400, + 'series': 'Love, Chunibyo & Other Delusions - Heart Throb -', + 'description': 'md5:5579d1a0355cc618558ba23d27067a62', + 'uploader': 'TBS', + 'episode': 'Wicked Lord Shingan... Reborn', + 'average_rating': 4.9, + 'season': 'Love, Chunibyo & Other Delusions - Heart Throb -', + 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/2ba0384e225a5370d5f0ee9496d91ea51389046521_main.jpg', + 'title': 'Love, Chunibyo & Other Delusions - Heart Throb - Episode 1 – Wicked Lord Shingan... Reborn', + 'season_number': 2, + 'upload_date': '20140108', }, 'params': {'skip_download': 'm3u8'}, 'expected_warnings': ['Unable to download XML'] + }, { + 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/', + 'only_matching': True, }] def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id') - webpage = self._download_webpage(url, display_id) - initial_state = self._parse_json( - self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), - display_id) - episode_data = initial_state['content']['byId'][internal_id] + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') + if not self._get_cookies(url).get('etp_rt'): - video_id = episode_data['external_id'].split('.')[1] - series_id = episode_data['episode_metadata']['series_slug_title'] - return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}', - CrunchyrollIE.ie_key(), video_id) - - app_config = self._parse_json( - self._search_regex(r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), - display_id) - client_id = app_config['cxApiParams']['accountAuthClientId'] - api_domain = app_config['cxApiParams']['apiDomain'] - basic_token = str(base64.b64encode(('%s:' % client_id).encode('ascii')), 'ascii') - auth_response = self._download_json( - f'{api_domain}/auth/v1/token', display_id, - note='Authenticating with cookie', - headers={ - 'Authorization': 'Basic ' + basic_token - }, data='grant_type=etp_rt_cookie'.encode('ascii')) - policy_response = self._download_json( - f'{api_domain}/index/v2', display_id, - note='Retrieving signed policy', - headers={ - 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] - }) - bucket = policy_response['cms']['bucket'] - params = { - 'Policy': policy_response['cms']['policy'], - 'Signature': policy_response['cms']['signature'], - 'Key-Pair-Id': policy_response['cms']['key_pair_id'] - } - locale = traverse_obj(initial_state, ('localization', 'locale')) - if locale: - params['locale'] = locale + return self._redirect_from_beta(url, lang, internal_id, display_id, True, CrunchyrollIE.ie_key()) + + api_domain, bucket, params = self._get_params(lang) + episode_response = self._download_json( f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, note='Retrieving episode metadata', @@ -827,9 +892,9 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): } -class CrunchyrollBetaShowIE(CrunchyrollBaseIE): +class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): IE_NAME = 'crunchyroll:playlist:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/\w+/(?P<id>[\w\-]+)/?(?:\?|$)' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)' _TESTS = [{ 'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { @@ -838,11 +903,56 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE): }, 'playlist_mincount': 10, }, { + 'url': 'https://beta.crunchyroll.com/series/GYJQV73V6/love-chunibyo--other-delusions---heart-throb--', + 'info_dict': { + 'id': 'love-chunibyo-other-delusions-heart-throb-', + 'title': 'Love, Chunibyo & Other Delusions - Heart Throb -', + }, + 'playlist_mincount': 10, + }, { 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA', 'only_matching': True, }] def _real_extract(self, url): - lang, series_id = self._match_valid_url(url).group('lang', 'id') - return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id.lower()}', - CrunchyrollShowPlaylistIE.ie_key(), series_id) + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') + + if not self._get_cookies(url).get('etp_rt'): + return self._redirect_from_beta(url, lang, internal_id, display_id, False, CrunchyrollShowPlaylistIE.ie_key()) + + api_domain, bucket, params = self._get_params(lang) + + series_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id, + note='Retrieving series metadata', query=params) + + seasons_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id, + note='Retrieving season list', query=params) + + def entries(): + for season in seasons_response['items']: + episodes_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id, + note=f'Retrieving episode list for {season.get("slug_title")}', query=params) + for episode in episodes_response['items']: + episode_id = episode['id'] + episode_display_id = episode['slug_title'] + yield { + '_type': 'url', + 'url': f'https://beta.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', + 'ie_key': CrunchyrollBetaIE.ie_key(), + 'id': episode_id, + 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), + 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')), + 'duration': float_or_none(episode.get('duration_ms'), 1000), + 'series': episode.get('series_title'), + 'series_id': episode.get('series_id'), + 'season': episode.get('season_title'), + 'season_id': episode.get('season_id'), + 'season_number': episode.get('season_number'), + 'episode': episode.get('title'), + 'episode_number': episode.get('sequence_number') + } + + return self.playlist_result(entries(), internal_id, series_response.get('title')) diff --git a/yt_dlp/extractor/cspan.py b/yt_dlp/extractor/cspan.py index d29b58ba6..f51159bbe 100644 --- a/yt_dlp/extractor/cspan.py +++ b/yt_dlp/extractor/cspan.py @@ -278,7 +278,7 @@ class CSpanCongressIE(InfoExtractor): video_id, transform_source=js_to_json) title = (self._og_search_title(webpage, default=None) - or self._html_search_regex(r'(?s)<title>(.*?)</title>', webpage, 'video title')) + or self._html_extract_title(webpage, 'video title')) description = (self._og_search_description(webpage, default=None) or self._html_search_meta('description', webpage, 'description', default=None)) diff --git a/yt_dlp/extractor/cybrary.py b/yt_dlp/extractor/cybrary.py new file mode 100644 index 000000000..c278f0fe0 --- /dev/null +++ b/yt_dlp/extractor/cybrary.py @@ -0,0 +1,146 @@ +# coding: utf-8 +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + smuggle_url, + str_or_none, + traverse_obj, + urlencode_postdata +) + + +class CybraryBaseIE(InfoExtractor): + _API_KEY = 'AIzaSyCX9ru6j70PX2My1Eq6Q1zoMAhuTdXlzSw' + _ENDPOINTS = { + 'course': 'https://app.cybrary.it/courses/api/catalog/browse/course/{}', + 'course_enrollment': 'https://app.cybrary.it/courses/api/catalog/{}/enrollment', + 'enrollment': 'https://app.cybrary.it/courses/api/enrollment/{}', + 'launch': 'https://app.cybrary.it/courses/api/catalog/{}/launch', + 'vimeo_oembed': 'https://vimeo.com/api/oembed.json?url=https://vimeo.com/{}', + } + _NETRC_MACHINE = 'cybrary' + _TOKEN = None + + def _perform_login(self, username, password): + CybraryBaseIE._TOKEN = self._download_json( + f'https://identitytoolkit.googleapis.com/v1/accounts:signInWithPassword?key={self._API_KEY}', + None, data=urlencode_postdata({'email': username, 'password': password, 'returnSecureToken': True}), + note='Logging in')['idToken'] + + def _real_initialize(self): + if not self._TOKEN: + self.raise_login_required(method='password') + + def _call_api(self, endpoint, item_id): + return self._download_json( + self._ENDPOINTS[endpoint].format(item_id), item_id, + note=f'Downloading {endpoint} JSON metadata', + headers={'Authorization': f'Bearer {self._TOKEN}'}) + + def _get_vimeo_id(self, activity_id): + launch_api = self._call_api('launch', activity_id) + + if launch_api.get('url'): + return self._search_regex(r'https?://player\.vimeo\.com/video/(?P<vimeo_id>[0-9]+)', launch_api['url'], 'vimeo_id') + return traverse_obj(launch_api, ('vendor_data', 'content', ..., 'videoId'), get_all=False) + + +class CybraryIE(CybraryBaseIE): + _VALID_URL = r'https?://app.cybrary.it/immersive/(?P<enrollment>[0-9]+)/activity/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://app.cybrary.it/immersive/12487950/activity/63102', + 'md5': '9ae12d37e555cb2ed554223a71a701d0', + 'info_dict': { + 'id': '646609770', + 'ext': 'mp4', + 'title': 'Getting Started', + 'thumbnail': 'https://i.vimeocdn.com/video/1301817996-76a268f0c56cff18a5cecbbdc44131eb9dda0c80eb0b3a036_1280', + 'series_id': '63111', + 'uploader_url': 'https://vimeo.com/user30867300', + 'duration': 88, + 'uploader_id': 'user30867300', + 'series': 'Cybrary Orientation', + 'uploader': 'Cybrary', + 'chapter': 'Cybrary Orientation Series', + 'chapter_id': '63110' + }, + 'expected_warnings': ['No authenticators for vimeo'] + }, { + 'url': 'https://app.cybrary.it/immersive/12747143/activity/52686', + 'md5': '62f26547dccc59c44363e2a13d4ad08d', + 'info_dict': { + 'id': '445638073', + 'ext': 'mp4', + 'title': 'Azure Virtual Network IP Addressing', + 'thumbnail': 'https://i.vimeocdn.com/video/936667051-1647ace66c627d4a2382185e0dae8deb830309bfddd53f8b2367b2f91e92ed0e-d_1280', + 'series_id': '52733', + 'uploader_url': 'https://vimeo.com/user30867300', + 'duration': 426, + 'uploader_id': 'user30867300', + 'series': 'AZ-500: Microsoft Azure Security Technologies', + 'uploader': 'Cybrary', + 'chapter': 'Implement Network Security', + 'chapter_id': '52693' + }, + 'expected_warnings': ['No authenticators for vimeo'] + }] + + def _real_extract(self, url): + activity_id, enrollment_id = self._match_valid_url(url).group('id', 'enrollment') + course = self._call_api('enrollment', enrollment_id)['content'] + activity = traverse_obj(course, ('learning_modules', ..., 'activities', lambda _, v: int(activity_id) == v['id']), get_all=False) + + if activity.get('type') not in ['Video Activity', 'Lesson Activity']: + raise ExtractorError('The activity is not a video', expected=True) + + module = next((m for m in course.get('learning_modules') or [] + if int(activity_id) in traverse_obj(m, ('activities', ..., 'id') or [])), None) + + vimeo_id = self._get_vimeo_id(activity_id) + + return { + '_type': 'url_transparent', + 'series': traverse_obj(course, ('content_description', 'title')), + 'series_id': str_or_none(traverse_obj(course, ('content_description', 'id'))), + 'id': vimeo_id, + 'chapter': module.get('title'), + 'chapter_id': str_or_none(module.get('id')), + 'title': activity.get('title'), + 'url': smuggle_url(f'https://player.vimeo.com/video/{vimeo_id}', {'http_headers': {'Referer': 'https://api.cybrary.it'}}) + } + + +class CybraryCourseIE(CybraryBaseIE): + _VALID_URL = r'https://app.cybrary.it/browse/course/(?P<id>[\w-]+)/?(?:$|[#?])' + _TESTS = [{ + 'url': 'https://app.cybrary.it/browse/course/az-500-microsoft-azure-security-technologies', + 'info_dict': { + 'id': 898, + 'title': 'AZ-500: Microsoft Azure Security Technologies', + 'description': 'md5:69549d379c0fc1dec92926d4e8b6fbd4' + }, + 'playlist_count': 59 + }, { + 'url': 'https://app.cybrary.it/browse/course/cybrary-orientation', + 'info_dict': { + 'id': 1245, + 'title': 'Cybrary Orientation', + 'description': 'md5:9e69ff66b32fe78744e0ad4babe2e88e' + }, + 'playlist_count': 4 + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + course = self._call_api('course', course_id) + enrollment_info = self._call_api('course_enrollment', course['id']) + + entries = [self.url_result( + f'https://app.cybrary.it/immersive/{enrollment_info["id"]}/activity/{activity["id"]}') + for activity in traverse_obj(course, ('content_item', 'learning_modules', ..., 'activities', ...))] + + return self.playlist_result( + entries, + traverse_obj(course, ('content_item', 'id'), expected_type=str_or_none), + course.get('title'), course.get('short_description')) diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py index 95589d53a..9cb56185b 100644 --- a/yt_dlp/extractor/dailymotion.py +++ b/yt_dlp/extractor/dailymotion.py @@ -94,10 +94,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor): _VALID_URL = r'''(?ix) https?:// (?: - (?:(?:www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|\#)/)?video|swf)| + (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player\.html\?)?video|swf)| (?:www\.)?lequipe\.fr/video ) - /(?P<id>[^/?_]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))? + [/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))? ''' IE_NAME = 'dailymotion' _TESTS = [{ @@ -116,6 +116,25 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'age_limit': 0, }, }, { + 'url': 'https://geo.dailymotion.com/player.html?video=x89eyek&mute=true', + 'md5': 'e2f9717c6604773f963f069ca53a07f8', + 'info_dict': { + 'id': 'x89eyek', + 'ext': 'mp4', + 'title': "En quête d'esprit du 27/03/2022", + 'description': 'md5:66542b9f4df2eb23f314fc097488e553', + 'duration': 2756, + 'timestamp': 1648383669, + 'upload_date': '20220327', + 'uploader': 'CNEWS', + 'uploader_id': 'x24vth', + 'age_limit': 0, + 'view_count': int, + 'like_count': int, + 'tags': ['en_quete_d_esprit'], + 'thumbnail': 'https://s2.dmcdn.net/v/Tncwi1YGKdvFbDuDY/x1080', + } + }, { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', 'md5': '2137c41a8e78554bb09225b8eb322406', 'info_dict': { diff --git a/yt_dlp/extractor/dropout.py b/yt_dlp/extractor/dropout.py index a7442d8f0..2fa61950c 100644 --- a/yt_dlp/extractor/dropout.py +++ b/yt_dlp/extractor/dropout.py @@ -123,7 +123,7 @@ class DropoutIE(InfoExtractor): self._login(display_id) webpage = self._download_webpage(url, display_id, note='Downloading video webpage') finally: - self._download_webpage('https://www.dropout.tv/logout', display_id, note='Logging out') + self._download_webpage('https://www.dropout.tv/logout', display_id, note='Logging out', fatal=False) embed_url = self._search_regex(r'embed_url:\s*["\'](.+?)["\']', webpage, 'embed url') thumbnail = self._og_search_thumbnail(webpage) @@ -139,7 +139,7 @@ class DropoutIE(InfoExtractor): '_type': 'url_transparent', 'ie_key': VHXEmbedIE.ie_key(), 'url': embed_url, - 'id': self._search_regex(r'embed.vhx.tv/videos/(.+?)\?', embed_url, 'id'), + 'id': self._search_regex(r'embed\.vhx\.tv/videos/(.+?)\?', embed_url, 'id'), 'display_id': display_id, 'title': title, 'description': self._html_search_meta('description', webpage, fatal=False), diff --git a/yt_dlp/extractor/elonet.py b/yt_dlp/extractor/elonet.py index eefba4e24..9c6aea28e 100644 --- a/yt_dlp/extractor/elonet.py +++ b/yt_dlp/extractor/elonet.py @@ -1,30 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - base_url, - ExtractorError, - try_get, -) -from ..compat import compat_str +from ..utils import determine_ext class ElonetIE(InfoExtractor): _VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P<id>[0-9]+)' _TESTS = [{ - # m3u8 with subtitles 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867', - 'md5': '8efc954b96c543711707f87de757caea', 'info_dict': { 'id': '107867', 'ext': 'mp4', 'title': 'Valkoinen peura', - 'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...', - 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large', + 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_107867.+', + 'description': 'md5:bded4201c9677fab10854884fe8f7312', }, + 'params': {'skip_download': 'dash'}, }, { # DASH with subtitles 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_116539', @@ -32,58 +24,45 @@ class ElonetIE(InfoExtractor): 'id': '116539', 'ext': 'mp4', 'title': 'Minulla on tiikeri', - 'description': 'Pienellä pojalla, joka asuu kerrostalossa, on kotieläimenä tiikeri. Se on kuitenkin salaisuus. Kerrostalon räpätäti on Kotilaisen täti, joka on aina vali...', - 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_116539&index=0&size=large&source=Solr', - } + 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_116539.+', + 'description': 'md5:5ab72b3fe76d3414e46cc8f277104419', + }, + 'params': {'skip_download': 'dash'}, + }, { + # Page with multiple videos, download the main one + 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_117396', + 'info_dict': { + 'id': '117396', + 'ext': 'mp4', + 'title': 'Sampo', + 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_117396.+', + 'description': 'md5:ec69572a5b054d0ecafe8086b1fa96f7', + }, + 'params': {'skip_download': 'dash'}, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'<meta .*property="og:title" .*content="(.+?)"', webpage, 'title') - description = self._html_search_regex( - r'<meta .*property="og:description" .*content="(.+?)"', webpage, 'description') - thumbnail = self._html_search_regex( - r'<meta .*property="og:image" .*content="(.+?)"', webpage, 'thumbnail') + src = self._parse_json(self._html_search_regex( + r'id=\'video-data\'[^>]+data-video-sources="([^"]+)"', webpage, 'json'), video_id)[0]['src'] + ext = determine_ext(src) - json_s = self._html_search_regex( - r'data-video-sources="(.+?)"', webpage, 'json') - src = try_get( - self._parse_json(json_s, video_id), - lambda x: x[0]["src"], compat_str) - formats = [] - subtitles = {} - if re.search(r'\.m3u8\??', src): - res = self._download_webpage_handle( - # elonet servers have certificate problems - src.replace('https:', 'http:'), video_id, - note='Downloading m3u8 information', - errnote='Failed to download m3u8 information') - if res: - doc, urlh = res - url = urlh.geturl() - formats, subtitles = self._parse_m3u8_formats_and_subtitles(doc, url) - for f in formats: - f['ext'] = 'mp4' - elif re.search(r'\.mpd\??', src): - res = self._download_xml_handle( - src, video_id, - note='Downloading MPD manifest', - errnote='Failed to download MPD manifest') - if res: - doc, urlh = res - url = base_url(urlh.geturl()) - formats, subtitles = self._parse_mpd_formats_and_subtitles(doc, mpd_base_url=url) + if ext == 'm3u8': + formats, subtitles = self._extract_m3u8_formats_and_subtitles(src, video_id, fatal=False) + elif ext == 'mpd': + formats, subtitles = self._extract_mpd_formats_and_subtitles(src, video_id, fatal=False) else: - raise ExtractorError("Unknown streaming format") + formats, subtitles = [], {} + self.raise_no_formats(f'Unknown streaming format {ext}') + self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index e5ae12a7d..457f4c2aa 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -322,6 +322,7 @@ from .cpac import ( from .cozytv import CozyTVIE from .cracked import CrackedIE from .crackle import CrackleIE +from .craftsy import CraftsyIE from .crooksandliars import CrooksAndLiarsIE from .crowdbunker import ( CrowdBunkerIE, @@ -344,6 +345,10 @@ from .curiositystream import ( CuriosityStreamSeriesIE, ) from .cwtv import CWTVIE +from .cybrary import ( + CybraryIE, + CybraryCourseIE +) from .daftsex import DaftsexIE from .dailymail import DailyMailIE from .dailymotion import ( diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 2deed585f..5e0e2facf 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -397,8 +397,10 @@ class FacebookIE(InfoExtractor): r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)] post = traverse_obj(post_data, ( ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] - media = [m for m in traverse_obj(post, (..., 'attachments', ..., 'media'), expected_type=dict) or [] - if str(m.get('id')) == video_id and m.get('__typename') == 'Video'] + media = traverse_obj( + post, + (..., 'attachments', ..., 'media', lambda _, m: str(m['id']) == video_id and m['__typename'] == 'Video'), + expected_type=dict) title = get_first(media, ('title', 'text')) description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {} diff --git a/yt_dlp/extractor/fivetv.py b/yt_dlp/extractor/fivetv.py index be81fccb8..d6bebd19b 100644 --- a/yt_dlp/extractor/fivetv.py +++ b/yt_dlp/extractor/fivetv.py @@ -75,8 +75,7 @@ class FiveTVIE(InfoExtractor): r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'], webpage, 'video url') - title = self._og_search_title(webpage, default=None) or self._search_regex( - r'<title>([^<]+)</title>', webpage, 'title') + title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) duration = int_or_none(self._og_search_property( 'video:duration', webpage, 'duration', default=None)) diff --git a/yt_dlp/extractor/foxgay.py b/yt_dlp/extractor/foxgay.py index 512a10645..1c53e0642 100644 --- a/yt_dlp/extractor/foxgay.py +++ b/yt_dlp/extractor/foxgay.py @@ -29,8 +29,7 @@ class FoxgayIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = remove_end(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), ' - Foxgay.com') + title = remove_end(self._html_extract_title(webpage), ' - Foxgay.com') description = get_element_by_id('inf_tit', webpage) # The default user-agent with foxgay cookies leads to pages without videos diff --git a/yt_dlp/extractor/funimation.py b/yt_dlp/extractor/funimation.py index 36a9c4772..6aa9bc9ce 100644 --- a/yt_dlp/extractor/funimation.py +++ b/yt_dlp/extractor/funimation.py @@ -333,7 +333,7 @@ class FunimationShowIE(FunimationBaseIE): 'https://prod-api-funimationnow.dadcdigital.com/api/funimation/episodes/?limit=99999&title_id=%s' % show_info.get('id'), display_id) - vod_items = traverse_obj(items_info, ('items', ..., re.compile('(?i)mostRecent[AS]vod').match, 'item')) + vod_items = traverse_obj(items_info, ('items', ..., lambda k, _: re.match(r'(?i)mostRecent[AS]vod', k), 'item')) return { '_type': 'playlist', diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 4a2e30158..bd56ad289 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -149,6 +149,7 @@ from .blogger import BloggerIE from .mainstreaming import MainStreamingIE from .gfycat import GfycatIE from .panopto import PanoptoBaseIE +from .ruutu import RuutuIE class GenericIE(InfoExtractor): @@ -2511,7 +2512,24 @@ class GenericIE(InfoExtractor): 'id': 'insert-a-quiz-into-a-panopto-video' }, 'playlist_count': 1 - } + }, + { + # Ruutu embed + 'url': 'https://www.nelonen.fi/ohjelmat/madventures-suomi/2160731-riku-ja-tunna-lahtevat-peurajahtiin-tv-sta-tutun-biologin-kanssa---metsastysreissu-huipentuu-kasvissyojan-painajaiseen', + 'md5': 'a2513a98d3496099e6eced40f7e6a14b', + 'info_dict': { + 'id': '4044426', + 'ext': 'mp4', + 'title': 'Riku ja Tunna lähtevät peurajahtiin tv:stä tutun biologin kanssa – metsästysreissu huipentuu kasvissyöjän painajaiseen!', + 'thumbnail': r're:^https?://.+\.jpg$', + 'duration': 108, + 'series': 'Madventures Suomi', + 'description': 'md5:aa55b44bd06a1e337a6f1d0b46507381', + 'categories': ['Matkailu', 'Elämäntyyli'], + 'age_limit': 0, + 'upload_date': '20220308', + }, + }, ] def report_following_redirect(self, new_url): @@ -2873,10 +2891,8 @@ class GenericIE(InfoExtractor): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - video_title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'(?s)<title>(.*?)</title>', webpage, 'video title', - default='video') + video_title = (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'video title', default='video')) # Try to detect age limit automatically age_limit = self._rta_search(webpage) @@ -3739,6 +3755,12 @@ class GenericIE(InfoExtractor): panopto_urls = PanoptoBaseIE._extract_urls(webpage) if panopto_urls: return self.playlist_from_matches(panopto_urls, video_id, video_title) + + # Look for Ruutu embeds + ruutu_url = RuutuIE._extract_url(webpage) + if ruutu_url: + return self.url_result(ruutu_url, RuutuIE) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: @@ -3864,8 +3886,8 @@ class GenericIE(InfoExtractor): if RtmpIE.suitable(vurl): return True vpath = compat_urlparse.urlparse(vurl).path - vext = determine_ext(vpath) - return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') + vext = determine_ext(vpath, None) + return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') def filter_video(urls): return list(filter(check_video, urls)) diff --git a/yt_dlp/extractor/glide.py b/yt_dlp/extractor/glide.py index d94dfbf09..12af859be 100644 --- a/yt_dlp/extractor/glide.py +++ b/yt_dlp/extractor/glide.py @@ -23,9 +23,7 @@ class GlideIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'<title>(.+?)</title>', webpage, - 'title', default=None) or self._og_search_title(webpage) + title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage) video_url = self._proto_relative_url(self._search_regex( r'<source[^>]+src=(["\'])(?P<url>.+?)\1', webpage, 'video URL', default=None, diff --git a/yt_dlp/extractor/hellporno.py b/yt_dlp/extractor/hellporno.py index fae425103..92d32cdcc 100644 --- a/yt_dlp/extractor/hellporno.py +++ b/yt_dlp/extractor/hellporno.py @@ -38,8 +38,7 @@ class HellPornoIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - title = remove_end(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), ' - Hell Porno') + title = remove_end(self._html_extract_title(webpage), ' - Hell Porno') info = self._parse_html5_media_entries(url, webpage, display_id)[0] self._sort_formats(info['formats']) diff --git a/yt_dlp/extractor/huya.py b/yt_dlp/extractor/huya.py index b81439682..4e96f22fa 100644 --- a/yt_dlp/extractor/huya.py +++ b/yt_dlp/extractor/huya.py @@ -66,8 +66,7 @@ class HuyaLiveIE(InfoExtractor): room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo']) if not room_info: raise ExtractorError('Can not extract the room info', expected=True) - title = room_info.get('roomName') or room_info.get('introduction') or self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title') + title = room_info.get('roomName') or room_info.get('introduction') or self._html_extract_title(webpage) screen_type = room_info.get('screenType') live_source_type = room_info.get('liveSourceType') stream_info_list = stream_data['data'][0]['gameStreamInfoList'] diff --git a/yt_dlp/extractor/imdb.py b/yt_dlp/extractor/imdb.py index 24f1fde64..96cee2e2f 100644 --- a/yt_dlp/extractor/imdb.py +++ b/yt_dlp/extractor/imdb.py @@ -7,9 +7,10 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + int_or_none, mimetype2ext, - parse_duration, qualities, + traverse_obj, try_get, url_or_none, ) @@ -28,6 +29,17 @@ class ImdbIE(InfoExtractor): 'title': 'No. 2', 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7', 'duration': 152, + 'thumbnail': r're:^https?://.+\.jpg', + } + }, { + 'url': 'https://www.imdb.com/video/vi3516832537', + 'info_dict': { + 'id': '3516832537', + 'ext': 'mp4', + 'title': 'Paul: U.S. Trailer #1', + 'description': 'md5:17fcc4fe11ec29b4399be9d4c5ef126c', + 'duration': 153, + 'thumbnail': r're:^https?://.+\.jpg', } }, { 'url': 'http://www.imdb.com/video/_/vi2524815897', @@ -51,8 +63,13 @@ class ImdbIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - data = self._download_json( + webpage = self._download_webpage(f'https://www.imdb.com/video/vi{video_id}', video_id) + info = self._search_nextjs_data(webpage, video_id) + video_info = traverse_obj(info, ('props', 'pageProps', 'videoPlaybackData', 'video'), default={}) + title = (traverse_obj(video_info, ('name', 'value'), ('primaryTitle', 'titleText', 'text')) + or self._html_search_meta(('og:title', 'twitter:title'), webpage, default=None) + or self._html_extract_title(webpage)) + data = video_info.get('playbackURLs') or try_get(self._download_json( 'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id, query={ 'key': base64.b64encode(json.dumps({ @@ -60,11 +77,10 @@ class ImdbIE(InfoExtractor): 'subType': 'FORCE_LEGACY', 'id': 'vi%s' % video_id, }).encode()).decode(), - })[0] - + }), lambda x: x[0]['videoLegacyEncodings']) quality = qualities(('SD', '480p', '720p', '1080p')) - formats = [] - for encoding in data['videoLegacyEncodings']: + formats, subtitles = [], {} + for encoding in data: if not encoding or not isinstance(encoding, dict): continue video_url = url_or_none(encoding.get('url')) @@ -73,11 +89,13 @@ class ImdbIE(InfoExtractor): ext = mimetype2ext(encoding.get( 'mimeType')) or determine_ext(video_url) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( video_url, video_id, 'mp4', entry_protocol='m3u8_native', - preference=1, m3u8_id='hls', fatal=False)) + preference=1, m3u8_id='hls', fatal=False) + subtitles = self._merge_subtitles(subtitles, subs) + formats.extend(fmts) continue - format_id = encoding.get('definition') + format_id = traverse_obj(encoding, ('displayName', 'value'), 'definition') formats.append({ 'format_id': format_id, 'url': video_url, @@ -86,33 +104,15 @@ class ImdbIE(InfoExtractor): }) self._sort_formats(formats) - webpage = self._download_webpage( - 'https://www.imdb.com/video/vi' + video_id, video_id) - video_metadata = self._parse_json(self._search_regex( - r'args\.push\(\s*({.+?})\s*\)\s*;', webpage, - 'video metadata'), video_id) - - video_info = video_metadata.get('VIDEO_INFO') - if video_info and isinstance(video_info, dict): - info = try_get( - video_info, lambda x: x[list(video_info.keys())[0]][0], dict) - else: - info = {} - - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage) or self._html_search_regex( - r'<title>(.+?)</title>', webpage, 'title', - default=None) or info['videoTitle'] - return { 'id': video_id, 'title': title, 'alt_title': info.get('videoSubTitle'), 'formats': formats, - 'description': info.get('videoDescription'), - 'thumbnail': url_or_none(try_get( - info, lambda x: x['videoSlate']['source'])), - 'duration': parse_duration(info.get('videoRuntime')), + 'description': try_get(video_info, lambda x: x['description']['value']), + 'thumbnail': url_or_none(try_get(video_info, lambda x: x['thumbnail']['url'])), + 'duration': int_or_none(try_get(video_info, lambda x: x['runtime']['value'])), + 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/infoq.py b/yt_dlp/extractor/infoq.py index 0a70a1fb4..347cc5154 100644 --- a/yt_dlp/extractor/infoq.py +++ b/yt_dlp/extractor/infoq.py @@ -115,7 +115,7 @@ class InfoQIE(BokeCCBaseIE): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title') + video_title = self._html_extract_title(webpage) video_description = self._html_search_meta('description', webpage, 'description') if '/cn/' in url: diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py index b13b9f4cf..d07b39d48 100644 --- a/yt_dlp/extractor/iqiyi.py +++ b/yt_dlp/extractor/iqiyi.py @@ -9,14 +9,28 @@ import time from .common import InfoExtractor from ..compat import ( compat_str, + compat_urllib_parse_unquote ) +from .openload import PhantomJSwrapper from ..utils import ( clean_html, + ExtractorError, + float_or_none, + format_field, get_element_by_id, get_element_by_attribute, - ExtractorError, + int_or_none, + js_to_json, ohdave_rsa_encrypt, + parse_age_limit, + parse_duration, + parse_iso8601, + parse_resolution, + qualities, remove_start, + str_or_none, + traverse_obj, + urljoin, ) @@ -96,9 +110,6 @@ class IqiyiIE(InfoExtractor): '18': 7, # 1080p } - def _real_initialize(self): - self._login() - @staticmethod def _rsa_fun(data): # public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js @@ -107,7 +118,7 @@ class IqiyiIE(InfoExtractor): return ohdave_rsa_encrypt(data, e, N) - def _login(self): + def _perform_login(self): raise ExtractorError("iQiyi's non-free authentication algorithm has made login impossible", expected=True) def get_raw_data(self, tvid, video_id): @@ -217,3 +228,359 @@ class IqiyiIE(InfoExtractor): 'title': title, 'formats': formats, } + + +class IqIE(InfoExtractor): + IE_NAME = 'iq.com' + IE_DESC = 'International version of iQiyi' + _VALID_URL = r'https?://(?:www\.)?iq\.com/play/(?:[\w%-]*-)?(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.iq.com/play/one-piece-episode-1000-1ma1i6ferf4', + 'md5': '2d7caf6eeca8a32b407094b33b757d39', + 'info_dict': { + 'ext': 'mp4', + 'id': '1ma1i6ferf4', + 'title': '航海王 第1000集', + 'description': 'Subtitle available on Sunday 4PM(GMT+8).', + 'duration': 1430, + 'timestamp': 1637488203, + 'upload_date': '20211121', + 'episode_number': 1000, + 'episode': 'Episode 1000', + 'series': 'One Piece', + 'age_limit': 13, + 'average_rating': float, + }, + 'params': { + 'format': '500', + }, + 'expected_warnings': ['format is restricted'] + }, { + # VIP-restricted video + 'url': 'https://www.iq.com/play/mermaid-in-the-fog-2021-gbdpx13bs4', + 'only_matching': True + }] + _BID_TAGS = { + '100': '240P', + '200': '360P', + '300': '480P', + '500': '720P', + '600': '1080P', + '610': '1080P50', + '700': '2K', + '800': '4K', + } + _LID_TAGS = { + '1': 'zh_CN', + '2': 'zh_TW', + '3': 'en', + '18': 'th', + '21': 'my', + '23': 'vi', + '24': 'id', + '26': 'es', + '28': 'ar', + } + + _DASH_JS = ''' + console.log(page.evaluate(function() { + var tvid = "%(tvid)s"; var vid = "%(vid)s"; var src = "%(src)s"; + var uid = "%(uid)s"; var dfp = "%(dfp)s"; var mode = "%(mode)s"; var lang = "%(lang)s"; + var bid_list = %(bid_list)s; var ut_list = %(ut_list)s; var tm = new Date().getTime(); + var cmd5x_func = %(cmd5x_func)s; var cmd5x_exporter = {}; cmd5x_func({}, cmd5x_exporter, {}); var cmd5x = cmd5x_exporter.cmd5x; + var authKey = cmd5x(cmd5x('') + tm + '' + tvid); + var k_uid = Array.apply(null, Array(32)).map(function() {return Math.floor(Math.random() * 15).toString(16)}).join(''); + var dash_paths = {}; + bid_list.forEach(function(bid) { + var query = { + 'tvid': tvid, + 'bid': bid, + 'ds': 1, + 'vid': vid, + 'src': src, + 'vt': 0, + 'rs': 1, + 'uid': uid, + 'ori': 'pcw', + 'ps': 1, + 'k_uid': k_uid, + 'pt': 0, + 'd': 0, + 's': '', + 'lid': '', + 'slid': 0, + 'cf': '', + 'ct': '', + 'authKey': authKey, + 'k_tag': 1, + 'ost': 0, + 'ppt': 0, + 'dfp': dfp, + 'prio': JSON.stringify({ + 'ff': 'f4v', + 'code': 2 + }), + 'k_err_retries': 0, + 'up': '', + 'su': 2, + 'applang': lang, + 'sver': 2, + 'X-USER-MODE': mode, + 'qd_v': 2, + 'tm': tm, + 'qdy': 'a', + 'qds': 0, + 'k_ft1': 141287244169348, + 'k_ft4': 34359746564, + 'k_ft5': 1, + 'bop': JSON.stringify({ + 'version': '10.0', + 'dfp': dfp + }), + }; + var enc_params = []; + for (var prop in query) { + enc_params.push(encodeURIComponent(prop) + '=' + encodeURIComponent(query[prop])); + } + ut_list.forEach(function(ut) { + enc_params.push('ut=' + ut); + }) + var dash_path = '/dash?' + enc_params.join('&'); dash_path += '&vf=' + cmd5x(dash_path); + dash_paths[bid] = dash_path; + }); + return JSON.stringify(dash_paths); + })); + saveAndExit(); + ''' + + def _extract_vms_player_js(self, webpage, video_id): + player_js_cache = self._downloader.cache.load('iq', 'player_js') + if player_js_cache: + return player_js_cache + webpack_js_url = self._proto_relative_url(self._search_regex( + r'<script src="((?:https?)?//stc.iqiyipic.com/_next/static/chunks/webpack-\w+\.js)"', webpage, 'webpack URL')) + webpack_js = self._download_webpage(webpack_js_url, video_id, note='Downloading webpack JS', errnote='Unable to download webpack JS') + webpack_map1, webpack_map2 = [self._parse_json(js_map, video_id, transform_source=js_to_json) for js_map in self._search_regex( + r'\(({[^}]*})\[\w+\][^\)]*\)\s*\+\s*["\']\.["\']\s*\+\s*({[^}]*})\[\w+\]\+["\']\.js', webpack_js, 'JS locations', group=(1, 2))] + for module_index in reversed(list(webpack_map2.keys())): + module_js = self._download_webpage( + f'https://stc.iqiyipic.com/_next/static/chunks/{webpack_map1.get(module_index, module_index)}.{webpack_map2[module_index]}.js', + video_id, note=f'Downloading #{module_index} module JS', errnote='Unable to download module JS', fatal=False) or '' + if 'vms request' in module_js: + self._downloader.cache.store('iq', 'player_js', module_js) + return module_js + raise ExtractorError('Unable to extract player JS') + + def _extract_cmd5x_function(self, webpage, video_id): + return self._search_regex(r',\s*(function\s*\([^\)]*\)\s*{\s*var _qda.+_qdc\(\)\s*})\s*,', + self._extract_vms_player_js(webpage, video_id), 'signature function') + + def _update_bid_tags(self, webpage, video_id): + extracted_bid_tags = self._parse_json( + self._search_regex( + r'arguments\[1\][^,]*,\s*function\s*\([^\)]*\)\s*{\s*"use strict";?\s*var \w=({.+}})\s*,\s*\w\s*=\s*{\s*getNewVd', + self._extract_vms_player_js(webpage, video_id), 'video tags', default=''), + video_id, transform_source=js_to_json, fatal=False) + if not extracted_bid_tags: + return + self._BID_TAGS = { + bid: traverse_obj(extracted_bid_tags, (bid, 'value'), expected_type=str, default=self._BID_TAGS.get(bid)) + for bid in extracted_bid_tags.keys() + } + + def _get_cookie(self, name, default=None): + cookie = self._get_cookies('https://iq.com/').get(name) + return cookie.value if cookie else default + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + self._update_bid_tags(webpage, video_id) + + next_props = self._search_nextjs_data(webpage, video_id)['props'] + page_data = next_props['initialState']['play'] + video_info = page_data['curVideoInfo'] + + uid = traverse_obj( + self._parse_json( + self._get_cookie('I00002', '{}'), video_id, transform_source=compat_urllib_parse_unquote, fatal=False), + ('data', 'uid'), default=0) + + if uid: + vip_data = self._download_json( + 'https://pcw-api.iq.com/api/vtype', video_id, note='Downloading VIP data', errnote='Unable to download VIP data', query={ + 'batch': 1, + 'platformId': 3, + 'modeCode': self._get_cookie('mod', 'intl'), + 'langCode': self._get_cookie('lang', 'en_us'), + 'deviceId': self._get_cookie('QC005', '') + }, fatal=False) + ut_list = traverse_obj(vip_data, ('data', 'all_vip', ..., 'vipType'), expected_type=str_or_none, default=[]) + else: + ut_list = ['0'] + + # bid 0 as an initial format checker + dash_paths = self._parse_json(PhantomJSwrapper(self).get( + url, html='<!DOCTYPE html>', video_id=video_id, note2='Executing signature code', jscode=self._DASH_JS % { + 'tvid': video_info['tvId'], + 'vid': video_info['vid'], + 'src': traverse_obj(next_props, ('initialProps', 'pageProps', 'ptid'), + expected_type=str, default='04022001010011000000'), + 'uid': uid, + 'dfp': self._get_cookie('dfp', ''), + 'mode': self._get_cookie('mod', 'intl'), + 'lang': self._get_cookie('lang', 'en_us'), + 'bid_list': '[' + ','.join(['0', *self._BID_TAGS.keys()]) + ']', + 'ut_list': '[' + ','.join(ut_list) + ']', + 'cmd5x_func': self._extract_cmd5x_function(webpage, video_id), + })[1].strip(), video_id) + + formats, subtitles = [], {} + initial_format_data = self._download_json( + urljoin('https://cache-video.iq.com', dash_paths['0']), video_id, + note='Downloading initial video format info', errnote='Unable to download initial video format info')['data'] + + preview_time = traverse_obj( + initial_format_data, ('boss_ts', (None, 'data'), ('previewTime', 'rtime')), expected_type=float_or_none, get_all=False) + if traverse_obj(initial_format_data, ('boss_ts', 'data', 'prv'), expected_type=int_or_none): + self.report_warning('This preview video is limited%s' % format_field(preview_time, template=' to %s seconds')) + + # TODO: Extract audio-only formats + for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none, default=[])): + dash_path = dash_paths.get(bid) + if not dash_path: + self.report_warning(f'Unknown format id: {bid}. It is currently not being extracted') + continue + format_data = traverse_obj(self._download_json( + urljoin('https://cache-video.iq.com', dash_path), video_id, + note=f'Downloading format data for {self._BID_TAGS[bid]}', errnote='Unable to download format data', + fatal=False), 'data', expected_type=dict) + + video_format = traverse_obj(format_data, ('program', 'video', lambda _, v: str(v['bid']) == bid), + expected_type=dict, default=[], get_all=False) or {} + extracted_formats = [] + if video_format.get('m3u8Url'): + extracted_formats.extend(self._extract_m3u8_formats( + urljoin(format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/'), video_format['m3u8Url']), + 'mp4', m3u8_id=bid, fatal=False)) + if video_format.get('mpdUrl'): + # TODO: Properly extract mpd hostname + extracted_formats.extend(self._extract_mpd_formats( + urljoin(format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/'), video_format['mpdUrl']), + mpd_id=bid, fatal=False)) + if video_format.get('m3u8'): + ff = video_format.get('ff', 'ts') + if ff == 'ts': + m3u8_formats, _ = self._parse_m3u8_formats_and_subtitles( + video_format['m3u8'], ext='mp4', m3u8_id=bid, fatal=False) + extracted_formats.extend(m3u8_formats) + elif ff == 'm4s': + mpd_data = traverse_obj( + self._parse_json(video_format['m3u8'], video_id, fatal=False), ('payload', ..., 'data'), expected_type=str) + if not mpd_data: + continue + mpd_formats, _ = self._parse_mpd_formats_and_subtitles( + mpd_data, bid, format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/')) + extracted_formats.extend(mpd_formats) + else: + self.report_warning(f'{ff} formats are currently not supported') + + if not extracted_formats: + if video_format.get('s'): + self.report_warning(f'{self._BID_TAGS[bid]} format is restricted') + else: + self.report_warning(f'Unable to extract {self._BID_TAGS[bid]} format') + for f in extracted_formats: + f.update({ + 'quality': qualities(list(self._BID_TAGS.keys()))(bid), + 'format_note': self._BID_TAGS[bid], + **parse_resolution(video_format.get('scrsz')) + }) + formats.extend(extracted_formats) + + self._sort_formats(formats) + + for sub_format in traverse_obj(initial_format_data, ('program', 'stl', ...), expected_type=dict, default=[]): + lang = self._LID_TAGS.get(str_or_none(sub_format.get('lid')), sub_format.get('_name')) + subtitles.setdefault(lang, []).extend([{ + 'ext': format_ext, + 'url': urljoin(initial_format_data.get('dstl', 'http://meta.video.iqiyi.com'), sub_format[format_key]) + } for format_key, format_ext in [('srt', 'srt'), ('webvtt', 'vtt')] if sub_format.get(format_key)]) + + extra_metadata = page_data.get('albumInfo') if video_info.get('albumId') and page_data.get('albumInfo') else video_info + return { + 'id': video_id, + 'title': video_info['name'], + 'formats': formats, + 'subtitles': subtitles, + 'description': video_info.get('mergeDesc'), + 'duration': parse_duration(video_info.get('len')), + 'age_limit': parse_age_limit(video_info.get('rating')), + 'average_rating': traverse_obj(page_data, ('playScoreInfo', 'score'), expected_type=float_or_none), + 'timestamp': parse_iso8601(video_info.get('isoUploadDate')), + 'categories': traverse_obj(extra_metadata, ('videoTagMap', ..., ..., 'name'), expected_type=str), + 'cast': traverse_obj(extra_metadata, ('actorArr', ..., 'name'), expected_type=str), + 'episode_number': int_or_none(video_info.get('order')) or None, + 'series': video_info.get('albumName'), + } + + +class IqAlbumIE(InfoExtractor): + IE_NAME = 'iq.com:album' + _VALID_URL = r'https?://(?:www\.)?iq\.com/album/(?:[\w%-]*-)?(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.iq.com/album/one-piece-1999-1bk9icvr331', + 'info_dict': { + 'id': '1bk9icvr331', + 'title': 'One Piece', + 'description': 'Subtitle available on Sunday 4PM(GMT+8).' + }, + 'playlist_mincount': 238 + }, { + # Movie/single video + 'url': 'https://www.iq.com/album/九龙城寨-2021-22yjnij099k', + 'info_dict': { + 'ext': 'mp4', + 'id': '22yjnij099k', + 'title': '九龙城寨', + 'description': 'md5:8a09f50b8ba0db4dc69bc7c844228044', + 'duration': 5000, + 'timestamp': 1641911371, + 'upload_date': '20220111', + 'series': '九龙城寨', + 'cast': ['Shi Yan Neng', 'Yu Lang', 'Peter lv', 'Sun Zi Jun', 'Yang Xiao Bo'], + 'age_limit': 13, + 'average_rating': float, + }, + 'expected_warnings': ['format is restricted'] + }] + + def _entries(self, album_id_num, page_ranges, album_id=None, mode_code='intl', lang_code='en_us'): + for page_range in page_ranges: + page = self._download_json( + f'https://pcw-api.iq.com/api/episodeListSource/{album_id_num}', album_id, + note=f'Downloading video list episodes {page_range.get("msg", "")}', + errnote='Unable to download video list', query={ + 'platformId': 3, + 'modeCode': mode_code, + 'langCode': lang_code, + 'endOrder': page_range['to'], + 'startOrder': page_range['from'] + }) + for video in page['data']['epg']: + yield self.url_result('https://www.iq.com/play/%s' % (video.get('playLocSuffix') or video['qipuIdStr']), + IqIE.ie_key(), video.get('qipuIdStr'), video.get('name')) + + def _real_extract(self, url): + album_id = self._match_id(url) + webpage = self._download_webpage(url, album_id) + next_data = self._search_nextjs_data(webpage, album_id) + album_data = next_data['props']['initialState']['album']['videoAlbumInfo'] + + if album_data.get('videoType') == 'singleVideo': + return self.url_result('https://www.iq.com/play/%s' % album_id, IqIE.ie_key()) + return self.playlist_result( + self._entries(album_data['albumId'], album_data['totalPageRange'], album_id, + traverse_obj(next_data, ('props', 'initialProps', 'pageProps', 'modeCode')), + traverse_obj(next_data, ('props', 'initialProps', 'pageProps', 'langCode'))), + album_id, album_data.get('name'), album_data.get('desc')) diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index 254d98692..c0e01e352 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -76,8 +76,7 @@ class IwaraIE(InfoExtractor): 'age_limit': age_limit, } - title = remove_end(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), ' | Iwara') + title = remove_end(self._html_extract_title(webpage), ' | Iwara') thumbnail = self._html_search_regex( r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None) diff --git a/yt_dlp/extractor/limelight.py b/yt_dlp/extractor/limelight.py index 369141d67..b20681ad1 100644 --- a/yt_dlp/extractor/limelight.py +++ b/yt_dlp/extractor/limelight.py @@ -194,7 +194,7 @@ class LimelightBaseIE(InfoExtractor): cc_url = cc.get('webvttFileUrl') if not cc_url: continue - lang = cc.get('languageCode') or self._search_regex(r'/[a-z]{2}\.vtt', cc_url, 'lang', default='en') + lang = cc.get('languageCode') or self._search_regex(r'/([a-z]{2})\.vtt', cc_url, 'lang', default='en') subtitles.setdefault(lang, []).append({ 'url': cc_url, }) diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index bf549e164..0f57bfa06 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -102,7 +102,7 @@ class LinkedInIE(LinkedInBaseIE): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title') + title = self._html_extract_title(webpage) description = clean_html(get_element_by_class('share-update-card__update-text', webpage)) like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage)) creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage))) diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py index ace86c2fd..fbf9223b2 100644 --- a/yt_dlp/extractor/mediasite.py +++ b/yt_dlp/extractor/mediasite.py @@ -14,6 +14,7 @@ from ..utils import ( float_or_none, mimetype2ext, str_or_none, + try_call, try_get, unescapeHTML, unsmuggle_url, @@ -145,11 +146,11 @@ class MediasiteIE(InfoExtractor): 'duration': slide['Time'] / 1000, }) - next_time = try_get(None, [ - lambda _: Stream['Slides'][i + 1]['Time'], - lambda _: duration, - lambda _: slide['Time'], - ], expected_type=(int, float)) + next_time = try_call( + lambda: Stream['Slides'][i + 1]['Time'], + lambda: duration, + lambda: slide['Time'], + expected_type=(int, float)) fragments.append({ 'path': fname_template.format(slide.get('Number', i + 1)), diff --git a/yt_dlp/extractor/miaopai.py b/yt_dlp/extractor/miaopai.py index f9e35ac7f..cf0610bdf 100644 --- a/yt_dlp/extractor/miaopai.py +++ b/yt_dlp/extractor/miaopai.py @@ -24,8 +24,7 @@ class MiaoPaiIE(InfoExtractor): webpage = self._download_webpage( url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD}) - title = self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title') + title = self._html_extract_title(webpage) thumbnail = self._html_search_regex( r'<div[^>]+class=(?P<q1>[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P<q2>[\'"])(?P<url>[^\'"]+)(?P=q2)', webpage, 'thumbnail', fatal=False, group='url') diff --git a/yt_dlp/extractor/mojvideo.py b/yt_dlp/extractor/mojvideo.py index 0421f3f44..16d94052b 100644 --- a/yt_dlp/extractor/mojvideo.py +++ b/yt_dlp/extractor/mojvideo.py @@ -38,8 +38,7 @@ class MojvideoIE(InfoExtractor): r'<errordesc>([^<]*)</errordesc>', playerapi, 'error description', fatal=False) raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True) - title = self._html_search_regex( - r'<title>([^<]+)</title>', playerapi, 'title') + title = self._html_extract_title(playerapi) video_url = self._html_search_regex( r'<file>([^<]+)</file>', playerapi, 'video URL') thumbnail = self._html_search_regex( diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index b77ef5f28..77f253519 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -86,7 +86,7 @@ class NebulaBaseIE(InfoExtractor): # if 401 or 403, attempt credential re-auth and retry if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403): self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}') - self._login() + self._perform_login() return inner_call() else: raise diff --git a/yt_dlp/extractor/newgrounds.py b/yt_dlp/extractor/newgrounds.py index 1e1274ef0..6525a6d8a 100644 --- a/yt_dlp/extractor/newgrounds.py +++ b/yt_dlp/extractor/newgrounds.py @@ -106,8 +106,7 @@ class NewgroundsIE(InfoExtractor): uploader = None webpage = self._download_webpage(url, media_id) - title = self._html_search_regex( - r'<title>(.+?)</title>', webpage, 'title') + title = self._html_extract_title(webpage) media_url_string = self._search_regex( r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None) @@ -219,8 +218,7 @@ class NewgroundsPlaylistIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - title = self._search_regex( - r'<title>([^>]+)</title>', webpage, 'title', default=None) + title = self._html_extract_title(webpage, default=None) # cut left menu webpage = self._search_regex( diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 626c6379b..3b8efc3e6 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -309,7 +309,9 @@ class NhkForSchoolProgramListIE(InfoExtractor): webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id) - title = self._og_search_title(webpage, fatal=False) or self._html_extract_title(webpage, fatal=False) or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False) + title = (self._og_search_title(webpage) + or self._html_extract_title(webpage) + or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False)) title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None description = self._html_search_regex( r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>', diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 74828f833..4eb6ed070 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -25,7 +25,10 @@ from ..utils import ( parse_duration, parse_filesize, parse_iso8601, + parse_resolution, + qualities, remove_start, + str_or_none, traverse_obj, try_get, unescapeHTML, @@ -430,18 +433,25 @@ class NiconicoIE(InfoExtractor): # find in json (logged in) tags = traverse_obj(api_data, ('tag', 'items', ..., 'name')) + thumb_prefs = qualities(['url', 'middleUrl', 'largeUrl', 'player', 'ogp']) + return { 'id': video_id, '_api_data': api_data, 'title': get_video_info(('originalTitle', 'title')) or self._og_search_title(webpage, default=None), 'formats': formats, - 'thumbnail': get_video_info('thumbnail', 'url') or self._html_search_meta( - ('image', 'og:image'), webpage, 'thumbnail', default=None), + 'thumbnails': [{ + 'id': key, + 'url': url, + 'ext': 'jpg', + 'preference': thumb_prefs(key), + **parse_resolution(url, lenient=True), + } for key, url in (get_video_info('thumbnail') or {}).items() if url], 'description': clean_html(get_video_info('description')), - 'uploader': traverse_obj(api_data, ('owner', 'nickname')), + 'uploader': traverse_obj(api_data, ('owner', 'nickname'), ('channel', 'name'), ('community', 'name')), + 'uploader_id': str_or_none(traverse_obj(api_data, ('owner', 'id'), ('channel', 'id'), ('community', 'id'))), 'timestamp': parse_iso8601(get_video_info('registeredAt')) or parse_iso8601( self._html_search_meta('video:release_date', webpage, 'date published', default=None)), - 'uploader_id': traverse_obj(api_data, ('owner', 'id')), 'channel': traverse_obj(api_data, ('channel', 'name'), ('community', 'name')), 'channel_id': traverse_obj(api_data, ('channel', 'id'), ('community', 'id')), 'view_count': int_or_none(get_video_info('count', 'view')), @@ -459,7 +469,7 @@ class NiconicoIE(InfoExtractor): comment_user_key = traverse_obj(api_data, ('comment', 'keys', 'userKey')) user_id_str = session_api_data.get('serviceUserId') - thread_ids = [x for x in traverse_obj(api_data, ('comment', 'threads')) or [] if x['isActive']] + thread_ids = traverse_obj(api_data, ('comment', 'threads', lambda _, v: v['isActive'])) raw_danmaku = self._extract_all_comments(video_id, thread_ids, user_id_str, comment_user_key) if not raw_danmaku: self.report_warning(f'Failed to get comments. {bug_reports_message()}') diff --git a/yt_dlp/extractor/openrec.py b/yt_dlp/extractor/openrec.py index b476c0986..5eb1cdbad 100644 --- a/yt_dlp/extractor/openrec.py +++ b/yt_dlp/extractor/openrec.py @@ -7,6 +7,7 @@ from ..utils import ( get_first, int_or_none, traverse_obj, + try_get, unified_strdate, unified_timestamp, ) @@ -18,6 +19,13 @@ class OpenRecBaseIE(InfoExtractor): return self._parse_json( self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id) + def _expand_media(self, video_id, media): + for name, m3u8_url in (media or {}).items(): + if not m3u8_url: + continue + yield from self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', m3u8_id=name) + def _extract_movie(self, webpage, video_id, name, is_live): window_stores = self._extract_pagestore(webpage, video_id) movie_stores = [ @@ -29,13 +37,21 @@ class OpenRecBaseIE(InfoExtractor): if not any(movie_stores): raise ExtractorError(f'Failed to extract {name} info') - m3u8_playlists = get_first(movie_stores, 'media') or {} - formats = [] - for name, m3u8_url in m3u8_playlists.items(): - if not m3u8_url: - continue - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', live=is_live, m3u8_id=name)) + formats = list(self._expand_media(video_id, get_first(movie_stores, 'media'))) + if not formats and is_live: + # archived livestreams + cookies = self._get_cookies('https://www.openrec.tv/') + detail = self._download_json( + f'https://apiv5.openrec.tv/api/v5/movies/{video_id}/detail', video_id, + headers={ + 'Origin': 'https://www.openrec.tv', + 'Referer': 'https://www.openrec.tv/', + 'access-token': try_get(cookies, lambda x: x.get('access_token').value), + 'uuid': try_get(cookies, lambda x: x.get('uuid').value), + }) + new_media = traverse_obj(detail, ('data', 'items', ..., 'media'), get_all=False) + formats = list(self._expand_media(video_id, new_media)) + is_live = False self._sort_formats(formats) diff --git a/yt_dlp/extractor/playvid.py b/yt_dlp/extractor/playvid.py index 4aef186ea..e1c406b6c 100644 --- a/yt_dlp/extractor/playvid.py +++ b/yt_dlp/extractor/playvid.py @@ -85,8 +85,7 @@ class PlayvidIE(InfoExtractor): # Extract title - should be in the flashvars; if not, look elsewhere if video_title is None: - video_title = self._html_search_regex( - r'<title>(.*?)</title', webpage, 'title') + video_title = self._html_extract_title(webpage) return { 'id': video_id, diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index 9d243b2be..6864129c6 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -118,7 +118,7 @@ class RaiBaseIE(InfoExtractor): }) def _create_http_urls(self, relinker_url, fmts): - _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\d+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' + _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' _QUALITY = { # tbr: w, h diff --git a/yt_dlp/extractor/rule34video.py b/yt_dlp/extractor/rule34video.py index 522d4ccd5..a602a9f33 100644 --- a/yt_dlp/extractor/rule34video.py +++ b/yt_dlp/extractor/rule34video.py @@ -49,7 +49,7 @@ class Rule34VideoIE(InfoExtractor): 'quality': quality, }) - title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title') + title = self._html_extract_title(webpage) thumbnail = self._html_search_regex(r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None) duration = self._html_search_regex(r'"icon-clock"></i>\s+<span>((?:\d+:?)+)', webpage, 'duration', default=None) diff --git a/yt_dlp/extractor/ruutu.py b/yt_dlp/extractor/ruutu.py index d9cf39d71..5a30e3360 100644 --- a/yt_dlp/extractor/ruutu.py +++ b/yt_dlp/extractor/ruutu.py @@ -1,6 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse from ..utils import ( @@ -8,6 +11,8 @@ from ..utils import ( ExtractorError, find_xpath_attr, int_or_none, + traverse_obj, + try_call, unified_strdate, url_or_none, xpath_attr, @@ -123,6 +128,16 @@ class RuutuIE(InfoExtractor): ] _API_BASE = 'https://gatling.nelonenmedia.fi' + @classmethod + def _extract_url(cls, webpage): + settings = try_call( + lambda: json.loads(re.search( + r'jQuery\.extend\(Drupal\.settings, ({.+?})\);', webpage).group(1), strict=False)) + video_id = traverse_obj(settings, ( + 'mediaCrossbowSettings', 'file', 'field_crossbow_video_id', 'und', 0, 'value')) + if video_id: + return f'http://www.ruutu.fi/video/{video_id}' + def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py index 6f4240422..b295184a1 100644 --- a/yt_dlp/extractor/senategov.py +++ b/yt_dlp/extractor/senategov.py @@ -112,7 +112,7 @@ class SenateISVPIE(InfoExtractor): if smuggled_data.get('force_title'): title = smuggled_data['force_title'] else: - title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id) + title = self._html_extract_title(webpage) poster = qs.get('poster') thumbnail = poster[0] if poster else None diff --git a/yt_dlp/extractor/sunporno.py b/yt_dlp/extractor/sunporno.py index 68051169b..59b77bf92 100644 --- a/yt_dlp/extractor/sunporno.py +++ b/yt_dlp/extractor/sunporno.py @@ -36,8 +36,7 @@ class SunPornoIE(InfoExtractor): webpage = self._download_webpage( 'http://www.sunporno.com/videos/%s' % video_id, video_id) - title = self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title') + title = self._html_extract_title(webpage) description = self._html_search_meta( 'description', webpage, 'description') thumbnail = self._html_search_regex( diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index 5b3222ecf..5c7b54531 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -7,6 +7,7 @@ import base64 from .common import InfoExtractor from ..utils import ( HEADRequest, + int_or_none, urlencode_postdata, ) @@ -15,6 +16,28 @@ class TenPlayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?P<id>tpv\d{6}[a-z]{5})' _NETRC_MACHINE = '10play' _TESTS = [{ + 'url': 'https://10play.com.au/neighbours/web-extras/season-39/nathan-borg-is-the-first-aussie-actor-with-a-cochlear-implant-to-join-neighbours/tpv210128qupwd', + 'info_dict': { + 'id': '6226844312001', + 'ext': 'mp4', + 'title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', + 'alt_title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', + 'description': 'md5:a02d0199c901c2dd4c796f1e7dd0de43', + 'duration': 186, + 'season': 39, + 'series': 'Neighbours', + 'thumbnail': r're:https://.*\.jpg', + 'uploader': 'Channel 10', + 'age_limit': 15, + 'timestamp': 1611810000, + 'upload_date': '20210128', + 'uploader_id': '2199827728001', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Only available in Australia', + }, { 'url': 'https://10play.com.au/todd-sampsons-body-hack/episodes/season-4/episode-7/tpv200921kvngh', 'info_dict': { 'id': '6192880312001', @@ -62,12 +85,17 @@ class TenPlayIE(InfoExtractor): def _real_extract(self, url): content_id = self._match_id(url) - _token = self._get_bearer_token(content_id) data = self._download_json( 'https://10play.com.au/api/v1/videos/' + content_id, content_id) + headers = {} + + if data.get('memberGated') is True: + _token = self._get_bearer_token(content_id) + headers = {'Authorization': _token} + _video_url = self._download_json( data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON', - headers={'Authorization': _token}).get('source') + headers=headers).get('source') m3u8_url = self._request_webpage(HEADRequest( _video_url), content_id).geturl() if '10play-not-in-oz' in m3u8_url: @@ -77,12 +105,16 @@ class TenPlayIE(InfoExtractor): return { 'formats': formats, + 'subtitles': {'en': [{'url': data.get('captionUrl')}]} if data.get('captionUrl') else None, 'id': data.get('altId') or content_id, - 'title': data.get('title'), + 'duration': data.get('duration'), + 'title': data.get('subtitle'), + 'alt_title': data.get('title'), 'description': data.get('description'), 'age_limit': self._AUS_AGES.get(data.get('classification')), - 'series': data.get('showName'), - 'season': data.get('showContentSeason'), + 'series': data.get('tvShow'), + 'season': int_or_none(data.get('season')), + 'episode_number': int_or_none(data.get('episode')), 'timestamp': data.get('published'), 'thumbnail': data.get('imageUrl'), 'uploader': 'Channel 10', diff --git a/yt_dlp/extractor/thisav.py b/yt_dlp/extractor/thisav.py index 4af286e6d..6bb00b3ab 100644 --- a/yt_dlp/extractor/thisav.py +++ b/yt_dlp/extractor/thisav.py @@ -37,9 +37,7 @@ class ThisAVIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = remove_end(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), - ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站') + title = remove_end(self._html_extract_title(webpage), ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站') video_url = self._html_search_regex( r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None) if video_url: diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 56cc2dcc6..c1d6c5477 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -263,8 +263,8 @@ class TikTokBaseIE(InfoExtractor): return { 'id': aweme_id, - 'title': aweme_detail['desc'], - 'description': aweme_detail['desc'], + 'title': aweme_detail.get('desc'), + 'description': aweme_detail.get('desc'), 'view_count': int_or_none(stats_info.get('play_count')), 'like_count': int_or_none(stats_info.get('digg_count')), 'repost_count': int_or_none(stats_info.get('share_count')), @@ -387,6 +387,9 @@ class TikTokIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'artist': 'Ysrbeats', + 'album': 'Lehanga', + 'track': 'Lehanga', } }, { 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en', @@ -410,6 +413,8 @@ class TikTokIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson', + 'track': 'Big Fun', } }, { # Banned audio, only available on the app @@ -458,6 +463,30 @@ class TikTokIE(TikTokBaseIE): }, 'expected_warnings': ['Video not available'] }, { + # Video without title and description + 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694', + 'info_dict': { + 'id': '7059698374567611694', + 'ext': 'mp4', + 'title': 'tiktok video #7059698374567611694', + 'description': '', + 'uploader': 'pokemonlife22', + 'creator': 'Pokemon', + 'uploader_id': '6820838815978423302', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W', + 'track': 'original sound', + 'timestamp': 1643714123, + 'duration': 6, + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'upload_date': '20220201', + 'artist': 'Pokemon', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + }, + 'expected_warnings': ['Video not available', 'Creating a generic title'] + }, { # Auto-captions available 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', 'only_matching': True @@ -522,6 +551,15 @@ class TikTokUserIE(TikTokBaseIE): }, 'expected_warnings': ['Retrying'] }, { + 'url': 'https://www.tiktok.com/@6820838815978423302', + 'playlist_mincount': 5, + 'info_dict': { + 'id': '6820838815978423302', + 'title': '6820838815978423302', + 'thumbnail': r're:https://.+_1080x1080\.webp' + }, + 'expected_warnings': ['Retrying'] + }, { 'url': 'https://www.tiktok.com/@meme', 'playlist_mincount': 593, 'info_dict': { @@ -593,7 +631,7 @@ class TikTokUserIE(TikTokBaseIE): webpage = self._download_webpage(url, user_name, headers={ 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)' }) - user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID') + user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID', default=None) or user_name videos = LazyList(self._video_entries_api(webpage, user_id, user_name)) thumbnail = traverse_obj(videos, (0, 'author', 'avatar_larger', 'url_list', 0)) diff --git a/yt_dlp/extractor/traileraddict.py b/yt_dlp/extractor/traileraddict.py index 10100fbcf..514f4793e 100644 --- a/yt_dlp/extractor/traileraddict.py +++ b/yt_dlp/extractor/traileraddict.py @@ -24,8 +24,7 @@ class TrailerAddictIE(InfoExtractor): name = mobj.group('movie') + '/' + mobj.group('trailer_name') webpage = self._download_webpage(url, name) - title = self._search_regex(r'<title>(.+?)</title>', - webpage, 'video title').replace(' - Trailer Addict', '') + title = self._html_extract_title(webpage, 'video title').replace(' - Trailer Addict', '') view_count_str = self._search_regex( r'<span class="views_n">([0-9,.]+)</span>', webpage, 'view count', fatal=False) diff --git a/yt_dlp/extractor/varzesh3.py b/yt_dlp/extractor/varzesh3.py index 81313dc9d..32655b96d 100644 --- a/yt_dlp/extractor/varzesh3.py +++ b/yt_dlp/extractor/varzesh3.py @@ -42,8 +42,7 @@ class Varzesh3IE(InfoExtractor): video_url = self._search_regex( r'<source[^>]+src="([^"]+)"', webpage, 'video url') - title = remove_start(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), 'ویدیو ورزش 3 | ') + title = remove_start(self._html_extract_title(webpage), 'ویدیو ورزش 3 | ') description = self._html_search_regex( r'(?s)<div class="matn">(.+?)</div>', diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py index b0a1fca68..ba627ca5b 100644 --- a/yt_dlp/extractor/viu.py +++ b/yt_dlp/extractor/viu.py @@ -329,7 +329,8 @@ class ViuOTTIE(InfoExtractor): if token is not None: query['identity'] = token else: - # preview is limited to 3min for non-members. But we can try to bypass it + # The content is Preview or for VIP only. + # We can try to bypass the duration which is limited to 3mins only duration_limit, query['duration'] = True, '180' try: stream_data = download_playback() @@ -346,13 +347,13 @@ class ViuOTTIE(InfoExtractor): # bypass preview duration limit if duration_limit: - stream_url = urllib.parse.urlparse(stream_url) + old_stream_url = urllib.parse.urlparse(stream_url) + query = dict(urllib.parse.parse_qsl(old_stream_url.query, keep_blank_values=True)) query.update({ 'duration': video_data.get('time_duration') or '9999999', 'duration_start': '0', }) - stream_url = stream_url._replace(query=urllib.parse.urlencode(dict( - urllib.parse.parse_qsl(stream_url.query, keep_blank_values=True)))).geturl() + stream_url = old_stream_url._replace(query=urllib.parse.urlencode(query)).geturl() formats.append({ 'format_id': vid_format, diff --git a/yt_dlp/extractor/vrv.py b/yt_dlp/extractor/vrv.py index 10e6be7ed..00e1006c4 100644 --- a/yt_dlp/extractor/vrv.py +++ b/yt_dlp/extractor/vrv.py @@ -85,7 +85,30 @@ class VRVBaseIE(InfoExtractor): 'resource_key': resource_key, })['__links__']['cms_resource']['href'] - def _initialize_pre_login(self): + def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): + if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'): + return [] + format_id = join_nonempty( + stream_format, + audio_lang and 'audio-%s' % audio_lang, + hardsub_lang and 'hardsub-%s' % hardsub_lang) + if 'hls' in stream_format: + adaptive_formats = self._extract_m3u8_formats( + url, video_id, 'mp4', m3u8_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + elif stream_format == 'dash': + adaptive_formats = self._extract_mpd_formats( + url, video_id, mpd_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + if audio_lang: + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_lang + return adaptive_formats + + def _set_api_params(self): webpage = self._download_webpage( 'https://vrv.co/', None, headers=self.geo_verification_headers()) self._API_PARAMS = self._parse_json(self._search_regex( @@ -133,28 +156,8 @@ class VRVIE(VRVBaseIE): self._TOKEN = token_credentials['oauth_token'] self._TOKEN_SECRET = token_credentials['oauth_token_secret'] - def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): - if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'): - return [] - format_id = join_nonempty( - stream_format, - audio_lang and 'audio-%s' % audio_lang, - hardsub_lang and 'hardsub-%s' % hardsub_lang) - if 'hls' in stream_format: - adaptive_formats = self._extract_m3u8_formats( - url, video_id, 'mp4', m3u8_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - elif stream_format == 'dash': - adaptive_formats = self._extract_mpd_formats( - url, video_id, mpd_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - if audio_lang: - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = audio_lang - return adaptive_formats + def _initialize_pre_login(self): + return self._set_api_params() def _real_extract(self, url): video_id = self._match_id(url) @@ -249,6 +252,9 @@ class VRVSeriesIE(VRVBaseIE): 'playlist_mincount': 11, } + def _initialize_pre_login(self): + return self._set_api_params() + def _real_extract(self, url): series_id = self._match_id(url) diff --git a/yt_dlp/extractor/vshare.py b/yt_dlp/extractor/vshare.py index c631ac1fa..b4874ac39 100644 --- a/yt_dlp/extractor/vshare.py +++ b/yt_dlp/extractor/vshare.py @@ -50,8 +50,7 @@ class VShareIE(InfoExtractor): 'https://vshare.io/v/%s/width-650/height-430/1' % video_id, video_id, headers={'Referer': url}) - title = self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title') + title = self._html_extract_title(webpage) title = title.split(' - ')[0] error = self._html_search_regex( diff --git a/yt_dlp/extractor/vupload.py b/yt_dlp/extractor/vupload.py index 2229a6591..b561f63f7 100644 --- a/yt_dlp/extractor/vupload.py +++ b/yt_dlp/extractor/vupload.py @@ -28,7 +28,7 @@ class VuploadIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title') + title = self._html_extract_title(webpage) video_json = self._parse_json(self._html_search_regex(r'sources:\s*(.+?]),', webpage, 'video'), video_id, transform_source=js_to_json) formats = [] for source in video_json: diff --git a/yt_dlp/extractor/weibo.py b/yt_dlp/extractor/weibo.py index 621df5b54..dafa2af3b 100644 --- a/yt_dlp/extractor/weibo.py +++ b/yt_dlp/extractor/weibo.py @@ -73,8 +73,7 @@ class WeiboIE(InfoExtractor): webpage = self._download_webpage( url, video_id, note='Revisiting webpage') - title = self._html_search_regex( - r'<title>(.+?)</title>', webpage, 'title') + title = self._html_extract_title(webpage) video_formats = compat_parse_qs(self._search_regex( r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) diff --git a/yt_dlp/extractor/whowatch.py b/yt_dlp/extractor/whowatch.py index f8bc2e73a..e4b610d00 100644 --- a/yt_dlp/extractor/whowatch.py +++ b/yt_dlp/extractor/whowatch.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, qualities, + try_call, try_get, ExtractorError, ) @@ -26,10 +27,10 @@ class WhoWatchIE(InfoExtractor): metadata = self._download_json('https://api.whowatch.tv/lives/%s' % video_id, video_id) live_data = self._download_json('https://api.whowatch.tv/lives/%s/play' % video_id, video_id) - title = try_get(None, ( - lambda x: live_data['share_info']['live_title'][1:-1], - lambda x: metadata['live']['title'], - ), compat_str) + title = try_call( + lambda: live_data['share_info']['live_title'][1:-1], + lambda: metadata['live']['title'], + expected_type=str) hls_url = live_data.get('hls_url') if not hls_url: diff --git a/yt_dlp/extractor/xnxx.py b/yt_dlp/extractor/xnxx.py index dd4fb54d4..27f991627 100644 --- a/yt_dlp/extractor/xnxx.py +++ b/yt_dlp/extractor/xnxx.py @@ -13,7 +13,7 @@ from ..utils import ( class XNXXIE(InfoExtractor): - _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/' + _VALID_URL = r'https?://(?:video|www)\.xnxx3?\.com/video-?(?P<id>[0-9a-z]+)/' _TESTS = [{ 'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video', 'md5': '7583e96c15c0f21e9da3453d9920fbba', @@ -32,6 +32,9 @@ class XNXXIE(InfoExtractor): }, { 'url': 'http://www.xnxx.com/video-55awb78/', 'only_matching': True, + }, { + 'url': 'http://www.xnxx3.com/video-55awb78/', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/yahoo.py b/yt_dlp/extractor/yahoo.py index 6cf3b1de2..20504de2c 100644 --- a/yt_dlp/extractor/yahoo.py +++ b/yt_dlp/extractor/yahoo.py @@ -533,7 +533,7 @@ class YahooJapanNewsIE(InfoExtractor): title = self._html_search_meta( ['og:title', 'twitter:title'], webpage, 'title', default=None - ) or self._html_search_regex('<title>([^<]+)</title>', webpage, 'title') + ) or self._html_extract_title(webpage) if display_id == host: # Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...) diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py index a101af67e..7d3966bf1 100644 --- a/yt_dlp/extractor/yandexvideo.py +++ b/yt_dlp/extractor/yandexvideo.py @@ -163,7 +163,6 @@ class YandexVideoPreviewIE(InfoExtractor): 'thumbnail': 'https://i.mycdn.me/videoPreview?id=544866765315&type=37&idx=13&tkn=TY5qjLYZHxpmcnK8U2LgzYkgmaU&fn=external_8', 'uploader_id': '481054701571', 'title': 'LOFT - summer, summer, summer HD', - 'manifest_stream_number': 0, 'uploader': 'АРТЁМ КУДРОВ', }, }, { # youtube diff --git a/yt_dlp/extractor/youjizz.py b/yt_dlp/extractor/youjizz.py index 5f5fbf21c..111623ffe 100644 --- a/yt_dlp/extractor/youjizz.py +++ b/yt_dlp/extractor/youjizz.py @@ -36,8 +36,7 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'<title>(.+?)</title>', webpage, 'title') + title = self._html_extract_title(webpage) formats = [] diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 19b4985f6..017554c88 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -837,17 +837,20 @@ class YoutubeBaseInfoExtractor(InfoExtractor): uploader = self._get_text(renderer, 'ownerText', 'shortBylineText') channel_id = traverse_obj( - renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), expected_type=str, get_all=False) + renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), + expected_type=str, get_all=False) timestamp, time_text = self._extract_time_text(renderer, 'publishedTimeText') scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False)) overlay_style = traverse_obj( - renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) + renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), + get_all=False, expected_type=str) badges = self._extract_badges(renderer) thumbnails = self._extract_thumbnails(renderer, 'thumbnail') navigation_url = urljoin('https://www.youtube.com/', traverse_obj( - renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str)) + renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), + expected_type=str)) or '' url = f'https://www.youtube.com/watch?v={video_id}' - if overlay_style == 'SHORTS' or (navigation_url and '/shorts/' in navigation_url): + if overlay_style == 'SHORTS' or '/shorts/' in navigation_url: url = f'https://www.youtube.com/shorts/{video_id}' return { @@ -862,7 +865,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'uploader': uploader, 'channel_id': channel_id, 'thumbnails': thumbnails, - 'upload_date': strftime_or_none(timestamp, '%Y%m%d') if self._configuration_arg('approximate_date', ie_key='youtubetab') else None, + 'upload_date': (strftime_or_none(timestamp, '%Y%m%d') + if self._configuration_arg('approximate_date', ie_key='youtubetab') + else None), 'live_status': ('is_upcoming' if scheduled_timestamp is not None else 'was_live' if 'streamed' in time_text.lower() else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges @@ -3777,7 +3782,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _extract_basic_item_renderer(item): # Modified from _extract_grid_item_renderer known_basic_renderers = ( - 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer' + 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer', 'reelItemRenderer' ) for key, renderer in item.items(): if not isinstance(renderer, dict): @@ -3903,6 +3908,13 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): if video_id: return self._extract_video(video_renderer) + def _hashtag_tile_entry(self, hashtag_tile_renderer): + url = urljoin('https://youtube.com', traverse_obj( + hashtag_tile_renderer, ('onTapCommand', 'commandMetadata', 'webCommandMetadata', 'url'))) + if url: + return self.url_result( + url, ie=YoutubeTabIE.ie_key(), title=self._get_text(hashtag_tile_renderer, 'hashtag')) + def _post_thread_entries(self, post_thread_renderer): post_renderer = try_get( post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict) @@ -3985,12 +3997,14 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): known_renderers = { 'playlistVideoListRenderer': self._playlist_entries, 'gridRenderer': self._grid_entries, - 'shelfRenderer': lambda x: self._shelf_entries(x), + 'reelShelfRenderer': self._grid_entries, + 'shelfRenderer': self._shelf_entries, 'musicResponsiveListItemRenderer': lambda x: [self._music_reponsive_list_entry(x)], 'backstagePostThreadRenderer': self._post_thread_entries, 'videoRenderer': lambda x: [self._video_entry(x)], 'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}), 'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}), + 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)] } for key, renderer in isr_content.items(): if key not in known_renderers: @@ -4162,7 +4176,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): }) primary_thumbnails = self._extract_thumbnails( - primary_sidebar_renderer, ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail')) + primary_sidebar_renderer, ('thumbnailRenderer', ('playlistVideoThumbnailRenderer', 'playlistCustomThumbnailRenderer'), 'thumbnail')) if playlist_id is None: playlist_id = item_id @@ -5520,7 +5534,17 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): 'id': 'python', 'title': 'python', } - + }, { + 'url': 'https://www.youtube.com/results?search_query=%23cats', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '#cats', + 'title': '#cats', + 'entries': [{ + 'url': r're:https://(www\.)?youtube\.com/hashtag/cats', + 'title': '#cats', + }], + }, }, { 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', 'only_matching': True, diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 936cc8b6f..c23395671 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -163,6 +163,8 @@ def create_parser(): values = [process(value)] if delim is None else list(map(process, value.split(delim)[::-1])) while values: actual_val = val = values.pop() + if not val: + raise optparse.OptionValueError(f'Invalid {option.metavar} for {opt_str}: {value}') if val == 'all': current.update(allowed_values) elif val == '-all': @@ -1307,7 +1309,7 @@ def create_parser(): '--audio-format', metavar='FORMAT', dest='audioformat', default='best', help=( 'Specify audio format to convert the audio to when -x is used. Currently supported formats are: ' - 'best (default) or one of %s' % '|'.join(FFmpegExtractAudioPP.SUPPORTED_EXTS))) + 'best (default) or one of %s' % ', '.join(FFmpegExtractAudioPP.SUPPORTED_EXTS))) postproc.add_option( '--audio-quality', metavar='QUALITY', dest='audioquality', default='5', @@ -1319,7 +1321,7 @@ def create_parser(): 'Remux the video into another container if necessary (currently supported: %s). ' 'If target container does not support the video/audio codec, remuxing will fail. ' 'You can specify multiple rules; Eg. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 ' - 'and anything else to mkv.' % '|'.join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS))) + 'and anything else to mkv.' % ', '.join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS))) postproc.add_option( '--recode-video', metavar='FORMAT', dest='recodevideo', default=None, @@ -1434,7 +1436,7 @@ def create_parser(): '"multi_video" (default; only when the videos form a single show). ' 'All the video files must have same codecs and number of streams to be concatable. ' 'The "pl_video:" prefix can be used with "--paths" and "--output" to ' - 'set the output filename for the split files. See "OUTPUT TEMPLATE" for details')) + 'set the output filename for the concatenated files. See "OUTPUT TEMPLATE" for details')) postproc.add_option( '--fixup', metavar='POLICY', dest='fixup', default=None, @@ -1482,20 +1484,20 @@ def create_parser(): help=optparse.SUPPRESS_HELP) postproc.add_option( '--no-exec-before-download', - action='store_const', dest='exec_before_dl_cmd', const=[], + action='store_const', dest='exec_before_dl_cmd', const=None, help=optparse.SUPPRESS_HELP) postproc.add_option( '--convert-subs', '--convert-sub', '--convert-subtitles', metavar='FORMAT', dest='convertsubtitles', default=None, help=( 'Convert the subtitles to another format (currently supported: %s) ' - '(Alias: --convert-subtitles)' % '|'.join(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS))) + '(Alias: --convert-subtitles)' % ', '.join(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS))) postproc.add_option( '--convert-thumbnails', metavar='FORMAT', dest='convertthumbnails', default=None, help=( 'Convert the thumbnails to another format ' - '(currently supported: %s) ' % '|'.join(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS))) + '(currently supported: %s) ' % ', '.join(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS))) postproc.add_option( '--split-chapters', '--split-tracks', dest='split_chapters', action='store_true', default=False, diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 0b18e8774..643290286 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -95,7 +95,7 @@ class FFmpegPostProcessor(PostProcessor): def get_ffmpeg_version(path, prog): if path in self._version_cache: - self._versions[path], self._features = self._version_cache[path], self._features_cache.get(path, {}) + self._versions[prog], self._features = self._version_cache[path], self._features_cache.get(path, {}) return out = _get_exe_version_output(path, ['-bsfs'], to_screen=self.write_debug) ver = detect_exe_version(out) if out else False @@ -500,6 +500,9 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): temp_path = new_path = prefix + sep + extension if new_path == path: + if acodec == 'copy': + self.to_screen(f'File is already in target format {self._preferredcodec}, skipping') + return [], information orig_path = prepend_extension(path, 'orig') temp_path = prepend_extension(path, 'temp') if (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)) @@ -1122,6 +1125,11 @@ class FFmpegConcatPP(FFmpegPostProcessor): self._only_multi_video = only_multi_video super().__init__(downloader) + def _get_codecs(self, file): + codecs = traverse_obj(self.get_metadata_object(file), ('streams', ..., 'codec_name')) + self.write_debug(f'Codecs = {", ".join(codecs)}') + return tuple(codecs) + def concat_files(self, in_files, out_file): if not self._downloader._ensure_dir_exists(out_file): return @@ -1131,8 +1139,7 @@ class FFmpegConcatPP(FFmpegPostProcessor): os.replace(in_files[0], out_file) return [] - codecs = [traverse_obj(self.get_metadata_object(file), ('streams', ..., 'codec_name')) for file in in_files] - if len(set(map(tuple, codecs))) > 1: + if len(set(map(self._get_codecs, in_files))) > 1: raise PostProcessingError( 'The files have different streams/codecs and cannot be concatenated. ' 'Either select different formats or --recode-video them to a common format') @@ -1146,7 +1153,7 @@ class FFmpegConcatPP(FFmpegPostProcessor): entries = info.get('entries') or [] if not any(entries) or (self._only_multi_video and info['_type'] != 'multi_video'): return [], info - elif any(len(entry) > 1 for entry in traverse_obj(entries, (..., 'requested_downloads')) or []): + elif traverse_obj(entries, (..., 'requested_downloads', lambda _, v: len(v) > 1)): raise PostProcessingError('Concatenation is not supported when downloading multiple separate formats') in_files = traverse_obj(entries, (..., 'requested_downloads', 0, 'filepath')) or [] diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 62a1800d4..6663583fc 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -1040,7 +1040,7 @@ def make_HTTPS_handler(params, **kwargs): def bug_reports_message(before=';'): - msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp , ' + msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , ' 'filling out the appropriate issue template. ' 'Confirm you are on the latest version using yt-dlp -U') @@ -2418,11 +2418,14 @@ def parse_count(s): return str_to_int(mobj.group(1)) -def parse_resolution(s): +def parse_resolution(s, *, lenient=False): if s is None: return {} - mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s) + if lenient: + mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s) + else: + mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s) if mobj: return { 'width': int(mobj.group('w')), @@ -2880,6 +2883,7 @@ class PagedList: class OnDemandPagedList(PagedList): + """Download pages until a page with less than maximum results""" def _getslice(self, start, end): for pagenum in itertools.count(start // self._pagesize): firstid = pagenum * self._pagesize @@ -2919,6 +2923,7 @@ class OnDemandPagedList(PagedList): class InAdvancePagedList(PagedList): + """PagedList with total number of pages known in advance""" def __init__(self, pagefunc, pagecount, pagesize): PagedList.__init__(self, pagefunc, pagesize, True) self._pagecount = pagecount @@ -3087,24 +3092,25 @@ def multipart_encode(data, boundary=None): def dict_get(d, key_or_keys, default=None, skip_false_values=True): - if isinstance(key_or_keys, (list, tuple)): - for key in key_or_keys: - if key not in d or d[key] is None or skip_false_values and not d[key]: - continue - return d[key] - return default - return d.get(key_or_keys, default) + for val in map(d.get, variadic(key_or_keys)): + if val is not None and (val or not skip_false_values): + return val + return default -def try_get(src, getter, expected_type=None): - for get in variadic(getter): +def try_call(*funcs, expected_type=None, args=[], kwargs={}): + for f in funcs: try: - v = get(src) - except (AttributeError, KeyError, TypeError, IndexError): + val = f(*args, **kwargs) + except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError): pass else: - if expected_type is None or isinstance(v, expected_type): - return v + if expected_type is None or isinstance(val, expected_type): + return val + + +def try_get(src, getter, expected_type=None): + return try_call(*variadic(getter), args=(src,), expected_type=expected_type) def filter_dict(dct, cndn=lambda _, v: v is not None): @@ -3317,6 +3323,10 @@ def error_to_compat_str(err): return err_str +def error_to_str(err): + return f'{type(err).__name__}: {err}' + + def mimetype2ext(mt): if mt is None: return None @@ -5148,8 +5158,8 @@ def traverse_obj( @param path_list A list of paths which are checked one by one. Each path is a list of keys where each key is a string, a function, a tuple of strings/None or "...". - When a fuction is given, it takes the key as argument and - returns whether the key matches or not. When a tuple is given, + When a fuction is given, it takes the key and value as arguments + and returns whether the key matches or not. When a tuple is given, all the keys given in the tuple are traversed, and "..." traverses all the keys in the object "None" returns the object without traversal @@ -5194,7 +5204,7 @@ def traverse_obj( obj = str(obj) _current_depth += 1 depth = max(depth, _current_depth) - return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)] + return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))] elif isinstance(obj, dict) and not (is_user_input and key == ':'): obj = (obj.get(key) if casesense or (key in obj) else next((v for k, v in obj.items() if _lower(k) == key), None)) |