diff options
author | Jesús <heckyel@hyperbola.info> | 2022-03-30 01:24:15 +0800 |
---|---|---|
committer | Jesús <heckyel@hyperbola.info> | 2022-03-30 01:24:15 +0800 |
commit | 950cc067b8c41ac246deb4725177a372c95d8341 (patch) | |
tree | 66d5284ff86faec8c3000be1e7d0bd856b4c4dbe | |
parent | 7a74bc5d1e54299e51b73492e09c70da994f4b35 (diff) | |
parent | e7870111e83033e0ac728d5a2d565d1eb146c335 (diff) | |
download | hypervideo-pre-950cc067b8c41ac246deb4725177a372c95d8341.tar.lz hypervideo-pre-950cc067b8c41ac246deb4725177a372c95d8341.tar.xz hypervideo-pre-950cc067b8c41ac246deb4725177a372c95d8341.zip |
updated from upstream | 30/03/2022 at 01:24
32 files changed, 1229 insertions, 386 deletions
diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py index 4c11e25f2..729f60a0e 100644 --- a/devscripts/make_supportedsites.py +++ b/devscripts/make_supportedsites.py @@ -24,10 +24,9 @@ def main(): def gen_ies_md(ies): for ie in ies: ie_md = '**{0}**'.format(ie.IE_NAME) - ie_desc = getattr(ie, 'IE_DESC', None) - if ie_desc is False: + if ie.IE_DESC is False: continue - if ie_desc is not None: + if ie.IE_DESC is not None: ie_md += ': {0}'.format(ie.IE_DESC) search_key = getattr(ie, 'SEARCH_KEY', None) if search_key is not None: @@ -74,7 +74,7 @@ def version_to_list(version): def dependency_options(): - dependencies = [pycryptodome_module(), 'mutagen', 'brotli'] + collect_submodules('websockets') + dependencies = [pycryptodome_module(), 'mutagen', 'brotli', 'certifi'] + collect_submodules('websockets') excluded_modules = ['test', 'ytdlp_plugins', 'youtube-dl', 'youtube-dlc'] yield from (f'--hidden-import={module}' for module in dependencies) diff --git a/requirements.txt b/requirements.txt index 7818aca78..b65d25456 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ mutagen pycryptodome websockets brotli; platform_python_implementation=='CPython' -brotlicffi; platform_python_implementation!='CPython'
\ No newline at end of file +brotlicffi; platform_python_implementation!='CPython' +certifi
\ No newline at end of file diff --git a/test/helper.py b/test/helper.py index 1070e0668..28c21b2eb 100644 --- a/test/helper.py +++ b/test/helper.py @@ -196,15 +196,7 @@ def expect_dict(self, got_dict, expected_dict): def sanitize_got_info_dict(got_dict): IGNORED_FIELDS = ( - # Format keys - 'url', 'manifest_url', 'format', 'format_id', 'format_note', 'width', 'height', 'resolution', - 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'vbr', 'fps', 'vcodec', 'container', 'filesize', - 'filesize_approx', 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'preference', - 'language', 'language_preference', 'quality', 'source_preference', 'http_headers', - 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options', - - # RTMP formats - 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time', + *YoutubeDL._format_fields, # Lists 'formats', 'thumbnails', 'subtitles', 'automatic_captions', 'comments', 'entries', diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 7637297be..c9108c5b6 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -818,6 +818,8 @@ class TestYoutubeDL(unittest.TestCase): test('%(id&foo)s.bar', 'foo.bar') test('%(title&foo)s.bar', 'NA.bar') test('%(title&foo|baz)s.bar', 'baz.bar') + test('%(x,id&foo|baz)s.bar', 'foo.bar') + test('%(x,title&foo|baz)s.bar', 'baz.bar') # Laziness def gen(): @@ -931,7 +933,7 @@ class TestYoutubeDL(unittest.TestCase): res = get_videos() self.assertEqual(res, ['1', '2']) - def f(v): + def f(v, incomplete): if v['id'] == '1': return None else: diff --git a/test/test_utils.py b/test/test_utils.py index a7f1b0e94..31f168998 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -160,10 +160,12 @@ class TestUtil(unittest.TestCase): sanitize_filename('New World record at 0:12:34'), 'New World record at 0_12_34') - self.assertEqual(sanitize_filename('--gasdgf'), '_-gasdgf') + self.assertEqual(sanitize_filename('--gasdgf'), '--gasdgf') self.assertEqual(sanitize_filename('--gasdgf', is_id=True), '--gasdgf') - self.assertEqual(sanitize_filename('.gasdgf'), 'gasdgf') + self.assertEqual(sanitize_filename('--gasdgf', is_id=False), '_-gasdgf') + self.assertEqual(sanitize_filename('.gasdgf'), '.gasdgf') self.assertEqual(sanitize_filename('.gasdgf', is_id=True), '.gasdgf') + self.assertEqual(sanitize_filename('.gasdgf', is_id=False), 'gasdgf') forbidden = '"\0\\/' for fc in forbidden: @@ -625,6 +627,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('3h 11m 53s'), 11513) self.assertEqual(parse_duration('3 hours 11 minutes 53 seconds'), 11513) self.assertEqual(parse_duration('3 hours 11 mins 53 secs'), 11513) + self.assertEqual(parse_duration('3 hours, 11 minutes, 53 seconds'), 11513) + self.assertEqual(parse_duration('3 hours, 11 mins, 53 secs'), 11513) self.assertEqual(parse_duration('62m45s'), 3765) self.assertEqual(parse_duration('6m59s'), 419) self.assertEqual(parse_duration('49s'), 49) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index a5c7348b2..e57716e00 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -65,6 +65,7 @@ from .utils import ( ExistingVideoReached, expand_path, ExtractorError, + filter_dict, float_or_none, format_bytes, format_field, @@ -72,6 +73,7 @@ from .utils import ( formatSeconds, GeoRestrictedError, get_domain, + has_certifi, HEADRequest, InAdvancePagedList, int_or_none, @@ -86,6 +88,7 @@ from .utils import ( MaxDownloadsReached, merge_headers, network_exceptions, + NO_DEFAULT, number_of_digits, orderedSet, OUTTMPL_TYPES, @@ -511,6 +514,16 @@ class YoutubeDL(object): 'track_number', 'disc_number', 'release_year', )) + _format_fields = { + # NB: Keep in sync with the docstring of extractor/common.py + 'url', 'manifest_url', 'ext', 'format', 'format_id', 'format_note', + 'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', + 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', + 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', + 'preference', 'language', 'language_preference', 'quality', 'source_preference', + 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options', + 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time' + } _format_selection_exts = { 'audio': {'m4a', 'mp3', 'ogg', 'aac'}, 'video': {'mp4', 'flv', 'webm', '3gp'}, @@ -786,7 +799,7 @@ class YoutubeDL(object): def to_stdout(self, message, skip_eol=False, quiet=None): """Print message to stdout""" if quiet is not None: - self.deprecation_warning('"ydl.to_stdout" no longer accepts the argument quiet. Use "ydl.to_screen" instead') + self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. Use "YoutubeDL.to_screen" instead') self._write_string( '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')), self._out_files['print']) @@ -1087,10 +1100,11 @@ class YoutubeDL(object): (?P<fields>{field}) (?P<maths>(?:{math_op}{math_field})*) (?:>(?P<strf_format>.+?))? - (?P<alternate>(?<!\\),[^|&)]+)? - (?:&(?P<replacement>.*?))? - (?:\|(?P<default>.*?))? - $'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE)) + (?P<remaining> + (?P<alternate>(?<!\\),[^|&)]+)? + (?:&(?P<replacement>.*?))? + (?:\|(?P<default>.*?))? + )$'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE)) def _traverse_infodict(k): k = k.split('.') @@ -1137,8 +1151,10 @@ class YoutubeDL(object): na = self.params.get('outtmpl_na_placeholder', 'NA') def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')): - return sanitize_filename(str(value), restricted=restricted, - is_id=re.search(r'(^|[_.])id(\.|$)', key)) + return sanitize_filename(str(value), restricted=restricted, is_id=( + bool(re.search(r'(^|[_.])id(\.|$)', key)) + if 'filename-sanitization' in self.params.get('compat_opts', []) + else NO_DEFAULT)) sanitizer = sanitize if callable(sanitize) else filename_sanitizer sanitize = bool(sanitize) @@ -1161,7 +1177,7 @@ class YoutubeDL(object): value = get_value(mobj) replacement = mobj['replacement'] if value is None and mobj['alternate']: - mobj = re.match(INTERNAL_FORMAT_RE, mobj['alternate'][1:]) + mobj = re.match(INTERNAL_FORMAT_RE, mobj['remaining'][1:]) else: break @@ -1558,13 +1574,9 @@ class YoutubeDL(object): if not info: return info - force_properties = dict( - (k, v) for k, v in ie_result.items() if v is not None) - for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'): - if f in force_properties: - del force_properties[f] new_result = info.copy() - new_result.update(force_properties) + new_result.update(filter_dict(ie_result, lambda k, v: ( + v is not None and k not in {'_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'}))) # Extracted info may not be a video result (i.e. # info.get('_type', 'video') != video) but rather an url or @@ -1802,7 +1814,7 @@ class YoutubeDL(object): ie_result['entries'] = playlist_results # Write the updated info to json - if _infojson_written and self._write_info_json( + if _infojson_written is True and self._write_info_json( 'updated playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None: return @@ -2443,6 +2455,11 @@ class YoutubeDL(object): info_dict['__has_drm'] = any(f.get('has_drm') for f in formats) if not self.params.get('allow_unplayable_formats'): formats = [f for f in formats if not f.get('has_drm')] + if info_dict['__has_drm'] and all( + f.get('acodec') == f.get('vcodec') == 'none' for f in formats): + self.report_warning( + 'This video is DRM protected and only images are available for download. ' + 'Use --list-formats to see them') get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start')) if not get_from_start: @@ -2539,7 +2556,7 @@ class YoutubeDL(object): info_dict, _ = self.pre_process(info_dict) - if self._match_entry(info_dict) is not None: + if self._match_entry(info_dict, incomplete=self._format_fields) is not None: return info_dict self.post_extract(info_dict) @@ -2615,8 +2632,9 @@ class YoutubeDL(object): if not formats_to_download: if not self.params.get('ignore_no_formats_error'): - raise ExtractorError('Requested format is not available', expected=True, - video_id=info_dict['id'], ie=info_dict['extractor']) + raise ExtractorError( + 'Requested format is not available. Use --list-formats for a list of available formats', + expected=True, video_id=info_dict['id'], ie=info_dict['extractor']) self.report_warning('Requested format is not available') # Process what we can, even without any available formats. formats_to_download = [{}] @@ -3675,6 +3693,7 @@ class YoutubeDL(object): lib_str = join_nonempty( compat_brotli and compat_brotli.__name__, + has_certifi and 'certifi', compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0], SECRETSTORAGE_AVAILABLE and 'secretstorage', has_mutagen and 'mutagen', @@ -3766,7 +3785,7 @@ class YoutubeDL(object): return encoding def _write_info_json(self, label, ie_result, infofn, overwrite=None): - ''' Write infojson and returns True = written, False = skip, None = error ''' + ''' Write infojson and returns True = written, 'exists' = Already exists, False = skip, None = error ''' if overwrite is None: overwrite = self.params.get('overwrites', True) if not self.params.get('writeinfojson'): @@ -3778,14 +3797,15 @@ class YoutubeDL(object): return None elif not overwrite and os.path.exists(infofn): self.to_screen(f'[info] {label.title()} metadata is already present') - else: - self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}') - try: - write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn) - except (OSError, IOError): - self.report_error(f'Cannot write {label} metadata to JSON file {infofn}') - return None - return True + return 'exists' + + self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}') + try: + write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn) + return True + except (OSError, IOError): + self.report_error(f'Cannot write {label} metadata to JSON file {infofn}') + return None def _write_description(self, label, ie_result, descfn): ''' Write description and returns True = written, False = skip, None = error ''' @@ -3856,9 +3876,12 @@ class YoutubeDL(object): sub_info['filepath'] = sub_filename ret.append((sub_filename, sub_filename_final)) except (DownloadError, ExtractorError, IOError, OSError, ValueError) + network_exceptions as err: + msg = f'Unable to download video subtitles for {sub_lang!r}: {err}' if self.params.get('ignoreerrors') is not True: # False or 'only_download' - raise DownloadError(f'Unable to download video subtitles for {sub_lang!r}: {err}', err) - self.report_warning(f'Unable to download video subtitles for {sub_lang!r}: {err}') + if not self.params.get('ignoreerrors'): + self.report_error(msg) + raise DownloadError(msg) + self.report_warning(msg) return ret def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None): diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 6c6ac7adf..6d5a64336 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -93,9 +93,9 @@ def print_extractor_information(opts, urls): for ie in list_extractors(opts.age_limit): if not ie.working(): continue - desc = getattr(ie, 'IE_DESC', ie.IE_NAME) - if desc is False: + if ie.IE_DESC is False: continue + desc = ie.IE_DESC or ie.IE_NAME if getattr(ie, 'SEARCH_KEY', None) is not None: _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow') _COUNTS = ('', '5', '10', 'all') diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index be6202eef..71af705ea 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -13,6 +13,7 @@ from ..compat import ( ) from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS from ..utils import ( + classproperty, cli_option, cli_valueless_option, cli_bool_option, @@ -73,17 +74,23 @@ class ExternalFD(FragmentFD): def get_basename(cls): return cls.__name__[:-2].lower() + @classproperty + def EXE_NAME(cls): + return cls.get_basename() + @property def exe(self): - return self.get_basename() + return self.EXE_NAME @classmethod def available(cls, path=None): - path = check_executable(path or cls.get_basename(), [cls.AVAILABLE_OPT]) - if path: - cls.exe = path - return path - return False + path = check_executable( + cls.EXE_NAME if path in (None, cls.get_basename()) else path, + [cls.AVAILABLE_OPT]) + if not path: + return False + cls.exe = path + return path @classmethod def supports(cls, info_dict): @@ -106,7 +113,7 @@ class ExternalFD(FragmentFD): def _configuration_args(self, keys=None, *args, **kwargs): return _configuration_args( - self.get_basename(), self.params.get('external_downloader_args'), self.get_basename(), + self.get_basename(), self.params.get('external_downloader_args'), self.EXE_NAME, keys, *args, **kwargs) def _call_downloader(self, tmpfilename, info_dict): @@ -169,7 +176,7 @@ class CurlFD(ExternalFD): AVAILABLE_OPT = '-V' def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '--location', '-o', tmpfilename] + cmd = [self.exe, '--location', '-o', tmpfilename, '--compressed'] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] @@ -219,7 +226,7 @@ class WgetFD(ExternalFD): AVAILABLE_OPT = '--version' def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] + cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies', '--compression=auto'] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] @@ -230,7 +237,10 @@ class WgetFD(ExternalFD): retry[1] = '0' cmd += retry cmd += self._option('--bind-address', 'source_address') - cmd += self._option('--proxy', 'proxy') + proxy = self.params.get('proxy') + if proxy: + for var in ('http_proxy', 'https_proxy'): + cmd += ['--execute', '%s=%s' % (var, proxy)] cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') cmd += self._configuration_args() cmd += ['--', info_dict['url']] @@ -303,10 +313,7 @@ class Aria2cFD(ExternalFD): class HttpieFD(ExternalFD): AVAILABLE_OPT = '--version' - - @classmethod - def available(cls, path=None): - return super().available(path or 'http') + EXE_NAME = 'http' def _make_cmd(self, tmpfilename, info_dict): cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] @@ -507,11 +514,13 @@ class AVconvFD(FFmpegFD): pass -_BY_NAME = dict( - (klass.get_basename(), klass) +_BY_NAME = { + klass.get_basename(): klass for name, klass in globals().items() if name.endswith('FD') and name not in ('ExternalFD', 'FragmentFD') -) +} + +_BY_EXE = {klass.EXE_NAME: klass for klass in _BY_NAME.values()} def list_external_downloaders(): @@ -523,4 +532,4 @@ def get_external_downloader(external_downloader): downloader . """ # Drop .exe extension on Windows bn = os.path.splitext(os.path.basename(external_downloader))[0] - return _BY_NAME.get(bn) + return _BY_NAME.get(bn, _BY_EXE.get(bn)) diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index f0eba8844..5d98301b8 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -1650,21 +1650,27 @@ class AdobePassIE(InfoExtractor): hidden_data = self._hidden_inputs(first_bookend_page) hidden_data['history_val'] = 1 - provider_login_redirect_page = self._download_webpage( + provider_login_redirect_page_res = self._download_webpage_handle( urlh.geturl(), video_id, 'Sending First Bookend', query=hidden_data) - provider_tryauth_url = self._html_search_regex( - r'url:\s*[\'"]([^\'"]+)', provider_login_redirect_page, 'ajaxurl') + provider_login_redirect_page, urlh = provider_login_redirect_page_res - provider_tryauth_page = self._download_webpage( - provider_tryauth_url, video_id, 'Submitting TryAuth', - query=hidden_data) + # Some website partners seem to not have the extra ajaxurl redirect step, so we check if we already + # have the login prompt or not + if 'id="password" type="password" name="password"' in provider_login_redirect_page: + provider_login_page_res = provider_login_redirect_page_res + else: + provider_tryauth_url = self._html_search_regex( + r'url:\s*[\'"]([^\'"]+)', provider_login_redirect_page, 'ajaxurl') + provider_tryauth_page = self._download_webpage( + provider_tryauth_url, video_id, 'Submitting TryAuth', + query=hidden_data) - provider_login_page_res = self._download_webpage_handle( - f'https://authorize.suddenlink.net/saml/module.php/authSynacor/login.php?AuthState={provider_tryauth_page}', - video_id, 'Getting Login Page', - query=hidden_data) + provider_login_page_res = self._download_webpage_handle( + f'https://authorize.suddenlink.net/saml/module.php/authSynacor/login.php?AuthState={provider_tryauth_page}', + video_id, 'Getting Login Page', + query=hidden_data) provider_association_redirect, urlh = post_form( provider_login_page_res, 'Logging in', { diff --git a/yt_dlp/extractor/banbye.py b/yt_dlp/extractor/banbye.py new file mode 100644 index 000000000..3d4d36ec3 --- /dev/null +++ b/yt_dlp/extractor/banbye.py @@ -0,0 +1,153 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import math + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) +from ..utils import ( + format_field, + InAdvancePagedList, + traverse_obj, + unified_timestamp, +) + + +class BanByeBaseIE(InfoExtractor): + _API_BASE = 'https://api.banbye.com' + _CDN_BASE = 'https://cdn.banbye.com' + _VIDEO_BASE = 'https://banbye.com/watch' + + @staticmethod + def _extract_playlist_id(url, param='playlist'): + return compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get(param, [None])[0] + + def _extract_playlist(self, playlist_id): + data = self._download_json(f'{self._API_BASE}/playlists/{playlist_id}', playlist_id) + return self.playlist_result([ + self.url_result(f'{self._VIDEO_BASE}/{video_id}', BanByeIE) + for video_id in data['videoIds']], playlist_id, data.get('name')) + + +class BanByeIE(BanByeBaseIE): + _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T', + 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5', + 'info_dict': { + 'id': 'v_ytfmvkVYLE8T', + 'ext': 'mp4', + 'title': 'md5:5ec098f88a0d796f987648de6322ba0f', + 'description': 'md5:4d94836e73396bc18ef1fa0f43e5a63a', + 'uploader': 'wRealu24', + 'channel_id': 'ch_wrealu24', + 'channel_url': 'https://banbye.com/channel/ch_wrealu24', + 'timestamp': 1647604800, + 'upload_date': '20220318', + 'duration': 1931, + 'thumbnail': r're:https?://.*\.webp', + 'tags': 'count:5', + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, + }, { + 'url': 'https://banbye.com/watch/v_2JjQtqjKUE_F?playlistId=p_Ld82N6gBw_OJ', + 'info_dict': { + 'title': 'Krzysztof Karoń', + 'id': 'p_Ld82N6gBw_OJ', + }, + 'playlist_count': 9, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + playlist_id = self._extract_playlist_id(url, 'playlistId') + + if self._yes_playlist(playlist_id, video_id): + return self._extract_playlist(playlist_id) + + data = self._download_json(f'{self._API_BASE}/videos/{video_id}', video_id) + thumbnails = [{ + 'id': f'{quality}p', + 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.webp', + } for quality in [48, 96, 144, 240, 512, 1080]] + formats = [{ + 'format_id': f'http-{quality}p', + 'quality': quality, + 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4', + } for quality in data['quality']] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': data.get('title'), + 'description': data.get('desc'), + 'uploader': traverse_obj(data, ('channel', 'name')), + 'channel_id': data.get('channelId'), + 'channel_url': format_field(data, 'channelId', 'https://banbye.com/channel/%s'), + 'timestamp': unified_timestamp(data.get('publishedAt')), + 'duration': data.get('duration'), + 'tags': data.get('tags'), + 'formats': formats, + 'thumbnails': thumbnails, + 'like_count': data.get('likes'), + 'dislike_count': data.get('dislikes'), + 'view_count': data.get('views'), + 'comment_count': data.get('commentCount'), + } + + +class BanByeChannelIE(BanByeBaseIE): + _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?channel/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://banbye.com/channel/ch_wrealu24', + 'info_dict': { + 'title': 'wRealu24', + 'id': 'ch_wrealu24', + 'description': 'md5:da54e48416b74dfdde20a04867c0c2f6', + }, + 'playlist_mincount': 791, + }, { + 'url': 'https://banbye.com/channel/ch_wrealu24?playlist=p_Ld82N6gBw_OJ', + 'info_dict': { + 'title': 'Krzysztof Karoń', + 'id': 'p_Ld82N6gBw_OJ', + }, + 'playlist_count': 9, + }] + _PAGE_SIZE = 100 + + def _real_extract(self, url): + channel_id = self._match_id(url) + playlist_id = self._extract_playlist_id(url) + + if playlist_id: + return self._extract_playlist(playlist_id) + + def page_func(page_num): + data = self._download_json(f'{self._API_BASE}/videos', channel_id, query={ + 'channelId': channel_id, + 'sort': 'new', + 'limit': self._PAGE_SIZE, + 'offset': page_num * self._PAGE_SIZE, + }, note=f'Downloading page {page_num+1}') + return [ + self.url_result(f"{self._VIDEO_BASE}/{video['_id']}", BanByeIE) + for video in data['items'] + ] + + channel_data = self._download_json(f'{self._API_BASE}/channels/{channel_id}', channel_id) + entries = InAdvancePagedList( + page_func, + math.ceil(channel_data['videoCount'] / self._PAGE_SIZE), + self._PAGE_SIZE) + + return self.playlist_result( + entries, channel_id, channel_data.get('name'), channel_data.get('description')) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index b4eb20642..dd1ff512e 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -15,6 +15,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + filter_dict, int_or_none, float_or_none, mimetype2ext, @@ -755,15 +756,21 @@ class BiliIntlBaseIE(InfoExtractor): for i, line in enumerate(json['body']) if line.get('content')) return data - def _get_subtitles(self, ep_id): - sub_json = self._call_api(f'/web/v2/subtitle?episode_id={ep_id}&platform=web', ep_id) + def _get_subtitles(self, *, ep_id=None, aid=None): + sub_json = self._call_api( + '/web/v2/subtitle', ep_id or aid, note='Downloading subtitles list', + errnote='Unable to download subtitles list', query=filter_dict({ + 'platform': 'web', + 'episode_id': ep_id, + 'aid': aid, + })) subtitles = {} for sub in sub_json.get('subtitles') or []: sub_url = sub.get('url') if not sub_url: continue sub_data = self._download_json( - sub_url, ep_id, errnote='Unable to download subtitles', fatal=False, + sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False, note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '') if not sub_data: continue @@ -773,9 +780,14 @@ class BiliIntlBaseIE(InfoExtractor): }) return subtitles - def _get_formats(self, ep_id): - video_json = self._call_api(f'/web/playurl?ep_id={ep_id}&platform=web', ep_id, - note='Downloading video formats', errnote='Unable to download video formats') + def _get_formats(self, *, ep_id=None, aid=None): + video_json = self._call_api( + '/web/playurl', ep_id or aid, note='Downloading video formats', + errnote='Unable to download video formats', query=filter_dict({ + 'platform': 'web', + 'ep_id': ep_id, + 'aid': aid, + })) video_json = video_json['playurl'] formats = [] for vid in video_json.get('video') or []: @@ -809,15 +821,15 @@ class BiliIntlBaseIE(InfoExtractor): self._sort_formats(formats) return formats - def _extract_ep_info(self, episode_data, ep_id): + def _extract_video_info(self, video_data, *, ep_id=None, aid=None): return { - 'id': ep_id, - 'title': episode_data.get('title_display') or episode_data['title'], - 'thumbnail': episode_data.get('cover'), + 'id': ep_id or aid, + 'title': video_data.get('title_display') or video_data.get('title'), + 'thumbnail': video_data.get('cover'), 'episode_number': int_or_none(self._search_regex( - r'^E(\d+)(?:$| - )', episode_data.get('title_display'), 'episode number', default=None)), - 'formats': self._get_formats(ep_id), - 'subtitles': self._get_subtitles(ep_id), + r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)), + 'formats': self._get_formats(ep_id=ep_id, aid=aid), + 'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid), 'extractor_key': BiliIntlIE.ie_key(), } @@ -854,7 +866,7 @@ class BiliIntlBaseIE(InfoExtractor): class BiliIntlIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?(play/(?P<season_id>\d+)/(?P<ep_id>\d+)|video/(?P<aid>\d+))' _TESTS = [{ # Bstation page 'url': 'https://www.bilibili.tv/en/play/34613/341736', @@ -889,24 +901,35 @@ class BiliIntlIE(BiliIntlBaseIE): }, { 'url': 'https://www.biliintl.com/en/play/34613/341736', 'only_matching': True, + }, { + # User-generated content (as opposed to a series licensed from a studio) + 'url': 'https://bilibili.tv/en/video/2019955076', + 'only_matching': True, + }, { + # No language in URL + 'url': 'https://www.bilibili.tv/video/2019955076', + 'only_matching': True, }] def _real_extract(self, url): - season_id, video_id = self._match_valid_url(url).groups() + season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') + video_id = ep_id or aid webpage = self._download_webpage(url, video_id) # Bstation layout initial_data = self._parse_json(self._search_regex( - r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, + r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage, 'preload state', default='{}'), video_id, fatal=False) or {} - episode_data = traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) + video_data = ( + traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) + or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {}) - if not episode_data: + if season_id and not video_data: # Non-Bstation layout, read through episode list season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) - episode_data = next( + video_data = next( episode for episode in traverse_obj(season_json, ('sections', ..., 'episodes', ...), expected_type=dict) - if str(episode.get('episode_id')) == video_id) - return self._extract_ep_info(episode_data, video_id) + if str(episode.get('episode_id')) == ep_id) + return self._extract_video_info(video_data, ep_id=ep_id, aid=aid) class BiliIntlSeriesIE(BiliIntlBaseIE): @@ -934,7 +957,7 @@ class BiliIntlSeriesIE(BiliIntlBaseIE): series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id) for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]): episode_id = str(episode.get('episode_id')) - yield self._extract_ep_info(episode, episode_id) + yield self._extract_video_info(episode, ep_id=episode_id) def _real_extract(self, url): series_id = self._match_id(url) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index f3ae3fd4c..d0e57da23 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -49,6 +49,7 @@ from ..utils import ( error_to_compat_str, extract_attributes, ExtractorError, + filter_dict, fix_xml_ampersands, float_or_none, format_field, @@ -248,14 +249,14 @@ class InfoExtractor(object): license: License name the video is licensed under. creator: The creator of the video. timestamp: UNIX timestamp of the moment the video was uploaded - upload_date: Video upload date (YYYYMMDD). + upload_date: Video upload date in UTC (YYYYMMDD). If not explicitly set, calculated from timestamp release_timestamp: UNIX timestamp of the moment the video was released. If it is not clear whether to use timestamp or this, use the former - release_date: The date (YYYYMMDD) when the video was released. + release_date: The date (YYYYMMDD) when the video was released in UTC. If not explicitly set, calculated from release_timestamp modified_timestamp: UNIX timestamp of the moment the video was last modified. - modified_date: The date (YYYYMMDD) when the video was last modified. + modified_date: The date (YYYYMMDD) when the video was last modified in UTC. If not explicitly set, calculated from modified_timestamp uploader_id: Nickname or id of the video uploader. uploader_url: Full URL to a personal webpage of the video uploader. @@ -469,6 +470,7 @@ class InfoExtractor(object): _GEO_IP_BLOCKS = None _WORKING = True _NETRC_MACHINE = None + IE_DESC = None _LOGIN_HINTS = { 'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials', @@ -1033,7 +1035,7 @@ class InfoExtractor(object): if transform_source: json_string = transform_source(json_string) try: - return json.loads(json_string) + return json.loads(json_string, strict=False) except ValueError as ve: errmsg = '%s: Failed to parse JSON ' % video_id if fatal: @@ -1587,7 +1589,7 @@ class InfoExtractor(object): break traverse_json_ld(json_ld) - return dict((k, v) for k, v in info.items() if v is not None) + return filter_dict(info) def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): return self._parse_json( diff --git a/yt_dlp/extractor/ellentube.py b/yt_dlp/extractor/ellentube.py index 544473274..d451bc048 100644 --- a/yt_dlp/extractor/ellentube.py +++ b/yt_dlp/extractor/ellentube.py @@ -26,7 +26,7 @@ class EllenTubeBaseIE(InfoExtractor): duration = None for entry in data.get('media'): if entry.get('id') == 'm3u8': - formats = self._extract_m3u8_formats( + formats, subtitles = self._extract_m3u8_formats_and_subtitles( entry['url'], video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') duration = int_or_none(entry.get('duration')) @@ -48,6 +48,7 @@ class EllenTubeBaseIE(InfoExtractor): 'view_count': get_insight('view'), 'like_count': get_insight('like'), 'formats': formats, + 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 4eda27cdc..e5ae12a7d 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -122,6 +122,10 @@ from .awaan import ( ) from .azmedien import AZMedienIE from .baidu import BaiduVideoIE +from .banbye import ( + BanByeIE, + BanByeChannelIE, +) from .bandaichannel import BandaiChannelIE from .bandcamp import ( BandcampIE, @@ -674,6 +678,12 @@ from .iqiyi import ( IqIE, IqAlbumIE ) + +from .itprotv import ( + ITProTVIE, + ITProTVCourseIE +) + from .itv import ( ITVIE, ITVBTCCIE, @@ -731,6 +741,11 @@ from .laola1tv import ( EHFTVIE, ITTFIE, ) +from .lastfm import ( + LastFMIE, + LastFMPlaylistIE, + LastFMUserIE, +) from .lbry import ( LBRYIE, LBRYChannelIE, @@ -1962,6 +1977,11 @@ from .washingtonpost import ( WashingtonPostIE, WashingtonPostArticleIE, ) +from .wasdtv import ( + WASDTVStreamIE, + WASDTVRecordIE, + WASDTVClipIE, +) from .wat import WatIE from .watchbox import WatchBoxIE from .watchindianporn import WatchIndianPornIE diff --git a/yt_dlp/extractor/fc2.py b/yt_dlp/extractor/fc2.py index 7fc6b0e3d..54a83aa16 100644 --- a/yt_dlp/extractor/fc2.py +++ b/yt_dlp/extractor/fc2.py @@ -212,7 +212,6 @@ class FC2LiveIE(InfoExtractor): 'Accept': '*/*', 'User-Agent': std_headers['User-Agent'], }) - ws.__enter__() self.write_debug('[debug] Sending HLS server request') diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 97e34808f..4a2e30158 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -17,6 +17,7 @@ from ..compat import ( ) from ..utils import ( determine_ext, + dict_get, ExtractorError, float_or_none, HEADRequest, @@ -31,6 +32,7 @@ from ..utils import ( parse_resolution, sanitized_Request, smuggle_url, + str_or_none, unescapeHTML, unified_timestamp, unsmuggle_url, @@ -3778,11 +3780,12 @@ class GenericIE(InfoExtractor): # Video.js embed mobj = re.search( - r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', + r'(?s)\bvideojs\s*\(.+?([a-zA-Z0-9_$]+)\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', webpage) if mobj is not None: + varname = mobj.group(1) sources = self._parse_json( - mobj.group(1), video_id, transform_source=js_to_json, + mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [] if not isinstance(sources, list): sources = [sources] @@ -3819,6 +3822,21 @@ class GenericIE(InfoExtractor): 'Referer': full_response.geturl(), }, }) + # https://docs.videojs.com/player#addRemoteTextTrack + # https://html.spec.whatwg.org/multipage/media.html#htmltrackelement + for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage): + sub = self._parse_json( + sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {} + src = str_or_none(sub.get('src')) + if not src: + continue + subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({ + 'url': compat_urlparse.urljoin(url, src), + 'name': sub.get('label'), + 'http_headers': { + 'Referer': full_response.geturl(), + }, + }) if formats or subtitles: self.report_detected('video.js embed') self._sort_formats(formats) diff --git a/yt_dlp/extractor/go.py b/yt_dlp/extractor/go.py index 2ccc6df21..f92e16600 100644 --- a/yt_dlp/extractor/go.py +++ b/yt_dlp/extractor/go.py @@ -217,6 +217,7 @@ class GoIE(AdobePassIE): title = video_data['title'] formats = [] + subtitles = {} for asset in video_data.get('assets', {}).get('asset', []): asset_url = asset.get('value') if not asset_url: @@ -256,8 +257,10 @@ class GoIE(AdobePassIE): error_message = ', '.join([error['message'] for error in errors]) raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) asset_url += '?' + entitlement['uplynkData']['sessionKey'] - formats.extend(self._extract_m3u8_formats( - asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False)) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: f = { 'format_id': format_id, @@ -281,7 +284,6 @@ class GoIE(AdobePassIE): formats.append(f) self._sort_formats(formats) - subtitles = {} for cc in video_data.get('closedcaption', {}).get('src', []): cc_url = cc.get('value') if not cc_url: diff --git a/yt_dlp/extractor/itprotv.py b/yt_dlp/extractor/itprotv.py new file mode 100644 index 000000000..64cb4e69a --- /dev/null +++ b/yt_dlp/extractor/itprotv.py @@ -0,0 +1,141 @@ +# coding: utf-8 + +import re + +from .common import InfoExtractor + +from ..utils import ( + int_or_none, + str_or_none, + traverse_obj, + urljoin +) + + +class ITProTVBaseIE(InfoExtractor): + _ENDPOINTS = { + 'course': 'course?url={}&brand=00002560-0000-3fa9-0000-1d61000035f3', + 'episode': 'brand/00002560-0000-3fa9-0000-1d61000035f3/episode?url={}' + } + + def _call_api(self, ep, item_id, webpage): + return self._download_json( + f'https://api.itpro.tv/api/urza/v3/consumer-web/{self._ENDPOINTS[ep].format(item_id)}', + item_id, note=f'Fetching {ep} data API', + headers={'Authorization': f'Bearer {self._fetch_jwt(webpage)}'})[ep] + + def _fetch_jwt(self, webpage): + return self._search_regex(r'{"passedToken":"([\w-]+\.[\w-]+\.[\w-]+)",', webpage, 'jwt') + + def _check_if_logged_in(self, webpage): + if re.match(r'{\s*member\s*:\s*null', webpage): + self.raise_login_required() + + +class ITProTVIE(ITProTVBaseIE): + _VALID_URL = r'https://app.itpro.tv/course/(?P<course>[\w-]+)/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://app.itpro.tv/course/guided-tour/introductionitprotv', + 'md5': 'bca4a28c2667fd1a63052e71a94bb88c', + 'info_dict': { + 'id': 'introductionitprotv', + 'ext': 'mp4', + 'title': 'An Introduction to ITProTV 101', + 'thumbnail': 'https://itprotv-image-bucket.s3.amazonaws.com/getting-started/itprotv-101-introduction-PGM.11_39_56_02.Still001.png', + 'description': 'md5:b175c2c3061ce35a4dd33865b2c1da4e', + 'duration': 269, + 'series': 'ITProTV 101', + 'series_id': 'guided-tour', + 'availability': 'needs_auth', + 'chapter': 'ITProTV 101', + 'chapter_number': 1, + 'chapter_id': '5dbb3de426b46c0010b5d1b6' + }, + }, + { + 'url': 'https://app.itpro.tv/course/beyond-tech/job-interview-tips', + 'md5': '101a299b98c47ccf4c67f9f0951defa8', + 'info_dict': { + 'id': 'job-interview-tips', + 'ext': 'mp4', + 'title': 'Job Interview Tips', + 'thumbnail': 'https://s3.amazonaws.com:443/production-itprotv-thumbnails/2f370bf5-294d-4bbe-ab80-c0b5781630ea.png', + 'description': 'md5:30d8ba483febdf89ec85623aad3c3cb6', + 'duration': 267, + 'series': 'Beyond Tech', + 'series_id': 'beyond-tech', + 'availability': 'needs_auth', + 'chapter': 'Job Development', + 'chapter_number': 2, + 'chapter_id': '5f7c78d424330c000edf04d9' + }, + }] + + def _real_extract(self, url): + episode_id, course_name = self._match_valid_url(url).group('id', 'course') + webpage = self._download_webpage(url, episode_id) + self._check_if_logged_in(webpage) + course = self._call_api('course', course_name, webpage) + episode = self._call_api('episode', episode_id, webpage) + + chapter_number, chapter = next(( + (i, topic) for i, topic in enumerate(course.get('topics') or [], 1) + if traverse_obj(topic, 'id') == episode.get('topic')), {}) + + return { + 'id': episode_id, + 'title': episode.get('title'), + 'description': episode.get('description'), + 'thumbnail': episode.get('thumbnail'), + 'formats': [ + {'url': episode[f'jwVideo{h}Embed'], 'height': h} + for h in (320, 480, 720, 1080) if episode.get(f'jwVideo{h}Embed') + ], + 'duration': int_or_none(episode.get('length')), + 'series': course.get('name'), + 'series_id': course.get('url'), + 'chapter': str_or_none(chapter.get('title')), + 'chapter_number': chapter_number, + 'chapter_id': str_or_none(chapter.get('id')), + 'subtitles': { + 'en': [{'ext': 'vtt', 'data': episode['enCaptionData']}] + } if episode.get('enCaptionData') else None, + } + + +class ITProTVCourseIE(ITProTVBaseIE): + _VALID_URL = r'https?://app.itpro.tv/course/(?P<id>[\w-]+)/?(?:$|[#?])' + _TESTS = [ + { + 'url': 'https://app.itpro.tv/course/guided-tour', + 'info_dict': { + 'id': 'guided-tour', + 'description': 'md5:b175c2c3061ce35a4dd33865b2c1da4e', + 'title': 'ITProTV 101', + }, + 'playlist_count': 6 + }, + { + 'url': 'https://app.itpro.tv/course/beyond-tech', + 'info_dict': { + 'id': 'beyond-tech', + 'description': 'md5:44cd99855e7f81a15ce1269bd0621fed', + 'title': 'Beyond Tech' + }, + 'playlist_count': 15 + }, + ] + + def _real_extract(self, url): + course_id = self._match_id(url) + webpage = self._download_webpage(url, course_id) + self._check_if_logged_in(webpage) + course = self._call_api('course', course_id, webpage) + + entries = [self.url_result( + urljoin(url, f'{course_id}/{episode["url"]}'), ITProTVIE, + episode['url'], episode.get('title'), url_transparent=True) + for episode in course['episodes']] + + return self.playlist_result( + entries, course_id, course.get('name'), course.get('description')) diff --git a/yt_dlp/extractor/lastfm.py b/yt_dlp/extractor/lastfm.py new file mode 100644 index 000000000..5215717e8 --- /dev/null +++ b/yt_dlp/extractor/lastfm.py @@ -0,0 +1,129 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none, format_field + + +class LastFMPlaylistBaseIE(InfoExtractor): + def _entries(self, url, playlist_id): + webpage = self._download_webpage(url, playlist_id) + start_page_number = int_or_none(self._search_regex( + r'\bpage=(\d+)', url, 'page', default=None)) or 1 + last_page_number = int_or_none(self._search_regex( + r'>(\d+)</a>[^<]*</li>[^<]*<li[^>]+class="pagination-next', webpage, 'last_page', default=None)) + + for page_number in range(start_page_number, (last_page_number or start_page_number) + 1): + webpage = self._download_webpage( + url, playlist_id, + note='Downloading page %d%s' % (page_number, format_field(last_page_number, template=' of %d')), + query={'page': page_number}) + page_entries = [ + self.url_result(player_url, 'Youtube') + for player_url in set(re.findall(r'data-youtube-url="([^"]+)"', webpage)) + ] + + for e in page_entries: + yield e + + def _real_extract(self, url): + playlist_id = self._match_id(url) + return self.playlist_result(self._entries(url, playlist_id), playlist_id) + + +class LastFMPlaylistIE(LastFMPlaylistBaseIE): + _VALID_URL = r'https?://(?:www\.)?last\.fm/(music|tag)/(?P<id>[^/]+)(?:/[^/]+)?/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.last.fm/music/Oasis/(What%27s+the+Story)+Morning+Glory%3F', + 'info_dict': { + 'id': 'Oasis', + }, + 'playlist_count': 11, + }, { + 'url': 'https://www.last.fm/music/Oasis', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/music/Oasis/', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/music/Oasis?top_tracks_date_preset=ALL#top-tracks', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/music/Oasis/+tracks', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/music/Oasis/+tracks?page=2', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/music/Oasis/+tracks?date_preset=LAST_90_DAYS#top-tracks', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/tag/rock', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/tag/rock/tracks', + 'only_matching': True, + }] + + +class LastFMUserIE(LastFMPlaylistBaseIE): + _VALID_URL = r'https?://(?:www\.)?last\.fm/user/[^/]+/playlists/(?P<id>[^/#?]+)' + _TESTS = [{ + 'url': 'https://www.last.fm/user/mehq/playlists/12319471', + 'info_dict': { + 'id': '12319471', + }, + 'playlist_count': 30, + }] + + +class LastFMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?last\.fm/music(?:/[^/]+){2}/(?P<id>[^/#?]+)' + _TESTS = [{ + 'url': 'https://www.last.fm/music/Oasis/_/Wonderwall', + 'md5': '9c4a70c2e84c03d54fe24229b9e13b7b', + 'info_dict': { + 'id': '6hzrDeceEKc', + 'ext': 'mp4', + 'title': 'Oasis - Wonderwall (Official Video)', + 'thumbnail': r're:^https?://i.ytimg.com/.*\.jpg$', + 'description': 'md5:0848669853c10687cc28e88b5756738f', + 'uploader': 'Oasis', + 'uploader_id': 'oasisinetofficial', + 'upload_date': '20080207', + 'album': '(What\'s The Story) Morning Glory? (Remastered)', + 'track': 'Wonderwall (Remastered)', + 'channel_id': 'UCUDVBtnOQi4c7E8jebpjc9Q', + 'view_count': int, + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCUDVBtnOQi4c7E8jebpjc9Q', + 'tags': 'count:39', + 'creator': 'Oasis', + 'uploader_url': 're:^https?://www.youtube.com/user/oasisinetofficial', + 'duration': 279, + 'alt_title': 'Wonderwall (Remastered)', + 'age_limit': 0, + 'channel': 'Oasis', + 'channel_follower_count': int, + 'categories': ['Music'], + 'availability': 'public', + 'like_count': int, + 'playable_in_embed': True, + 'artist': 'Oasis', + }, + 'add_ie': ['Youtube'], + }, { + 'url': 'https://www.last.fm/music/Oasis/_/Don%27t+Look+Back+In+Anger+-+Remastered/', + 'only_matching': True, + }, { + 'url': 'https://www.last.fm/music/Guns+N%27+Roses/_/Sweet+Child+o%27+Mine', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player_url = self._search_regex(r'(?s)class="header-new-playlink"\s+href="([^"]+)"', webpage, 'player_url') + return self.url_result(player_url, 'Youtube') diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index 34f127285..9d243b2be 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -11,6 +11,7 @@ from ..compat import ( from ..utils import ( determine_ext, ExtractorError, + filter_dict, find_xpath_attr, fix_xml_ampersands, GeoRestrictedError, @@ -110,11 +111,11 @@ class RaiBaseIE(InfoExtractor): if not audio_only: formats.extend(self._create_http_urls(relinker_url, formats)) - return dict((k, v) for k, v in { + return filter_dict({ 'is_live': is_live, 'duration': duration, 'formats': formats, - }.items() if v is not None) + }) def _create_http_urls(self, relinker_url, fmts): _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\d+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index b8ac41483..9ff3136e2 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -14,7 +14,7 @@ from ..utils import ( class TVerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>(?:corner|episode|feature)/(?P<id>f?\d+))' + _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>corner|episode|feature|lp|tokyo2020/video)/(?P<id>[fc]?\d+)' # videos are only available for 7 days _TESTS = [{ 'url': 'https://tver.jp/corner/f0062178', @@ -29,6 +29,15 @@ class TVerIE(InfoExtractor): # subtitle = ' ' 'url': 'https://tver.jp/corner/f0068870', 'only_matching': True, + }, { + 'url': 'https://tver.jp/lp/f0009694', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/lp/c0000239', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/tokyo2020/video/6264525510001', + 'only_matching': True, }] _TOKEN = None BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' @@ -39,9 +48,11 @@ class TVerIE(InfoExtractor): def _real_extract(self, url): path, video_id = self._match_valid_url(url).groups() - api_response = self._download_json( - 'https://api.tver.jp/v4/' + path, video_id, - query={'token': self._TOKEN}) + if path == 'lp': + webpage = self._download_webpage(url, video_id) + redirect_path = self._search_regex(r'to_href="([^"]+)', webpage, 'redirect path') + path, video_id = self._match_valid_url(f'https://tver.jp{redirect_path}').groups() + api_response = self._download_json(f'https://api.tver.jp/v4/{path}/{video_id}', video_id, query={'token': self._TOKEN}) p_id = traverse_obj(api_response, ('main', 'publisher_id')) if not p_id: error_msg, expected = traverse_obj(api_response, ('episode', 0, 'textbar', 0, ('text', 'longer')), get_all=False), True diff --git a/yt_dlp/extractor/viki.py b/yt_dlp/extractor/viki.py index 8234ba7df..8a930798d 100644 --- a/yt_dlp/extractor/viki.py +++ b/yt_dlp/extractor/viki.py @@ -261,7 +261,7 @@ class VikiIE(VikiBaseIE): mpd_content = self._download_webpage(mpd_url, video_id, note='Downloading initial MPD manifest') mpd_url = self._search_regex( r'(?mi)<BaseURL>(http.+.mpd)', mpd_content, 'new manifest', default=mpd_url) - if 'mpdhd_high' not in mpd_url: + if 'mpdhd_high' not in mpd_url and 'sig=' not in mpd_url: # Modify the URL to get 1080p mpd_url = mpd_url.replace('mpdhd', 'mpdhd_high') formats = self._extract_mpd_formats(mpd_url, video_id) diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 051cf1b17..972fb480b 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -327,7 +327,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'info_dict': { 'id': '56015672', 'ext': 'mp4', - 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", + 'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc", 'description': 'md5:2d3305bad981a06ff79f027f19865021', 'timestamp': 1355990239, 'upload_date': '20121220', @@ -340,6 +340,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'format': 'best[protocol=https]', }, + 'skip': 'No longer available' }, { 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', @@ -357,6 +358,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'upload_date': '20130610', 'timestamp': 1370893156, 'license': 'by', + 'thumbnail': 'https://i.vimeocdn.com/video/440260469-19b0d92fca3bd84066623b53f1eb8aaa3980c6c809e2d67b6b39ab7b4a77a344-d_960', + 'view_count': int, + 'comment_count': int, + 'like_count': int, }, 'params': { 'format': 'best[protocol=https]', @@ -364,7 +369,7 @@ class VimeoIE(VimeoBaseInfoExtractor): }, { 'url': 'http://player.vimeo.com/video/54469442', - 'md5': '619b811a4417aa4abe78dc653becf511', + 'md5': 'b3e7f4d2cbb53bd7dc3bb6ff4ed5cfbd', 'note': 'Videos that embed the url in the player page', 'info_dict': { 'id': '54469442', @@ -375,6 +380,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'businessofsoftware', 'duration': 3610, 'description': None, + 'thumbnail': 'https://i.vimeocdn.com/video/376682406-f34043e7b766af6bef2af81366eacd6724f3fc3173179a11a97a1e26587c9529-d_1280', }, 'params': { 'format': 'best[protocol=https]', @@ -395,6 +401,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f', + 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_960', + 'view_count': int, + 'comment_count': int, + 'like_count': int, }, 'params': { 'format': 'best[protocol=https]', @@ -417,6 +427,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'timestamp': 1380339469, 'upload_date': '20130928', 'duration': 187, + 'thumbnail': 'https://i.vimeocdn.com/video/450239872-a05512d9b1e55d707a7c04365c10980f327b06d966351bc403a5d5d65c95e572-d_1280', + 'view_count': int, + 'comment_count': int, + 'like_count': int, }, 'params': {'format': 'http-1080p'}, }, @@ -425,7 +439,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'note': 'Video with subtitles', 'info_dict': { 'id': '76979871', - 'ext': 'mp4', + 'ext': 'mov', 'title': 'The New Vimeo Player (You Know, For Videos)', 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', 'timestamp': 1381846109, @@ -454,6 +468,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Tulio Gonçalves', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user28849593', 'uploader_id': 'user28849593', + 'duration': 118, + 'thumbnail': 'https://i.vimeocdn.com/video/478636036-c18440305ef3df9decfb6bf207a61fe39d2d17fa462a96f6f2d93d30492b037d-d_1280', }, }, { @@ -470,6 +486,11 @@ class VimeoIE(VimeoBaseInfoExtractor): 'timestamp': 1324343742, 'upload_date': '20111220', 'description': 'md5:ae23671e82d05415868f7ad1aec21147', + 'duration': 60, + 'comment_count': int, + 'view_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d_1280', + 'like_count': int, }, }, { @@ -485,6 +506,9 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Framework Studio', 'description': 'md5:f2edc61af3ea7a5592681ddbb683db73', 'upload_date': '20200225', + 'duration': 176, + 'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d_1280', + 'uploader_url': 'https://vimeo.com/frameworkla', }, }, { @@ -503,6 +527,11 @@ class VimeoIE(VimeoBaseInfoExtractor): 'timestamp': 1250886430, 'upload_date': '20090821', 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + 'duration': 321, + 'comment_count': int, + 'view_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/22728298-bfc22146f930de7cf497821c7b0b9f168099201ecca39b00b6bd31fcedfca7a6-d_1280', + 'like_count': int, }, 'params': { 'skip_download': True, @@ -535,10 +564,17 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', + 'timestamp': 1371200155, + 'upload_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, + 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f', + 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_960', + 'view_count': int, + 'comment_count': int, + 'like_count': int, }, 'params': { 'format': 'best[protocol=https]', @@ -568,12 +604,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'info_dict': { 'id': '119195465', 'ext': 'mp4', - 'title': 'youtube-dl test video \'ä"BaW_jenozKc', + 'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc", 'uploader': 'Philipp Hagemeister', 'uploader_id': 'user20132939', 'description': 'md5:fa7b6c6d8db0bdc353893df2f111855b', 'upload_date': '20150209', 'timestamp': 1423518307, + 'thumbnail': 'https://i.vimeocdn.com/video/default_1280', + 'duration': 10, + 'like_count': int, + 'uploader_url': 'https://vimeo.com/user20132939', + 'view_count': int, + 'comment_count': int, }, 'params': { 'format': 'best[protocol=https]', @@ -596,6 +638,14 @@ class VimeoIE(VimeoBaseInfoExtractor): 'title': 'Harrisville New Hampshire', 'timestamp': 1459259666, 'upload_date': '20160329', + 'release_timestamp': 1459259666, + 'license': 'by-nc', + 'duration': 159, + 'comment_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/562802436-585eeb13b5020c6ac0f171a2234067938098f84737787df05ff0d767f6d54ee9-d_1280', + 'like_count': int, + 'uploader_url': 'https://vimeo.com/aliniamedia', + 'release_date': '20160329', }, 'params': {'skip_download': True}, }, @@ -627,6 +677,14 @@ class VimeoIE(VimeoBaseInfoExtractor): 'title': 'The Shoes - Submarine Feat. Blaine Harrison', 'uploader_id': 'karimhd', 'description': 'md5:8e2eea76de4504c2e8020a9bcfa1e843', + 'channel_id': 'staffpicks', + 'duration': 336, + 'comment_count': int, + 'view_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/541243181-b593db36a16db2f0096f655da3f5a4dc46b8766d77b0f440df937ecb0c418347-d_1280', + 'like_count': int, + 'uploader_url': 'https://vimeo.com/karimhd', + 'channel_url': 'https://vimeo.com/channels/staffpicks', }, 'params': {'skip_download': 'm3u8'}, }, @@ -641,13 +699,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/581039021/9603038895', 'info_dict': { 'id': '581039021', - # these have to be provided but we don't care 'ext': 'mp4', 'timestamp': 1627621014, - 'title': 're:.+', - 'uploader_id': 're:.+', - 'uploader': 're:.+', - 'upload_date': r're:\d+', + 'release_timestamp': 1627621014, + 'duration': 976, + 'comment_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/1202249320-4ddb2c30398c0dc0ee059172d1bd5ea481ad12f0e0e3ad01d2266f56c744b015-d_1280', + 'like_count': int, + 'uploader_url': 'https://vimeo.com/txwestcapital', + 'release_date': '20210730', + 'uploader': 'Christopher Inks', + 'title': 'Thursday, July 29, 2021 BMA Evening Video Update', + 'uploader_id': 'txwestcapital', + 'upload_date': '20210730', }, 'params': { 'skip_download': True, @@ -961,9 +1025,15 @@ class VimeoOndemandIE(VimeoIE): 'uploader': 'גם סרטים', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms', 'uploader_id': 'gumfilms', - 'description': 'md5:4c027c965e439de4baab621e48b60791', + 'description': 'md5:aeeba3dbd4d04b0fa98a4fdc9c639998', 'upload_date': '20140906', 'timestamp': 1410032453, + 'thumbnail': 'https://i.vimeocdn.com/video/488238335-d7bf151c364cff8d467f1b73784668fe60aae28a54573a35d53a1210ae283bd8-d_1280', + 'comment_count': int, + 'license': 'https://creativecommons.org/licenses/by-nc-nd/3.0/', + 'duration': 53, + 'view_count': int, + 'like_count': int, }, 'params': { 'format': 'best[protocol=https]', @@ -982,6 +1052,11 @@ class VimeoOndemandIE(VimeoIE): 'description': 'md5:c3c46a90529612c8279fb6af803fc0df', 'upload_date': '20150502', 'timestamp': 1430586422, + 'duration': 121, + 'comment_count': int, + 'view_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/517077723-7066ae1d9a79d3eb361334fb5d58ec13c8f04b52f8dd5eadfbd6fb0bcf11f613-d_1280', + 'like_count': int, }, 'params': { 'skip_download': True, @@ -1011,7 +1086,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): 'id': 'tributes', 'title': 'Vimeo Tributes', }, - 'playlist_mincount': 25, + 'playlist_mincount': 22, }] _BASE_URL_TEMPL = 'https://vimeo.com/channels/%s' @@ -1196,6 +1271,9 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'uploader': 'Richard Hardwick', 'uploader_id': 'user21297594', 'description': "Comedian Dick Hardwick's five minute demo filmed in front of a live theater audience.\nEdit by Doug Mattocks", + 'duration': 304, + 'thumbnail': 'https://i.vimeocdn.com/video/450115033-43303819d9ebe24c2630352e18b7056d25197d09b3ae901abdac4c4f1d68de71-d_1280', + 'uploader_url': 'https://vimeo.com/user21297594', }, }, { 'note': 'video player needs Referer', diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py index b633df95d..b0a1fca68 100644 --- a/yt_dlp/extractor/viu.py +++ b/yt_dlp/extractor/viu.py @@ -1,55 +1,32 @@ # coding: utf-8 from __future__ import unicode_literals -import json import re +import json +import uuid +import random +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_kwargs, - compat_str, - compat_urlparse, - compat_urllib_request, -) +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + strip_or_none, try_get, smuggle_url, unsmuggle_url, + url_or_none, ) class ViuBaseIE(InfoExtractor): - def _real_initialize(self): - viu_auth_res = self._request_webpage( - 'https://www.viu.com/api/apps/v2/authenticate', None, - 'Requesting Viu auth', query={ - 'acct': 'test', - 'appid': 'viu_desktop', - 'fmt': 'json', - 'iid': 'guest', - 'languageid': 'default', - 'platform': 'desktop', - 'userid': 'guest', - 'useridtype': 'guest', - 'ver': '1.0' - }, headers=self.geo_verification_headers()) - self._auth_token = viu_auth_res.info()['X-VIU-AUTH'] - - def _call_api(self, path, *args, **kwargs): - headers = self.geo_verification_headers() - headers.update({ - 'X-VIU-AUTH': self._auth_token - }) - headers.update(kwargs.get('headers', {})) - kwargs['headers'] = headers + def _call_api(self, path, *args, headers={}, **kwargs): response = self._download_json( - 'https://www.viu.com/api/' + path, *args, - **compat_kwargs(kwargs))['response'] + f'https://www.viu.com/api/{path}', *args, **kwargs, + headers={**self.geo_verification_headers(), **headers})['response'] if response.get('status') != 'success': - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, response['message']), expected=True) + raise ExtractorError(f'{self.IE_NAME} said: {response["message"]}', expected=True) return response @@ -101,6 +78,7 @@ class ViuIE(ViuBaseIE): tdirforwhole = video_data.get('tdirforwhole') # #EXT-X-BYTERANGE is not supported by native hls downloader # and ffmpeg (#10955) + # FIXME: It is supported in yt-dlp # hls_file = video_data.get('hlsfile') hls_file = video_data.get('jwhlsfile') if url_path and tdirforwhole and hls_file: @@ -110,10 +88,9 @@ class ViuIE(ViuBaseIE): # r'(/hlsc_)[a-z]+(\d+\.m3u8)', # r'\1whe\2', video_data['href']) m3u8_url = video_data['href'] - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4') self._sort_formats(formats) - subtitles = {} for key, value in video_data.items(): mobj = re.match(r'^subtitle_(?P<lang>[^_]+)_(?P<ext>(vtt|srt))', key) if not mobj: @@ -227,42 +204,63 @@ class ViuOTTIE(InfoExtractor): 'zh-cn': 2, 'en-us': 3, } - _user_info = None + + _user_token = None + _auth_codes = {} def _detect_error(self, response): - code = response.get('status', {}).get('code') - if code > 0: + code = try_get(response, lambda x: x['status']['code']) + if code and code > 0: message = try_get(response, lambda x: x['status']['message']) - raise ExtractorError('%s said: %s (%s)' % ( - self.IE_NAME, message, code), expected=True) - return response['data'] - - def _raise_login_required(self): - raise ExtractorError( - 'This video requires login. ' - 'Specify --username and --password or --netrc (machine: %s) ' - 'to provide account credentials.' % self._NETRC_MACHINE, - expected=True) + raise ExtractorError(f'{self.IE_NAME} said: {message} ({code})', expected=True) + return response.get('data') or {} def _login(self, country_code, video_id): - if not self._user_info: + if self._user_token is None: username, password = self._get_login_info() - if username is None or password is None: + if username is None: return + headers = { + 'Authorization': f'Bearer {self._auth_codes[country_code]}', + 'Content-Type': 'application/json' + } + data = self._download_json( + 'https://api-gateway-global.viu.com/api/account/validate', + video_id, 'Validating email address', headers=headers, + data=json.dumps({ + 'principal': username, + 'provider': 'email' + }).encode()) + if not data.get('exists'): + raise ExtractorError('Invalid email address') data = self._download_json( - compat_urllib_request.Request( - 'https://www.viu.com/ott/%s/index.php' % country_code, method='POST'), - video_id, 'Logging in', errnote=False, fatal=False, - query={'r': 'user/login'}, + 'https://api-gateway-global.viu.com/api/auth/login', + video_id, 'Logging in', headers=headers, data=json.dumps({ - 'username': username, + 'email': username, 'password': password, - 'platform_flag_label': 'web', + 'provider': 'email', }).encode()) - self._user_info = self._detect_error(data)['user'] - - return self._user_info + self._detect_error(data) + self._user_token = data.get('identity') + # need to update with valid user's token else will throw an error again + self._auth_codes[country_code] = data.get('token') + return self._user_token + + def _get_token(self, country_code, video_id): + rand = ''.join(random.choice('0123456789') for _ in range(10)) + return self._download_json( + f'https://api-gateway-global.viu.com/api/auth/token?v={rand}000', video_id, + headers={'Content-Type': 'application/json'}, note='Getting bearer token', + data=json.dumps({ + 'countryCode': country_code.upper(), + 'platform': 'browser', + 'platformFlagLabel': 'web', + 'language': 'en', + 'uuid': str(uuid.uuid4()), + 'carrierId': '0' + }).encode('utf-8'))['token'] def _real_extract(self, url): url, idata = unsmuggle_url(url, {}) @@ -279,16 +277,16 @@ class ViuOTTIE(InfoExtractor): query['area_id'] = area_id product_data = self._download_json( - 'http://www.viu.com/ott/%s/index.php' % country_code, video_id, + f'http://www.viu.com/ott/{country_code}/index.php', video_id, 'Downloading video info', query=query)['data'] video_data = product_data.get('current_product') if not video_data: - raise ExtractorError('This video is not available in your region.', expected=True) + self.raise_geo_restricted() series_id = video_data.get('series_id') if self._yes_playlist(series_id, video_id, idata): - series = product_data.get('series', {}) + series = product_data.get('series') or {} product = series.get('product') if product: entries = [] @@ -296,14 +294,10 @@ class ViuOTTIE(InfoExtractor): item_id = entry.get('product_id') if not item_id: continue - item_id = compat_str(item_id) entries.append(self.url_result( - smuggle_url( - 'http://www.viu.com/ott/%s/%s/vod/%s/' % (country_code, lang_code, item_id), - {'force_noplaylist': True}), # prevent infinite recursion - 'ViuOTT', - item_id, - entry.get('synopsis', '').strip())) + smuggle_url(f'http://www.viu.com/ott/{country_code}/{lang_code}/vod/{item_id}/', + {'force_noplaylist': True}), + ViuOTTIE, str(item_id), entry.get('synopsis', '').strip())) return self.playlist_result(entries, series_id, series.get('name'), series.get('description')) @@ -312,69 +306,65 @@ class ViuOTTIE(InfoExtractor): 'ccs_product_id': video_data['ccs_product_id'], 'language_flag_id': self._LANGUAGE_FLAG.get(lang_code.lower()) or '3', } - headers = { - 'Referer': url, - 'Origin': url, - } - try: + + def download_playback(): stream_data = self._download_json( - 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, - video_id, 'Downloading stream info', query=query, headers=headers) - stream_data = self._detect_error(stream_data)['stream'] - except (ExtractorError, KeyError): - stream_data = None - if video_data.get('user_level', 0) > 0: - user = self._login(country_code, video_id) - if user: - query['identity'] = user['identity'] - stream_data = self._download_json( - 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, - video_id, 'Downloading stream info', query=query, headers=headers) - stream_data = self._detect_error(stream_data).get('stream') - else: - # preview is limited to 3min for non-members - # try to bypass the duration limit - duration_limit = True - query['duration'] = '180' - stream_data = self._download_json( - 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, - video_id, 'Downloading stream info', query=query, headers=headers) - try: - stream_data = self._detect_error(stream_data)['stream'] - except (ExtractorError, KeyError): # if still not working, give up - self._raise_login_required() + 'https://api-gateway-global.viu.com/api/playback/distribute', + video_id=video_id, query=query, fatal=False, note='Downloading stream info', + headers={ + 'Authorization': f'Bearer {self._auth_codes[country_code]}', + 'Referer': url, + 'Origin': url + }) + return self._detect_error(stream_data).get('stream') + + if not self._auth_codes.get(country_code): + self._auth_codes[country_code] = self._get_token(country_code, video_id) + stream_data = None + try: + stream_data = download_playback() + except (ExtractorError, KeyError): + token = self._login(country_code, video_id) + if token is not None: + query['identity'] = token + else: + # preview is limited to 3min for non-members. But we can try to bypass it + duration_limit, query['duration'] = True, '180' + try: + stream_data = download_playback() + except (ExtractorError, KeyError): + if token is not None: + raise + self.raise_login_required(method='password') if not stream_data: raise ExtractorError('Cannot get stream info', expected=True) - stream_sizes = stream_data.get('size', {}) formats = [] - for vid_format, stream_url in stream_data.get('url', {}).items(): - height = int_or_none(self._search_regex( - r's(\d+)p', vid_format, 'height', default=None)) + for vid_format, stream_url in (stream_data.get('url') or {}).items(): + height = int(self._search_regex(r's(\d+)p', vid_format, 'height', default=None)) # bypass preview duration limit if duration_limit: - stream_url = compat_urlparse.urlparse(stream_url) - query = dict(compat_urlparse.parse_qsl(stream_url.query, keep_blank_values=True)) - time_duration = int_or_none(video_data.get('time_duration')) + stream_url = urllib.parse.urlparse(stream_url) query.update({ - 'duration': time_duration if time_duration > 0 else '9999999', + 'duration': video_data.get('time_duration') or '9999999', 'duration_start': '0', }) - stream_url = stream_url._replace(query=compat_urlparse.urlencode(query)).geturl() + stream_url = stream_url._replace(query=urllib.parse.urlencode(dict( + urllib.parse.parse_qsl(stream_url.query, keep_blank_values=True)))).geturl() formats.append({ 'format_id': vid_format, 'url': stream_url, 'height': height, 'ext': 'mp4', - 'filesize': int_or_none(stream_sizes.get(vid_format)) + 'filesize': try_get(stream_data, lambda x: x['size'][vid_format], int) }) self._sort_formats(formats) subtitles = {} - for sub in video_data.get('subtitle', []): + for sub in video_data.get('subtitle') or []: sub_url = sub.get('url') if not sub_url: continue @@ -383,17 +373,16 @@ class ViuOTTIE(InfoExtractor): 'ext': 'srt', }) - title = video_data['synopsis'].strip() - + title = strip_or_none(video_data.get('synopsis')) return { 'id': video_id, 'title': title, 'description': video_data.get('description'), - 'series': product_data.get('series', {}).get('name'), + 'series': try_get(product_data, lambda x: x['series']['name']), 'episode': title, 'episode_number': int_or_none(video_data.get('number')), 'duration': int_or_none(stream_data.get('duration')), - 'thumbnail': video_data.get('cover_image_url'), + 'thumbnail': url_or_none(video_data.get('cover_image_url')), 'formats': formats, 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/wasdtv.py b/yt_dlp/extractor/wasdtv.py new file mode 100644 index 000000000..38c10dc62 --- /dev/null +++ b/yt_dlp/extractor/wasdtv.py @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + traverse_obj, + try_get, +) + + +class WASDTVBaseIE(InfoExtractor): + + def _fetch(self, path, video_id, description, query={}): + response = self._download_json( + f'https://wasd.tv/api/{path}', video_id, query=query, + note=f'Downloading {description} metadata', + errnote=f'Unable to download {description} metadata') + error = response.get('error') + if error: + raise ExtractorError(f'{self.IE_NAME} returned error: {error}', expected=True) + return response.get('result') + + def _extract_thumbnails(self, thumbnails_dict): + return [{ + 'url': url, + 'preference': index, + } for index, url in enumerate( + traverse_obj(thumbnails_dict, (('small', 'medium', 'large'),))) if url] + + def _real_extract(self, url): + container = self._get_container(url) + stream = traverse_obj(container, ('media_container_streams', 0)) + media = try_get(stream, lambda x: x['stream_media'][0]) + if not media: + raise ExtractorError('Can not extract media data.', expected=True) + media_meta = media.get('media_meta') + media_url, is_live = self._get_media_url(media_meta) + video_id = media.get('media_id') or container.get('media_container_id') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(media_url, video_id, 'mp4') + self._sort_formats(formats) + return { + 'id': str(video_id), + 'title': container.get('media_container_name') or self._og_search_title(self._download_webpage(url, video_id)), + 'description': container.get('media_container_description'), + 'thumbnails': self._extract_thumbnails(media_meta.get('media_preview_images')), + 'timestamp': parse_iso8601(container.get('created_at')), + 'view_count': int_or_none(stream.get('stream_current_viewers' if is_live else 'stream_total_viewers')), + 'is_live': is_live, + 'formats': formats, + 'subtitles': subtitles, + } + + def _get_container(self, url): + raise NotImplementedError('Subclass for get media container') + + def _get_media_url(self, media_meta): + raise NotImplementedError('Subclass for get media url') + + +class WASDTVStreamIE(WASDTVBaseIE): + IE_NAME = 'wasdtv:stream' + _VALID_URL = r'https?://wasd\.tv/(?P<id>[^/#?]+)$' + _TESTS = [{ + 'url': 'https://wasd.tv/24_7', + 'info_dict': { + 'id': '559738', + 'ext': 'mp4', + 'title': 'Live 24/7 Music', + 'description': '24/7 Music', + 'timestamp': int, + 'upload_date': r're:^\d{8}$', + 'is_live': True, + 'view_count': int, + }, + }] + + def _get_container(self, url): + nickname = self._match_id(url) + channel = self._fetch(f'channels/nicknames/{nickname}', video_id=nickname, description='channel') + channel_id = channel.get('channel_id') + containers = self._fetch( + 'v2/media-containers', channel_id, 'running media containers', + query={ + 'channel_id': channel_id, + 'media_container_type': 'SINGLE', + 'media_container_status': 'RUNNING', + }) + if not containers: + raise ExtractorError(f'{nickname} is offline', expected=True) + return containers[0] + + def _get_media_url(self, media_meta): + return media_meta['media_url'], True + + +class WASDTVRecordIE(WASDTVBaseIE): + IE_NAME = 'wasdtv:record' + _VALID_URL = r'https?://wasd\.tv/[^/#?]+/videos\?record=(?P<id>\d+)$' + _TESTS = [{ + 'url': 'https://wasd.tv/spacemita/videos?record=907755', + 'md5': 'c9899dd85be4cc997816ff9f9ca516ce', + 'info_dict': { + 'id': '906825', + 'ext': 'mp4', + 'title': 'Музыкальный', + 'description': 'md5:f510388d929ff60ae61d4c3cab3137cc', + 'timestamp': 1645812079, + 'upload_date': '20220225', + 'thumbnail': r're:^https?://.+\.jpg', + 'is_live': False, + 'view_count': int, + }, + }] + + def _get_container(self, url): + container_id = self._match_id(url) + return self._fetch( + f'v2/media-containers/{container_id}', container_id, 'media container') + + def _get_media_url(self, media_meta): + media_archive_url = media_meta.get('media_archive_url') + if media_archive_url: + return media_archive_url, False + return media_meta['media_url'], True + + +class WASDTVClipIE(WASDTVBaseIE): + IE_NAME = 'wasdtv:clip' + _VALID_URL = r'https?://wasd\.tv/[^/#?]+/clips\?clip=(?P<id>\d+)$' + _TESTS = [{ + 'url': 'https://wasd.tv/spacemita/clips?clip=26804', + 'md5': '818885e720143d7a4e776ff66fcff148', + 'info_dict': { + 'id': '26804', + 'ext': 'mp4', + 'title': 'Пуш флексит на голове стримера', + 'timestamp': 1646682908, + 'upload_date': '20220307', + 'thumbnail': r're:^https?://.+\.jpg', + 'view_count': int, + }, + }] + + def _real_extract(self, url): + clip_id = self._match_id(url) + clip = self._fetch(f'v2/clips/{clip_id}', video_id=clip_id, description='clip') + clip_data = clip.get('clip_data') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(clip_data.get('url'), video_id=clip_id, ext='mp4') + self._sort_formats(formats) + return { + 'id': clip_id, + 'title': clip.get('clip_title') or self._og_search_title(self._download_webpage(url, clip_id, fatal=False)), + 'thumbnails': self._extract_thumbnails(clip_data.get('preview')), + 'timestamp': parse_iso8601(clip.get('created_at')), + 'view_count': int_or_none(clip.get('clip_views_count')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index d74d5b0e9..19b4985f6 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -217,15 +217,35 @@ INNERTUBE_CLIENTS = { } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2 - } + }, + # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option) + # See: https://github.com/zerodytrash/YouTube-Internal-Clients + 'tv_embedded': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', + 'clientVersion': '2.0', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 85 + }, } +def _split_innertube_client(client_name): + variant, *base = client_name.rsplit('.', 1) + if base: + return variant, base[0], variant + base, *variant = client_name.split('_', 1) + return client_name, base, variant[0] if variant else None + + def build_innertube_clients(): THIRD_PARTY = { - 'embedUrl': 'https://google.com', # Can be any valid URL + 'embedUrl': 'https://www.youtube.com/', # Can be any valid URL } - BASE_CLIENTS = ('android', 'web', 'ios', 'mweb') + BASE_CLIENTS = ('android', 'web', 'tv', 'ios', 'mweb') priority = qualities(BASE_CLIENTS[::-1]) for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): @@ -234,15 +254,15 @@ def build_innertube_clients(): ytcfg.setdefault('REQUIRE_JS_PLAYER', True) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') - base_client, *variant = client.split('_') + _, base_client, variant = _split_innertube_client(client) ytcfg['priority'] = 10 * priority(base_client) if not variant: - INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg) - agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED' - agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY - agegate_ytcfg['priority'] -= 1 - elif variant == ['embedded']: + INNERTUBE_CLIENTS[f'{client}_embedscreen'] = embedscreen = copy.deepcopy(ytcfg) + embedscreen['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED' + embedscreen['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY + embedscreen['priority'] -= 3 + elif variant == 'embedded': ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY ytcfg['priority'] -= 2 else: @@ -807,6 +827,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor): description = self._get_text(renderer, 'descriptionSnippet') duration = parse_duration(self._get_text( renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text'))) + if duration is None: + duration = parse_duration(self._search_regex( + r'(?i)(ago)(?!.*\1)\s+(?P<duration>[a-z0-9 ,]+?)(?:\s+[\d,]+\s+views)?(?:\s+-\s+play\s+short)?$', + traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str), + video_id, default=None, group='duration')) + view_count = self._get_count(renderer, 'viewCountText') uploader = self._get_text(renderer, 'ownerText', 'shortBylineText') @@ -818,12 +844,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) badges = self._extract_badges(renderer) thumbnails = self._extract_thumbnails(renderer, 'thumbnail') + navigation_url = urljoin('https://www.youtube.com/', traverse_obj( + renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str)) + url = f'https://www.youtube.com/watch?v={video_id}' + if overlay_style == 'SHORTS' or (navigation_url and '/shorts/' in navigation_url): + url = f'https://www.youtube.com/shorts/{video_id}' return { '_type': 'url', 'ie_key': YoutubeIE.ie_key(), 'id': video_id, - 'url': f'https://www.youtube.com/watch?v={video_id}', + 'url': url, 'title': title, 'description': description, 'duration': duration, @@ -2940,13 +2971,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') - original_clients = clients + all_clients = set(clients) clients = clients[::-1] prs = [] - def append_client(client_name): - if client_name in INNERTUBE_CLIENTS and client_name not in original_clients: - clients.append(client_name) + def append_client(*client_names): + """ Append the first client name that exists but not already used """ + for client_name in client_names: + actual_client = _split_innertube_client(client_name)[0] + if actual_client in INNERTUBE_CLIENTS: + if actual_client not in all_clients: + clients.append(client_name) + all_clients.add(actual_client) + return # Android player_response does not have microFormats which are needed for # extraction of some data. So we return the initial_pr with formats @@ -2961,7 +2998,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): tried_iframe_fallback = False player_url = None while clients: - client = clients.pop() + client, base_client, variant = _split_innertube_client(clients.pop()) player_ytcfg = master_ytcfg if client == 'web' else {} if 'configs' not in self._configuration_arg('player_skip'): player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg @@ -2989,10 +3026,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): prs.append(pr) # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in - if client.endswith('_agegate') and self._is_unplayable(pr) and self.is_authenticated: - append_client(client.replace('_agegate', '_creator')) + if variant == 'embedded' and self._is_unplayable(pr) and self.is_authenticated: + append_client(f'{base_client}_creator') elif self._is_agegated(pr): - append_client(f'{client}_agegate') + if variant == 'tv_embedded': + append_client(f'{base_client}_embedded') + elif not variant: + append_client(f'tv_embedded.{base_client}', f'{base_client}_embedded') if last_error: if not len(prs): @@ -3013,7 +3053,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[]) for fmt in streaming_formats: - if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): + if fmt.get('targetDurationSec'): continue itag = str_or_none(fmt.get('itag')) @@ -3095,6 +3135,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'fps': int_or_none(fmt.get('fps')) or None, 'height': height, 'quality': q(quality), + 'has_drm': bool(fmt.get('drmFamilies')), 'tbr': tbr, 'url': fmt_url, 'width': int_or_none(fmt.get('width')), @@ -3468,6 +3509,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): subtitles, automatic_captions = {}, {} for lang_code, caption_track in captions.items(): base_url = caption_track.get('baseUrl') + orig_lang = parse_qs(base_url).get('lang', [None])[-1] if not base_url: continue lang_name = self._get_text(caption_track, 'name', max_runs=1) @@ -3481,19 +3523,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for trans_code, trans_name in translation_languages.items(): if not trans_code: continue + orig_trans_code = trans_code if caption_track.get('kind') != 'asr': + if 'translated_subs' in self._configuration_arg('skip'): + continue trans_code += f'-{lang_code}' trans_name += format_field(lang_name, template=' from %s') # Add an "-orig" label to the original language so that it can be distinguished. # The subs are returned without "-orig" as well for compatibility - if lang_code == f'a-{trans_code}': + if lang_code == f'a-{orig_trans_code}': process_language( automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {}) # Setting tlang=lang returns damaged subtitles. - # Not using lang_code == f'a-{trans_code}' here for future-proofing - orig_lang = parse_qs(base_url).get('lang', [None])[-1] process_language(automatic_captions, base_url, trans_code, trans_name, - {} if orig_lang == trans_code else {'tlang': trans_code}) + {} if orig_lang == orig_trans_code else {'tlang': trans_code}) info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 9f6b45ec6..936cc8b6f 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -334,10 +334,10 @@ def create_parser(): action='callback', callback=_set_from_options_callback, callback_kwargs={ 'allowed_values': { - 'filename', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles', + 'filename', 'filename-sanitization', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles', 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge', 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-attach-info-json', 'embed-metadata', - 'embed-thumbnail-atomicparsley', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', + 'embed-thumbnail-atomicparsley', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', }, 'aliases': { 'youtube-dl': ['-multistreams', 'all'], 'youtube-dlc': ['-no-youtube-channel-redirect', '-no-live-chat', 'all'], @@ -461,19 +461,18 @@ def create_parser(): metavar='COUNT', dest='max_views', default=None, type=int, help=optparse.SUPPRESS_HELP) selection.add_option( - '--match-filter', - metavar='FILTER', dest='match_filter', default=None, + '--match-filters', + metavar='FILTER', dest='match_filter', action='append', help=( 'Generic video filter. Any field (see "OUTPUT TEMPLATE") can be compared with a ' 'number or a string using the operators defined in "Filtering formats". ' - 'You can also simply specify a field to match if the field is present ' - 'and "!field" to check if the field is not present. In addition, ' - 'Python style regular expression matching can be done using "~=", ' - 'and multiple filters can be checked with "&". ' - 'Use a "\\" to escape "&" or quotes if needed. Eg: --match-filter ' - '"!is_live & like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" ' - 'matches only videos that are not live, has a like count more than 100 ' - '(or the like field is not available), and also has a description ' + 'You can also simply specify a field to match if the field is present, ' + 'use "!field" to check if the field is not present, and "&" to check multiple conditions. ' + 'Use a "\\" to escape "&" or quotes if needed. If used multiple times, ' + 'the filter matches if atleast one of the conditions are met. Eg: --match-filter ' + '!is_live --match-filter "like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" ' + 'matches only videos that are not live OR those that have a like count more than 100 ' + '(or the like field is not available) and also has a description ' 'that contains the phrase "cats & dogs" (ignoring case)')) selection.add_option( '--no-match-filter', @@ -1312,7 +1311,7 @@ def create_parser(): postproc.add_option( '--audio-quality', metavar='QUALITY', dest='audioquality', default='5', - help='Specify ffmpeg audio quality, insert a value between 0 (best) and 10 (worst) for VBR or a specific bitrate like 128K (default %default)') + help='Specify ffmpeg audio quality to use when converting the audio with -x. Insert a value between 0 (best) and 10 (worst) for VBR or a specific bitrate like 128K (default %default)') postproc.add_option( '--remux-video', metavar='FORMAT', dest='remuxvideo', default=None, diff --git a/yt_dlp/postprocessor/common.py b/yt_dlp/postprocessor/common.py index d761c9303..8420ee864 100644 --- a/yt_dlp/postprocessor/common.py +++ b/yt_dlp/postprocessor/common.py @@ -1,13 +1,18 @@ from __future__ import unicode_literals import functools +import itertools +import json import os +import time +import urllib.error -from ..compat import compat_str from ..utils import ( _configuration_args, encodeFilename, + network_exceptions, PostProcessingError, + sanitized_Request, write_string, ) @@ -63,7 +68,7 @@ class PostProcessor(metaclass=PostProcessorMetaClass): @classmethod def pp_key(cls): name = cls.__name__[:-2] - return compat_str(name[6:]) if name[:6].lower() == 'ffmpeg' else name + return name[6:] if name[:6].lower() == 'ffmpeg' else name def to_screen(self, text, prefix=True, *args, **kwargs): tag = '[%s] ' % self.PP_NAME if prefix else '' @@ -180,6 +185,28 @@ class PostProcessor(metaclass=PostProcessorMetaClass): progress_template.get('postprocess-title') or 'yt-dlp %(progress._default_template)s', progress_dict)) + def _download_json(self, url, *, expected_http_errors=(404,)): + # While this is not an extractor, it behaves similar to one and + # so obey extractor_retries and sleep_interval_requests + max_retries = self.get_param('extractor_retries', 3) + sleep_interval = self.get_param('sleep_interval_requests') or 0 + + self.write_debug(f'{self.PP_NAME} query: {url}') + for retries in itertools.count(): + try: + rsp = self._downloader.urlopen(sanitized_Request(url)) + return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8')) + except network_exceptions as e: + if isinstance(e, urllib.error.HTTPError) and e.code in expected_http_errors: + return None + if retries < max_retries: + self.report_warning(f'{e}. Retrying...') + if sleep_interval > 0: + self.to_screen(f'Sleeping {sleep_interval} seconds ...') + time.sleep(sleep_interval) + continue + raise PostProcessingError(f'Unable to communicate with {self.PP_NAME} API: {e}') + class AudioConversionError(PostProcessingError): pass diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 234ddeff0..0b18e8774 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -86,13 +86,18 @@ class FFmpegPostProcessor(PostProcessor): @staticmethod def get_versions(downloader=None): - return FFmpegPostProcessor.get_version_and_features(downloader)[0] + return FFmpegPostProcessor.get_versions_and_features(downloader)[0] + + _version_cache, _features_cache = {}, {} def _determine_executables(self): programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] def get_ffmpeg_version(path, prog): - out = _get_exe_version_output(path, ['-bsfs']) + if path in self._version_cache: + self._versions[path], self._features = self._version_cache[path], self._features_cache.get(path, {}) + return + out = _get_exe_version_output(path, ['-bsfs'], to_screen=self.write_debug) ver = detect_exe_version(out) if out else False if ver: regexs = [ @@ -104,13 +109,13 @@ class FFmpegPostProcessor(PostProcessor): mobj = re.match(regex, ver) if mobj: ver = mobj.group(1) - self._versions[prog] = ver + self._versions[prog] = self._version_cache[path] = ver if prog != 'ffmpeg' or not out: return mobj = re.search(r'(?m)^\s+libavformat\s+(?:[0-9. ]+)\s+/\s+(?P<runtime>[0-9. ]+)', out) lavf_runtime_version = mobj.group('runtime').replace(' ', '') if mobj else None - self._features = { + self._features = self._features_cache[path] = { 'fdk': '--enable-libfdk-aac' in out, 'setts': 'setts' in out.splitlines(), 'needs_adtstoasc': is_outdated_version(lavf_runtime_version, '57.56.100', False), @@ -148,26 +153,15 @@ class FFmpegPostProcessor(PostProcessor): self._paths[basename] = location self._versions = {} - for p in programs: - get_ffmpeg_version(self._paths[p], p) - + executables = {'basename': ('ffmpeg', 'avconv'), 'probe_basename': ('ffprobe', 'avprobe')} if prefer_ffmpeg is False: - prefs = ('avconv', 'ffmpeg') - else: - prefs = ('ffmpeg', 'avconv') - for p in prefs: - if self._versions[p]: - self.basename = p - break - - if prefer_ffmpeg is False: - prefs = ('avprobe', 'ffprobe') - else: - prefs = ('ffprobe', 'avprobe') - for p in prefs: - if self._versions[p]: - self.probe_basename = p - break + executables = {k: v[::-1] for k, v in executables.items()} + for var, prefs in executables.items(): + for p in prefs: + get_ffmpeg_version(self._paths[p], p) + if self._versions[p]: + setattr(self, var, p) + break if self.basename == 'avconv': self.deprecation_warning( @@ -553,9 +547,9 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor): @staticmethod def _options(target_ext): + yield from FFmpegPostProcessor.stream_copy_opts(False) if target_ext == 'avi': - return ['-c:v', 'libxvid', '-vtag', 'XVID'] - return [] + yield from ('-c:v', 'libxvid', '-vtag', 'XVID') @PostProcessor._restrict_to(images=False) def run(self, info): @@ -1129,6 +1123,8 @@ class FFmpegConcatPP(FFmpegPostProcessor): super().__init__(downloader) def concat_files(self, in_files, out_file): + if not self._downloader._ensure_dir_exists(out_file): + return if len(in_files) == 1: if os.path.realpath(in_files[0]) != os.path.realpath(out_file): self.to_screen(f'Moving "{in_files[0]}" to "{out_file}"') diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py index e7e04e86e..7943014e2 100644 --- a/yt_dlp/postprocessor/sponsorblock.py +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -1,12 +1,9 @@ from hashlib import sha256 -import itertools import json import re -import time from .ffmpeg import FFmpegPostProcessor -from ..compat import compat_urllib_parse_urlencode, compat_HTTPError -from ..utils import PostProcessingError, network_exceptions, sanitized_Request +from ..compat import compat_urllib_parse_urlencode class SponsorBlockPP(FFmpegPostProcessor): @@ -94,28 +91,7 @@ class SponsorBlockPP(FFmpegPostProcessor): 'categories': json.dumps(self._categories), 'actionTypes': json.dumps(['skip', 'poi']) }) - self.write_debug(f'SponsorBlock query: {url}') - for d in self._get_json(url): + for d in self._download_json(url) or []: if d['videoID'] == video_id: return d['segments'] return [] - - def _get_json(self, url): - # While this is not an extractor, it behaves similar to one and - # so obey extractor_retries and sleep_interval_requests - max_retries = self.get_param('extractor_retries', 3) - sleep_interval = self.get_param('sleep_interval_requests') or 0 - for retries in itertools.count(): - try: - rsp = self._downloader.urlopen(sanitized_Request(url)) - return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8')) - except network_exceptions as e: - if isinstance(e, compat_HTTPError) and e.code == 404: - return [] - if retries < max_retries: - self.report_warning(f'{e}. Retrying...') - if sleep_interval > 0: - self.to_screen(f'Sleeping {sleep_interval} seconds ...') - time.sleep(sleep_interval) - continue - raise PostProcessingError(f'Unable to communicate with SponsorBlock API: {e}') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index da6f27801..62a1800d4 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -85,6 +85,12 @@ from .socks import ( sockssocket, ) +try: + import certifi + has_certifi = True +except ImportError: + has_certifi = False + def register_socks_protocols(): # "Register" SOCKS protocols @@ -153,7 +159,6 @@ if compat_brotli: std_headers = { 'User-Agent': random_user_agent(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Encoding': ', '.join(SUPPORTED_ENCODINGS), 'Accept-Language': 'en-us,en;q=0.5', 'Sec-Fetch-Mode': 'navigate', } @@ -700,36 +705,40 @@ def timeconvert(timestr): return timestamp -def sanitize_filename(s, restricted=False, is_id=False): +def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT): """Sanitizes a string so it could be used as part of a filename. - If restricted is set, use a stricter subset of allowed characters. - Set is_id if this is not an arbitrary string, but an ID that should be kept - if possible. + @param restricted Use a stricter subset of allowed characters + @param is_id Whether this is an ID that should be kept unchanged if possible. + If unset, yt-dlp's new sanitization rules are in effect """ + if s == '': + return '' + def replace_insane(char): if restricted and char in ACCENT_CHARS: return ACCENT_CHARS[char] elif not restricted and char == '\n': - return ' ' + return '\0 ' elif char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': return '' if restricted else '\'' elif char == ':': - return '_-' if restricted else ' -' + return '\0_\0-' if restricted else '\0 \0-' elif char in '\\/|*<>': - return '_' - if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()): - return '_' - if restricted and ord(char) > 127: - return '_' + return '\0_' + if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127): + return '\0_' return char - if s == '': - return '' - # Handle timestamps - s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) + s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps result = ''.join(map(replace_insane, s)) + if is_id is NO_DEFAULT: + result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars + STRIP_RE = '(?:\0.|[ _-])*' + result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end + result = result.replace('\0', '') or '_' + if not is_id: while '__' in result: result = result.replace('__', '_') @@ -1010,20 +1019,23 @@ def make_HTTPS_handler(params, **kwargs): context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE if opts_check_certificate: - try: - context.load_default_certs() - # Work around the issue in load_default_certs when there are bad certificates. See: - # https://github.com/yt-dlp/yt-dlp/issues/1060, - # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312 - except ssl.SSLError: - # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151 - if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'): - # Create a new context to discard any certificates that were already loaded - context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) - context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED - for storename in ('CA', 'ROOT'): - _ssl_load_windows_store_certs(context, storename) - context.set_default_verify_paths() + if has_certifi and 'no-certifi' not in params.get('compat_opts', []): + context.load_verify_locations(cafile=certifi.where()) + else: + try: + context.load_default_certs() + # Work around the issue in load_default_certs when there are bad certificates. See: + # https://github.com/yt-dlp/yt-dlp/issues/1060, + # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312 + except ssl.SSLError: + # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151 + if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'): + # Create a new context to discard any certificates that were already loaded + context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED + for storename in ('CA', 'ROOT'): + _ssl_load_windows_store_certs(context, storename) + context.set_default_verify_paths() return YoutubeDLHTTPSHandler(params, context=context, **kwargs) @@ -1392,6 +1404,9 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): if h.capitalize() not in req.headers: req.add_header(h, v) + if 'Accept-encoding' not in req.headers: + req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS)) + req.headers = handle_youtubedl_headers(req.headers) if sys.version_info < (2, 7) and '#' in req.get_full_url(): @@ -2629,23 +2644,23 @@ def parse_duration(s): m = re.match( r'''(?ix)(?:P? (?: - [0-9]+\s*y(?:ears?)?\s* + [0-9]+\s*y(?:ears?)?,?\s* )? (?: - [0-9]+\s*m(?:onths?)?\s* + [0-9]+\s*m(?:onths?)?,?\s* )? (?: - [0-9]+\s*w(?:eeks?)?\s* + [0-9]+\s*w(?:eeks?)?,?\s* )? (?: - (?P<days>[0-9]+)\s*d(?:ays?)?\s* + (?P<days>[0-9]+)\s*d(?:ays?)?,?\s* )? T)? (?: - (?P<hours>[0-9]+)\s*h(?:ours?)?\s* + (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s* )? (?: - (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s* + (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s* )? (?: (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s* @@ -2698,7 +2713,9 @@ def check_executable(exe, args=[]): return exe -def _get_exe_version_output(exe, args): +def _get_exe_version_output(exe, args, *, to_screen=None): + if to_screen: + to_screen(f'Checking exe version: {shell_quote([exe] + args)}') try: # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers # SIGTTOU if yt-dlp is run in the background. @@ -3090,16 +3107,16 @@ def try_get(src, getter, expected_type=None): return v +def filter_dict(dct, cndn=lambda _, v: v is not None): + return {k: v for k, v in dct.items() if cndn(k, v)} + + def merge_dicts(*dicts): merged = {} for a_dict in dicts: for k, v in a_dict.items(): - if v is None: - continue - if (k not in merged - or (isinstance(v, compat_str) and v - and isinstance(merged[k], compat_str) - and not merged[k])): + if (v is not None and k not in merged + or isinstance(v, str) and merged[k] == ''): merged[k] = v return merged @@ -3534,6 +3551,11 @@ def _match_one(filter_part, dct, incomplete): '=': operator.eq, } + if isinstance(incomplete, bool): + is_incomplete = lambda _: incomplete + else: + is_incomplete = lambda k: k in incomplete + operator_rex = re.compile(r'''(?x)\s* (?P<key>[a-z_]+) \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* @@ -3572,7 +3594,7 @@ def _match_one(filter_part, dct, incomplete): if numeric_comparison is not None and m['op'] in STRING_OPERATORS: raise ValueError('Operator %s only supports string values!' % m['op']) if actual_value is None: - return incomplete or m['none_inclusive'] + return is_incomplete(m['key']) or m['none_inclusive'] return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison) UNARY_OPERATORS = { @@ -3587,7 +3609,7 @@ def _match_one(filter_part, dct, incomplete): if m: op = UNARY_OPERATORS[m.group('op')] actual_value = dct.get(m.group('key')) - if incomplete and actual_value is None: + if is_incomplete(m.group('key')) and actual_value is None: return True return op(actual_value) @@ -3595,24 +3617,29 @@ def _match_one(filter_part, dct, incomplete): def match_str(filter_str, dct, incomplete=False): - """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false - When incomplete, all conditions passes on missing fields + """ Filter a dictionary with a simple string syntax. + @returns Whether the filter passes + @param incomplete Set of keys that is expected to be missing from dct. + Can be True/False to indicate all/none of the keys may be missing. + All conditions on incomplete keys pass if the key is missing """ return all( _match_one(filter_part.replace(r'\&', '&'), dct, incomplete) for filter_part in re.split(r'(?<!\\)&', filter_str)) -def match_filter_func(filter_str): - if filter_str is None: +def match_filter_func(filters): + if not filters: return None + filters = variadic(filters) def _match_func(info_dict, *args, **kwargs): - if match_str(filter_str, info_dict, *args, **kwargs): + if any(match_str(f, info_dict, *args, **kwargs) for f in filters): return None else: - video_title = info_dict.get('title', info_dict.get('id', 'video')) - return '%s does not pass filter %s, skipping ..' % (video_title, filter_str) + video_title = info_dict.get('title') or info_dict.get('id') or 'video' + filter_str = ') | ('.join(map(str.strip, filters)) + return f'{video_title} does not pass filter ({filter_str}), skipping ..' return _match_func @@ -5423,15 +5450,18 @@ class Config: class WebSocketsWrapper(): """Wraps websockets module to use in non-async scopes""" - def __init__(self, url, headers=None): + def __init__(self, url, headers=None, connect=True): self.loop = asyncio.events.new_event_loop() self.conn = compat_websockets.connect( url, extra_headers=headers, ping_interval=None, close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf')) + if connect: + self.__enter__() atexit.register(self.__exit__, None, None, None) def __enter__(self): - self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop) + if not self.pool: + self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop) return self def send(self, *args): @@ -5491,3 +5521,11 @@ has_websockets = bool(compat_websockets) def merge_headers(*dicts): """Merge dicts of http headers case insensitively, prioritizing the latter ones""" return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))} + + +class classproperty: + def __init__(self, f): + self.f = f + + def __get__(self, _, cls): + return self.f(cls) |