aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.editorconfig8
-rw-r--r--yt_dlp/YoutubeDL.py83
-rw-r--r--yt_dlp/__init__.py2
-rw-r--r--yt_dlp/cookies.py11
-rw-r--r--yt_dlp/downloader/fragment.py24
-rw-r--r--yt_dlp/downloader/http.py26
-rw-r--r--yt_dlp/extractor/adobeconnect.py2
-rw-r--r--yt_dlp/extractor/allocine.py6
-rw-r--r--yt_dlp/extractor/archiveorg.py9
-rw-r--r--yt_dlp/extractor/asiancrush.py3
-rw-r--r--yt_dlp/extractor/azmedien.py7
-rw-r--r--yt_dlp/extractor/bbc.py5
-rw-r--r--yt_dlp/extractor/bilibili.py6
-rw-r--r--yt_dlp/extractor/br.py5
-rw-r--r--yt_dlp/extractor/breitbart.py5
-rw-r--r--yt_dlp/extractor/callin.py2
-rw-r--r--yt_dlp/extractor/canvas.py8
-rw-r--r--yt_dlp/extractor/cbc.py6
-rw-r--r--yt_dlp/extractor/closertotruth.py3
-rw-r--r--yt_dlp/extractor/common.py22
-rw-r--r--yt_dlp/extractor/craftsy.py71
-rw-r--r--yt_dlp/extractor/crunchyroll.py204
-rw-r--r--yt_dlp/extractor/cspan.py2
-rw-r--r--yt_dlp/extractor/cybrary.py146
-rw-r--r--yt_dlp/extractor/dailymotion.py23
-rw-r--r--yt_dlp/extractor/dropout.py4
-rw-r--r--yt_dlp/extractor/elonet.py85
-rw-r--r--yt_dlp/extractor/extractors.py5
-rw-r--r--yt_dlp/extractor/facebook.py6
-rw-r--r--yt_dlp/extractor/fivetv.py3
-rw-r--r--yt_dlp/extractor/foxgay.py3
-rw-r--r--yt_dlp/extractor/funimation.py2
-rw-r--r--yt_dlp/extractor/generic.py36
-rw-r--r--yt_dlp/extractor/glide.py4
-rw-r--r--yt_dlp/extractor/hellporno.py3
-rw-r--r--yt_dlp/extractor/huya.py3
-rw-r--r--yt_dlp/extractor/imdb.py64
-rw-r--r--yt_dlp/extractor/infoq.py2
-rw-r--r--yt_dlp/extractor/iqiyi.py377
-rw-r--r--yt_dlp/extractor/iwara.py3
-rw-r--r--yt_dlp/extractor/limelight.py2
-rw-r--r--yt_dlp/extractor/linkedin.py2
-rw-r--r--yt_dlp/extractor/mediasite.py11
-rw-r--r--yt_dlp/extractor/miaopai.py3
-rw-r--r--yt_dlp/extractor/mojvideo.py3
-rw-r--r--yt_dlp/extractor/nebula.py2
-rw-r--r--yt_dlp/extractor/newgrounds.py6
-rw-r--r--yt_dlp/extractor/nhk.py4
-rw-r--r--yt_dlp/extractor/niconico.py20
-rw-r--r--yt_dlp/extractor/openrec.py30
-rw-r--r--yt_dlp/extractor/playvid.py3
-rw-r--r--yt_dlp/extractor/rai.py2
-rw-r--r--yt_dlp/extractor/rule34video.py2
-rw-r--r--yt_dlp/extractor/ruutu.py15
-rw-r--r--yt_dlp/extractor/senategov.py2
-rw-r--r--yt_dlp/extractor/sunporno.py3
-rw-r--r--yt_dlp/extractor/tenplay.py42
-rw-r--r--yt_dlp/extractor/thisav.py4
-rw-r--r--yt_dlp/extractor/tiktok.py44
-rw-r--r--yt_dlp/extractor/traileraddict.py3
-rw-r--r--yt_dlp/extractor/varzesh3.py3
-rw-r--r--yt_dlp/extractor/viu.py9
-rw-r--r--yt_dlp/extractor/vrv.py52
-rw-r--r--yt_dlp/extractor/vshare.py3
-rw-r--r--yt_dlp/extractor/vupload.py2
-rw-r--r--yt_dlp/extractor/weibo.py3
-rw-r--r--yt_dlp/extractor/whowatch.py9
-rw-r--r--yt_dlp/extractor/xnxx.py5
-rw-r--r--yt_dlp/extractor/yahoo.py2
-rw-r--r--yt_dlp/extractor/yandexvideo.py1
-rw-r--r--yt_dlp/extractor/youjizz.py3
-rw-r--r--yt_dlp/extractor/youtube.py42
-rw-r--r--yt_dlp/options.py14
-rw-r--r--yt_dlp/postprocessor/ffmpeg.py15
-rw-r--r--yt_dlp/utils.py48
75 files changed, 1277 insertions, 426 deletions
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 000000000..40c19fa66
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,8 @@
+root = true
+
+[**.py]
+charset = utf-8
+indent_size = 4
+indent_style = space
+trim_trailing_whitespace = true
+insert_final_newline = true
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py
index e57716e00..6d8018690 100644
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -516,7 +516,7 @@ class YoutubeDL(object):
_format_fields = {
# NB: Keep in sync with the docstring of extractor/common.py
- 'url', 'manifest_url', 'ext', 'format', 'format_id', 'format_note',
+ 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note',
'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr',
'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx',
'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start',
@@ -937,7 +937,7 @@ class YoutubeDL(object):
def deprecation_warning(self, message):
if self.params.get('logger') is not None:
- self.params['logger'].warning('DeprecationWarning: {message}')
+ self.params['logger'].warning(f'DeprecationWarning: {message}')
else:
self.to_stderr(f'{self._format_err("DeprecationWarning:", self.Styles.ERROR)} {message}', True)
@@ -1239,18 +1239,21 @@ class YoutubeDL(object):
outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs)
return self.escape_outtmpl(outtmpl) % info_dict
- def _prepare_filename(self, info_dict, tmpl_type='default'):
+ def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None):
+ assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive'
+ if outtmpl is None:
+ outtmpl = self.outtmpl_dict.get(tmpl_type or 'default', self.outtmpl_dict['default'])
try:
- outtmpl = self._outtmpl_expandpath(self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default']))
+ outtmpl = self._outtmpl_expandpath(outtmpl)
filename = self.evaluate_outtmpl(outtmpl, info_dict, True)
if not filename:
return None
- if tmpl_type in ('default', 'temp'):
+ if tmpl_type in ('', 'temp'):
final_ext, ext = self.params.get('final_ext'), info_dict.get('ext')
if final_ext and ext and final_ext != ext and filename.endswith(f'.{final_ext}'):
filename = replace_extension(filename, ext, final_ext)
- else:
+ elif tmpl_type:
force_ext = OUTTMPL_TYPES[tmpl_type]
if force_ext:
filename = replace_extension(filename, force_ext, info_dict.get('ext'))
@@ -1266,10 +1269,12 @@ class YoutubeDL(object):
self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
return None
- def prepare_filename(self, info_dict, dir_type='', warn=False):
- """Generate the output filename."""
-
- filename = self._prepare_filename(info_dict, dir_type or 'default')
+ def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False):
+ """Generate the output filename"""
+ if outtmpl:
+ assert not dir_type, 'outtmpl and dir_type are mutually exclusive'
+ dir_type = None
+ filename = self._prepare_filename(info_dict, tmpl_type=dir_type, outtmpl=outtmpl)
if not filename and dir_type not in ('', 'temp'):
return ''
@@ -2182,7 +2187,7 @@ class YoutubeDL(object):
yield merged_format
else:
- format_fallback, format_reverse, format_idx = False, True, 1
+ format_fallback, seperate_fallback, format_reverse, format_idx = False, None, True, 1
mobj = re.match(
r'(?P<bw>best|worst|b|w)(?P<type>video|audio|v|a)?(?P<mod>\*)?(?:\.(?P<n>[1-9]\d*))?$',
format_spec)
@@ -2209,6 +2214,7 @@ class YoutubeDL(object):
filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none'
elif format_spec in self._format_selection_exts['video']:
filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none'
+ seperate_fallback = lambda f: f.get('ext') == format_spec and f.get('vcodec') != 'none'
elif format_spec in self._format_selection_exts['storyboards']:
filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none'
else:
@@ -2217,11 +2223,15 @@ class YoutubeDL(object):
def selector_function(ctx):
formats = list(ctx['formats'])
matches = list(filter(filter_f, formats)) if filter_f is not None else formats
- if format_fallback and ctx['incomplete_formats'] and not matches:
- # for extractors with incomplete formats (audio only (soundcloud)
- # or video only (imgur)) best/worst will fallback to
- # best/worst {video,audio}-only format
- matches = formats
+ if not matches:
+ if format_fallback and ctx['incomplete_formats']:
+ # for extractors with incomplete formats (audio only (soundcloud)
+ # or video only (imgur)) best/worst will fallback to
+ # best/worst {video,audio}-only format
+ matches = formats
+ elif seperate_fallback and not ctx['has_merged_format']:
+ # for compatibility with youtube-dl when there is no pre-merged format
+ matches = list(filter(seperate_fallback, formats))
matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1]))
try:
yield matches[format_idx - 1]
@@ -2467,8 +2477,9 @@ class YoutubeDL(object):
if info_dict.get('is_live') and formats:
formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start]
if get_from_start and not formats:
- self.raise_no_formats(info_dict, msg='--live-from-start is passed, but there are no formats that can be downloaded from the start. '
- 'If you want to download from the current time, pass --no-live-from-start')
+ self.raise_no_formats(info_dict, msg=(
+ '--live-from-start is passed, but there are no formats that can be downloaded from the start. '
+ 'If you want to download from the current time, use --no-live-from-start'))
if not formats:
self.raise_no_formats(info_dict)
@@ -2598,33 +2609,15 @@ class YoutubeDL(object):
self.report_error(err, tb=False, is_error=False)
continue
- # While in format selection we may need to have an access to the original
- # format set in order to calculate some metrics or do some processing.
- # For now we need to be able to guess whether original formats provided
- # by extractor are incomplete or not (i.e. whether extractor provides only
- # video-only or audio-only formats) for proper formats selection for
- # extractors with such incomplete formats (see
- # https://github.com/ytdl-org/youtube-dl/pull/5556).
- # Since formats may be filtered during format selection and may not match
- # the original formats the results may be incorrect. Thus original formats
- # or pre-calculated metrics should be passed to format selection routines
- # as well.
- # We will pass a context object containing all necessary additional data
- # instead of just formats.
- # This fixes incorrect format selection issue (see
- # https://github.com/ytdl-org/youtube-dl/issues/10083).
- incomplete_formats = (
- # All formats are video-only or
- all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
- # all formats are audio-only
- or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
-
- ctx = {
+ formats_to_download = list(format_selector({
'formats': formats,
- 'incomplete_formats': incomplete_formats,
- }
-
- formats_to_download = list(format_selector(ctx))
+ 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
+ 'incomplete_formats': (
+ # All formats are video-only or
+ all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
+ # all formats are audio-only
+ or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)),
+ }))
if interactive_format_selection and not formats_to_download:
self.report_error('Requested format is not available', tb=False, is_error=False)
continue
@@ -2766,7 +2759,7 @@ class YoutubeDL(object):
self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy))
for tmpl, file_tmpl in self.params['print_to_file'].get(key, []):
- filename = self.evaluate_outtmpl(file_tmpl, info_dict)
+ filename = self.prepare_filename(info_dict, outtmpl=file_tmpl)
tmpl = format_tmpl(tmpl)
self.to_screen(f'[info] Writing {tmpl!r} to: {filename}')
if self._ensure_dir_exists(filename):
diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py
index 6d5a64336..0599af92c 100644
--- a/yt_dlp/__init__.py
+++ b/yt_dlp/__init__.py
@@ -378,7 +378,7 @@ def validate_options(opts):
'To let yt-dlp download and merge the best available formats, simply do not pass any format selection',
'If you know what you are doing and want only the best pre-merged format, use "-f b" instead to suppress this warning')))
- # --(post-processor/downloader)-args without name
+ # --(postprocessor/downloader)-args without name
def report_args_compat(name, value, key1, key2=None):
if key1 in value and key2 not in value:
warnings.append(f'{name} arguments given without specifying name. The arguments will be given to all {name}s')
diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py
index 7265cad81..1f08a3664 100644
--- a/yt_dlp/cookies.py
+++ b/yt_dlp/cookies.py
@@ -21,6 +21,7 @@ from .compat import (
compat_cookiejar_Cookie,
)
from .utils import (
+ error_to_str,
expand_path,
Popen,
YoutubeDLCookieJar,
@@ -721,7 +722,7 @@ def _get_kwallet_network_wallet(logger):
network_wallet = stdout.decode('utf-8').strip()
logger.debug('NetworkWallet = "{}"'.format(network_wallet))
return network_wallet
- except BaseException as e:
+ except Exception as e:
logger.warning('exception while obtaining NetworkWallet: {}'.format(e))
return default_wallet
@@ -766,8 +767,8 @@ def _get_kwallet_password(browser_keyring_name, logger):
if stdout[-1:] == b'\n':
stdout = stdout[:-1]
return stdout
- except BaseException as e:
- logger.warning(f'exception running kwallet-query: {type(e).__name__}({e})')
+ except Exception as e:
+ logger.warning(f'exception running kwallet-query: {error_to_str(e)}')
return b''
@@ -823,8 +824,8 @@ def _get_mac_keyring_password(browser_keyring_name, logger):
if stdout[-1:] == b'\n':
stdout = stdout[:-1]
return stdout
- except BaseException as e:
- logger.warning(f'exception running find-generic-password: {type(e).__name__}({e})')
+ except Exception as e:
+ logger.warning(f'exception running find-generic-password: {error_to_str(e)}')
return None
diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py
index 6b75dfc62..c45a8a476 100644
--- a/yt_dlp/downloader/fragment.py
+++ b/yt_dlp/downloader/fragment.py
@@ -403,7 +403,7 @@ class FragmentFD(FileDownloader):
pass
if compat_os_name == 'nt':
- def bindoj_result(future):
+ def future_result(future):
while True:
try:
return future.result(0.1)
@@ -412,7 +412,7 @@ class FragmentFD(FileDownloader):
except concurrent.futures.TimeoutError:
continue
else:
- def bindoj_result(future):
+ def future_result(future):
return future.result()
def interrupt_trigger_iter(fg):
@@ -430,7 +430,7 @@ class FragmentFD(FileDownloader):
result = True
for tpe, job in spins:
try:
- result = result and bindoj_result(job)
+ result = result and future_result(job)
except KeyboardInterrupt:
interrupt_trigger[0] = False
finally:
@@ -494,16 +494,14 @@ class FragmentFD(FileDownloader):
self.report_error('Giving up after %s fragment retries' % fragment_retries)
def append_fragment(frag_content, frag_index, ctx):
- if not frag_content:
- if not is_fatal(frag_index - 1):
- self.report_skip_fragment(frag_index, 'fragment not found')
- return True
- else:
- ctx['dest_stream'].close()
- self.report_error(
- 'fragment %s not found, unable to continue' % frag_index)
- return False
- self._append_fragment(ctx, pack_func(frag_content, frag_index))
+ if frag_content:
+ self._append_fragment(ctx, pack_func(frag_content, frag_index))
+ elif not is_fatal(frag_index - 1):
+ self.report_skip_fragment(frag_index, 'fragment not found')
+ else:
+ ctx['dest_stream'].close()
+ self.report_error(f'fragment {frag_index} not found, unable to continue')
+ return False
return True
decrypt_fragment = self.decrypter(info_dict)
diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py
index 8e096b76b..591a9b08d 100644
--- a/yt_dlp/downloader/http.py
+++ b/yt_dlp/downloader/http.py
@@ -7,7 +7,6 @@ import random
from .common import FileDownloader
from ..compat import (
- compat_str,
compat_urllib_error,
compat_http_client
)
@@ -18,7 +17,7 @@ from ..utils import (
parse_http_range,
sanitized_Request,
ThrottledDownload,
- try_get,
+ try_call,
write_xattr,
XAttrMetadataError,
XAttrUnavailableError,
@@ -58,8 +57,6 @@ class HttpFD(FileDownloader):
ctx.resume_len = 0
ctx.block_size = self.params.get('buffersize', 1024)
ctx.start_time = time.time()
- ctx.chunk_size = None
- throttle_start = None
# parse given Range
req_start, req_end, _ = parse_http_range(headers.get('Range'))
@@ -85,12 +82,6 @@ class HttpFD(FileDownloader):
class NextFragment(Exception):
pass
- def set_range(req, start, end):
- range_header = 'bytes=%d-' % start
- if end:
- range_header += compat_str(end)
- req.add_header('Range', range_header)
-
def establish_connection():
ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size)
if not is_test and chunk_size else chunk_size)
@@ -120,18 +111,18 @@ class HttpFD(FileDownloader):
else:
range_end = None
- if try_get(None, lambda _: range_start > range_end):
+ if try_call(lambda: range_start > range_end):
ctx.resume_len = 0
ctx.open_mode = 'wb'
raise RetryDownload(Exception(f'Conflicting range. (start={range_start} > end={range_end})'))
- if try_get(None, lambda _: range_end >= ctx.content_len):
+ if try_call(lambda: range_end >= ctx.content_len):
range_end = ctx.content_len - 1
request = sanitized_Request(url, request_data, headers)
has_range = range_start is not None
if has_range:
- set_range(request, range_start, range_end)
+ request.add_header('Range', f'bytes={int(range_start)}-{int_or_none(range_end) or ""}')
# Establish connection
try:
ctx.data = self.ydl.urlopen(request)
@@ -214,7 +205,6 @@ class HttpFD(FileDownloader):
raise RetryDownload(err)
def download():
- nonlocal throttle_start
data_len = ctx.data.info().get('Content-length', None)
# Range HTTP header may be ignored/unsupported by a webserver
@@ -329,14 +319,14 @@ class HttpFD(FileDownloader):
if speed and speed < (self.params.get('throttledratelimit') or 0):
# The speed must stay below the limit for 3 seconds
# This prevents raising error when the speed temporarily goes down
- if throttle_start is None:
- throttle_start = now
- elif now - throttle_start > 3:
+ if ctx.throttle_start is None:
+ ctx.throttle_start = now
+ elif now - ctx.throttle_start > 3:
if ctx.stream is not None and ctx.tmpfilename != '-':
ctx.stream.close()
raise ThrottledDownload()
elif speed:
- throttle_start = None
+ ctx.throttle_start = None
if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len:
ctx.resume_len = byte_counter
diff --git a/yt_dlp/extractor/adobeconnect.py b/yt_dlp/extractor/adobeconnect.py
index e688dddcb..e2e6f93f3 100644
--- a/yt_dlp/extractor/adobeconnect.py
+++ b/yt_dlp/extractor/adobeconnect.py
@@ -14,7 +14,7 @@ class AdobeConnectIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1])
is_live = qs.get('isLive', ['false'])[0] == 'true'
formats = []
diff --git a/yt_dlp/extractor/allocine.py b/yt_dlp/extractor/allocine.py
index cd533acfc..403a277e9 100644
--- a/yt_dlp/extractor/allocine.py
+++ b/yt_dlp/extractor/allocine.py
@@ -7,6 +7,7 @@ from ..utils import (
int_or_none,
qualities,
remove_end,
+ strip_or_none,
try_get,
unified_timestamp,
url_basename,
@@ -102,10 +103,7 @@ class AllocineIE(InfoExtractor):
video_id = display_id
media_data = self._download_json(
'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id)
- title = remove_end(
- self._html_search_regex(
- r'(?s)<title>(.+?)</title>', webpage, 'title').strip(),
- ' - AlloCiné')
+ title = remove_end(strip_or_none(self._html_extract_title(webpage), ' - AlloCiné'))
for key, value in media_data['video'].items():
if not key.endswith('Path'):
continue
diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py
index 2a25c0713..2ab3c1beb 100644
--- a/yt_dlp/extractor/archiveorg.py
+++ b/yt_dlp/extractor/archiveorg.py
@@ -457,7 +457,7 @@ class YoutubeWebArchiveIE(InfoExtractor):
_OLDEST_CAPTURE_DATE = 20050214000000
_NEWEST_CAPTURE_DATE = 20500101000000
- def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note='Downloading CDX API JSON'):
+ def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note=None, fatal=False):
# CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md
query = {
'url': url,
@@ -468,7 +468,9 @@ class YoutubeWebArchiveIE(InfoExtractor):
'collapse': collapse or [],
**(query or {})
}
- res = self._download_json('https://web.archive.org/cdx/search/cdx', item_id, note, query=query)
+ res = self._download_json(
+ 'https://web.archive.org/cdx/search/cdx', item_id,
+ note or 'Downloading CDX API JSON', query=query, fatal=fatal)
if isinstance(res, list) and len(res) >= 2:
# format response to make it easier to use
return list(dict(zip(res[0], v)) for v in res[1:])
@@ -481,8 +483,7 @@ class YoutubeWebArchiveIE(InfoExtractor):
regex), webpage, name, default='{}'), video_id, fatal=False)
def _extract_webpage_title(self, webpage):
- page_title = self._html_search_regex(
- r'<title>([^<]*)</title>', webpage, 'title', default='')
+ page_title = self._html_extract_title(webpage, default='')
# YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.
return self._html_search_regex(
r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
diff --git a/yt_dlp/extractor/asiancrush.py b/yt_dlp/extractor/asiancrush.py
index 75a632958..7f1940fca 100644
--- a/yt_dlp/extractor/asiancrush.py
+++ b/yt_dlp/extractor/asiancrush.py
@@ -181,8 +181,7 @@ class AsianCrushPlaylistIE(AsianCrushBaseIE):
'title', default=None) or self._og_search_title(
webpage, default=None) or self._html_search_meta(
'twitter:title', webpage, 'title',
- default=None) or self._search_regex(
- r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
+ default=None) or self._html_extract_title(webpage)
if title:
title = re.sub(r'\s*\|\s*.+?$', '', title)
diff --git a/yt_dlp/extractor/azmedien.py b/yt_dlp/extractor/azmedien.py
index b3cabbf94..0168340b9 100644
--- a/yt_dlp/extractor/azmedien.py
+++ b/yt_dlp/extractor/azmedien.py
@@ -11,7 +11,7 @@ class AZMedienIE(InfoExtractor):
IE_DESC = 'AZ Medien videos'
_VALID_URL = r'''(?x)
https?://
- (?:www\.)?
+ (?:www\.|tv\.)?
(?P<host>
telezueri\.ch|
telebaern\.tv|
@@ -31,7 +31,7 @@ class AZMedienIE(InfoExtractor):
'''
_TESTS = [{
- 'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569',
+ 'url': 'https://tv.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569',
'info_dict': {
'id': '1_anruz3wy',
'ext': 'mp4',
@@ -39,6 +39,9 @@ class AZMedienIE(InfoExtractor):
'uploader_id': 'TVOnline',
'upload_date': '20180930',
'timestamp': 1538328802,
+ 'view_count': int,
+ 'thumbnail': 'http://cfvod.kaltura.com/p/1719221/sp/171922100/thumbnail/entry_id/1_anruz3wy/version/100031',
+ 'duration': 1930
},
'params': {
'skip_download': True,
diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py
index 823155730..29ad7ded7 100644
--- a/yt_dlp/extractor/bbc.py
+++ b/yt_dlp/extractor/bbc.py
@@ -906,9 +906,8 @@ class BBCIE(BBCCoUkIE):
playlist_title = json_ld_info.get('title')
if not playlist_title:
- playlist_title = self._og_search_title(
- webpage, default=None) or self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
+ playlist_title = (self._og_search_title(webpage, default=None)
+ or self._html_extract_title(webpage, 'playlist title', default=None))
if playlist_title:
playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py
index dd1ff512e..3212f3328 100644
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@@ -926,9 +926,9 @@ class BiliIntlIE(BiliIntlBaseIE):
if season_id and not video_data:
# Non-Bstation layout, read through episode list
season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id)
- video_data = next(
- episode for episode in traverse_obj(season_json, ('sections', ..., 'episodes', ...), expected_type=dict)
- if str(episode.get('episode_id')) == ep_id)
+ video_data = traverse_obj(season_json,
+ ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id),
+ expected_type=dict, get_all=False)
return self._extract_video_info(video_data, ep_id=ep_id, aid=aid)
diff --git a/yt_dlp/extractor/br.py b/yt_dlp/extractor/br.py
index 7169eceb6..0155827d8 100644
--- a/yt_dlp/extractor/br.py
+++ b/yt_dlp/extractor/br.py
@@ -175,7 +175,7 @@ class BRIE(InfoExtractor):
class BRMediathekIE(InfoExtractor):
IE_DESC = 'Bayerischer Rundfunk Mediathek'
- _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})'
+ _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek//?video/(?:[^/?&#]+?-)?(?P<id>av:[0-9a-f]{24})'
_TESTS = [{
'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e',
@@ -188,6 +188,9 @@ class BRMediathekIE(InfoExtractor):
'timestamp': 1511942766,
'upload_date': '20171129',
}
+ }, {
+ 'url': 'https://www.br.de/mediathek//video/av:61b0db581aed360007558c12',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/yt_dlp/extractor/breitbart.py b/yt_dlp/extractor/breitbart.py
index f50f719dc..e029aa627 100644
--- a/yt_dlp/extractor/breitbart.py
+++ b/yt_dlp/extractor/breitbart.py
@@ -29,9 +29,8 @@ class BreitBartIE(InfoExtractor):
self._sort_formats(formats)
return {
'id': video_id,
- 'title': self._og_search_title(
- webpage, default=None) or self._html_search_regex(
- r'(?s)<title>(.*?)</title>', webpage, 'video title'),
+ 'title': (self._og_search_title(webpage, default=None)
+ or self._html_extract_title(webpage, 'video title')),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'age_limit': self._rta_search(webpage),
diff --git a/yt_dlp/extractor/callin.py b/yt_dlp/extractor/callin.py
index acf327ace..1f3b7cfff 100644
--- a/yt_dlp/extractor/callin.py
+++ b/yt_dlp/extractor/callin.py
@@ -54,7 +54,7 @@ class CallinIE(InfoExtractor):
id = episode['id']
title = (episode.get('title')
or self._og_search_title(webpage, fatal=False)
- or self._html_search_regex('<title>(.*?)</title>', webpage, 'title'))
+ or self._html_extract_title(webpage))
url = episode['m3u8']
formats = self._extract_m3u8_formats(url, display_id, ext='ts')
self._sort_formats(formats)
diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py
index 31e7d7de6..8b9903774 100644
--- a/yt_dlp/extractor/canvas.py
+++ b/yt_dlp/extractor/canvas.py
@@ -245,10 +245,6 @@ class VrtNUIE(GigyaBaseIE):
'upload_date': '20200727',
},
'skip': 'This video is only available for registered users',
- 'params': {
- 'username': '<snip>',
- 'password': '<snip>',
- },
'expected_warnings': ['is not a supported codec'],
}, {
# Only available via new API endpoint
@@ -264,10 +260,6 @@ class VrtNUIE(GigyaBaseIE):
'episode_number': 5,
},
'skip': 'This video is only available for registered users',
- 'params': {
- 'username': '<snip>',
- 'password': '<snip>',
- },
'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'],
}]
_NETRC_MACHINE = 'vrtnu'
diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py
index ac1272f7b..fba8bf965 100644
--- a/yt_dlp/extractor/cbc.py
+++ b/yt_dlp/extractor/cbc.py
@@ -127,9 +127,9 @@ class CBCIE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- title = self._og_search_title(webpage, default=None) or self._html_search_meta(
- 'twitter:title', webpage, 'title', default=None) or self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
+ title = (self._og_search_title(webpage, default=None)
+ or self._html_search_meta('twitter:title', webpage, 'title', default=None)
+ or self._html_extract_title(webpage))
entries = [
self._extract_player_init(player_init, display_id)
for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]
diff --git a/yt_dlp/extractor/closertotruth.py b/yt_dlp/extractor/closertotruth.py
index 26243d52d..517e121e0 100644
--- a/yt_dlp/extractor/closertotruth.py
+++ b/yt_dlp/extractor/closertotruth.py
@@ -54,8 +54,7 @@ class CloserToTruthIE(InfoExtractor):
r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
webpage, 'kaltura partner_id')
- title = self._search_regex(
- r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title')
+ title = self._html_extract_title(webpage, 'video title')
select = self._search_regex(
r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index d0e57da23..e2605c1f4 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -139,6 +139,8 @@ class InfoExtractor(object):
for HDS - URL of the F4M manifest,
for DASH - URL of the MPD manifest,
for MSS - URL of the ISM manifest.
+ * manifest_stream_number (For internal use only)
+ The index of the stream in the manifest file
* ext Will be calculated from URL if missing
* format A human-readable description of the format
("mp4 container with h264/opus").
@@ -215,7 +217,7 @@ class InfoExtractor(object):
(HTTP or RTMP) download. Boolean.
* has_drm The format has DRM and cannot be downloaded. Boolean
* downloader_options A dictionary of downloader options as
- described in FileDownloader
+ described in FileDownloader (For internal use only)
RTMP formats can also have the additional fields: page_url,
app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
rtmp_protocol, rtmp_real_time
@@ -1297,8 +1299,8 @@ class InfoExtractor(object):
@staticmethod
def _og_regexes(prop):
content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
- property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
- % {'prop': re.escape(prop)})
+ property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
+ % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
template = r'<meta[^>]+?%s[^>]+?%s'
return [
template % (property_re, content_re),
@@ -1329,9 +1331,8 @@ class InfoExtractor(object):
def _og_search_description(self, html, **kargs):
return self._og_search_property('description', html, fatal=False, **kargs)
- def _og_search_title(self, html, **kargs):
- kargs.setdefault('fatal', False)
- return self._og_search_property('title', html, **kargs)
+ def _og_search_title(self, html, *, fatal=False, **kargs):
+ return self._og_search_property('title', html, fatal=fatal, **kargs)
def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
regexes = self._og_regexes('video') + self._og_regexes('video:url')
@@ -1342,9 +1343,8 @@ class InfoExtractor(object):
def _og_search_url(self, html, **kargs):
return self._og_search_property('url', html, **kargs)
- def _html_extract_title(self, html, name, **kwargs):
- return self._html_search_regex(
- r'(?s)<title>(.*?)</title>', html, name, **kwargs)
+ def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
+ return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
name = variadic(name)
@@ -3686,9 +3686,9 @@ class InfoExtractor(object):
def _merge_subtitle_items(subtitle_list1, subtitle_list2):
""" Merge subtitle items for one language. Items with duplicated URLs/data
will be dropped. """
- list1_data = set([item.get('url') or item['data'] for item in subtitle_list1])
+ list1_data = set((item.get('url'), item.get('data')) for item in subtitle_list1)
ret = list(subtitle_list1)
- ret.extend([item for item in subtitle_list2 if (item.get('url') or item['data']) not in list1_data])
+ ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
return ret
@classmethod
diff --git a/yt_dlp/extractor/craftsy.py b/yt_dlp/extractor/craftsy.py
new file mode 100644
index 000000000..ed2f4420e
--- /dev/null
+++ b/yt_dlp/extractor/craftsy.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .brightcove import BrightcoveNewIE
+from .common import InfoExtractor
+
+from ..utils import (
+ dict_get,
+ get_element_by_id,
+ js_to_json,
+ traverse_obj,
+)
+
+
+class CraftsyIE(InfoExtractor):
+ _VALID_URL = r'https?://www.craftsy.com/class/(?P<id>[a-z0-9_-]+)/'
+ _TESTS = [{
+ 'url': 'https://www.craftsy.com/class/the-midnight-quilt-show-season-5/',
+ 'info_dict': {
+ 'id': 'the-midnight-quilt-show-season-5',
+ 'title': 'The Midnight Quilt Show Season 5',
+ 'description': 'md5:113eda818e985d1a566625fb2f833b7a',
+ },
+ 'playlist_count': 10,
+ }, {
+ 'url': 'https://www.craftsy.com/class/sew-your-own-designer-handbag/',
+ 'info_dict': {
+ 'id': 'sew-your-own-designer-handbag',
+ 'title': 'Sew Your Own Designer Handbag',
+ 'description': 'md5:8270d0ef5427d3c895a27351aeaac276',
+ },
+ 'playlist_mincount': 1,
+ }, {
+ 'url': 'https://www.craftsy.com/class/all-access-estes-park-wool-market/',
+ 'info_dict': {
+ 'id': 'all-access-estes-park-wool-market',
+ 'title': 'All Access: Estes Park Wool Market',
+ 'description': 'md5:aded1bd8d38ae2fae4dae936c0ae01e7',
+ },
+ 'playlist_count': 6,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ video_data = self._parse_json(self._search_regex(
+ r'class_video_player_vars\s*=\s*({.*})\s*;',
+ get_element_by_id('vidstore-classes_class-video-player-js-extra', webpage),
+ 'video data'), video_id, transform_source=js_to_json)
+
+ account_id = traverse_obj(video_data, ('video_player', 'bc_account_id'))
+
+ entries = []
+ class_preview = traverse_obj(video_data, ('video_player', 'class_preview'))
+ if class_preview:
+ v_id = class_preview.get('video_id')
+ entries.append(self.url_result(
+ f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={v_id}',
+ BrightcoveNewIE, v_id, class_preview.get('title')))
+
+ if dict_get(video_data, ('is_free', 'user_has_access')):
+ entries += [
+ self.url_result(
+ f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={lesson["video_id"]}',
+ BrightcoveNewIE, lesson['video_id'], lesson.get('title'))
+ for lesson in video_data['lessons']]
+
+ return self.playlist_result(
+ entries, video_id, video_data.get('class_title'),
+ self._html_search_meta(('og:description', 'description'), webpage, default=None))
diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py
index bf1bf8c1c..7edb645f8 100644
--- a/yt_dlp/extractor/crunchyroll.py
+++ b/yt_dlp/extractor/crunchyroll.py
@@ -9,7 +9,7 @@ import zlib
from hashlib import sha1
from math import pow, sqrt, floor
from .common import InfoExtractor
-from .vrv import VRVIE
+from .vrv import VRVBaseIE
from ..compat import (
compat_b64decode,
compat_etree_Element,
@@ -86,6 +86,22 @@ class CrunchyrollBaseIE(InfoExtractor):
if not self._get_cookies(self._LOGIN_URL).get('etp_rt'):
raise ExtractorError('Login succeeded but did not set etp_rt cookie')
+ # Beta-specific, but needed for redirects
+ def _get_beta_embedded_json(self, webpage, display_id):
+ initial_state = self._parse_json(self._search_regex(
+ r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id)
+ app_config = self._parse_json(self._search_regex(
+ r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id)
+ return initial_state, app_config
+
+ def _redirect_to_beta(self, webpage, iekey, video_id):
+ if not self._get_cookies(self._LOGIN_URL).get('etp_rt'):
+ raise ExtractorError('Received a beta page from non-beta url when not logged in.')
+ initial_state, app_config = self._get_beta_embedded_json(webpage, video_id)
+ url = app_config['baseSiteUrl'] + initial_state['router']['locations']['current']['pathname']
+ self.to_screen(f'{video_id}: Redirected to beta site - {url}')
+ return self.url_result(f'{url}', iekey, video_id)
+
@staticmethod
def _add_skip_wall(url):
parsed_url = compat_urlparse.urlparse(url)
@@ -100,7 +116,7 @@ class CrunchyrollBaseIE(InfoExtractor):
parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
-class CrunchyrollIE(CrunchyrollBaseIE, VRVIE):
+class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE):
IE_NAME = 'crunchyroll'
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<id>[0-9]+))(?:[/?&]|$)'
_TESTS = [{
@@ -406,6 +422,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
webpage = self._download_webpage(
self._add_skip_wall(webpage_url), video_id,
headers=self.geo_verification_headers())
+ if re.search(r'<div id="preload-data">', webpage):
+ return self._redirect_to_beta(webpage, CrunchyrollBetaIE.ie_key(), video_id)
note_m = self._html_search_regex(
r'<div class="showmedia-trailer-notice">(.+?)</div>',
webpage, 'trailer-notice', default='')
@@ -670,6 +688,8 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
# https:// gives a 403, but http:// does not
self._add_skip_wall(url).replace('https://', 'http://'), show_id,
headers=self.geo_verification_headers())
+ if re.search(r'<div id="preload-data">', webpage):
+ return self._redirect_to_beta(webpage, CrunchyrollBetaShowIE.ie_key(), show_id)
title = self._html_search_meta('name', webpage, default=None)
episode_re = r'<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"'
@@ -692,9 +712,56 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
}
-class CrunchyrollBetaIE(CrunchyrollBaseIE):
+class CrunchyrollBetaBaseIE(CrunchyrollBaseIE):
+ params = None
+
+ def _get_params(self, lang):
+ if not CrunchyrollBetaBaseIE.params:
+ initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(
+ f'https://beta.crunchyroll.com/{lang}', None, note='Retrieving main page'), None)
+ api_domain = app_config['cxApiParams']['apiDomain']
+ basic_token = str(base64.b64encode(('%s:' % app_config['cxApiParams']['accountAuthClientId']).encode('ascii')), 'ascii')
+ auth_response = self._download_json(
+ f'{api_domain}/auth/v1/token', None, note='Authenticating with cookie',
+ headers={
+ 'Authorization': 'Basic ' + basic_token
+ }, data='grant_type=etp_rt_cookie'.encode('ascii'))
+ policy_response = self._download_json(
+ f'{api_domain}/index/v2', None, note='Retrieving signed policy',
+ headers={
+ 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']
+ })
+ bucket = policy_response['cms']['bucket']
+ params = {
+ 'Policy': policy_response['cms']['policy'],
+ 'Signature': policy_response['cms']['signature'],
+ 'Key-Pair-Id': policy_response['cms']['key_pair_id']
+ }
+ locale = traverse_obj(initial_state, ('localization', 'locale'))
+ if locale:
+ params['locale'] = locale
+ CrunchyrollBetaBaseIE.params = (api_domain, bucket, params)
+ return CrunchyrollBetaBaseIE.params
+
+ def _redirect_from_beta(self, url, lang, internal_id, display_id, is_episode, iekey):
+ initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(url, display_id), display_id)
+ content_data = initial_state['content']['byId'][internal_id]
+ if is_episode:
+ video_id = content_data['external_id'].split('.')[1]
+ series_id = content_data['episode_metadata']['series_slug_title']
+ else:
+ series_id = content_data['slug_title']
+ series_id = re.sub(r'-{2,}', '-', series_id)
+ url = f'https://www.crunchyroll.com/{lang}{series_id}'
+ if is_episode:
+ url = url + f'/{display_id}-{video_id}'
+ self.to_screen(f'{display_id}: Not logged in. Redirecting to non-beta site - {url}')
+ return self.url_result(url, iekey, display_id)
+
+
+class CrunchyrollBetaIE(CrunchyrollBetaBaseIE):
IE_NAME = 'crunchyroll:beta'
- _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<internal_id>\w+)/(?P<id>[\w\-]+)/?(?:\?|$)'
+ _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)'
_TESTS = [{
'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future',
'info_dict': {
@@ -705,51 +772,49 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE):
'uploader': 'Toei Animation',
'title': 'World Trigger Episode 73 – To the Future',
'upload_date': '20160402',
+ 'episode_number': 73,
+ 'series': 'World Trigger',
+ 'average_rating': 4.9,
+ 'episode': 'To the Future',
+ 'season': 'World Trigger',
+ 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/c870dedca1a83137c2d3d144984155ed1459527119_main.jpg',
+ 'season_number': 1,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ 'expected_warnings': ['Unable to download XML']
+ }, {
+ 'url': 'https://beta.crunchyroll.com/watch/GYK53DMPR/wicked-lord-shingan-reborn',
+ 'info_dict': {
+ 'id': '648781',
+ 'ext': 'mp4',
+ 'episode_number': 1,
+ 'timestamp': 1389173400,
+ 'series': 'Love, Chunibyo & Other Delusions - Heart Throb -',
+ 'description': 'md5:5579d1a0355cc618558ba23d27067a62',
+ 'uploader': 'TBS',
+ 'episode': 'Wicked Lord Shingan... Reborn',
+ 'average_rating': 4.9,
+ 'season': 'Love, Chunibyo & Other Delusions - Heart Throb -',
+ 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/2ba0384e225a5370d5f0ee9496d91ea51389046521_main.jpg',
+ 'title': 'Love, Chunibyo & Other Delusions - Heart Throb - Episode 1 – Wicked Lord Shingan... Reborn',
+ 'season_number': 2,
+ 'upload_date': '20140108',
},
'params': {'skip_download': 'm3u8'},
'expected_warnings': ['Unable to download XML']
+ }, {
+ 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id')
- webpage = self._download_webpage(url, display_id)
- initial_state = self._parse_json(
- self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'),
- display_id)
- episode_data = initial_state['content']['byId'][internal_id]
+ lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id')
+
if not self._get_cookies(url).get('etp_rt'):
- video_id = episode_data['external_id'].split('.')[1]
- series_id = episode_data['episode_metadata']['series_slug_title']
- return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}',
- CrunchyrollIE.ie_key(), video_id)
-
- app_config = self._parse_json(
- self._search_regex(r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'),
- display_id)
- client_id = app_config['cxApiParams']['accountAuthClientId']
- api_domain = app_config['cxApiParams']['apiDomain']
- basic_token = str(base64.b64encode(('%s:' % client_id).encode('ascii')), 'ascii')
- auth_response = self._download_json(
- f'{api_domain}/auth/v1/token', display_id,
- note='Authenticating with cookie',
- headers={
- 'Authorization': 'Basic ' + basic_token
- }, data='grant_type=etp_rt_cookie'.encode('ascii'))
- policy_response = self._download_json(
- f'{api_domain}/index/v2', display_id,
- note='Retrieving signed policy',
- headers={
- 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']
- })
- bucket = policy_response['cms']['bucket']
- params = {
- 'Policy': policy_response['cms']['policy'],
- 'Signature': policy_response['cms']['signature'],
- 'Key-Pair-Id': policy_response['cms']['key_pair_id']
- }
- locale = traverse_obj(initial_state, ('localization', 'locale'))
- if locale:
- params['locale'] = locale
+ return self._redirect_from_beta(url, lang, internal_id, display_id, True, CrunchyrollIE.ie_key())
+
+ api_domain, bucket, params = self._get_params(lang)
+
episode_response = self._download_json(
f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id,
note='Retrieving episode metadata',
@@ -827,9 +892,9 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE):
}
-class CrunchyrollBetaShowIE(CrunchyrollBaseIE):
+class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE):
IE_NAME = 'crunchyroll:playlist:beta'
- _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/\w+/(?P<id>[\w\-]+)/?(?:\?|$)'
+ _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)'
_TESTS = [{
'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA',
'info_dict': {
@@ -838,11 +903,56 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE):
},
'playlist_mincount': 10,
}, {
+ 'url': 'https://beta.crunchyroll.com/series/GYJQV73V6/love-chunibyo--other-delusions---heart-throb--',
+ 'info_dict': {
+ 'id': 'love-chunibyo-other-delusions-heart-throb-',
+ 'title': 'Love, Chunibyo & Other Delusions - Heart Throb -',
+ },
+ 'playlist_mincount': 10,
+ }, {
'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA',
'only_matching': True,
}]
def _real_extract(self, url):
- lang, series_id = self._match_valid_url(url).group('lang', 'id')
- return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id.lower()}',
- CrunchyrollShowPlaylistIE.ie_key(), series_id)
+ lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id')
+
+ if not self._get_cookies(url).get('etp_rt'):
+ return self._redirect_from_beta(url, lang, internal_id, display_id, False, CrunchyrollShowPlaylistIE.ie_key())
+
+ api_domain, bucket, params = self._get_params(lang)
+
+ series_response = self._download_json(
+ f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id,
+ note='Retrieving series metadata', query=params)
+
+ seasons_response = self._download_json(
+ f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id,
+ note='Retrieving season list', query=params)
+
+ def entries():
+ for season in seasons_response['items']:
+ episodes_response = self._download_json(
+ f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id,
+ note=f'Retrieving episode list for {season.get("slug_title")}', query=params)
+ for episode in episodes_response['items']:
+ episode_id = episode['id']
+ episode_display_id = episode['slug_title']
+ yield {
+ '_type': 'url',
+ 'url': f'https://beta.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}',
+ 'ie_key': CrunchyrollBetaIE.ie_key(),
+ 'id': episode_id,
+ 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')),
+ 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')),
+ 'duration': float_or_none(episode.get('duration_ms'), 1000),
+ 'series': episode.get('series_title'),
+ 'series_id': episode.get('series_id'),
+ 'season': episode.get('season_title'),
+ 'season_id': episode.get('season_id'),
+ 'season_number': episode.get('season_number'),
+ 'episode': episode.get('title'),
+ 'episode_number': episode.get('sequence_number')
+ }
+
+ return self.playlist_result(entries(), internal_id, series_response.get('title'))
diff --git a/yt_dlp/extractor/cspan.py b/yt_dlp/extractor/cspan.py
index d29b58ba6..f51159bbe 100644
--- a/yt_dlp/extractor/cspan.py
+++ b/yt_dlp/extractor/cspan.py
@@ -278,7 +278,7 @@ class CSpanCongressIE(InfoExtractor):
video_id, transform_source=js_to_json)
title = (self._og_search_title(webpage, default=None)
- or self._html_search_regex(r'(?s)<title>(.*?)</title>', webpage, 'video title'))
+ or self._html_extract_title(webpage, 'video title'))
description = (self._og_search_description(webpage, default=None)
or self._html_search_meta('description', webpage, 'description', default=None))
diff --git a/yt_dlp/extractor/cybrary.py b/yt_dlp/extractor/cybrary.py
new file mode 100644
index 000000000..c278f0fe0
--- /dev/null
+++ b/yt_dlp/extractor/cybrary.py
@@ -0,0 +1,146 @@
+# coding: utf-8
+from .common import InfoExtractor
+
+from ..utils import (
+ ExtractorError,
+ smuggle_url,
+ str_or_none,
+ traverse_obj,
+ urlencode_postdata
+)
+
+
+class CybraryBaseIE(InfoExtractor):
+ _API_KEY = 'AIzaSyCX9ru6j70PX2My1Eq6Q1zoMAhuTdXlzSw'
+ _ENDPOINTS = {
+ 'course': 'https://app.cybrary.it/courses/api/catalog/browse/course/{}',
+ 'course_enrollment': 'https://app.cybrary.it/courses/api/catalog/{}/enrollment',
+ 'enrollment': 'https://app.cybrary.it/courses/api/enrollment/{}',
+ 'launch': 'https://app.cybrary.it/courses/api/catalog/{}/launch',
+ 'vimeo_oembed': 'https://vimeo.com/api/oembed.json?url=https://vimeo.com/{}',
+ }
+ _NETRC_MACHINE = 'cybrary'
+ _TOKEN = None
+
+ def _perform_login(self, username, password):
+ CybraryBaseIE._TOKEN = self._download_json(
+ f'https://identitytoolkit.googleapis.com/v1/accounts:signInWithPassword?key={self._API_KEY}',
+ None, data=urlencode_postdata({'email': username, 'password': password, 'returnSecureToken': True}),
+ note='Logging in')['idToken']
+
+ def _real_initialize(self):
+ if not self._TOKEN:
+ self.raise_login_required(method='password')
+
+ def _call_api(self, endpoint, item_id):
+ return self._download_json(
+ self._ENDPOINTS[endpoint].format(item_id), item_id,
+ note=f'Downloading {endpoint} JSON metadata',
+ headers={'Authorization': f'Bearer {self._TOKEN}'})
+
+ def _get_vimeo_id(self, activity_id):
+ launch_api = self._call_api('launch', activity_id)
+
+ if launch_api.get('url'):
+ return self._search_regex(r'https?://player\.vimeo\.com/video/(?P<vimeo_id>[0-9]+)', launch_api['url'], 'vimeo_id')
+ return traverse_obj(launch_api, ('vendor_data', 'content', ..., 'videoId'), get_all=False)
+
+
+class CybraryIE(CybraryBaseIE):
+ _VALID_URL = r'https?://app.cybrary.it/immersive/(?P<enrollment>[0-9]+)/activity/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'https://app.cybrary.it/immersive/12487950/activity/63102',
+ 'md5': '9ae12d37e555cb2ed554223a71a701d0',
+ 'info_dict': {
+ 'id': '646609770',
+ 'ext': 'mp4',
+ 'title': 'Getting Started',
+ 'thumbnail': 'https://i.vimeocdn.com/video/1301817996-76a268f0c56cff18a5cecbbdc44131eb9dda0c80eb0b3a036_1280',
+ 'series_id': '63111',
+ 'uploader_url': 'https://vimeo.com/user30867300',
+ 'duration': 88,
+ 'uploader_id': 'user30867300',
+ 'series': 'Cybrary Orientation',
+ 'uploader': 'Cybrary',
+ 'chapter': 'Cybrary Orientation Series',
+ 'chapter_id': '63110'
+ },
+ 'expected_warnings': ['No authenticators for vimeo']
+ }, {
+ 'url': 'https://app.cybrary.it/immersive/12747143/activity/52686',
+ 'md5': '62f26547dccc59c44363e2a13d4ad08d',
+ 'info_dict': {
+ 'id': '445638073',
+ 'ext': 'mp4',
+ 'title': 'Azure Virtual Network IP Addressing',
+ 'thumbnail': 'https://i.vimeocdn.com/video/936667051-1647ace66c627d4a2382185e0dae8deb830309bfddd53f8b2367b2f91e92ed0e-d_1280',
+ 'series_id': '52733',
+ 'uploader_url': 'https://vimeo.com/user30867300',
+ 'duration': 426,
+ 'uploader_id': 'user30867300',
+ 'series': 'AZ-500: Microsoft Azure Security Technologies',
+ 'uploader': 'Cybrary',
+ 'chapter': 'Implement Network Security',
+ 'chapter_id': '52693'
+ },
+ 'expected_warnings': ['No authenticators for vimeo']
+ }]
+
+ def _real_extract(self, url):
+ activity_id, enrollment_id = self._match_valid_url(url).group('id', 'enrollment')
+ course = self._call_api('enrollment', enrollment_id)['content']
+ activity = traverse_obj(course, ('learning_modules', ..., 'activities', lambda _, v: int(activity_id) == v['id']), get_all=False)
+
+ if activity.get('type') not in ['Video Activity', 'Lesson Activity']:
+ raise ExtractorError('The activity is not a video', expected=True)
+
+ module = next((m for m in course.get('learning_modules') or []
+ if int(activity_id) in traverse_obj(m, ('activities', ..., 'id') or [])), None)
+
+ vimeo_id = self._get_vimeo_id(activity_id)
+
+ return {
+ '_type': 'url_transparent',
+ 'series': traverse_obj(course, ('content_description', 'title')),
+ 'series_id': str_or_none(traverse_obj(course, ('content_description', 'id'))),
+ 'id': vimeo_id,
+ 'chapter': module.get('title'),
+ 'chapter_id': str_or_none(module.get('id')),
+ 'title': activity.get('title'),
+ 'url': smuggle_url(f'https://player.vimeo.com/video/{vimeo_id}', {'http_headers': {'Referer': 'https://api.cybrary.it'}})
+ }
+
+
+class CybraryCourseIE(CybraryBaseIE):
+ _VALID_URL = r'https://app.cybrary.it/browse/course/(?P<id>[\w-]+)/?(?:$|[#?])'
+ _TESTS = [{
+ 'url': 'https://app.cybrary.it/browse/course/az-500-microsoft-azure-security-technologies',
+ 'info_dict': {
+ 'id': 898,
+ 'title': 'AZ-500: Microsoft Azure Security Technologies',
+ 'description': 'md5:69549d379c0fc1dec92926d4e8b6fbd4'
+ },
+ 'playlist_count': 59
+ }, {
+ 'url': 'https://app.cybrary.it/browse/course/cybrary-orientation',
+ 'info_dict': {
+ 'id': 1245,
+ 'title': 'Cybrary Orientation',
+ 'description': 'md5:9e69ff66b32fe78744e0ad4babe2e88e'
+ },
+ 'playlist_count': 4
+ }]
+
+ def _real_extract(self, url):
+ course_id = self._match_id(url)
+ course = self._call_api('course', course_id)
+ enrollment_info = self._call_api('course_enrollment', course['id'])
+
+ entries = [self.url_result(
+ f'https://app.cybrary.it/immersive/{enrollment_info["id"]}/activity/{activity["id"]}')
+ for activity in traverse_obj(course, ('content_item', 'learning_modules', ..., 'activities', ...))]
+
+ return self.playlist_result(
+ entries,
+ traverse_obj(course, ('content_item', 'id'), expected_type=str_or_none),
+ course.get('title'), course.get('short_description'))
diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py
index 95589d53a..9cb56185b 100644
--- a/yt_dlp/extractor/dailymotion.py
+++ b/yt_dlp/extractor/dailymotion.py
@@ -94,10 +94,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
_VALID_URL = r'''(?ix)
https?://
(?:
- (?:(?:www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|\#)/)?video|swf)|
+ (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player\.html\?)?video|swf)|
(?:www\.)?lequipe\.fr/video
)
- /(?P<id>[^/?_]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))?
+ [/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))?
'''
IE_NAME = 'dailymotion'
_TESTS = [{
@@ -116,6 +116,25 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'age_limit': 0,
},
}, {
+ 'url': 'https://geo.dailymotion.com/player.html?video=x89eyek&mute=true',
+ 'md5': 'e2f9717c6604773f963f069ca53a07f8',
+ 'info_dict': {
+ 'id': 'x89eyek',
+ 'ext': 'mp4',
+ 'title': "En quête d'esprit du 27/03/2022",
+ 'description': 'md5:66542b9f4df2eb23f314fc097488e553',
+ 'duration': 2756,
+ 'timestamp': 1648383669,
+ 'upload_date': '20220327',
+ 'uploader': 'CNEWS',
+ 'uploader_id': 'x24vth',
+ 'age_limit': 0,
+ 'view_count': int,
+ 'like_count': int,
+ 'tags': ['en_quete_d_esprit'],
+ 'thumbnail': 'https://s2.dmcdn.net/v/Tncwi1YGKdvFbDuDY/x1080',
+ }
+ }, {
'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
'md5': '2137c41a8e78554bb09225b8eb322406',
'info_dict': {
diff --git a/yt_dlp/extractor/dropout.py b/yt_dlp/extractor/dropout.py
index a7442d8f0..2fa61950c 100644
--- a/yt_dlp/extractor/dropout.py
+++ b/yt_dlp/extractor/dropout.py
@@ -123,7 +123,7 @@ class DropoutIE(InfoExtractor):
self._login(display_id)
webpage = self._download_webpage(url, display_id, note='Downloading video webpage')
finally:
- self._download_webpage('https://www.dropout.tv/logout', display_id, note='Logging out')
+ self._download_webpage('https://www.dropout.tv/logout', display_id, note='Logging out', fatal=False)
embed_url = self._search_regex(r'embed_url:\s*["\'](.+?)["\']', webpage, 'embed url')
thumbnail = self._og_search_thumbnail(webpage)
@@ -139,7 +139,7 @@ class DropoutIE(InfoExtractor):
'_type': 'url_transparent',
'ie_key': VHXEmbedIE.ie_key(),
'url': embed_url,
- 'id': self._search_regex(r'embed.vhx.tv/videos/(.+?)\?', embed_url, 'id'),
+ 'id': self._search_regex(r'embed\.vhx\.tv/videos/(.+?)\?', embed_url, 'id'),
'display_id': display_id,
'title': title,
'description': self._html_search_meta('description', webpage, fatal=False),
diff --git a/yt_dlp/extractor/elonet.py b/yt_dlp/extractor/elonet.py
index eefba4e24..9c6aea28e 100644
--- a/yt_dlp/extractor/elonet.py
+++ b/yt_dlp/extractor/elonet.py
@@ -1,30 +1,22 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
- base_url,
- ExtractorError,
- try_get,
-)
-from ..compat import compat_str
+from ..utils import determine_ext
class ElonetIE(InfoExtractor):
_VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P<id>[0-9]+)'
_TESTS = [{
- # m3u8 with subtitles
'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867',
- 'md5': '8efc954b96c543711707f87de757caea',
'info_dict': {
'id': '107867',
'ext': 'mp4',
'title': 'Valkoinen peura',
- 'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...',
- 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large',
+ 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_107867.+',
+ 'description': 'md5:bded4201c9677fab10854884fe8f7312',
},
+ 'params': {'skip_download': 'dash'},
}, {
# DASH with subtitles
'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_116539',
@@ -32,58 +24,45 @@ class ElonetIE(InfoExtractor):
'id': '116539',
'ext': 'mp4',
'title': 'Minulla on tiikeri',
- 'description': 'Pienellä pojalla, joka asuu kerrostalossa, on kotieläimenä tiikeri. Se on kuitenkin salaisuus. Kerrostalon räpätäti on Kotilaisen täti, joka on aina vali...',
- 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_116539&index=0&size=large&source=Solr',
- }
+ 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_116539.+',
+ 'description': 'md5:5ab72b3fe76d3414e46cc8f277104419',
+ },
+ 'params': {'skip_download': 'dash'},
+ }, {
+ # Page with multiple videos, download the main one
+ 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_117396',
+ 'info_dict': {
+ 'id': '117396',
+ 'ext': 'mp4',
+ 'title': 'Sampo',
+ 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_117396.+',
+ 'description': 'md5:ec69572a5b054d0ecafe8086b1fa96f7',
+ },
+ 'params': {'skip_download': 'dash'},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(
- r'<meta .*property="og&#x3A;title" .*content="(.+?)"', webpage, 'title')
- description = self._html_search_regex(
- r'<meta .*property="og&#x3A;description" .*content="(.+?)"', webpage, 'description')
- thumbnail = self._html_search_regex(
- r'<meta .*property="og&#x3A;image" .*content="(.+?)"', webpage, 'thumbnail')
+ src = self._parse_json(self._html_search_regex(
+ r'id=\'video-data\'[^>]+data-video-sources="([^"]+)"', webpage, 'json'), video_id)[0]['src']
+ ext = determine_ext(src)
- json_s = self._html_search_regex(
- r'data-video-sources="(.+?)"', webpage, 'json')
- src = try_get(
- self._parse_json(json_s, video_id),
- lambda x: x[0]["src"], compat_str)
- formats = []
- subtitles = {}
- if re.search(r'\.m3u8\??', src):
- res = self._download_webpage_handle(
- # elonet servers have certificate problems
- src.replace('https:', 'http:'), video_id,
- note='Downloading m3u8 information',
- errnote='Failed to download m3u8 information')
- if res:
- doc, urlh = res
- url = urlh.geturl()
- formats, subtitles = self._parse_m3u8_formats_and_subtitles(doc, url)
- for f in formats:
- f['ext'] = 'mp4'
- elif re.search(r'\.mpd\??', src):
- res = self._download_xml_handle(
- src, video_id,
- note='Downloading MPD manifest',
- errnote='Failed to download MPD manifest')
- if res:
- doc, urlh = res
- url = base_url(urlh.geturl())
- formats, subtitles = self._parse_mpd_formats_and_subtitles(doc, mpd_base_url=url)
+ if ext == 'm3u8':
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(src, video_id, fatal=False)
+ elif ext == 'mpd':
+ formats, subtitles = self._extract_mpd_formats_and_subtitles(src, video_id, fatal=False)
else:
- raise ExtractorError("Unknown streaming format")
+ formats, subtitles = [], {}
+ self.raise_no_formats(f'Unknown streaming format {ext}')
+ self._sort_formats(formats)
return {
'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
'subtitles': subtitles,
}
diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py
index e5ae12a7d..457f4c2aa 100644
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@@ -322,6 +322,7 @@ from .cpac import (
from .cozytv import CozyTVIE
from .cracked import CrackedIE
from .crackle import CrackleIE
+from .craftsy import CraftsyIE
from .crooksandliars import CrooksAndLiarsIE
from .crowdbunker import (
CrowdBunkerIE,
@@ -344,6 +345,10 @@ from .curiositystream import (
CuriosityStreamSeriesIE,
)
from .cwtv import CWTVIE
+from .cybrary import (
+ CybraryIE,
+ CybraryCourseIE
+)
from .daftsex import DaftsexIE
from .dailymail import DailyMailIE
from .dailymotion import (
diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py
index 2deed585f..5e0e2facf 100644
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@@ -397,8 +397,10 @@ class FacebookIE(InfoExtractor):
r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)]
post = traverse_obj(post_data, (
..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
- media = [m for m in traverse_obj(post, (..., 'attachments', ..., 'media'), expected_type=dict) or []
- if str(m.get('id')) == video_id and m.get('__typename') == 'Video']
+ media = traverse_obj(
+ post,
+ (..., 'attachments', ..., 'media', lambda _, m: str(m['id']) == video_id and m['__typename'] == 'Video'),
+ expected_type=dict)
title = get_first(media, ('title', 'text'))
description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text'))
uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {}
diff --git a/yt_dlp/extractor/fivetv.py b/yt_dlp/extractor/fivetv.py
index be81fccb8..d6bebd19b 100644
--- a/yt_dlp/extractor/fivetv.py
+++ b/yt_dlp/extractor/fivetv.py
@@ -75,8 +75,7 @@ class FiveTVIE(InfoExtractor):
r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'],
webpage, 'video url')
- title = self._og_search_title(webpage, default=None) or self._search_regex(
- r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage)
duration = int_or_none(self._og_search_property(
'video:duration', webpage, 'duration', default=None))
diff --git a/yt_dlp/extractor/foxgay.py b/yt_dlp/extractor/foxgay.py
index 512a10645..1c53e0642 100644
--- a/yt_dlp/extractor/foxgay.py
+++ b/yt_dlp/extractor/foxgay.py
@@ -29,8 +29,7 @@ class FoxgayIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = remove_end(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'), ' - Foxgay.com')
+ title = remove_end(self._html_extract_title(webpage), ' - Foxgay.com')
description = get_element_by_id('inf_tit', webpage)
# The default user-agent with foxgay cookies leads to pages without videos
diff --git a/yt_dlp/extractor/funimation.py b/yt_dlp/extractor/funimation.py
index 36a9c4772..6aa9bc9ce 100644
--- a/yt_dlp/extractor/funimation.py
+++ b/yt_dlp/extractor/funimation.py
@@ -333,7 +333,7 @@ class FunimationShowIE(FunimationBaseIE):
'https://prod-api-funimationnow.dadcdigital.com/api/funimation/episodes/?limit=99999&title_id=%s'
% show_info.get('id'), display_id)
- vod_items = traverse_obj(items_info, ('items', ..., re.compile('(?i)mostRecent[AS]vod').match, 'item'))
+ vod_items = traverse_obj(items_info, ('items', ..., lambda k, _: re.match(r'(?i)mostRecent[AS]vod', k), 'item'))
return {
'_type': 'playlist',
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py
index 4a2e30158..bd56ad289 100644
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -149,6 +149,7 @@ from .blogger import BloggerIE
from .mainstreaming import MainStreamingIE
from .gfycat import GfycatIE
from .panopto import PanoptoBaseIE
+from .ruutu import RuutuIE
class GenericIE(InfoExtractor):
@@ -2511,7 +2512,24 @@ class GenericIE(InfoExtractor):
'id': 'insert-a-quiz-into-a-panopto-video'
},
'playlist_count': 1
- }
+ },
+ {
+ # Ruutu embed
+ 'url': 'https://www.nelonen.fi/ohjelmat/madventures-suomi/2160731-riku-ja-tunna-lahtevat-peurajahtiin-tv-sta-tutun-biologin-kanssa---metsastysreissu-huipentuu-kasvissyojan-painajaiseen',
+ 'md5': 'a2513a98d3496099e6eced40f7e6a14b',
+ 'info_dict': {
+ 'id': '4044426',
+ 'ext': 'mp4',
+ 'title': 'Riku ja Tunna lähtevät peurajahtiin tv:stä tutun biologin kanssa – metsästysreissu huipentuu kasvissyöjän painajaiseen!',
+ 'thumbnail': r're:^https?://.+\.jpg$',
+ 'duration': 108,
+ 'series': 'Madventures Suomi',
+ 'description': 'md5:aa55b44bd06a1e337a6f1d0b46507381',
+ 'categories': ['Matkailu', 'Elämäntyyli'],
+ 'age_limit': 0,
+ 'upload_date': '20220308',
+ },
+ },
]
def report_following_redirect(self, new_url):
@@ -2873,10 +2891,8 @@ class GenericIE(InfoExtractor):
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
- video_title = self._og_search_title(
- webpage, default=None) or self._html_search_regex(
- r'(?s)<title>(.*?)</title>', webpage, 'video title',
- default='video')
+ video_title = (self._og_search_title(webpage, default=None)
+ or self._html_extract_title(webpage, 'video title', default='video'))
# Try to detect age limit automatically
age_limit = self._rta_search(webpage)
@@ -3739,6 +3755,12 @@ class GenericIE(InfoExtractor):
panopto_urls = PanoptoBaseIE._extract_urls(webpage)
if panopto_urls:
return self.playlist_from_matches(panopto_urls, video_id, video_title)
+
+ # Look for Ruutu embeds
+ ruutu_url = RuutuIE._extract_url(webpage)
+ if ruutu_url:
+ return self.url_result(ruutu_url, RuutuIE)
+
# Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries:
@@ -3864,8 +3886,8 @@ class GenericIE(InfoExtractor):
if RtmpIE.suitable(vurl):
return True
vpath = compat_urlparse.urlparse(vurl).path
- vext = determine_ext(vpath)
- return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
+ vext = determine_ext(vpath, None)
+ return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
def filter_video(urls):
return list(filter(check_video, urls))
diff --git a/yt_dlp/extractor/glide.py b/yt_dlp/extractor/glide.py
index d94dfbf09..12af859be 100644
--- a/yt_dlp/extractor/glide.py
+++ b/yt_dlp/extractor/glide.py
@@ -23,9 +23,7 @@ class GlideIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(
- r'<title>(.+?)</title>', webpage,
- 'title', default=None) or self._og_search_title(webpage)
+ title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage)
video_url = self._proto_relative_url(self._search_regex(
r'<source[^>]+src=(["\'])(?P<url>.+?)\1',
webpage, 'video URL', default=None,
diff --git a/yt_dlp/extractor/hellporno.py b/yt_dlp/extractor/hellporno.py
index fae425103..92d32cdcc 100644
--- a/yt_dlp/extractor/hellporno.py
+++ b/yt_dlp/extractor/hellporno.py
@@ -38,8 +38,7 @@ class HellPornoIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
- title = remove_end(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'), ' - Hell Porno')
+ title = remove_end(self._html_extract_title(webpage), ' - Hell Porno')
info = self._parse_html5_media_entries(url, webpage, display_id)[0]
self._sort_formats(info['formats'])
diff --git a/yt_dlp/extractor/huya.py b/yt_dlp/extractor/huya.py
index b81439682..4e96f22fa 100644
--- a/yt_dlp/extractor/huya.py
+++ b/yt_dlp/extractor/huya.py
@@ -66,8 +66,7 @@ class HuyaLiveIE(InfoExtractor):
room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo'])
if not room_info:
raise ExtractorError('Can not extract the room info', expected=True)
- title = room_info.get('roomName') or room_info.get('introduction') or self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title')
+ title = room_info.get('roomName') or room_info.get('introduction') or self._html_extract_title(webpage)
screen_type = room_info.get('screenType')
live_source_type = room_info.get('liveSourceType')
stream_info_list = stream_data['data'][0]['gameStreamInfoList']
diff --git a/yt_dlp/extractor/imdb.py b/yt_dlp/extractor/imdb.py
index 24f1fde64..96cee2e2f 100644
--- a/yt_dlp/extractor/imdb.py
+++ b/yt_dlp/extractor/imdb.py
@@ -7,9 +7,10 @@ import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
+ int_or_none,
mimetype2ext,
- parse_duration,
qualities,
+ traverse_obj,
try_get,
url_or_none,
)
@@ -28,6 +29,17 @@ class ImdbIE(InfoExtractor):
'title': 'No. 2',
'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7',
'duration': 152,
+ 'thumbnail': r're:^https?://.+\.jpg',
+ }
+ }, {
+ 'url': 'https://www.imdb.com/video/vi3516832537',
+ 'info_dict': {
+ 'id': '3516832537',
+ 'ext': 'mp4',
+ 'title': 'Paul: U.S. Trailer #1',
+ 'description': 'md5:17fcc4fe11ec29b4399be9d4c5ef126c',
+ 'duration': 153,
+ 'thumbnail': r're:^https?://.+\.jpg',
}
}, {
'url': 'http://www.imdb.com/video/_/vi2524815897',
@@ -51,8 +63,13 @@ class ImdbIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
- data = self._download_json(
+ webpage = self._download_webpage(f'https://www.imdb.com/video/vi{video_id}', video_id)
+ info = self._search_nextjs_data(webpage, video_id)
+ video_info = traverse_obj(info, ('props', 'pageProps', 'videoPlaybackData', 'video'), default={})
+ title = (traverse_obj(video_info, ('name', 'value'), ('primaryTitle', 'titleText', 'text'))
+ or self._html_search_meta(('og:title', 'twitter:title'), webpage, default=None)
+ or self._html_extract_title(webpage))
+ data = video_info.get('playbackURLs') or try_get(self._download_json(
'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id,
query={
'key': base64.b64encode(json.dumps({
@@ -60,11 +77,10 @@ class ImdbIE(InfoExtractor):
'subType': 'FORCE_LEGACY',
'id': 'vi%s' % video_id,
}).encode()).decode(),
- })[0]
-
+ }), lambda x: x[0]['videoLegacyEncodings'])
quality = qualities(('SD', '480p', '720p', '1080p'))
- formats = []
- for encoding in data['videoLegacyEncodings']:
+ formats, subtitles = [], {}
+ for encoding in data:
if not encoding or not isinstance(encoding, dict):
continue
video_url = url_or_none(encoding.get('url'))
@@ -73,11 +89,13 @@ class ImdbIE(InfoExtractor):
ext = mimetype2ext(encoding.get(
'mimeType')) or determine_ext(video_url)
if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
- preference=1, m3u8_id='hls', fatal=False))
+ preference=1, m3u8_id='hls', fatal=False)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ formats.extend(fmts)
continue
- format_id = encoding.get('definition')
+ format_id = traverse_obj(encoding, ('displayName', 'value'), 'definition')
formats.append({
'format_id': format_id,
'url': video_url,
@@ -86,33 +104,15 @@ class ImdbIE(InfoExtractor):
})
self._sort_formats(formats)
- webpage = self._download_webpage(
- 'https://www.imdb.com/video/vi' + video_id, video_id)
- video_metadata = self._parse_json(self._search_regex(
- r'args\.push\(\s*({.+?})\s*\)\s*;', webpage,
- 'video metadata'), video_id)
-
- video_info = video_metadata.get('VIDEO_INFO')
- if video_info and isinstance(video_info, dict):
- info = try_get(
- video_info, lambda x: x[list(video_info.keys())[0]][0], dict)
- else:
- info = {}
-
- title = self._html_search_meta(
- ['og:title', 'twitter:title'], webpage) or self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'title',
- default=None) or info['videoTitle']
-
return {
'id': video_id,
'title': title,
'alt_title': info.get('videoSubTitle'),
'formats': formats,
- 'description': info.get('videoDescription'),
- 'thumbnail': url_or_none(try_get(
- info, lambda x: x['videoSlate']['source'])),
- 'duration': parse_duration(info.get('videoRuntime')),
+ 'description': try_get(video_info, lambda x: x['description']['value']),
+ 'thumbnail': url_or_none(try_get(video_info, lambda x: x['thumbnail']['url'])),
+ 'duration': int_or_none(try_get(video_info, lambda x: x['runtime']['value'])),
+ 'subtitles': subtitles,
}
diff --git a/yt_dlp/extractor/infoq.py b/yt_dlp/extractor/infoq.py
index 0a70a1fb4..347cc5154 100644
--- a/yt_dlp/extractor/infoq.py
+++ b/yt_dlp/extractor/infoq.py
@@ -115,7 +115,7 @@ class InfoQIE(BokeCCBaseIE):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
+ video_title = self._html_extract_title(webpage)
video_description = self._html_search_meta('description', webpage, 'description')
if '/cn/' in url:
diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py
index b13b9f4cf..d07b39d48 100644
--- a/yt_dlp/extractor/iqiyi.py
+++ b/yt_dlp/extractor/iqiyi.py
@@ -9,14 +9,28 @@ import time
from .common import InfoExtractor
from ..compat import (
compat_str,
+ compat_urllib_parse_unquote
)
+from .openload import PhantomJSwrapper
from ..utils import (
clean_html,
+ ExtractorError,
+ float_or_none,
+ format_field,
get_element_by_id,
get_element_by_attribute,
- ExtractorError,
+ int_or_none,
+ js_to_json,
ohdave_rsa_encrypt,
+ parse_age_limit,
+ parse_duration,
+ parse_iso8601,
+ parse_resolution,
+ qualities,
remove_start,
+ str_or_none,
+ traverse_obj,
+ urljoin,
)
@@ -96,9 +110,6 @@ class IqiyiIE(InfoExtractor):
'18': 7, # 1080p
}
- def _real_initialize(self):
- self._login()
-
@staticmethod
def _rsa_fun(data):
# public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js
@@ -107,7 +118,7 @@ class IqiyiIE(InfoExtractor):
return ohdave_rsa_encrypt(data, e, N)
- def _login(self):
+ def _perform_login(self):
raise ExtractorError("iQiyi's non-free authentication algorithm has made login impossible", expected=True)
def get_raw_data(self, tvid, video_id):
@@ -217,3 +228,359 @@ class IqiyiIE(InfoExtractor):
'title': title,
'formats': formats,
}
+
+
+class IqIE(InfoExtractor):
+ IE_NAME = 'iq.com'
+ IE_DESC = 'International version of iQiyi'
+ _VALID_URL = r'https?://(?:www\.)?iq\.com/play/(?:[\w%-]*-)?(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://www.iq.com/play/one-piece-episode-1000-1ma1i6ferf4',
+ 'md5': '2d7caf6eeca8a32b407094b33b757d39',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': '1ma1i6ferf4',
+ 'title': '航海王 第1000集',
+ 'description': 'Subtitle available on Sunday 4PM(GMT+8).',
+ 'duration': 1430,
+ 'timestamp': 1637488203,
+ 'upload_date': '20211121',
+ 'episode_number': 1000,
+ 'episode': 'Episode 1000',
+ 'series': 'One Piece',
+ 'age_limit': 13,
+ 'average_rating': float,
+ },
+ 'params': {
+ 'format': '500',
+ },
+ 'expected_warnings': ['format is restricted']
+ }, {
+ # VIP-restricted video
+ 'url': 'https://www.iq.com/play/mermaid-in-the-fog-2021-gbdpx13bs4',
+ 'only_matching': True
+ }]
+ _BID_TAGS = {
+ '100': '240P',
+ '200': '360P',
+ '300': '480P',
+ '500': '720P',
+ '600': '1080P',
+ '610': '1080P50',
+ '700': '2K',
+ '800': '4K',
+ }
+ _LID_TAGS = {
+ '1': 'zh_CN',
+ '2': 'zh_TW',
+ '3': 'en',
+ '18': 'th',
+ '21': 'my',
+ '23': 'vi',
+ '24': 'id',
+ '26': 'es',
+ '28': 'ar',
+ }
+
+ _DASH_JS = '''
+ console.log(page.evaluate(function() {
+ var tvid = "%(tvid)s"; var vid = "%(vid)s"; var src = "%(src)s";
+ var uid = "%(uid)s"; var dfp = "%(dfp)s"; var mode = "%(mode)s"; var lang = "%(lang)s";
+ var bid_list = %(bid_list)s; var ut_list = %(ut_list)s; var tm = new Date().getTime();
+ var cmd5x_func = %(cmd5x_func)s; var cmd5x_exporter = {}; cmd5x_func({}, cmd5x_exporter, {}); var cmd5x = cmd5x_exporter.cmd5x;
+ var authKey = cmd5x(cmd5x('') + tm + '' + tvid);
+ var k_uid = Array.apply(null, Array(32)).map(function() {return Math.floor(Math.random() * 15).toString(16)}).join('');
+ var dash_paths = {};
+ bid_list.forEach(function(bid) {
+ var query = {
+ 'tvid': tvid,
+ 'bid': bid,
+ 'ds': 1,
+ 'vid': vid,
+ 'src': src,
+ 'vt': 0,
+ 'rs': 1,
+ 'uid': uid,
+ 'ori': 'pcw',
+ 'ps': 1,
+ 'k_uid': k_uid,
+ 'pt': 0,
+ 'd': 0,
+ 's': '',
+ 'lid': '',
+ 'slid': 0,
+ 'cf': '',
+ 'ct': '',
+ 'authKey': authKey,
+ 'k_tag': 1,
+ 'ost': 0,
+ 'ppt': 0,
+ 'dfp': dfp,
+ 'prio': JSON.stringify({
+ 'ff': 'f4v',
+ 'code': 2
+ }),
+ 'k_err_retries': 0,
+ 'up': '',
+ 'su': 2,
+ 'applang': lang,
+ 'sver': 2,
+ 'X-USER-MODE': mode,
+ 'qd_v': 2,
+ 'tm': tm,
+ 'qdy': 'a',
+ 'qds': 0,
+ 'k_ft1': 141287244169348,
+ 'k_ft4': 34359746564,
+ 'k_ft5': 1,
+ 'bop': JSON.stringify({
+ 'version': '10.0',
+ 'dfp': dfp
+ }),
+ };
+ var enc_params = [];
+ for (var prop in query) {
+ enc_params.push(encodeURIComponent(prop) + '=' + encodeURIComponent(query[prop]));
+ }
+ ut_list.forEach(function(ut) {
+ enc_params.push('ut=' + ut);
+ })
+ var dash_path = '/dash?' + enc_params.join('&'); dash_path += '&vf=' + cmd5x(dash_path);
+ dash_paths[bid] = dash_path;
+ });
+ return JSON.stringify(dash_paths);
+ }));
+ saveAndExit();
+ '''
+
+ def _extract_vms_player_js(self, webpage, video_id):
+ player_js_cache = self._downloader.cache.load('iq', 'player_js')
+ if player_js_cache:
+ return player_js_cache
+ webpack_js_url = self._proto_relative_url(self._search_regex(
+ r'<script src="((?:https?)?//stc.iqiyipic.com/_next/static/chunks/webpack-\w+\.js)"', webpage, 'webpack URL'))
+ webpack_js = self._download_webpage(webpack_js_url, video_id, note='Downloading webpack JS', errnote='Unable to download webpack JS')
+ webpack_map1, webpack_map2 = [self._parse_json(js_map, video_id, transform_source=js_to_json) for js_map in self._search_regex(
+ r'\(({[^}]*})\[\w+\][^\)]*\)\s*\+\s*["\']\.["\']\s*\+\s*({[^}]*})\[\w+\]\+["\']\.js', webpack_js, 'JS locations', group=(1, 2))]
+ for module_index in reversed(list(webpack_map2.keys())):
+ module_js = self._download_webpage(
+ f'https://stc.iqiyipic.com/_next/static/chunks/{webpack_map1.get(module_index, module_index)}.{webpack_map2[module_index]}.js',
+ video_id, note=f'Downloading #{module_index} module JS', errnote='Unable to download module JS', fatal=False) or ''
+ if 'vms request' in module_js:
+ self._downloader.cache.store('iq', 'player_js', module_js)
+ return module_js
+ raise ExtractorError('Unable to extract player JS')
+
+ def _extract_cmd5x_function(self, webpage, video_id):
+ return self._search_regex(r',\s*(function\s*\([^\)]*\)\s*{\s*var _qda.+_qdc\(\)\s*})\s*,',
+ self._extract_vms_player_js(webpage, video_id), 'signature function')
+
+ def _update_bid_tags(self, webpage, video_id):
+ extracted_bid_tags = self._parse_json(
+ self._search_regex(
+ r'arguments\[1\][^,]*,\s*function\s*\([^\)]*\)\s*{\s*"use strict";?\s*var \w=({.+}})\s*,\s*\w\s*=\s*{\s*getNewVd',
+ self._extract_vms_player_js(webpage, video_id), 'video tags', default=''),
+ video_id, transform_source=js_to_json, fatal=False)
+ if not extracted_bid_tags:
+ return
+ self._BID_TAGS = {
+ bid: traverse_obj(extracted_bid_tags, (bid, 'value'), expected_type=str, default=self._BID_TAGS.get(bid))
+ for bid in extracted_bid_tags.keys()
+ }
+
+ def _get_cookie(self, name, default=None):
+ cookie = self._get_cookies('https://iq.com/').get(name)
+ return cookie.value if cookie else default
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ self._update_bid_tags(webpage, video_id)
+
+ next_props = self._search_nextjs_data(webpage, video_id)['props']
+ page_data = next_props['initialState']['play']
+ video_info = page_data['curVideoInfo']
+
+ uid = traverse_obj(
+ self._parse_json(
+ self._get_cookie('I00002', '{}'), video_id, transform_source=compat_urllib_parse_unquote, fatal=False),
+ ('data', 'uid'), default=0)
+
+ if uid:
+ vip_data = self._download_json(
+ 'https://pcw-api.iq.com/api/vtype', video_id, note='Downloading VIP data', errnote='Unable to download VIP data', query={
+ 'batch': 1,
+ 'platformId': 3,
+ 'modeCode': self._get_cookie('mod', 'intl'),
+ 'langCode': self._get_cookie('lang', 'en_us'),
+ 'deviceId': self._get_cookie('QC005', '')
+ }, fatal=False)
+ ut_list = traverse_obj(vip_data, ('data', 'all_vip', ..., 'vipType'), expected_type=str_or_none, default=[])
+ else:
+ ut_list = ['0']
+
+ # bid 0 as an initial format checker
+ dash_paths = self._parse_json(PhantomJSwrapper(self).get(
+ url, html='<!DOCTYPE html>', video_id=video_id, note2='Executing signature code', jscode=self._DASH_JS % {
+ 'tvid': video_info['tvId'],
+ 'vid': video_info['vid'],
+ 'src': traverse_obj(next_props, ('initialProps', 'pageProps', 'ptid'),
+ expected_type=str, default='04022001010011000000'),
+ 'uid': uid,
+ 'dfp': self._get_cookie('dfp', ''),
+ 'mode': self._get_cookie('mod', 'intl'),
+ 'lang': self._get_cookie('lang', 'en_us'),
+ 'bid_list': '[' + ','.join(['0', *self._BID_TAGS.keys()]) + ']',
+ 'ut_list': '[' + ','.join(ut_list) + ']',
+ 'cmd5x_func': self._extract_cmd5x_function(webpage, video_id),
+ })[1].strip(), video_id)
+
+ formats, subtitles = [], {}
+ initial_format_data = self._download_json(
+ urljoin('https://cache-video.iq.com', dash_paths['0']), video_id,
+ note='Downloading initial video format info', errnote='Unable to download initial video format info')['data']
+
+ preview_time = traverse_obj(
+ initial_format_data, ('boss_ts', (None, 'data'), ('previewTime', 'rtime')), expected_type=float_or_none, get_all=False)
+ if traverse_obj(initial_format_data, ('boss_ts', 'data', 'prv'), expected_type=int_or_none):
+ self.report_warning('This preview video is limited%s' % format_field(preview_time, template=' to %s seconds'))
+
+ # TODO: Extract audio-only formats
+ for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none, default=[])):
+ dash_path = dash_paths.get(bid)
+ if not dash_path:
+ self.report_warning(f'Unknown format id: {bid}. It is currently not being extracted')
+ continue
+ format_data = traverse_obj(self._download_json(
+ urljoin('https://cache-video.iq.com', dash_path), video_id,
+ note=f'Downloading format data for {self._BID_TAGS[bid]}', errnote='Unable to download format data',
+ fatal=False), 'data', expected_type=dict)
+
+ video_format = traverse_obj(format_data, ('program', 'video', lambda _, v: str(v['bid']) == bid),
+ expected_type=dict, default=[], get_all=False) or {}
+ extracted_formats = []
+ if video_format.get('m3u8Url'):
+ extracted_formats.extend(self._extract_m3u8_formats(
+ urljoin(format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/'), video_format['m3u8Url']),
+ 'mp4', m3u8_id=bid, fatal=False))
+ if video_format.get('mpdUrl'):
+ # TODO: Properly extract mpd hostname
+ extracted_formats.extend(self._extract_mpd_formats(
+ urljoin(format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/'), video_format['mpdUrl']),
+ mpd_id=bid, fatal=False))
+ if video_format.get('m3u8'):
+ ff = video_format.get('ff', 'ts')
+ if ff == 'ts':
+ m3u8_formats, _ = self._parse_m3u8_formats_and_subtitles(
+ video_format['m3u8'], ext='mp4', m3u8_id=bid, fatal=False)
+ extracted_formats.extend(m3u8_formats)
+ elif ff == 'm4s':
+ mpd_data = traverse_obj(
+ self._parse_json(video_format['m3u8'], video_id, fatal=False), ('payload', ..., 'data'), expected_type=str)
+ if not mpd_data:
+ continue
+ mpd_formats, _ = self._parse_mpd_formats_and_subtitles(
+ mpd_data, bid, format_data.get('dm3u8', 'https://cache-m.iq.com/dc/dt/'))
+ extracted_formats.extend(mpd_formats)
+ else:
+ self.report_warning(f'{ff} formats are currently not supported')
+
+ if not extracted_formats:
+ if video_format.get('s'):
+ self.report_warning(f'{self._BID_TAGS[bid]} format is restricted')
+ else:
+ self.report_warning(f'Unable to extract {self._BID_TAGS[bid]} format')
+ for f in extracted_formats:
+ f.update({
+ 'quality': qualities(list(self._BID_TAGS.keys()))(bid),
+ 'format_note': self._BID_TAGS[bid],
+ **parse_resolution(video_format.get('scrsz'))
+ })
+ formats.extend(extracted_formats)
+
+ self._sort_formats(formats)
+
+ for sub_format in traverse_obj(initial_format_data, ('program', 'stl', ...), expected_type=dict, default=[]):
+ lang = self._LID_TAGS.get(str_or_none(sub_format.get('lid')), sub_format.get('_name'))
+ subtitles.setdefault(lang, []).extend([{
+ 'ext': format_ext,
+ 'url': urljoin(initial_format_data.get('dstl', 'http://meta.video.iqiyi.com'), sub_format[format_key])
+ } for format_key, format_ext in [('srt', 'srt'), ('webvtt', 'vtt')] if sub_format.get(format_key)])
+
+ extra_metadata = page_data.get('albumInfo') if video_info.get('albumId') and page_data.get('albumInfo') else video_info
+ return {
+ 'id': video_id,
+ 'title': video_info['name'],
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'description': video_info.get('mergeDesc'),
+ 'duration': parse_duration(video_info.get('len')),
+ 'age_limit': parse_age_limit(video_info.get('rating')),
+ 'average_rating': traverse_obj(page_data, ('playScoreInfo', 'score'), expected_type=float_or_none),
+ 'timestamp': parse_iso8601(video_info.get('isoUploadDate')),
+ 'categories': traverse_obj(extra_metadata, ('videoTagMap', ..., ..., 'name'), expected_type=str),
+ 'cast': traverse_obj(extra_metadata, ('actorArr', ..., 'name'), expected_type=str),
+ 'episode_number': int_or_none(video_info.get('order')) or None,
+ 'series': video_info.get('albumName'),
+ }
+
+
+class IqAlbumIE(InfoExtractor):
+ IE_NAME = 'iq.com:album'
+ _VALID_URL = r'https?://(?:www\.)?iq\.com/album/(?:[\w%-]*-)?(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://www.iq.com/album/one-piece-1999-1bk9icvr331',
+ 'info_dict': {
+ 'id': '1bk9icvr331',
+ 'title': 'One Piece',
+ 'description': 'Subtitle available on Sunday 4PM(GMT+8).'
+ },
+ 'playlist_mincount': 238
+ }, {
+ # Movie/single video
+ 'url': 'https://www.iq.com/album/九龙城寨-2021-22yjnij099k',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': '22yjnij099k',
+ 'title': '九龙城寨',
+ 'description': 'md5:8a09f50b8ba0db4dc69bc7c844228044',
+ 'duration': 5000,
+ 'timestamp': 1641911371,
+ 'upload_date': '20220111',
+ 'series': '九龙城寨',
+ 'cast': ['Shi Yan Neng', 'Yu Lang', 'Peter lv', 'Sun Zi Jun', 'Yang Xiao Bo'],
+ 'age_limit': 13,
+ 'average_rating': float,
+ },
+ 'expected_warnings': ['format is restricted']
+ }]
+
+ def _entries(self, album_id_num, page_ranges, album_id=None, mode_code='intl', lang_code='en_us'):
+ for page_range in page_ranges:
+ page = self._download_json(
+ f'https://pcw-api.iq.com/api/episodeListSource/{album_id_num}', album_id,
+ note=f'Downloading video list episodes {page_range.get("msg", "")}',
+ errnote='Unable to download video list', query={
+ 'platformId': 3,
+ 'modeCode': mode_code,
+ 'langCode': lang_code,
+ 'endOrder': page_range['to'],
+ 'startOrder': page_range['from']
+ })
+ for video in page['data']['epg']:
+ yield self.url_result('https://www.iq.com/play/%s' % (video.get('playLocSuffix') or video['qipuIdStr']),
+ IqIE.ie_key(), video.get('qipuIdStr'), video.get('name'))
+
+ def _real_extract(self, url):
+ album_id = self._match_id(url)
+ webpage = self._download_webpage(url, album_id)
+ next_data = self._search_nextjs_data(webpage, album_id)
+ album_data = next_data['props']['initialState']['album']['videoAlbumInfo']
+
+ if album_data.get('videoType') == 'singleVideo':
+ return self.url_result('https://www.iq.com/play/%s' % album_id, IqIE.ie_key())
+ return self.playlist_result(
+ self._entries(album_data['albumId'], album_data['totalPageRange'], album_id,
+ traverse_obj(next_data, ('props', 'initialProps', 'pageProps', 'modeCode')),
+ traverse_obj(next_data, ('props', 'initialProps', 'pageProps', 'langCode'))),
+ album_id, album_data.get('name'), album_data.get('desc'))
diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py
index 254d98692..c0e01e352 100644
--- a/yt_dlp/extractor/iwara.py
+++ b/yt_dlp/extractor/iwara.py
@@ -76,8 +76,7 @@ class IwaraIE(InfoExtractor):
'age_limit': age_limit,
}
- title = remove_end(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'), ' | Iwara')
+ title = remove_end(self._html_extract_title(webpage), ' | Iwara')
thumbnail = self._html_search_regex(
r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None)
diff --git a/yt_dlp/extractor/limelight.py b/yt_dlp/extractor/limelight.py
index 369141d67..b20681ad1 100644
--- a/yt_dlp/extractor/limelight.py
+++ b/yt_dlp/extractor/limelight.py
@@ -194,7 +194,7 @@ class LimelightBaseIE(InfoExtractor):
cc_url = cc.get('webvttFileUrl')
if not cc_url:
continue
- lang = cc.get('languageCode') or self._search_regex(r'/[a-z]{2}\.vtt', cc_url, 'lang', default='en')
+ lang = cc.get('languageCode') or self._search_regex(r'/([a-z]{2})\.vtt', cc_url, 'lang', default='en')
subtitles.setdefault(lang, []).append({
'url': cc_url,
})
diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py
index bf549e164..0f57bfa06 100644
--- a/yt_dlp/extractor/linkedin.py
+++ b/yt_dlp/extractor/linkedin.py
@@ -102,7 +102,7 @@ class LinkedInIE(LinkedInBaseIE):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
description = clean_html(get_element_by_class('share-update-card__update-text', webpage))
like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage))
creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage)))
diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py
index ace86c2fd..fbf9223b2 100644
--- a/yt_dlp/extractor/mediasite.py
+++ b/yt_dlp/extractor/mediasite.py
@@ -14,6 +14,7 @@ from ..utils import (
float_or_none,
mimetype2ext,
str_or_none,
+ try_call,
try_get,
unescapeHTML,
unsmuggle_url,
@@ -145,11 +146,11 @@ class MediasiteIE(InfoExtractor):
'duration': slide['Time'] / 1000,
})
- next_time = try_get(None, [
- lambda _: Stream['Slides'][i + 1]['Time'],
- lambda _: duration,
- lambda _: slide['Time'],
- ], expected_type=(int, float))
+ next_time = try_call(
+ lambda: Stream['Slides'][i + 1]['Time'],
+ lambda: duration,
+ lambda: slide['Time'],
+ expected_type=(int, float))
fragments.append({
'path': fname_template.format(slide.get('Number', i + 1)),
diff --git a/yt_dlp/extractor/miaopai.py b/yt_dlp/extractor/miaopai.py
index f9e35ac7f..cf0610bdf 100644
--- a/yt_dlp/extractor/miaopai.py
+++ b/yt_dlp/extractor/miaopai.py
@@ -24,8 +24,7 @@ class MiaoPaiIE(InfoExtractor):
webpage = self._download_webpage(
url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD})
- title = self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
thumbnail = self._html_search_regex(
r'<div[^>]+class=(?P<q1>[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P<q2>[\'"])(?P<url>[^\'"]+)(?P=q2)',
webpage, 'thumbnail', fatal=False, group='url')
diff --git a/yt_dlp/extractor/mojvideo.py b/yt_dlp/extractor/mojvideo.py
index 0421f3f44..16d94052b 100644
--- a/yt_dlp/extractor/mojvideo.py
+++ b/yt_dlp/extractor/mojvideo.py
@@ -38,8 +38,7 @@ class MojvideoIE(InfoExtractor):
r'<errordesc>([^<]*)</errordesc>', playerapi, 'error description', fatal=False)
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True)
- title = self._html_search_regex(
- r'<title>([^<]+)</title>', playerapi, 'title')
+ title = self._html_extract_title(playerapi)
video_url = self._html_search_regex(
r'<file>([^<]+)</file>', playerapi, 'video URL')
thumbnail = self._html_search_regex(
diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py
index b77ef5f28..77f253519 100644
--- a/yt_dlp/extractor/nebula.py
+++ b/yt_dlp/extractor/nebula.py
@@ -86,7 +86,7 @@ class NebulaBaseIE(InfoExtractor):
# if 401 or 403, attempt credential re-auth and retry
if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403):
self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}')
- self._login()
+ self._perform_login()
return inner_call()
else:
raise
diff --git a/yt_dlp/extractor/newgrounds.py b/yt_dlp/extractor/newgrounds.py
index 1e1274ef0..6525a6d8a 100644
--- a/yt_dlp/extractor/newgrounds.py
+++ b/yt_dlp/extractor/newgrounds.py
@@ -106,8 +106,7 @@ class NewgroundsIE(InfoExtractor):
uploader = None
webpage = self._download_webpage(url, media_id)
- title = self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
media_url_string = self._search_regex(
r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None)
@@ -219,8 +218,7 @@ class NewgroundsPlaylistIE(InfoExtractor):
webpage = self._download_webpage(url, playlist_id)
- title = self._search_regex(
- r'<title>([^>]+)</title>', webpage, 'title', default=None)
+ title = self._html_extract_title(webpage, default=None)
# cut left menu
webpage = self._search_regex(
diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py
index 626c6379b..3b8efc3e6 100644
--- a/yt_dlp/extractor/nhk.py
+++ b/yt_dlp/extractor/nhk.py
@@ -309,7 +309,9 @@ class NhkForSchoolProgramListIE(InfoExtractor):
webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id)
- title = self._og_search_title(webpage, fatal=False) or self._html_extract_title(webpage, fatal=False) or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False)
+ title = (self._og_search_title(webpage)
+ or self._html_extract_title(webpage)
+ or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False))
title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None
description = self._html_search_regex(
r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>',
diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py
index 74828f833..4eb6ed070 100644
--- a/yt_dlp/extractor/niconico.py
+++ b/yt_dlp/extractor/niconico.py
@@ -25,7 +25,10 @@ from ..utils import (
parse_duration,
parse_filesize,
parse_iso8601,
+ parse_resolution,
+ qualities,
remove_start,
+ str_or_none,
traverse_obj,
try_get,
unescapeHTML,
@@ -430,18 +433,25 @@ class NiconicoIE(InfoExtractor):
# find in json (logged in)
tags = traverse_obj(api_data, ('tag', 'items', ..., 'name'))
+ thumb_prefs = qualities(['url', 'middleUrl', 'largeUrl', 'player', 'ogp'])
+
return {
'id': video_id,
'_api_data': api_data,
'title': get_video_info(('originalTitle', 'title')) or self._og_search_title(webpage, default=None),
'formats': formats,
- 'thumbnail': get_video_info('thumbnail', 'url') or self._html_search_meta(
- ('image', 'og:image'), webpage, 'thumbnail', default=None),
+ 'thumbnails': [{
+ 'id': key,
+ 'url': url,
+ 'ext': 'jpg',
+ 'preference': thumb_prefs(key),
+ **parse_resolution(url, lenient=True),
+ } for key, url in (get_video_info('thumbnail') or {}).items() if url],
'description': clean_html(get_video_info('description')),
- 'uploader': traverse_obj(api_data, ('owner', 'nickname')),
+ 'uploader': traverse_obj(api_data, ('owner', 'nickname'), ('channel', 'name'), ('community', 'name')),
+ 'uploader_id': str_or_none(traverse_obj(api_data, ('owner', 'id'), ('channel', 'id'), ('community', 'id'))),
'timestamp': parse_iso8601(get_video_info('registeredAt')) or parse_iso8601(
self._html_search_meta('video:release_date', webpage, 'date published', default=None)),
- 'uploader_id': traverse_obj(api_data, ('owner', 'id')),
'channel': traverse_obj(api_data, ('channel', 'name'), ('community', 'name')),
'channel_id': traverse_obj(api_data, ('channel', 'id'), ('community', 'id')),
'view_count': int_or_none(get_video_info('count', 'view')),
@@ -459,7 +469,7 @@ class NiconicoIE(InfoExtractor):
comment_user_key = traverse_obj(api_data, ('comment', 'keys', 'userKey'))
user_id_str = session_api_data.get('serviceUserId')
- thread_ids = [x for x in traverse_obj(api_data, ('comment', 'threads')) or [] if x['isActive']]
+ thread_ids = traverse_obj(api_data, ('comment', 'threads', lambda _, v: v['isActive']))
raw_danmaku = self._extract_all_comments(video_id, thread_ids, user_id_str, comment_user_key)
if not raw_danmaku:
self.report_warning(f'Failed to get comments. {bug_reports_message()}')
diff --git a/yt_dlp/extractor/openrec.py b/yt_dlp/extractor/openrec.py
index b476c0986..5eb1cdbad 100644
--- a/yt_dlp/extractor/openrec.py
+++ b/yt_dlp/extractor/openrec.py
@@ -7,6 +7,7 @@ from ..utils import (
get_first,
int_or_none,
traverse_obj,
+ try_get,
unified_strdate,
unified_timestamp,
)
@@ -18,6 +19,13 @@ class OpenRecBaseIE(InfoExtractor):
return self._parse_json(
self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id)
+ def _expand_media(self, video_id, media):
+ for name, m3u8_url in (media or {}).items():
+ if not m3u8_url:
+ continue
+ yield from self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4', m3u8_id=name)
+
def _extract_movie(self, webpage, video_id, name, is_live):
window_stores = self._extract_pagestore(webpage, video_id)
movie_stores = [
@@ -29,13 +37,21 @@ class OpenRecBaseIE(InfoExtractor):
if not any(movie_stores):
raise ExtractorError(f'Failed to extract {name} info')
- m3u8_playlists = get_first(movie_stores, 'media') or {}
- formats = []
- for name, m3u8_url in m3u8_playlists.items():
- if not m3u8_url:
- continue
- formats.extend(self._extract_m3u8_formats(
- m3u8_url, video_id, ext='mp4', live=is_live, m3u8_id=name))
+ formats = list(self._expand_media(video_id, get_first(movie_stores, 'media')))
+ if not formats and is_live:
+ # archived livestreams
+ cookies = self._get_cookies('https://www.openrec.tv/')
+ detail = self._download_json(
+ f'https://apiv5.openrec.tv/api/v5/movies/{video_id}/detail', video_id,
+ headers={
+ 'Origin': 'https://www.openrec.tv',
+ 'Referer': 'https://www.openrec.tv/',
+ 'access-token': try_get(cookies, lambda x: x.get('access_token').value),
+ 'uuid': try_get(cookies, lambda x: x.get('uuid').value),
+ })
+ new_media = traverse_obj(detail, ('data', 'items', ..., 'media'), get_all=False)
+ formats = list(self._expand_media(video_id, new_media))
+ is_live = False
self._sort_formats(formats)
diff --git a/yt_dlp/extractor/playvid.py b/yt_dlp/extractor/playvid.py
index 4aef186ea..e1c406b6c 100644
--- a/yt_dlp/extractor/playvid.py
+++ b/yt_dlp/extractor/playvid.py
@@ -85,8 +85,7 @@ class PlayvidIE(InfoExtractor):
# Extract title - should be in the flashvars; if not, look elsewhere
if video_title is None:
- video_title = self._html_search_regex(
- r'<title>(.*?)</title', webpage, 'title')
+ video_title = self._html_extract_title(webpage)
return {
'id': video_id,
diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py
index 9d243b2be..6864129c6 100644
--- a/yt_dlp/extractor/rai.py
+++ b/yt_dlp/extractor/rai.py
@@ -118,7 +118,7 @@ class RaiBaseIE(InfoExtractor):
})
def _create_http_urls(self, relinker_url, fmts):
- _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\d+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?'
+ _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?'
_MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s'
_QUALITY = {
# tbr: w, h
diff --git a/yt_dlp/extractor/rule34video.py b/yt_dlp/extractor/rule34video.py
index 522d4ccd5..a602a9f33 100644
--- a/yt_dlp/extractor/rule34video.py
+++ b/yt_dlp/extractor/rule34video.py
@@ -49,7 +49,7 @@ class Rule34VideoIE(InfoExtractor):
'quality': quality,
})
- title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
thumbnail = self._html_search_regex(r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None)
duration = self._html_search_regex(r'"icon-clock"></i>\s+<span>((?:\d+:?)+)', webpage, 'duration', default=None)
diff --git a/yt_dlp/extractor/ruutu.py b/yt_dlp/extractor/ruutu.py
index d9cf39d71..5a30e3360 100644
--- a/yt_dlp/extractor/ruutu.py
+++ b/yt_dlp/extractor/ruutu.py
@@ -1,6 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
+import json
+import re
+
from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlparse
from ..utils import (
@@ -8,6 +11,8 @@ from ..utils import (
ExtractorError,
find_xpath_attr,
int_or_none,
+ traverse_obj,
+ try_call,
unified_strdate,
url_or_none,
xpath_attr,
@@ -123,6 +128,16 @@ class RuutuIE(InfoExtractor):
]
_API_BASE = 'https://gatling.nelonenmedia.fi'
+ @classmethod
+ def _extract_url(cls, webpage):
+ settings = try_call(
+ lambda: json.loads(re.search(
+ r'jQuery\.extend\(Drupal\.settings, ({.+?})\);', webpage).group(1), strict=False))
+ video_id = traverse_obj(settings, (
+ 'mediaCrossbowSettings', 'file', 'field_crossbow_video_id', 'und', 0, 'value'))
+ if video_id:
+ return f'http://www.ruutu.fi/video/{video_id}'
+
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py
index 6f4240422..b295184a1 100644
--- a/yt_dlp/extractor/senategov.py
+++ b/yt_dlp/extractor/senategov.py
@@ -112,7 +112,7 @@ class SenateISVPIE(InfoExtractor):
if smuggled_data.get('force_title'):
title = smuggled_data['force_title']
else:
- title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id)
+ title = self._html_extract_title(webpage)
poster = qs.get('poster')
thumbnail = poster[0] if poster else None
diff --git a/yt_dlp/extractor/sunporno.py b/yt_dlp/extractor/sunporno.py
index 68051169b..59b77bf92 100644
--- a/yt_dlp/extractor/sunporno.py
+++ b/yt_dlp/extractor/sunporno.py
@@ -36,8 +36,7 @@ class SunPornoIE(InfoExtractor):
webpage = self._download_webpage(
'http://www.sunporno.com/videos/%s' % video_id, video_id)
- title = self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
description = self._html_search_meta(
'description', webpage, 'description')
thumbnail = self._html_search_regex(
diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py
index 5b3222ecf..5c7b54531 100644
--- a/yt_dlp/extractor/tenplay.py
+++ b/yt_dlp/extractor/tenplay.py
@@ -7,6 +7,7 @@ import base64
from .common import InfoExtractor
from ..utils import (
HEADRequest,
+ int_or_none,
urlencode_postdata,
)
@@ -15,6 +16,28 @@ class TenPlayIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?P<id>tpv\d{6}[a-z]{5})'
_NETRC_MACHINE = '10play'
_TESTS = [{
+ 'url': 'https://10play.com.au/neighbours/web-extras/season-39/nathan-borg-is-the-first-aussie-actor-with-a-cochlear-implant-to-join-neighbours/tpv210128qupwd',
+ 'info_dict': {
+ 'id': '6226844312001',
+ 'ext': 'mp4',
+ 'title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours',
+ 'alt_title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours',
+ 'description': 'md5:a02d0199c901c2dd4c796f1e7dd0de43',
+ 'duration': 186,
+ 'season': 39,
+ 'series': 'Neighbours',
+ 'thumbnail': r're:https://.*\.jpg',
+ 'uploader': 'Channel 10',
+ 'age_limit': 15,
+ 'timestamp': 1611810000,
+ 'upload_date': '20210128',
+ 'uploader_id': '2199827728001',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'Only available in Australia',
+ }, {
'url': 'https://10play.com.au/todd-sampsons-body-hack/episodes/season-4/episode-7/tpv200921kvngh',
'info_dict': {
'id': '6192880312001',
@@ -62,12 +85,17 @@ class TenPlayIE(InfoExtractor):
def _real_extract(self, url):
content_id = self._match_id(url)
- _token = self._get_bearer_token(content_id)
data = self._download_json(
'https://10play.com.au/api/v1/videos/' + content_id, content_id)
+ headers = {}
+
+ if data.get('memberGated') is True:
+ _token = self._get_bearer_token(content_id)
+ headers = {'Authorization': _token}
+
_video_url = self._download_json(
data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON',
- headers={'Authorization': _token}).get('source')
+ headers=headers).get('source')
m3u8_url = self._request_webpage(HEADRequest(
_video_url), content_id).geturl()
if '10play-not-in-oz' in m3u8_url:
@@ -77,12 +105,16 @@ class TenPlayIE(InfoExtractor):
return {
'formats': formats,
+ 'subtitles': {'en': [{'url': data.get('captionUrl')}]} if data.get('captionUrl') else None,
'id': data.get('altId') or content_id,
- 'title': data.get('title'),
+ 'duration': data.get('duration'),
+ 'title': data.get('subtitle'),
+ 'alt_title': data.get('title'),
'description': data.get('description'),
'age_limit': self._AUS_AGES.get(data.get('classification')),
- 'series': data.get('showName'),
- 'season': data.get('showContentSeason'),
+ 'series': data.get('tvShow'),
+ 'season': int_or_none(data.get('season')),
+ 'episode_number': int_or_none(data.get('episode')),
'timestamp': data.get('published'),
'thumbnail': data.get('imageUrl'),
'uploader': 'Channel 10',
diff --git a/yt_dlp/extractor/thisav.py b/yt_dlp/extractor/thisav.py
index 4af286e6d..6bb00b3ab 100644
--- a/yt_dlp/extractor/thisav.py
+++ b/yt_dlp/extractor/thisav.py
@@ -37,9 +37,7 @@ class ThisAVIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- title = remove_end(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'),
- ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站')
+ title = remove_end(self._html_extract_title(webpage), ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站')
video_url = self._html_search_regex(
r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None)
if video_url:
diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py
index 56cc2dcc6..c1d6c5477 100644
--- a/yt_dlp/extractor/tiktok.py
+++ b/yt_dlp/extractor/tiktok.py
@@ -263,8 +263,8 @@ class TikTokBaseIE(InfoExtractor):
return {
'id': aweme_id,
- 'title': aweme_detail['desc'],
- 'description': aweme_detail['desc'],
+ 'title': aweme_detail.get('desc'),
+ 'description': aweme_detail.get('desc'),
'view_count': int_or_none(stats_info.get('play_count')),
'like_count': int_or_none(stats_info.get('digg_count')),
'repost_count': int_or_none(stats_info.get('share_count')),
@@ -387,6 +387,9 @@ class TikTokIE(TikTokBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
+ 'artist': 'Ysrbeats',
+ 'album': 'Lehanga',
+ 'track': 'Lehanga',
}
}, {
'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
@@ -410,6 +413,8 @@ class TikTokIE(TikTokBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
+ 'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson',
+ 'track': 'Big Fun',
}
}, {
# Banned audio, only available on the app
@@ -458,6 +463,30 @@ class TikTokIE(TikTokBaseIE):
},
'expected_warnings': ['Video not available']
}, {
+ # Video without title and description
+ 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694',
+ 'info_dict': {
+ 'id': '7059698374567611694',
+ 'ext': 'mp4',
+ 'title': 'tiktok video #7059698374567611694',
+ 'description': '',
+ 'uploader': 'pokemonlife22',
+ 'creator': 'Pokemon',
+ 'uploader_id': '6820838815978423302',
+ 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
+ 'track': 'original sound',
+ 'timestamp': 1643714123,
+ 'duration': 6,
+ 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
+ 'upload_date': '20220201',
+ 'artist': 'Pokemon',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ },
+ 'expected_warnings': ['Video not available', 'Creating a generic title']
+ }, {
# Auto-captions available
'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
'only_matching': True
@@ -522,6 +551,15 @@ class TikTokUserIE(TikTokBaseIE):
},
'expected_warnings': ['Retrying']
}, {
+ 'url': 'https://www.tiktok.com/@6820838815978423302',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': '6820838815978423302',
+ 'title': '6820838815978423302',
+ 'thumbnail': r're:https://.+_1080x1080\.webp'
+ },
+ 'expected_warnings': ['Retrying']
+ }, {
'url': 'https://www.tiktok.com/@meme',
'playlist_mincount': 593,
'info_dict': {
@@ -593,7 +631,7 @@ class TikTokUserIE(TikTokBaseIE):
webpage = self._download_webpage(url, user_name, headers={
'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'
})
- user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID')
+ user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID', default=None) or user_name
videos = LazyList(self._video_entries_api(webpage, user_id, user_name))
thumbnail = traverse_obj(videos, (0, 'author', 'avatar_larger', 'url_list', 0))
diff --git a/yt_dlp/extractor/traileraddict.py b/yt_dlp/extractor/traileraddict.py
index 10100fbcf..514f4793e 100644
--- a/yt_dlp/extractor/traileraddict.py
+++ b/yt_dlp/extractor/traileraddict.py
@@ -24,8 +24,7 @@ class TrailerAddictIE(InfoExtractor):
name = mobj.group('movie') + '/' + mobj.group('trailer_name')
webpage = self._download_webpage(url, name)
- title = self._search_regex(r'<title>(.+?)</title>',
- webpage, 'video title').replace(' - Trailer Addict', '')
+ title = self._html_extract_title(webpage, 'video title').replace(' - Trailer Addict', '')
view_count_str = self._search_regex(
r'<span class="views_n">([0-9,.]+)</span>',
webpage, 'view count', fatal=False)
diff --git a/yt_dlp/extractor/varzesh3.py b/yt_dlp/extractor/varzesh3.py
index 81313dc9d..32655b96d 100644
--- a/yt_dlp/extractor/varzesh3.py
+++ b/yt_dlp/extractor/varzesh3.py
@@ -42,8 +42,7 @@ class Varzesh3IE(InfoExtractor):
video_url = self._search_regex(
r'<source[^>]+src="([^"]+)"', webpage, 'video url')
- title = remove_start(self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title'), 'ویدیو ورزش 3 | ')
+ title = remove_start(self._html_extract_title(webpage), 'ویدیو ورزش 3 | ')
description = self._html_search_regex(
r'(?s)<div class="matn">(.+?)</div>',
diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py
index b0a1fca68..ba627ca5b 100644
--- a/yt_dlp/extractor/viu.py
+++ b/yt_dlp/extractor/viu.py
@@ -329,7 +329,8 @@ class ViuOTTIE(InfoExtractor):
if token is not None:
query['identity'] = token
else:
- # preview is limited to 3min for non-members. But we can try to bypass it
+ # The content is Preview or for VIP only.
+ # We can try to bypass the duration which is limited to 3mins only
duration_limit, query['duration'] = True, '180'
try:
stream_data = download_playback()
@@ -346,13 +347,13 @@ class ViuOTTIE(InfoExtractor):
# bypass preview duration limit
if duration_limit:
- stream_url = urllib.parse.urlparse(stream_url)
+ old_stream_url = urllib.parse.urlparse(stream_url)
+ query = dict(urllib.parse.parse_qsl(old_stream_url.query, keep_blank_values=True))
query.update({
'duration': video_data.get('time_duration') or '9999999',
'duration_start': '0',
})
- stream_url = stream_url._replace(query=urllib.parse.urlencode(dict(
- urllib.parse.parse_qsl(stream_url.query, keep_blank_values=True)))).geturl()
+ stream_url = old_stream_url._replace(query=urllib.parse.urlencode(query)).geturl()
formats.append({
'format_id': vid_format,
diff --git a/yt_dlp/extractor/vrv.py b/yt_dlp/extractor/vrv.py
index 10e6be7ed..00e1006c4 100644
--- a/yt_dlp/extractor/vrv.py
+++ b/yt_dlp/extractor/vrv.py
@@ -85,7 +85,30 @@ class VRVBaseIE(InfoExtractor):
'resource_key': resource_key,
})['__links__']['cms_resource']['href']
- def _initialize_pre_login(self):
+ def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang):
+ if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'):
+ return []
+ format_id = join_nonempty(
+ stream_format,
+ audio_lang and 'audio-%s' % audio_lang,
+ hardsub_lang and 'hardsub-%s' % hardsub_lang)
+ if 'hls' in stream_format:
+ adaptive_formats = self._extract_m3u8_formats(
+ url, video_id, 'mp4', m3u8_id=format_id,
+ note='Downloading %s information' % format_id,
+ fatal=False)
+ elif stream_format == 'dash':
+ adaptive_formats = self._extract_mpd_formats(
+ url, video_id, mpd_id=format_id,
+ note='Downloading %s information' % format_id,
+ fatal=False)
+ if audio_lang:
+ for f in adaptive_formats:
+ if f.get('acodec') != 'none':
+ f['language'] = audio_lang
+ return adaptive_formats
+
+ def _set_api_params(self):
webpage = self._download_webpage(
'https://vrv.co/', None, headers=self.geo_verification_headers())
self._API_PARAMS = self._parse_json(self._search_regex(
@@ -133,28 +156,8 @@ class VRVIE(VRVBaseIE):
self._TOKEN = token_credentials['oauth_token']
self._TOKEN_SECRET = token_credentials['oauth_token_secret']
- def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang):
- if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'):
- return []
- format_id = join_nonempty(
- stream_format,
- audio_lang and 'audio-%s' % audio_lang,
- hardsub_lang and 'hardsub-%s' % hardsub_lang)
- if 'hls' in stream_format:
- adaptive_formats = self._extract_m3u8_formats(
- url, video_id, 'mp4', m3u8_id=format_id,
- note='Downloading %s information' % format_id,
- fatal=False)
- elif stream_format == 'dash':
- adaptive_formats = self._extract_mpd_formats(
- url, video_id, mpd_id=format_id,
- note='Downloading %s information' % format_id,
- fatal=False)
- if audio_lang:
- for f in adaptive_formats:
- if f.get('acodec') != 'none':
- f['language'] = audio_lang
- return adaptive_formats
+ def _initialize_pre_login(self):
+ return self._set_api_params()
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -249,6 +252,9 @@ class VRVSeriesIE(VRVBaseIE):
'playlist_mincount': 11,
}
+ def _initialize_pre_login(self):
+ return self._set_api_params()
+
def _real_extract(self, url):
series_id = self._match_id(url)
diff --git a/yt_dlp/extractor/vshare.py b/yt_dlp/extractor/vshare.py
index c631ac1fa..b4874ac39 100644
--- a/yt_dlp/extractor/vshare.py
+++ b/yt_dlp/extractor/vshare.py
@@ -50,8 +50,7 @@ class VShareIE(InfoExtractor):
'https://vshare.io/v/%s/width-650/height-430/1' % video_id,
video_id, headers={'Referer': url})
- title = self._html_search_regex(
- r'<title>([^<]+)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
title = title.split(' - ')[0]
error = self._html_search_regex(
diff --git a/yt_dlp/extractor/vupload.py b/yt_dlp/extractor/vupload.py
index 2229a6591..b561f63f7 100644
--- a/yt_dlp/extractor/vupload.py
+++ b/yt_dlp/extractor/vupload.py
@@ -28,7 +28,7 @@ class VuploadIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
video_json = self._parse_json(self._html_search_regex(r'sources:\s*(.+?]),', webpage, 'video'), video_id, transform_source=js_to_json)
formats = []
for source in video_json:
diff --git a/yt_dlp/extractor/weibo.py b/yt_dlp/extractor/weibo.py
index 621df5b54..dafa2af3b 100644
--- a/yt_dlp/extractor/weibo.py
+++ b/yt_dlp/extractor/weibo.py
@@ -73,8 +73,7 @@ class WeiboIE(InfoExtractor):
webpage = self._download_webpage(
url, video_id, note='Revisiting webpage')
- title = self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
video_formats = compat_parse_qs(self._search_regex(
r'video-sources=\\\"(.+?)\"', webpage, 'video_sources'))
diff --git a/yt_dlp/extractor/whowatch.py b/yt_dlp/extractor/whowatch.py
index f8bc2e73a..e4b610d00 100644
--- a/yt_dlp/extractor/whowatch.py
+++ b/yt_dlp/extractor/whowatch.py
@@ -5,6 +5,7 @@ from .common import InfoExtractor
from ..utils import (
int_or_none,
qualities,
+ try_call,
try_get,
ExtractorError,
)
@@ -26,10 +27,10 @@ class WhoWatchIE(InfoExtractor):
metadata = self._download_json('https://api.whowatch.tv/lives/%s' % video_id, video_id)
live_data = self._download_json('https://api.whowatch.tv/lives/%s/play' % video_id, video_id)
- title = try_get(None, (
- lambda x: live_data['share_info']['live_title'][1:-1],
- lambda x: metadata['live']['title'],
- ), compat_str)
+ title = try_call(
+ lambda: live_data['share_info']['live_title'][1:-1],
+ lambda: metadata['live']['title'],
+ expected_type=str)
hls_url = live_data.get('hls_url')
if not hls_url:
diff --git a/yt_dlp/extractor/xnxx.py b/yt_dlp/extractor/xnxx.py
index dd4fb54d4..27f991627 100644
--- a/yt_dlp/extractor/xnxx.py
+++ b/yt_dlp/extractor/xnxx.py
@@ -13,7 +13,7 @@ from ..utils import (
class XNXXIE(InfoExtractor):
- _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/'
+ _VALID_URL = r'https?://(?:video|www)\.xnxx3?\.com/video-?(?P<id>[0-9a-z]+)/'
_TESTS = [{
'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video',
'md5': '7583e96c15c0f21e9da3453d9920fbba',
@@ -32,6 +32,9 @@ class XNXXIE(InfoExtractor):
}, {
'url': 'http://www.xnxx.com/video-55awb78/',
'only_matching': True,
+ }, {
+ 'url': 'http://www.xnxx3.com/video-55awb78/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/yt_dlp/extractor/yahoo.py b/yt_dlp/extractor/yahoo.py
index 6cf3b1de2..20504de2c 100644
--- a/yt_dlp/extractor/yahoo.py
+++ b/yt_dlp/extractor/yahoo.py
@@ -533,7 +533,7 @@ class YahooJapanNewsIE(InfoExtractor):
title = self._html_search_meta(
['og:title', 'twitter:title'], webpage, 'title', default=None
- ) or self._html_search_regex('<title>([^<]+)</title>', webpage, 'title')
+ ) or self._html_extract_title(webpage)
if display_id == host:
# Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...)
diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py
index a101af67e..7d3966bf1 100644
--- a/yt_dlp/extractor/yandexvideo.py
+++ b/yt_dlp/extractor/yandexvideo.py
@@ -163,7 +163,6 @@ class YandexVideoPreviewIE(InfoExtractor):
'thumbnail': 'https://i.mycdn.me/videoPreview?id=544866765315&type=37&idx=13&tkn=TY5qjLYZHxpmcnK8U2LgzYkgmaU&fn=external_8',
'uploader_id': '481054701571',
'title': 'LOFT - summer, summer, summer HD',
- 'manifest_stream_number': 0,
'uploader': 'АРТЁМ КУДРОВ',
},
}, { # youtube
diff --git a/yt_dlp/extractor/youjizz.py b/yt_dlp/extractor/youjizz.py
index 5f5fbf21c..111623ffe 100644
--- a/yt_dlp/extractor/youjizz.py
+++ b/yt_dlp/extractor/youjizz.py
@@ -36,8 +36,7 @@ class YouJizzIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'title')
+ title = self._html_extract_title(webpage)
formats = []
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index 19b4985f6..017554c88 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -837,17 +837,20 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
channel_id = traverse_obj(
- renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), expected_type=str, get_all=False)
+ renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'),
+ expected_type=str, get_all=False)
timestamp, time_text = self._extract_time_text(renderer, 'publishedTimeText')
scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False))
overlay_style = traverse_obj(
- renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str)
+ renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'),
+ get_all=False, expected_type=str)
badges = self._extract_badges(renderer)
thumbnails = self._extract_thumbnails(renderer, 'thumbnail')
navigation_url = urljoin('https://www.youtube.com/', traverse_obj(
- renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str))
+ renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'),
+ expected_type=str)) or ''
url = f'https://www.youtube.com/watch?v={video_id}'
- if overlay_style == 'SHORTS' or (navigation_url and '/shorts/' in navigation_url):
+ if overlay_style == 'SHORTS' or '/shorts/' in navigation_url:
url = f'https://www.youtube.com/shorts/{video_id}'
return {
@@ -862,7 +865,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'uploader': uploader,
'channel_id': channel_id,
'thumbnails': thumbnails,
- 'upload_date': strftime_or_none(timestamp, '%Y%m%d') if self._configuration_arg('approximate_date', ie_key='youtubetab') else None,
+ 'upload_date': (strftime_or_none(timestamp, '%Y%m%d')
+ if self._configuration_arg('approximate_date', ie_key='youtubetab')
+ else None),
'live_status': ('is_upcoming' if scheduled_timestamp is not None
else 'was_live' if 'streamed' in time_text.lower()
else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges
@@ -3777,7 +3782,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
def _extract_basic_item_renderer(item):
# Modified from _extract_grid_item_renderer
known_basic_renderers = (
- 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer'
+ 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer', 'reelItemRenderer'
)
for key, renderer in item.items():
if not isinstance(renderer, dict):
@@ -3903,6 +3908,13 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
if video_id:
return self._extract_video(video_renderer)
+ def _hashtag_tile_entry(self, hashtag_tile_renderer):
+ url = urljoin('https://youtube.com', traverse_obj(
+ hashtag_tile_renderer, ('onTapCommand', 'commandMetadata', 'webCommandMetadata', 'url')))
+ if url:
+ return self.url_result(
+ url, ie=YoutubeTabIE.ie_key(), title=self._get_text(hashtag_tile_renderer, 'hashtag'))
+
def _post_thread_entries(self, post_thread_renderer):
post_renderer = try_get(
post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
@@ -3985,12 +3997,14 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
known_renderers = {
'playlistVideoListRenderer': self._playlist_entries,
'gridRenderer': self._grid_entries,
- 'shelfRenderer': lambda x: self._shelf_entries(x),
+ 'reelShelfRenderer': self._grid_entries,
+ 'shelfRenderer': self._shelf_entries,
'musicResponsiveListItemRenderer': lambda x: [self._music_reponsive_list_entry(x)],
'backstagePostThreadRenderer': self._post_thread_entries,
'videoRenderer': lambda x: [self._video_entry(x)],
'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}),
'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}),
+ 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)]
}
for key, renderer in isr_content.items():
if key not in known_renderers:
@@ -4162,7 +4176,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
})
primary_thumbnails = self._extract_thumbnails(
- primary_sidebar_renderer, ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail'))
+ primary_sidebar_renderer, ('thumbnailRenderer', ('playlistVideoThumbnailRenderer', 'playlistCustomThumbnailRenderer'), 'thumbnail'))
if playlist_id is None:
playlist_id = item_id
@@ -5520,7 +5534,17 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor):
'id': 'python',
'title': 'python',
}
-
+ }, {
+ 'url': 'https://www.youtube.com/results?search_query=%23cats',
+ 'playlist_mincount': 1,
+ 'info_dict': {
+ 'id': '#cats',
+ 'title': '#cats',
+ 'entries': [{
+ 'url': r're:https://(www\.)?youtube\.com/hashtag/cats',
+ 'title': '#cats',
+ }],
+ },
}, {
'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
'only_matching': True,
diff --git a/yt_dlp/options.py b/yt_dlp/options.py
index 936cc8b6f..c23395671 100644
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@@ -163,6 +163,8 @@ def create_parser():
values = [process(value)] if delim is None else list(map(process, value.split(delim)[::-1]))
while values:
actual_val = val = values.pop()
+ if not val:
+ raise optparse.OptionValueError(f'Invalid {option.metavar} for {opt_str}: {value}')
if val == 'all':
current.update(allowed_values)
elif val == '-all':
@@ -1307,7 +1309,7 @@ def create_parser():
'--audio-format', metavar='FORMAT', dest='audioformat', default='best',
help=(
'Specify audio format to convert the audio to when -x is used. Currently supported formats are: '
- 'best (default) or one of %s' % '|'.join(FFmpegExtractAudioPP.SUPPORTED_EXTS)))
+ 'best (default) or one of %s' % ', '.join(FFmpegExtractAudioPP.SUPPORTED_EXTS)))
postproc.add_option(
'--audio-quality', metavar='QUALITY',
dest='audioquality', default='5',
@@ -1319,7 +1321,7 @@ def create_parser():
'Remux the video into another container if necessary (currently supported: %s). '
'If target container does not support the video/audio codec, remuxing will fail. '
'You can specify multiple rules; Eg. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 '
- 'and anything else to mkv.' % '|'.join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS)))
+ 'and anything else to mkv.' % ', '.join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS)))
postproc.add_option(
'--recode-video',
metavar='FORMAT', dest='recodevideo', default=None,
@@ -1434,7 +1436,7 @@ def create_parser():
'"multi_video" (default; only when the videos form a single show). '
'All the video files must have same codecs and number of streams to be concatable. '
'The "pl_video:" prefix can be used with "--paths" and "--output" to '
- 'set the output filename for the split files. See "OUTPUT TEMPLATE" for details'))
+ 'set the output filename for the concatenated files. See "OUTPUT TEMPLATE" for details'))
postproc.add_option(
'--fixup',
metavar='POLICY', dest='fixup', default=None,
@@ -1482,20 +1484,20 @@ def create_parser():
help=optparse.SUPPRESS_HELP)
postproc.add_option(
'--no-exec-before-download',
- action='store_const', dest='exec_before_dl_cmd', const=[],
+ action='store_const', dest='exec_before_dl_cmd', const=None,
help=optparse.SUPPRESS_HELP)
postproc.add_option(
'--convert-subs', '--convert-sub', '--convert-subtitles',
metavar='FORMAT', dest='convertsubtitles', default=None,
help=(
'Convert the subtitles to another format (currently supported: %s) '
- '(Alias: --convert-subtitles)' % '|'.join(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS)))
+ '(Alias: --convert-subtitles)' % ', '.join(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS)))
postproc.add_option(
'--convert-thumbnails',
metavar='FORMAT', dest='convertthumbnails', default=None,
help=(
'Convert the thumbnails to another format '
- '(currently supported: %s) ' % '|'.join(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS)))
+ '(currently supported: %s) ' % ', '.join(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS)))
postproc.add_option(
'--split-chapters', '--split-tracks',
dest='split_chapters', action='store_true', default=False,
diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py
index 0b18e8774..643290286 100644
--- a/yt_dlp/postprocessor/ffmpeg.py
+++ b/yt_dlp/postprocessor/ffmpeg.py
@@ -95,7 +95,7 @@ class FFmpegPostProcessor(PostProcessor):
def get_ffmpeg_version(path, prog):
if path in self._version_cache:
- self._versions[path], self._features = self._version_cache[path], self._features_cache.get(path, {})
+ self._versions[prog], self._features = self._version_cache[path], self._features_cache.get(path, {})
return
out = _get_exe_version_output(path, ['-bsfs'], to_screen=self.write_debug)
ver = detect_exe_version(out) if out else False
@@ -500,6 +500,9 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
temp_path = new_path = prefix + sep + extension
if new_path == path:
+ if acodec == 'copy':
+ self.to_screen(f'File is already in target format {self._preferredcodec}, skipping')
+ return [], information
orig_path = prepend_extension(path, 'orig')
temp_path = prepend_extension(path, 'temp')
if (self._nopostoverwrites and os.path.exists(encodeFilename(new_path))
@@ -1122,6 +1125,11 @@ class FFmpegConcatPP(FFmpegPostProcessor):
self._only_multi_video = only_multi_video
super().__init__(downloader)
+ def _get_codecs(self, file):
+ codecs = traverse_obj(self.get_metadata_object(file), ('streams', ..., 'codec_name'))
+ self.write_debug(f'Codecs = {", ".join(codecs)}')
+ return tuple(codecs)
+
def concat_files(self, in_files, out_file):
if not self._downloader._ensure_dir_exists(out_file):
return
@@ -1131,8 +1139,7 @@ class FFmpegConcatPP(FFmpegPostProcessor):
os.replace(in_files[0], out_file)
return []
- codecs = [traverse_obj(self.get_metadata_object(file), ('streams', ..., 'codec_name')) for file in in_files]
- if len(set(map(tuple, codecs))) > 1:
+ if len(set(map(self._get_codecs, in_files))) > 1:
raise PostProcessingError(
'The files have different streams/codecs and cannot be concatenated. '
'Either select different formats or --recode-video them to a common format')
@@ -1146,7 +1153,7 @@ class FFmpegConcatPP(FFmpegPostProcessor):
entries = info.get('entries') or []
if not any(entries) or (self._only_multi_video and info['_type'] != 'multi_video'):
return [], info
- elif any(len(entry) > 1 for entry in traverse_obj(entries, (..., 'requested_downloads')) or []):
+ elif traverse_obj(entries, (..., 'requested_downloads', lambda _, v: len(v) > 1)):
raise PostProcessingError('Concatenation is not supported when downloading multiple separate formats')
in_files = traverse_obj(entries, (..., 'requested_downloads', 0, 'filepath')) or []
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 62a1800d4..6663583fc 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -1040,7 +1040,7 @@ def make_HTTPS_handler(params, **kwargs):
def bug_reports_message(before=';'):
- msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp , '
+ msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
'filling out the appropriate issue template. '
'Confirm you are on the latest version using yt-dlp -U')
@@ -2418,11 +2418,14 @@ def parse_count(s):
return str_to_int(mobj.group(1))
-def parse_resolution(s):
+def parse_resolution(s, *, lenient=False):
if s is None:
return {}
- mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
+ if lenient:
+ mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
+ else:
+ mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
if mobj:
return {
'width': int(mobj.group('w')),
@@ -2880,6 +2883,7 @@ class PagedList:
class OnDemandPagedList(PagedList):
+ """Download pages until a page with less than maximum results"""
def _getslice(self, start, end):
for pagenum in itertools.count(start // self._pagesize):
firstid = pagenum * self._pagesize
@@ -2919,6 +2923,7 @@ class OnDemandPagedList(PagedList):
class InAdvancePagedList(PagedList):
+ """PagedList with total number of pages known in advance"""
def __init__(self, pagefunc, pagecount, pagesize):
PagedList.__init__(self, pagefunc, pagesize, True)
self._pagecount = pagecount
@@ -3087,24 +3092,25 @@ def multipart_encode(data, boundary=None):
def dict_get(d, key_or_keys, default=None, skip_false_values=True):
- if isinstance(key_or_keys, (list, tuple)):
- for key in key_or_keys:
- if key not in d or d[key] is None or skip_false_values and not d[key]:
- continue
- return d[key]
- return default
- return d.get(key_or_keys, default)
+ for val in map(d.get, variadic(key_or_keys)):
+ if val is not None and (val or not skip_false_values):
+ return val
+ return default
-def try_get(src, getter, expected_type=None):
- for get in variadic(getter):
+def try_call(*funcs, expected_type=None, args=[], kwargs={}):
+ for f in funcs:
try:
- v = get(src)
- except (AttributeError, KeyError, TypeError, IndexError):
+ val = f(*args, **kwargs)
+ except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
pass
else:
- if expected_type is None or isinstance(v, expected_type):
- return v
+ if expected_type is None or isinstance(val, expected_type):
+ return val
+
+
+def try_get(src, getter, expected_type=None):
+ return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
def filter_dict(dct, cndn=lambda _, v: v is not None):
@@ -3317,6 +3323,10 @@ def error_to_compat_str(err):
return err_str
+def error_to_str(err):
+ return f'{type(err).__name__}: {err}'
+
+
def mimetype2ext(mt):
if mt is None:
return None
@@ -5148,8 +5158,8 @@ def traverse_obj(
@param path_list A list of paths which are checked one by one.
Each path is a list of keys where each key is a string,
a function, a tuple of strings/None or "...".
- When a fuction is given, it takes the key as argument and
- returns whether the key matches or not. When a tuple is given,
+ When a fuction is given, it takes the key and value as arguments
+ and returns whether the key matches or not. When a tuple is given,
all the keys given in the tuple are traversed, and
"..." traverses all the keys in the object
"None" returns the object without traversal
@@ -5194,7 +5204,7 @@ def traverse_obj(
obj = str(obj)
_current_depth += 1
depth = max(depth, _current_depth)
- return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
+ return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
elif isinstance(obj, dict) and not (is_user_input and key == ':'):
obj = (obj.get(key) if casesense or (key in obj)
else next((v for k, v in obj.items() if _lower(k) == key), None))