aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFelix S <felix.von.s@posteo.de>2022-02-11 19:03:33 +0000
committerGitHub <noreply@github.com>2022-02-11 11:03:33 -0800
commit3f047fc406dc2df4f2ca6a75b2ea07d9928b2a09 (patch)
tree6ad24b537ca94fb8d135676b3e3b37708b7992fa
parent82b517678362f8779450f686bb5a283057e486d4 (diff)
downloadhypervideo-pre-3f047fc406dc2df4f2ca6a75b2ea07d9928b2a09.tar.lz
hypervideo-pre-3f047fc406dc2df4f2ca6a75b2ea07d9928b2a09.tar.xz
hypervideo-pre-3f047fc406dc2df4f2ca6a75b2ea07d9928b2a09.zip
[extractor] Extract subtitles from manifests for more sites (#2686)
vimeo, globo, kaltura, svt Authored by: fstirlitz
-rw-r--r--yt_dlp/extractor/globo.py6
-rw-r--r--yt_dlp/extractor/kaltura.py8
-rw-r--r--yt_dlp/extractor/svt.py25
-rw-r--r--yt_dlp/extractor/vimeo.py18
4 files changed, 34 insertions, 23 deletions
diff --git a/yt_dlp/extractor/globo.py b/yt_dlp/extractor/globo.py
index 9addb7043..f6aaae1e9 100644
--- a/yt_dlp/extractor/globo.py
+++ b/yt_dlp/extractor/globo.py
@@ -139,11 +139,11 @@ class GloboIE(InfoExtractor):
resource_url = source['scheme'] + '://' + source['domain'] + source['path']
signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A')
- formats.extend(self._extract_m3u8_formats(
- signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+ fmts, subtitles = self._extract_m3u8_formats_and_subtitles(
+ signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+ formats.extend(fmts)
self._sort_formats(formats)
- subtitles = {}
for resource in video['resources']:
if resource.get('type') == 'subtitle':
subtitles.setdefault(resource.get('language') or 'por', []).append({
diff --git a/yt_dlp/extractor/kaltura.py b/yt_dlp/extractor/kaltura.py
index c58216458..f6dfc9caa 100644
--- a/yt_dlp/extractor/kaltura.py
+++ b/yt_dlp/extractor/kaltura.py
@@ -301,6 +301,7 @@ class KalturaIE(InfoExtractor):
data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url)
formats = []
+ subtitles = {}
for f in flavor_assets:
# Continue if asset is not ready
if f.get('status') != 2:
@@ -344,13 +345,14 @@ class KalturaIE(InfoExtractor):
if '/playManifest/' in data_url:
m3u8_url = sign_url(data_url.replace(
'format/url', 'format/applehttp'))
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
m3u8_url, entry_id, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
+ m3u8_id='hls', fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
self._sort_formats(formats)
- subtitles = {}
if captions:
for caption in captions.get('objects', []):
# Continue if caption is not ready
diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py
index 6ad01a912..8ca62e370 100644
--- a/yt_dlp/extractor/svt.py
+++ b/yt_dlp/extractor/svt.py
@@ -23,23 +23,27 @@ class SVTBaseIE(InfoExtractor):
is_live = dict_get(video_info, ('live', 'simulcast'), default=False)
m3u8_protocol = 'm3u8' if is_live else 'm3u8_native'
formats = []
+ subtitles = {}
for vr in video_info['videoReferences']:
player_type = vr.get('playerType') or vr.get('format')
vurl = vr['url']
ext = determine_ext(vurl)
if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
vurl, video_id,
ext='mp4', entry_protocol=m3u8_protocol,
- m3u8_id=player_type, fatal=False))
+ m3u8_id=player_type, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
vurl + '?hdcore=3.3.0', video_id,
f4m_id=player_type, fatal=False))
elif ext == 'mpd':
- if player_type == 'dashhbbtv':
- formats.extend(self._extract_mpd_formats(
- vurl, video_id, mpd_id=player_type, fatal=False))
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
+ vurl, video_id, mpd_id=player_type, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'format_id': player_type,
@@ -52,18 +56,19 @@ class SVTBaseIE(InfoExtractor):
countries=self._GEO_COUNTRIES, metadata_available=True)
self._sort_formats(formats)
- subtitles = {}
subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences'))
if isinstance(subtitle_references, list):
for sr in subtitle_references:
subtitle_url = sr.get('url')
subtitle_lang = sr.get('language', 'sv')
if subtitle_url:
+ sub = {
+ 'url': subtitle_url,
+ }
if determine_ext(subtitle_url) == 'm3u8':
- # TODO(yan12125): handle WebVTT in m3u8 manifests
- continue
-
- subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url})
+ # XXX: no way of testing, is it ever hit?
+ sub['ext'] = 'vtt'
+ subtitles.setdefault(subtitle_lang, []).append(sub)
title = video_info.get('title')
diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py
index 57391d766..c2dec244f 100644
--- a/yt_dlp/extractor/vimeo.py
+++ b/yt_dlp/extractor/vimeo.py
@@ -131,6 +131,8 @@ class VimeoBaseInfoExtractor(InfoExtractor):
request = config.get('request') or {}
formats = []
+ subtitles = {}
+
config_files = video_data.get('files') or request.get('files') or {}
for f in (config_files.get('progressive') or []):
video_url = f.get('url')
@@ -163,21 +165,24 @@ class VimeoBaseInfoExtractor(InfoExtractor):
sep_manifest_urls = [(format_id, manifest_url)]
for f_id, m_url in sep_manifest_urls:
if files_type == 'hls':
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
m_url, video_id, 'mp4',
'm3u8' if is_live else 'm3u8_native', m3u8_id=f_id,
note='Downloading %s m3u8 information' % cdn_name,
- fatal=False))
+ fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
elif files_type == 'dash':
if 'json=1' in m_url:
real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url')
if real_m_url:
m_url = real_m_url
- mpd_formats = self._extract_mpd_formats(
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
m_url.replace('/master.json', '/master.mpd'), video_id, f_id,
'Downloading %s MPD information' % cdn_name,
fatal=False)
- formats.extend(mpd_formats)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
live_archive = live_event.get('archive') or {}
live_archive_source_url = live_archive.get('source_url')
@@ -188,12 +193,11 @@ class VimeoBaseInfoExtractor(InfoExtractor):
'quality': 10,
})
- subtitles = {}
for tt in (request.get('text_tracks') or []):
- subtitles[tt['lang']] = [{
+ subtitles.setdefault(tt['lang'], []).append({
'ext': 'vtt',
'url': urljoin('https://vimeo.com', tt['url']),
- }]
+ })
thumbnails = []
if not is_live: