Subtitle extraction from streaming media manifests #247

Authored by fstirlitz Modified from: https://github.com/ytdl-org/youtube-dl/pull/6144 Closes: #73 Fixes: https://github.com/ytdl-org/youtube-dl/issues/6106 https://github.com/ytdl-org/youtube-dl/issues/14977 https://github.com/ytdl-org/youtube-dl/issues/21438 https://github.com/ytdl-org/youtube-dl/issues/23609 https://github.com/ytdl-org/youtube-dl/issues/28132 Might also fix (untested): https://github.com/ytdl-org/youtube-dl/issues/15424 https://github.com/ytdl-org/youtube-dl/issues/18267 https://github.com/ytdl-org/youtube-dl/issues/23899 https://github.com/ytdl-org/youtube-dl/issues/24375 https://github.com/ytdl-org/youtube-dl/issues/24595 https://github.com/ytdl-org/youtube-dl/issues/27899 Related: https://github.com/ytdl-org/youtube-dl/issues/22379 https://github.com/ytdl-org/youtube-dl/pull/24517 https://github.com/ytdl-org/youtube-dl/pull/24886 https://github.com/ytdl-org/youtube-dl/pull/27215 Notes: * The functions `extractor.common._extract_..._formats` are still kept for compatibility * Only some extractors have currently been moved to using `_extract_..._formats_and_subtitles` * Direct subtitle manifests (without a master) are not supported and are wrongly identified as containing video formats * AES support is untested * The fragmented TTML subtitles extracted from DASH/ISM are valid, but are unsupported by `ffmpeg` and most video players * Their XML fragments can be dumped using `ffmpeg -i in.mp4 -f data -map 0 -c copy out.ttml`. Once the unnecessary headers are stripped out of this, it becomes a valid self-contained ttml file * The ttml subs downloaded from DASH manifests can also be directly opened with <https://github.com/SubtitleEdit> * Fragmented WebVTT files extracted from DASH/ISM are also unsupported by most tools * Unlike the ttml files, the XML fragments of these cannot be dumped using `ffmpeg` * The webtt subs extracted from DASH can be parsed by <https://github.com/gpac/gpac> * But validity of the those extracted from ISM are untested
author: pukkandan <pukkandan.ytdlp@gmail.com> 2021-04-28 19:02:43 +0530
committer: GitHub <noreply@github.com> 2021-04-28 19:02:43 +0530
commit: be6202f12b97858b9d716e608394b51065d0419f (patch)
tree: 71f920777c24d9d81c990f0bf57d66e9d5bbaff7 /yt_dlp/extractor/generic.py
parent: db9a564b6a5c31472f8298969584eead0b59fa1c (diff)
parent: e8f834cd8dfc07011d1080321e42bc130e7201bb (diff)
download: hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.tar.lz
hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.tar.xz
hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.zip
1 files changed, 5 insertions, 3 deletions
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py
index 4250d1093..32815476f 100644
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -2444,8 +2444,9 @@ class GenericIE(InfoExtractor):
         m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
         if m:
             format_id = compat_str(m.group('format_id'))
+            subtitles = {}
             if format_id.endswith('mpegurl'):
-                formats = self._extract_m3u8_formats(url, video_id, 'mp4')
+                formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
             elif format_id == 'f4m':
                 formats = self._extract_f4m_formats(url, video_id)
             else:
@@ -2457,6 +2458,7 @@ class GenericIE(InfoExtractor):
                 info_dict['direct'] = True
             self._sort_formats(formats)
             info_dict['formats'] = formats
+            info_dict['subtitles'] = subtitles
             return info_dict
 
         if not self._downloader.params.get('test', False) and not is_intentional:
@@ -2510,7 +2512,7 @@ class GenericIE(InfoExtractor):
             if doc.tag == 'rss':
                 return self._extract_rss(url, video_id, doc)
             elif doc.tag == 'SmoothStreamingMedia':
-                info_dict['formats'] = self._parse_ism_formats(doc, url)
+                info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url)
                 self._sort_formats(info_dict['formats'])
                 return info_dict
             elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
@@ -2524,7 +2526,7 @@ class GenericIE(InfoExtractor):
                         xspf_base_url=full_response.geturl()),
                     video_id)
             elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
-                info_dict['formats'] = self._parse_mpd_formats(
+                info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles(
                     doc,
                     mpd_base_url=full_response.geturl().rpartition('/')[0],
                     mpd_url=url)
author	pukkandan <pukkandan.ytdlp@gmail.com>	2021-04-28 19:02:43 +0530
committer	GitHub <noreply@github.com>	2021-04-28 19:02:43 +0530
commit	be6202f12b97858b9d716e608394b51065d0419f (patch)
tree	71f920777c24d9d81c990f0bf57d66e9d5bbaff7 /yt_dlp/extractor/generic.py
parent	db9a564b6a5c31472f8298969584eead0b59fa1c (diff)
parent	e8f834cd8dfc07011d1080321e42bc130e7201bb (diff)
download	hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.tar.lz hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.tar.xz hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.zip