diff options
author | pukkandan <pukkandan.ytdlp@gmail.com> | 2021-04-28 19:02:43 +0530 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-04-28 19:02:43 +0530 |
commit | be6202f12b97858b9d716e608394b51065d0419f (patch) | |
tree | 71f920777c24d9d81c990f0bf57d66e9d5bbaff7 /yt_dlp/extractor/generic.py | |
parent | db9a564b6a5c31472f8298969584eead0b59fa1c (diff) | |
parent | e8f834cd8dfc07011d1080321e42bc130e7201bb (diff) | |
download | hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.tar.lz hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.tar.xz hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.zip |
Subtitle extraction from streaming media manifests #247
Authored by fstirlitz
Modified from: https://github.com/ytdl-org/youtube-dl/pull/6144
Closes: #73
Fixes:
https://github.com/ytdl-org/youtube-dl/issues/6106
https://github.com/ytdl-org/youtube-dl/issues/14977
https://github.com/ytdl-org/youtube-dl/issues/21438
https://github.com/ytdl-org/youtube-dl/issues/23609
https://github.com/ytdl-org/youtube-dl/issues/28132
Might also fix (untested):
https://github.com/ytdl-org/youtube-dl/issues/15424
https://github.com/ytdl-org/youtube-dl/issues/18267
https://github.com/ytdl-org/youtube-dl/issues/23899
https://github.com/ytdl-org/youtube-dl/issues/24375
https://github.com/ytdl-org/youtube-dl/issues/24595
https://github.com/ytdl-org/youtube-dl/issues/27899
Related:
https://github.com/ytdl-org/youtube-dl/issues/22379
https://github.com/ytdl-org/youtube-dl/pull/24517
https://github.com/ytdl-org/youtube-dl/pull/24886
https://github.com/ytdl-org/youtube-dl/pull/27215
Notes:
* The functions `extractor.common._extract_..._formats` are still kept for compatibility
* Only some extractors have currently been moved to using `_extract_..._formats_and_subtitles`
* Direct subtitle manifests (without a master) are not supported and are wrongly identified as containing video formats
* AES support is untested
* The fragmented TTML subtitles extracted from DASH/ISM are valid, but are unsupported by `ffmpeg` and most video players
* Their XML fragments can be dumped using `ffmpeg -i in.mp4 -f data -map 0 -c copy out.ttml`.
Once the unnecessary headers are stripped out of this, it becomes a valid self-contained ttml file
* The ttml subs downloaded from DASH manifests can also be directly opened with <https://github.com/SubtitleEdit>
* Fragmented WebVTT files extracted from DASH/ISM are also unsupported by most tools
* Unlike the ttml files, the XML fragments of these cannot be dumped using `ffmpeg`
* The webtt subs extracted from DASH can be parsed by <https://github.com/gpac/gpac>
* But validity of the those extracted from ISM are untested
Diffstat (limited to 'yt_dlp/extractor/generic.py')
-rw-r--r-- | yt_dlp/extractor/generic.py | 8 |
1 files changed, 5 insertions, 3 deletions
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 4250d1093..32815476f 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2444,8 +2444,9 @@ class GenericIE(InfoExtractor): m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: format_id = compat_str(m.group('format_id')) + subtitles = {} if format_id.endswith('mpegurl'): - formats = self._extract_m3u8_formats(url, video_id, 'mp4') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') elif format_id == 'f4m': formats = self._extract_f4m_formats(url, video_id) else: @@ -2457,6 +2458,7 @@ class GenericIE(InfoExtractor): info_dict['direct'] = True self._sort_formats(formats) info_dict['formats'] = formats + info_dict['subtitles'] = subtitles return info_dict if not self._downloader.params.get('test', False) and not is_intentional: @@ -2510,7 +2512,7 @@ class GenericIE(InfoExtractor): if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif doc.tag == 'SmoothStreamingMedia': - info_dict['formats'] = self._parse_ism_formats(doc, url) + info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url) self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): @@ -2524,7 +2526,7 @@ class GenericIE(InfoExtractor): xspf_base_url=full_response.geturl()), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): - info_dict['formats'] = self._parse_mpd_formats( + info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles( doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) |