diff options
author | pukkandan <pukkandan.ytdlp@gmail.com> | 2021-04-28 19:02:43 +0530 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-04-28 19:02:43 +0530 |
commit | be6202f12b97858b9d716e608394b51065d0419f (patch) | |
tree | 71f920777c24d9d81c990f0bf57d66e9d5bbaff7 /yt_dlp/extractor/twitter.py | |
parent | db9a564b6a5c31472f8298969584eead0b59fa1c (diff) | |
parent | e8f834cd8dfc07011d1080321e42bc130e7201bb (diff) | |
download | hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.tar.lz hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.tar.xz hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.zip |
Subtitle extraction from streaming media manifests #247
Authored by fstirlitz
Modified from: https://github.com/ytdl-org/youtube-dl/pull/6144
Closes: #73
Fixes:
https://github.com/ytdl-org/youtube-dl/issues/6106
https://github.com/ytdl-org/youtube-dl/issues/14977
https://github.com/ytdl-org/youtube-dl/issues/21438
https://github.com/ytdl-org/youtube-dl/issues/23609
https://github.com/ytdl-org/youtube-dl/issues/28132
Might also fix (untested):
https://github.com/ytdl-org/youtube-dl/issues/15424
https://github.com/ytdl-org/youtube-dl/issues/18267
https://github.com/ytdl-org/youtube-dl/issues/23899
https://github.com/ytdl-org/youtube-dl/issues/24375
https://github.com/ytdl-org/youtube-dl/issues/24595
https://github.com/ytdl-org/youtube-dl/issues/27899
Related:
https://github.com/ytdl-org/youtube-dl/issues/22379
https://github.com/ytdl-org/youtube-dl/pull/24517
https://github.com/ytdl-org/youtube-dl/pull/24886
https://github.com/ytdl-org/youtube-dl/pull/27215
Notes:
* The functions `extractor.common._extract_..._formats` are still kept for compatibility
* Only some extractors have currently been moved to using `_extract_..._formats_and_subtitles`
* Direct subtitle manifests (without a master) are not supported and are wrongly identified as containing video formats
* AES support is untested
* The fragmented TTML subtitles extracted from DASH/ISM are valid, but are unsupported by `ffmpeg` and most video players
* Their XML fragments can be dumped using `ffmpeg -i in.mp4 -f data -map 0 -c copy out.ttml`.
Once the unnecessary headers are stripped out of this, it becomes a valid self-contained ttml file
* The ttml subs downloaded from DASH manifests can also be directly opened with <https://github.com/SubtitleEdit>
* Fragmented WebVTT files extracted from DASH/ISM are also unsupported by most tools
* Unlike the ttml files, the XML fragments of these cannot be dumped using `ffmpeg`
* The webtt subs extracted from DASH can be parsed by <https://github.com/gpac/gpac>
* But validity of the those extracted from ISM are untested
Diffstat (limited to 'yt_dlp/extractor/twitter.py')
-rw-r--r-- | yt_dlp/extractor/twitter.py | 28 |
1 files changed, 19 insertions, 9 deletions
diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 8a2a77b71..63c11bd47 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -36,9 +36,9 @@ class TwitterBaseIE(InfoExtractor): def _extract_variant_formats(self, variant, video_id): variant_url = variant.get('url') if not variant_url: - return [] + return [], {} elif '.m3u8' in variant_url: - return self._extract_m3u8_formats( + return self._extract_m3u8_formats_and_subtitles( variant_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) else: @@ -49,22 +49,27 @@ class TwitterBaseIE(InfoExtractor): 'tbr': tbr, } self._search_dimensions_in_video_url(f, variant_url) - return [f] + return [f], {} def _extract_formats_from_vmap_url(self, vmap_url, video_id): vmap_data = self._download_xml(vmap_url, video_id) formats = [] + subtitles = {} urls = [] for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'): video_variant.attrib['url'] = compat_urllib_parse_unquote( video_variant.attrib['url']) urls.append(video_variant.attrib['url']) - formats.extend(self._extract_variant_formats( - video_variant.attrib, video_id)) + fmts, subs = self._extract_variant_formats( + video_variant.attrib, video_id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile')) if video_url not in urls: - formats.extend(self._extract_variant_formats({'url': video_url}, video_id)) - return formats + fmts, subs = self._extract_variant_formats({'url': video_url}, video_id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + return formats, subtitles @staticmethod def _search_dimensions_in_video_url(a_format, video_url): @@ -471,8 +476,11 @@ class TwitterIE(TwitterBaseIE): video_info = media.get('video_info') or {} formats = [] + subtitles = {} for variant in video_info.get('variants', []): - formats.extend(self._extract_variant_formats(variant, twid)) + fmts, subs = self._extract_variant_formats(variant, twid) + subtitles = self._merge_subtitles(subtitles, subs) + formats.extend(fmts) self._sort_formats(formats) thumbnails = [] @@ -491,6 +499,7 @@ class TwitterIE(TwitterBaseIE): info.update({ 'formats': formats, + 'subtitles': subtitles, 'thumbnails': thumbnails, 'duration': float_or_none(video_info.get('duration_millis'), 1000), }) @@ -540,7 +549,7 @@ class TwitterIE(TwitterBaseIE): is_amplify = card_name == 'amplify' vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) - formats = self._extract_formats_from_vmap_url(vmap_url, content_id or twid) + formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid) self._sort_formats(formats) thumbnails = [] @@ -558,6 +567,7 @@ class TwitterIE(TwitterBaseIE): info.update({ 'formats': formats, + 'subtitles': subtitles, 'thumbnails': thumbnails, 'duration': int_or_none(get_binding_value( 'content_duration_seconds')), |