diff options
author | pukkandan <pukkandan.ytdlp@gmail.com> | 2021-04-28 19:02:43 +0530 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-04-28 19:02:43 +0530 |
commit | be6202f12b97858b9d716e608394b51065d0419f (patch) | |
tree | 71f920777c24d9d81c990f0bf57d66e9d5bbaff7 /yt_dlp/extractor/common.py | |
parent | db9a564b6a5c31472f8298969584eead0b59fa1c (diff) | |
parent | e8f834cd8dfc07011d1080321e42bc130e7201bb (diff) | |
download | hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.tar.lz hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.tar.xz hypervideo-pre-be6202f12b97858b9d716e608394b51065d0419f.zip |
Subtitle extraction from streaming media manifests #247
Authored by fstirlitz
Modified from: https://github.com/ytdl-org/youtube-dl/pull/6144
Closes: #73
Fixes:
https://github.com/ytdl-org/youtube-dl/issues/6106
https://github.com/ytdl-org/youtube-dl/issues/14977
https://github.com/ytdl-org/youtube-dl/issues/21438
https://github.com/ytdl-org/youtube-dl/issues/23609
https://github.com/ytdl-org/youtube-dl/issues/28132
Might also fix (untested):
https://github.com/ytdl-org/youtube-dl/issues/15424
https://github.com/ytdl-org/youtube-dl/issues/18267
https://github.com/ytdl-org/youtube-dl/issues/23899
https://github.com/ytdl-org/youtube-dl/issues/24375
https://github.com/ytdl-org/youtube-dl/issues/24595
https://github.com/ytdl-org/youtube-dl/issues/27899
Related:
https://github.com/ytdl-org/youtube-dl/issues/22379
https://github.com/ytdl-org/youtube-dl/pull/24517
https://github.com/ytdl-org/youtube-dl/pull/24886
https://github.com/ytdl-org/youtube-dl/pull/27215
Notes:
* The functions `extractor.common._extract_..._formats` are still kept for compatibility
* Only some extractors have currently been moved to using `_extract_..._formats_and_subtitles`
* Direct subtitle manifests (without a master) are not supported and are wrongly identified as containing video formats
* AES support is untested
* The fragmented TTML subtitles extracted from DASH/ISM are valid, but are unsupported by `ffmpeg` and most video players
* Their XML fragments can be dumped using `ffmpeg -i in.mp4 -f data -map 0 -c copy out.ttml`.
Once the unnecessary headers are stripped out of this, it becomes a valid self-contained ttml file
* The ttml subs downloaded from DASH manifests can also be directly opened with <https://github.com/SubtitleEdit>
* Fragmented WebVTT files extracted from DASH/ISM are also unsupported by most tools
* Unlike the ttml files, the XML fragments of these cannot be dumped using `ffmpeg`
* The webtt subs extracted from DASH can be parsed by <https://github.com/gpac/gpac>
* But validity of the those extracted from ISM are untested
Diffstat (limited to 'yt_dlp/extractor/common.py')
-rw-r--r-- | yt_dlp/extractor/common.py | 288 |
1 files changed, 199 insertions, 89 deletions
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 4487c5375..2ca25951b 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1879,11 +1879,21 @@ class InfoExtractor(object): 'format_note': 'Quality selection URL', } - def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, - entry_protocol='m3u8', preference=None, quality=None, - m3u8_id=None, note=None, errnote=None, - fatal=True, live=False, data=None, headers={}, - query={}): + def _extract_m3u8_formats(self, *args, **kwargs): + fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs) + if subs: + self.report_warning(bug_reports_message( + "Ignoring subtitle tracks found in the HLS manifest; " + "if any subtitle tracks are missing," + )) + return fmts + + def _extract_m3u8_formats_and_subtitles( + self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', + preference=None, quality=None, m3u8_id=None, note=None, + errnote=None, fatal=True, live=False, data=None, headers={}, + query={}): + res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', @@ -1891,30 +1901,34 @@ class InfoExtractor(object): fatal=fatal, data=data, headers=headers, query=query) if res is False: - return [] + return [], {} m3u8_doc, urlh = res m3u8_url = urlh.geturl() - return self._parse_m3u8_formats( + return self._parse_m3u8_formats_and_subtitles( m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, preference=preference, quality=quality, m3u8_id=m3u8_id, note=note, errnote=errnote, fatal=fatal, live=live, data=data, headers=headers, query=query, video_id=video_id) - def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, - entry_protocol='m3u8', preference=None, quality=None, - m3u8_id=None, live=False, note=None, errnote=None, - fatal=True, data=None, headers={}, query={}, video_id=None): + def _parse_m3u8_formats_and_subtitles( + self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8', + preference=None, quality=None, m3u8_id=None, live=False, note=None, + errnote=None, fatal=True, data=None, headers={}, query={}, + video_id=None): + if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access - return [] + return [], {} if (not self._downloader.params.get('allow_unplayable_formats') and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)): # Apple FairPlay - return [] + return [], {} formats = [] + subtitles = {} + format_url = lambda u: ( u if re.match(r'^https?://', u) @@ -2001,7 +2015,7 @@ class InfoExtractor(object): } formats.append(f) - return formats + return formats, subtitles groups = {} last_stream_inf = {} @@ -2013,6 +2027,21 @@ class InfoExtractor(object): if not (media_type and group_id and name): return groups.setdefault(group_id, []).append(media) + # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1> + if media_type == 'SUBTITLES': + lang = media['LANGUAGE'] # XXX: normalise? + url = format_url(media['URI']) + sub_info = { + 'url': url, + 'ext': determine_ext(url), + } + if sub_info['ext'] == 'm3u8': + # Per RFC 8216 ยง3.1, the only possible subtitle format m3u8 + # files may contain is WebVTT: + # <https://tools.ietf.org/html/rfc8216#section-3.1> + sub_info['ext'] = 'vtt' + sub_info['protocol'] = 'm3u8_native' + subtitles.setdefault(lang, []).append(sub_info) if media_type not in ('VIDEO', 'AUDIO'): return media_url = media.get('URI') @@ -2160,7 +2189,7 @@ class InfoExtractor(object): formats.append(http_f) last_stream_inf = {} - return formats + return formats, subtitles @staticmethod def _xpath_ns(path, namespace=None): @@ -2403,23 +2432,44 @@ class InfoExtractor(object): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): + def _extract_mpd_formats(self, *args, **kwargs): + fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs) + if subs: + self.report_warning(bug_reports_message( + "Ignoring subtitle tracks found in the DASH manifest; " + "if any subtitle tracks are missing," + )) + return fmts + + def _extract_mpd_formats_and_subtitles( + self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, + fatal=True, data=None, headers={}, query={}): res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', errnote=errnote or 'Failed to download MPD manifest', fatal=fatal, data=data, headers=headers, query=query) if res is False: - return [] + return [], {} mpd_doc, urlh = res if mpd_doc is None: - return [] + return [], {} mpd_base_url = base_url(urlh.geturl()) - return self._parse_mpd_formats( + return self._parse_mpd_formats_and_subtitles( mpd_doc, mpd_id, mpd_base_url, mpd_url) - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): + def _parse_mpd_formats(self, *args, **kwargs): + fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs) + if subs: + self.report_warning(bug_reports_message( + "Ignoring subtitle tracks found in the DASH manifest; " + "if any subtitle tracks are missing," + )) + return fmts + + def _parse_mpd_formats_and_subtitles( + self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): """ Parse formats from MPD manifest. References: @@ -2429,7 +2479,7 @@ class InfoExtractor(object): """ if not self._downloader.params.get('dynamic_mpd', True): if mpd_doc.get('type') == 'dynamic': - return [] + return [], {} namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) @@ -2501,6 +2551,7 @@ class InfoExtractor(object): mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) formats = [] + subtitles = {} for period in mpd_doc.findall(_add_ns('Period')): period_duration = parse_duration(period.get('duration')) or mpd_duration period_ms_info = extract_multisegment_info(period, { @@ -2518,11 +2569,9 @@ class InfoExtractor(object): representation_attrib.update(representation.attrib) # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory mime_type = representation_attrib['mimeType'] - content_type = mime_type.split('/')[0] - if content_type == 'text': - # TODO implement WebVTT downloading - pass - elif content_type in ('video', 'audio'): + content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) + + if content_type in ('video', 'audio', 'text'): base_url = '' for element in (representation, adaptation_set, period, mpd_doc): base_url_e = element.find(_add_ns('BaseURL')) @@ -2539,21 +2588,28 @@ class InfoExtractor(object): url_el = representation.find(_add_ns('BaseURL')) filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) bandwidth = int_or_none(representation_attrib.get('bandwidth')) - f = { - 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, - 'manifest_url': mpd_url, - 'ext': mimetype2ext(mime_type), - 'width': int_or_none(representation_attrib.get('width')), - 'height': int_or_none(representation_attrib.get('height')), - 'tbr': float_or_none(bandwidth, 1000), - 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), - 'fps': int_or_none(representation_attrib.get('frameRate')), - 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, - 'format_note': 'DASH %s' % content_type, - 'filesize': filesize, - 'container': mimetype2ext(mime_type) + '_dash', - } - f.update(parse_codecs(representation_attrib.get('codecs'))) + if content_type in ('video', 'audio'): + f = { + 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, + 'manifest_url': mpd_url, + 'ext': mimetype2ext(mime_type), + 'width': int_or_none(representation_attrib.get('width')), + 'height': int_or_none(representation_attrib.get('height')), + 'tbr': float_or_none(bandwidth, 1000), + 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), + 'fps': int_or_none(representation_attrib.get('frameRate')), + 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, + 'format_note': 'DASH %s' % content_type, + 'filesize': filesize, + 'container': mimetype2ext(mime_type) + '_dash', + } + f.update(parse_codecs(representation_attrib.get('codecs'))) + elif content_type == 'text': + f = { + 'ext': mimetype2ext(mime_type), + 'manifest_url': mpd_url, + 'filesize': filesize, + } representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) def prepare_template(template_name, identifiers): @@ -2700,26 +2756,38 @@ class InfoExtractor(object): else: # Assuming direct URL to unfragmented media. f['url'] = base_url - formats.append(f) + if content_type in ('video', 'audio'): + formats.append(f) + elif content_type == 'text': + subtitles.setdefault(lang or 'und', []).append(f) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) - return formats - - def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): + return formats, subtitles + + def _extract_ism_formats(self, *args, **kwargs): + fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs) + if subs: + self.report_warning(bug_reports_message( + "Ignoring subtitle tracks found in the ISM manifest; " + "if any subtitle tracks are missing," + )) + return fmts + + def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): res = self._download_xml_handle( ism_url, video_id, note=note or 'Downloading ISM manifest', errnote=errnote or 'Failed to download ISM manifest', fatal=fatal, data=data, headers=headers, query=query) if res is False: - return [] + return [], {} ism_doc, urlh = res if ism_doc is None: - return [] + return [], {} - return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) + return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id) - def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): + def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): """ Parse formats from ISM manifest. References: @@ -2727,26 +2795,28 @@ class InfoExtractor(object): https://msdn.microsoft.com/en-us/library/ff469518.aspx """ if ism_doc.get('IsLive') == 'TRUE': - return [] + return [], {} if (not self._downloader.params.get('allow_unplayable_formats') and ism_doc.find('Protection') is not None): - return [] + return [], {} duration = int(ism_doc.attrib['Duration']) timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000 formats = [] + subtitles = {} for stream in ism_doc.findall('StreamIndex'): stream_type = stream.get('Type') - if stream_type not in ('video', 'audio'): + if stream_type not in ('video', 'audio', 'text'): continue url_pattern = stream.attrib['Url'] stream_timescale = int_or_none(stream.get('TimeScale')) or timescale stream_name = stream.get('Name') + stream_language = stream.get('Language', 'und') for track in stream.findall('QualityLevel'): fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None) # TODO: add support for WVC1 and WMAP - if fourcc not in ('H264', 'AVC1', 'AACL'): + if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'): self.report_warning('%s is not a supported codec' % fourcc) continue tbr = int(track.attrib['Bitrate']) // 1000 @@ -2789,33 +2859,52 @@ class InfoExtractor(object): format_id.append(stream_name) format_id.append(compat_str(tbr)) - formats.append({ - 'format_id': '-'.join(format_id), - 'url': ism_url, - 'manifest_url': ism_url, - 'ext': 'ismv' if stream_type == 'video' else 'isma', - 'width': width, - 'height': height, - 'tbr': tbr, - 'asr': sampling_rate, - 'vcodec': 'none' if stream_type == 'audio' else fourcc, - 'acodec': 'none' if stream_type == 'video' else fourcc, - 'protocol': 'ism', - 'fragments': fragments, - '_download_params': { - 'duration': duration, - 'timescale': stream_timescale, - 'width': width or 0, - 'height': height or 0, - 'fourcc': fourcc, - 'codec_private_data': track.get('CodecPrivateData'), - 'sampling_rate': sampling_rate, - 'channels': int_or_none(track.get('Channels', 2)), - 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)), - 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)), - }, - }) - return formats + if stream_type == 'text': + subtitles.setdefault(stream_language, []).append({ + 'ext': 'ismt', + 'protocol': 'ism', + 'url': ism_url, + 'manifest_url': ism_url, + 'fragments': fragments, + '_download_params': { + 'stream_type': stream_type, + 'duration': duration, + 'timescale': stream_timescale, + 'fourcc': fourcc, + 'language': stream_language, + 'codec_private_data': track.get('CodecPrivateData'), + } + }) + elif stream_type in ('video', 'audio'): + formats.append({ + 'format_id': '-'.join(format_id), + 'url': ism_url, + 'manifest_url': ism_url, + 'ext': 'ismv' if stream_type == 'video' else 'isma', + 'width': width, + 'height': height, + 'tbr': tbr, + 'asr': sampling_rate, + 'vcodec': 'none' if stream_type == 'audio' else fourcc, + 'acodec': 'none' if stream_type == 'video' else fourcc, + 'protocol': 'ism', + 'fragments': fragments, + '_download_params': { + 'stream_type': stream_type, + 'duration': duration, + 'timescale': stream_timescale, + 'width': width or 0, + 'height': height or 0, + 'fourcc': fourcc, + 'language': stream_language, + 'codec_private_data': track.get('CodecPrivateData'), + 'sampling_rate': sampling_rate, + 'channels': int_or_none(track.get('Channels', 2)), + 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)), + 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)), + }, + }) + return formats, subtitles def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None): def absolute_url(item_url): @@ -2940,7 +3029,16 @@ class InfoExtractor(object): entries.append(media_info) return entries - def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): + def _extract_akamai_formats(self, *args, **kwargs): + fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs) + if subs: + self.report_warning(bug_reports_message( + "Ignoring subtitle tracks found in the manifests; " + "if any subtitle tracks are missing," + )) + return fmts + + def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}): signed = 'hdnea=' in manifest_url if not signed: # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html @@ -2949,6 +3047,7 @@ class InfoExtractor(object): '', manifest_url).strip('?') formats = [] + subtitles = {} hdcore_sign = 'hdcore=3.7.0' f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') @@ -2967,10 +3066,11 @@ class InfoExtractor(object): hls_host = hosts.get('hls') if hls_host: m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) - m3u8_formats = self._extract_m3u8_formats( + m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) formats.extend(m3u8_formats) + subtitles = self._merge_subtitles(subtitles, m3u8_subtitles) http_host = hosts.get('http') if http_host and m3u8_formats and not signed: @@ -2994,7 +3094,7 @@ class InfoExtractor(object): formats.append(http_f) i += 1 - return formats + return formats, subtitles def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): query = compat_urlparse.urlparse(url).query @@ -3319,12 +3419,22 @@ class InfoExtractor(object): return ret @classmethod - def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): - """ Merge two subtitle dictionaries, language by language. """ - ret = dict(subtitle_dict1) - for lang in subtitle_dict2: - ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) - return ret + def _merge_subtitles(cls, *dicts, **kwargs): + """ Merge subtitle dictionaries, language by language. """ + + target = (lambda target=None: target)(**kwargs) + # The above lambda extracts the keyword argument 'target' from kwargs + # while ensuring there are no stray ones. When Python 2 support + # is dropped, remove it and change the function signature to: + # + # def _merge_subtitles(cls, *dicts, target=None): + + if target is None: + target = {} + for d in dicts: + for lang, subs in d.items(): + target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs) + return target def extract_automatic_captions(self, *args, **kwargs): if (self._downloader.params.get('writeautomaticsub', False) |