diff options
Diffstat (limited to 'youtube_dlc/extractor/common.py')
-rw-r--r-- | youtube_dlc/extractor/common.py | 262 |
1 files changed, 167 insertions, 95 deletions
diff --git a/youtube_dlc/extractor/common.py b/youtube_dlc/extractor/common.py index 0114d24da..edfa99975 100644 --- a/youtube_dlc/extractor/common.py +++ b/youtube_dlc/extractor/common.py @@ -1833,9 +1833,8 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, quality=None, - m3u8_id=None, note=None, errnote=None, - fatal=True, live=False, data=None, headers={}, - query={}): + m3u8_id=None, live=False, note=None, errnote=None, + fatal=True, data=None, headers={}, query={}): res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', @@ -1850,11 +1849,14 @@ class InfoExtractor(object): return self._parse_m3u8_formats( m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, - preference=preference, quality=quality, m3u8_id=m3u8_id, live=live) + preference=preference, quality=quality, m3u8_id=m3u8_id, + note=note, errnote=errnote, fatal=fatal, live=live, data=data, + headers=headers, query=query, video_id=video_id) def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8', preference=None, quality=None, - m3u8_id=None, live=False): + m3u8_id=None, live=False, note=None, errnote=None, + fatal=True, data=None, headers={}, query={}, video_id=None): if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access return [] @@ -1868,6 +1870,8 @@ class InfoExtractor(object): if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) + split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False) + # References: # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211 @@ -1884,15 +1888,67 @@ class InfoExtractor(object): # media playlist and MUST NOT appear in master playlist thus we can # clearly detect media playlist with this criterion. + def _extract_m3u8_playlist_formats(format_url, m3u8_doc=None): + if not m3u8_doc: + res = self._download_webpage_handle( + format_url, video_id, + note=False, + errnote=errnote or 'Failed to download m3u8 playlist information', + fatal=fatal, data=data, headers=headers, query=query) + + if res is False: + return [] + + m3u8_doc, urlh = res + format_url = urlh.geturl() + + playlist_formats = [] + i = ( + 0 + if split_discontinuity + else None) + format_info = { + 'index': i, + 'key_data': None, + 'files': [], + } + for line in m3u8_doc.splitlines(): + if not line.startswith('#'): + format_info['files'].append(line) + elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'): + i += 1 + playlist_formats.append(format_info) + format_info = { + 'index': i, + 'url': format_url, + 'files': [], + } + playlist_formats.append(format_info) + return playlist_formats + if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is - return [{ - 'url': m3u8_url, - 'format_id': m3u8_id, - 'ext': ext, - 'protocol': entry_protocol, - 'preference': preference, - 'quality': quality, - }] + + playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc, True) + + for format in playlist_formats: + format_id = [] + if m3u8_id: + format_id.append(m3u8_id) + format_index = format.get('index') + if format_index: + format_id.append(str(format_index)) + f = { + 'format_id': '-'.join(format_id), + 'format_index': format_index, + 'url': m3u8_url, + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + 'quality': quality, + } + formats.append(f) + + return formats groups = {} last_stream_inf = {} @@ -1908,23 +1964,31 @@ class InfoExtractor(object): return media_url = media.get('URI') if media_url: + manifest_url = format_url(media_url) format_id = [] - for v in (m3u8_id, group_id, name): - if v: - format_id.append(v) - f = { - 'format_id': '-'.join(format_id), - 'url': format_url(media_url), - 'manifest_url': m3u8_url, - 'language': media.get('LANGUAGE'), - 'ext': ext, - 'protocol': entry_protocol, - 'preference': preference, - 'quality': quality, - } - if media_type == 'AUDIO': - f['vcodec'] = 'none' - formats.append(f) + playlist_formats = _extract_m3u8_playlist_formats(manifest_url) + + for format in playlist_formats: + format_index = format.get('index') + for v in (m3u8_id, group_id, name): + if v: + format_id.append(v) + if format_index: + format_id.append(str(format_index)) + f = { + 'format_id': '-'.join(format_id), + 'format_index': format_index, + 'url': manifest_url, + 'manifest_url': m3u8_url, + 'language': media.get('LANGUAGE'), + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + 'quality': quality, + } + if media_type == 'AUDIO': + f['vcodec'] = 'none' + formats.append(f) def build_stream_name(): # Despite specification does not mention NAME attribute for @@ -1961,74 +2025,82 @@ class InfoExtractor(object): tbr = float_or_none( last_stream_inf.get('AVERAGE-BANDWIDTH') or last_stream_inf.get('BANDWIDTH'), scale=1000) - format_id = [] - if m3u8_id: - format_id.append(m3u8_id) - stream_name = build_stream_name() - # Bandwidth of live streams may differ over time thus making - # format_id unpredictable. So it's better to keep provided - # format_id intact. - if not live: - format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) manifest_url = format_url(line.strip()) - f = { - 'format_id': '-'.join(format_id), - 'url': manifest_url, - 'manifest_url': m3u8_url, - 'tbr': tbr, - 'ext': ext, - 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), - 'protocol': entry_protocol, - 'preference': preference, - 'quality': quality, - } - resolution = last_stream_inf.get('RESOLUTION') - if resolution: - mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution) - if mobj: - f['width'] = int(mobj.group('width')) - f['height'] = int(mobj.group('height')) - # Unified Streaming Platform - mobj = re.search( - r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) - if mobj: - abr, vbr = mobj.groups() - abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) - f.update({ - 'vbr': vbr, - 'abr': abr, - }) - codecs = parse_codecs(last_stream_inf.get('CODECS')) - f.update(codecs) - audio_group_id = last_stream_inf.get('AUDIO') - # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which - # references a rendition group MUST have a CODECS attribute. - # However, this is not always respected, for example, [2] - # contains EXT-X-STREAM-INF tag which references AUDIO - # rendition group but does not have CODECS and despite - # referencing an audio group it represents a complete - # (with audio and video) format. So, for such cases we will - # ignore references to rendition groups and treat them - # as complete formats. - if audio_group_id and codecs and f.get('vcodec') != 'none': - audio_group = groups.get(audio_group_id) - if audio_group and audio_group[0].get('URI'): - # TODO: update acodec for audio only formats with - # the same GROUP-ID - f['acodec'] = 'none' - formats.append(f) - # for DailyMotion - progressive_uri = last_stream_inf.get('PROGRESSIVE-URI') - if progressive_uri: - http_f = f.copy() - del http_f['manifest_url'] - http_f.update({ - 'format_id': f['format_id'].replace('hls-', 'http-'), - 'protocol': 'http', - 'url': progressive_uri, - }) - formats.append(http_f) + playlist_formats = _extract_m3u8_playlist_formats(manifest_url) + + for format in playlist_formats: + format_id = [] + if m3u8_id: + format_id.append(m3u8_id) + format_index = format.get('index') + stream_name = build_stream_name() + # Bandwidth of live streams may differ over time thus making + # format_id unpredictable. So it's better to keep provided + # format_id intact. + if not live: + format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) + if format_index: + format_id.append(str(format_index)) + f = { + 'format_id': '-'.join(format_id), + 'format_index': format_index, + 'url': manifest_url, + 'manifest_url': m3u8_url, + 'tbr': tbr, + 'ext': ext, + 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), + 'protocol': entry_protocol, + 'preference': preference, + 'quality': quality, + } + resolution = last_stream_inf.get('RESOLUTION') + if resolution: + mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution) + if mobj: + f['width'] = int(mobj.group('width')) + f['height'] = int(mobj.group('height')) + # Unified Streaming Platform + mobj = re.search( + r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) + if mobj: + abr, vbr = mobj.groups() + abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) + f.update({ + 'vbr': vbr, + 'abr': abr, + }) + codecs = parse_codecs(last_stream_inf.get('CODECS')) + f.update(codecs) + audio_group_id = last_stream_inf.get('AUDIO') + # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which + # references a rendition group MUST have a CODECS attribute. + # However, this is not always respected, for example, [2] + # contains EXT-X-STREAM-INF tag which references AUDIO + # rendition group but does not have CODECS and despite + # referencing an audio group it represents a complete + # (with audio and video) format. So, for such cases we will + # ignore references to rendition groups and treat them + # as complete formats. + if audio_group_id and codecs and f.get('vcodec') != 'none': + audio_group = groups.get(audio_group_id) + if audio_group and audio_group[0].get('URI'): + # TODO: update acodec for audio only formats with + # the same GROUP-ID + f['acodec'] = 'none' + formats.append(f) + + # for DailyMotion + progressive_uri = last_stream_inf.get('PROGRESSIVE-URI') + if progressive_uri: + http_f = f.copy() + del http_f['manifest_url'] + http_f.update({ + 'format_id': f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': progressive_uri, + }) + formats.append(http_f) last_stream_inf = {} return formats |