From e364927f8374577c3ecaf7ccb365382aa525f913 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sat, 1 Feb 2020 14:23:50 -0800 Subject: yt_data_extract: parse mimeType field for codecs the youtube-dl formats table doesn't have all the necessary information --- youtube/yt_data_extract/watch_extraction.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'youtube') diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index 1609f8d..6c0899b 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -263,6 +263,31 @@ def _extract_watch_info_desktop(top_level): return info +fmt_type_re = re.compile( + r'(audio|video)/([\w0-9]+); codecs="([\w0-9\.]+(?:, [\w0-9\.]+)*)"') +def update_format_with_type_info(fmt, yt_fmt): + # 'type' for invidious api format + mime_type = multi_get(yt_fmt, 'mimeType', 'type') + if mime_type is None: + return + match = re.fullmatch(fmt_type_re, mime_type) + + type, fmt['ext'], codecs = match.groups() + codecs = codecs.split(', ') + for codec in codecs: + if (codec.startswith('av') + or codec in ('vp9', 'vp8', 'vp8.0', 'h263', 'h264', 'mp4v')): + if codec == 'vp8.0': + codec = 'vp8' + conservative_update(fmt, 'vcodec', codec) + elif (codec.startswith('mp4a') + or codec in ('opus', 'mp3', 'aac', 'dtse', 'ec-3', 'vorbis')): + conservative_update(fmt, 'acodec', codec) + else: + print('Warning: unrecognized codec: ' + codec) + if type == 'audio': + assert len(codecs) == 1 + def _extract_formats(info, player_response): streaming_data = player_response.get('streamingData', {}) yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', []) @@ -275,11 +300,13 @@ def _extract_formats(info, player_response): fmt['audio_bitrate'] = None fmt['acodec'] = None fmt['vcodec'] = None + fmt['itag'] = yt_fmt.get('itag') fmt['width'] = yt_fmt.get('width') fmt['height'] = yt_fmt.get('height') fmt['file_size'] = yt_fmt.get('contentLength') fmt['audio_sample_rate'] = yt_fmt.get('audioSampleRate') fmt['fps'] = yt_fmt.get('fps') + update_format_with_type_info(fmt, yt_fmt) cipher = dict(urllib.parse.parse_qsl(yt_fmt.get('cipher', ''))) if cipher: fmt['url'] = cipher.get('url') -- cgit v1.2.3