aboutsummaryrefslogtreecommitdiffstats
path: root/youtube
diff options
context:
space:
mode:
authorJames Taylor <user234683@users.noreply.github.com>2020-02-01 14:23:50 -0800
committerJames Taylor <user234683@users.noreply.github.com>2020-02-01 14:23:50 -0800
commite364927f8374577c3ecaf7ccb365382aa525f913 (patch)
tree95383bbe3447be778619e71409baf38063992e92 /youtube
parentf787e4e2027583476ca34bd01c8462f6459369bb (diff)
downloadyt-local-e364927f8374577c3ecaf7ccb365382aa525f913.tar.lz
yt-local-e364927f8374577c3ecaf7ccb365382aa525f913.tar.xz
yt-local-e364927f8374577c3ecaf7ccb365382aa525f913.zip
yt_data_extract: parse mimeType field for codecs
the youtube-dl formats table doesn't have all the necessary information
Diffstat (limited to 'youtube')
-rw-r--r--youtube/yt_data_extract/watch_extraction.py27
1 files changed, 27 insertions, 0 deletions
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index 1609f8d..6c0899b 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -263,6 +263,31 @@ def _extract_watch_info_desktop(top_level):
return info
+fmt_type_re = re.compile(
+ r'(audio|video)/([\w0-9]+); codecs="([\w0-9\.]+(?:, [\w0-9\.]+)*)"')
+def update_format_with_type_info(fmt, yt_fmt):
+ # 'type' for invidious api format
+ mime_type = multi_get(yt_fmt, 'mimeType', 'type')
+ if mime_type is None:
+ return
+ match = re.fullmatch(fmt_type_re, mime_type)
+
+ type, fmt['ext'], codecs = match.groups()
+ codecs = codecs.split(', ')
+ for codec in codecs:
+ if (codec.startswith('av')
+ or codec in ('vp9', 'vp8', 'vp8.0', 'h263', 'h264', 'mp4v')):
+ if codec == 'vp8.0':
+ codec = 'vp8'
+ conservative_update(fmt, 'vcodec', codec)
+ elif (codec.startswith('mp4a')
+ or codec in ('opus', 'mp3', 'aac', 'dtse', 'ec-3', 'vorbis')):
+ conservative_update(fmt, 'acodec', codec)
+ else:
+ print('Warning: unrecognized codec: ' + codec)
+ if type == 'audio':
+ assert len(codecs) == 1
+
def _extract_formats(info, player_response):
streaming_data = player_response.get('streamingData', {})
yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', [])
@@ -275,11 +300,13 @@ def _extract_formats(info, player_response):
fmt['audio_bitrate'] = None
fmt['acodec'] = None
fmt['vcodec'] = None
+ fmt['itag'] = yt_fmt.get('itag')
fmt['width'] = yt_fmt.get('width')
fmt['height'] = yt_fmt.get('height')
fmt['file_size'] = yt_fmt.get('contentLength')
fmt['audio_sample_rate'] = yt_fmt.get('audioSampleRate')
fmt['fps'] = yt_fmt.get('fps')
+ update_format_with_type_info(fmt, yt_fmt)
cipher = dict(urllib.parse.parse_qsl(yt_fmt.get('cipher', '')))
if cipher:
fmt['url'] = cipher.get('url')