diff options
Diffstat (limited to 'youtube/yt_data_extract/watch_extraction.py')
| -rw-r--r-- | youtube/yt_data_extract/watch_extraction.py | 74 |
1 files changed, 69 insertions, 5 deletions
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index 85c8100..de87a6a 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -473,13 +473,22 @@ def _extract_formats(info, player_response): itag = yt_fmt.get('itag') # Translated audio track - # Example: https://www.youtube.com/watch?v=gF9kkB0UWYQ - # Only get the original language for now so a foreign - # translation will not be picked just because it comes first - if deep_get(yt_fmt, 'audioTrack', 'audioIsDefault') is False: - continue + # Keep non-default tracks for multi-audio support + # (they will be served via local proxy) fmt = {} + + # Audio track info + audio_track = yt_fmt.get('audioTrack') + if audio_track: + fmt['audio_track_id'] = audio_track.get('id') + fmt['audio_track_name'] = audio_track.get('displayName') + fmt['audio_track_is_default'] = audio_track.get('audioIsDefault', True) + else: + fmt['audio_track_id'] = None + fmt['audio_track_name'] = None + fmt['audio_track_is_default'] = True + fmt['itag'] = itag fmt['ext'] = None fmt['audio_bitrate'] = None @@ -532,6 +541,61 @@ def _extract_formats(info, player_response): else: info['ip_address'] = None + +def parse_format(yt_fmt): + '''Parse a single YouTube format dict into our internal format dict.''' + itag = yt_fmt.get('itag') + fmt = {} + + audio_track = yt_fmt.get('audioTrack') + if audio_track: + fmt['audio_track_id'] = audio_track.get('id') + fmt['audio_track_name'] = audio_track.get('displayName') + fmt['audio_track_is_default'] = audio_track.get('audioIsDefault', True) + else: + fmt['audio_track_id'] = None + fmt['audio_track_name'] = None + fmt['audio_track_is_default'] = True + + fmt['itag'] = itag + fmt['ext'] = None + fmt['audio_bitrate'] = None + fmt['bitrate'] = yt_fmt.get('bitrate') + fmt['acodec'] = None + fmt['vcodec'] = None + fmt['width'] = yt_fmt.get('width') + fmt['height'] = yt_fmt.get('height') + fmt['file_size'] = extract_int(yt_fmt.get('contentLength')) + fmt['audio_sample_rate'] = extract_int(yt_fmt.get('audioSampleRate')) + fmt['duration_ms'] = yt_fmt.get('approxDurationMs') + fmt['fps'] = yt_fmt.get('fps') + fmt['init_range'] = yt_fmt.get('initRange') + fmt['index_range'] = yt_fmt.get('indexRange') + for key in ('init_range', 'index_range'): + if fmt[key]: + fmt[key]['start'] = int(fmt[key]['start']) + fmt[key]['end'] = int(fmt[key]['end']) + update_format_with_type_info(fmt, yt_fmt) + cipher = dict(urllib.parse.parse_qsl(multi_get(yt_fmt, + 'cipher', 'signatureCipher', default=''))) + if cipher: + fmt['url'] = cipher.get('url') + else: + fmt['url'] = yt_fmt.get('url') + fmt['s'] = cipher.get('s') + fmt['sp'] = cipher.get('sp') + + hardcoded_itag_info = _formats.get(str(itag), {}) + for key, value in hardcoded_itag_info.items(): + conservative_update(fmt, key, value) + fmt['quality'] = hardcoded_itag_info.get('height') + conservative_update(fmt, 'quality', + extract_int(yt_fmt.get('quality'), whole_word=False)) + conservative_update(fmt, 'quality', + extract_int(yt_fmt.get('qualityLabel'), whole_word=False)) + + return fmt + hls_regex = re.compile(r'[\w_-]+=(?:"[^"]+"|[^",]+),') def extract_hls_formats(hls_manifest): '''returns hls_formats, err''' |
