From 6e14a8547d05cf02ad72e8415f70072bdf599212 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sun, 28 Jun 2020 13:18:54 -0700 Subject: Handle case where embedded player response missing Change so it extracts other stuff from regular playerResponse Extract formats from embedded player response, but fallback to regular one if that doesn't work. Sometimes there is no 'player' at top_level and the urls are in the regular playerResponse --- youtube/yt_data_extract/watch_extraction.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'youtube/yt_data_extract') diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index 0970125..9dbb252 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -415,8 +415,14 @@ def extract_watch_info(polymer_json): if error: info['playability_error'] = error + player_response = top_level.get('playerResponse', {}) + + # usually, only the embedded one has the urls player_args = deep_get(top_level, 'player', 'args', default={}) - player_response = json.loads(player_args['player_response']) if 'player_response' in player_args else {} + if 'player_response' in player_args: + embedded_player_response = json.loads(player_args['player_response']) + else: + embedded_player_response = {} # captions info['automatic_caption_languages'] = [] @@ -446,7 +452,9 @@ def extract_watch_info(polymer_json): print('WARNING: Found non-translatable caption language') # formats - _extract_formats(info, player_response) + _extract_formats(info, embedded_player_response) + if not info['formats']: + _extract_formats(info, player_response) # playability errors _extract_playability_error(info, player_response) -- cgit v1.2.3