diff options
Diffstat (limited to 'youtube/yt_data_extract.py')
-rw-r--r-- | youtube/yt_data_extract.py | 435 |
1 files changed, 420 insertions, 15 deletions
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index cccd679..81604fd 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -6,6 +6,7 @@ import re import urllib import collections from math import ceil +import traceback # videos (all of type str): @@ -36,8 +37,112 @@ from math import ceil # size # first_video_id - - +# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py +_formats = { + '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well + '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + + + # 3D videos + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + + # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264'}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264'}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264'}, + + # DASH mp4 video + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, + + # Dash mp4 audio + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, + + # Dash webm + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + + # Dash webm audio + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, + + # Dash webm audio with opus inside + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, + + # RTMP (unnamed) + '_rtmp': {'protocol': 'rtmp'}, + + # av01 video only formats sometimes served with "unknown" codecs + '394': {'vcodec': 'av01.0.05M.08'}, + '395': {'vcodec': 'av01.0.05M.08'}, + '396': {'vcodec': 'av01.0.05M.08'}, + '397': {'vcodec': 'av01.0.05M.08'}, +} def get_plain_text(node): @@ -59,7 +164,7 @@ def format_text_runs(runs): result += html.escape(text_run["text"]) return result -def default_get(object, key, default, types=()): +def default_get(object, key, default=None, types=()): '''Like dict.get(), but returns default if the result doesn't match one of the types. Also works for indexing lists.''' try: @@ -74,7 +179,7 @@ def default_get(object, key, default, types=()): -def default_multi_get(object, *keys, default, types=()): +def default_multi_get(object, *keys, default=None, types=()): '''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors. If types is given and the result doesn't match one of those types, default is returned''' @@ -106,6 +211,11 @@ def multi_default_multi_get(object, *key_sequences, default=None, types=()): continue return default +def remove_redirect(url): + if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking + query_string = url[url.find('?')+1: ] + return urllib.parse.parse_qs(query_string)['q'][0] + return url def get_url(node): try: @@ -239,9 +349,9 @@ def renderer_info(renderer, additional_info={}): type = list(renderer.keys())[0] renderer = renderer[type] info = {} - if type == 'itemSectionRenderer': + if type in ('itemSectionRenderer', 'compactAutoplayRenderer'): return renderer_info(renderer['contents'][0], additional_info) - + if type in ('movieRenderer', 'clarificationRenderer'): info['type'] = 'unsupported' return info @@ -345,6 +455,7 @@ item_types = { 'videoRenderer', 'compactVideoRenderer', + 'compactAutoplayRenderer', 'gridVideoRenderer', 'playlistVideoRenderer', @@ -378,6 +489,11 @@ def traverse_browse_renderer(renderer): print('Could not find tab with content') return {} +def traverse_standard_list(renderer): + renderer_list = multi_default_multi_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple)) + continuation = default_multi_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation') + return renderer_list, continuation + # these renderers contain one inside them nested_renderer_dispatch = { 'singleColumnBrowseResultsRenderer': traverse_browse_renderer, @@ -385,7 +501,16 @@ nested_renderer_dispatch = { 'twoColumnSearchResultsRenderer': lambda renderer: default_get(renderer, 'primaryContents', {}, types=dict), } -def extract_items(response): +# these renderers contain a list of renderers in side them +nested_renderer_list_dispatch = { + 'sectionListRenderer': traverse_standard_list, + 'itemSectionRenderer': traverse_standard_list, + 'gridRenderer': traverse_standard_list, + 'playlistVideoListRenderer': traverse_standard_list, + 'singleColumnWatchNextResults': lambda r: (default_multi_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None), +} + +def extract_items(response, item_types=item_types): '''return items, ctoken''' if 'continuationContents' in response: # always has just the one [something]Continuation key, but do this just in case they add some tracking key or something @@ -414,13 +539,11 @@ def extract_items(response): key, value = list(renderer.items())[0] # has a list in it, add it to the iter stack - if key in list_types: - renderer_list = multi_default_multi_get(value, ['contents'], ['items'], default=(), types=(list, tuple)) + if key in nested_renderer_list_dispatch: + renderer_list, continuation = nested_renderer_list_dispatch[key](value) if renderer_list: iter_stack.append(current_iter) current_iter = iter(renderer_list) - - continuation = default_multi_get(value, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str) if continuation: ctoken = continuation @@ -506,10 +629,7 @@ def extract_channel_info(polymer_json, tab): info['links'] = [] for link_json in channel_metadata.get('primaryLinks', ()): - url = link_json['navigationEndpoint']['urlEndpoint']['url'] - if url.startswith('/redirect'): # youtube puts these on external links to do tracking - query_string = url[url.find('?')+1: ] - url = urllib.parse.parse_qs(query_string)['q'][0] + url = remove_redirect(link_json['navigationEndpoint']['urlEndpoint']['url']) text = get_plain_text(link_json['title']) @@ -699,5 +819,290 @@ def parse_comments_polymer(polymer_json): 'sort': metadata['sort'], } +def check_missing_keys(object, *key_sequences): + for key_sequence in key_sequences: + _object = object + try: + for key in key_sequence: + _object = object[key] + except (KeyError, IndexError, TypeError): + return 'Could not find ' + key + + return None + +def extract_plain_text(node, default=None): + if isinstance(node, str): + return node + + try: + return node['simpleText'] + except (KeyError, TypeError): + pass + + try: + return ''.join(text_run['text'] for text_run in node['runs']) + except (KeyError, TypeError): + pass + + return default + +def extract_formatted_text(node): + try: + result = [] + runs = node['runs'] + for run in runs: + url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') + if url is not None: + run['url'] = remove_redirect(url) + run['text'] = run['url'] # youtube truncates the url text, we don't want that nonsense + return runs + except (KeyError, TypeError): + traceback.print_exc() + pass + + try: + return [{'text': node['simpleText']}] + except (KeyError, TypeError): + pass + + return [] + +def extract_integer(string): + if not isinstance(string, str): + return None + match = re.search(r'(\d+)', string.replace(',', '')) + if match is None: + return None + try: + return int(match.group(1)) + except ValueError: + return None + +def extract_metadata_row_info(video_renderer_info): + # extract category and music list + info = { + 'category': None, + 'music_list': [], + } + + current_song = {} + for row in default_multi_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]): + row_title = extract_plain_text(default_multi_get(row, 'metadataRowRenderer', 'title'), default='') + row_content = extract_plain_text(default_multi_get(row, 'metadataRowRenderer', 'contents', 0)) + if row_title == 'Category': + info['category'] = row_content + elif row_title in ('Song', 'Music'): + if current_song: + info['music_list'].append(current_song) + current_song = {'title': row_content} + elif row_title == 'Artist': + current_song['artist'] = row_content + elif row_title == 'Album': + current_song['album'] = row_content + elif row_title == 'Writers': + current_song['writers'] = row_content + elif row_title.startswith('Licensed'): + current_song['licensor'] = row_content + if current_song: + info['music_list'].append(current_song) + return info + +def extract_watch_info_mobile(top_level): + info = {} + microformat = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) + + info['allowed_countries'] = microformat.get('availableCountries', []) + info['published_date'] = microformat.get('publishDate') + + response = top_level.get('response', {}) + + # video info from metadata renderers + items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'}) + if items: + video_info = items[0]['slimVideoMetadataRenderer'] + else: + print('Failed to extract video metadata') + video_info = {} + + info.update(extract_metadata_row_info(video_info)) + #info['description'] = extract_formatted_text(video_info.get('description')) + info['like_count'] = None + info['dislike_count'] = None + for button in video_info.get('buttons', ()): + button_renderer = button.get('slimMetadataToggleButtonRenderer', {}) + + # all the digits can be found in the accessibility data + count = extract_integer(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label')) + + # this count doesn't have all the digits, it's like 53K for instance + dumb_count = extract_integer(extract_plain_text(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText'))) + + # the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0 + if dumb_count == 0: + count = 0 + + if 'isLike' in button_renderer: + info['like_count'] = count + elif 'isDislike' in button_renderer: + info['dislike_count'] = count + + # comment section info + items, _ = extract_items(response, item_types={'commentSectionRenderer'}) + if items: + comment_info = items[0]['commentSectionRenderer'] + comment_count_text = extract_plain_text(default_multi_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText')) + if comment_count_text == 'Comments': # just this with no number, means 0 comments + info['comment_count'] = 0 + else: + info['comment_count'] = extract_integer(comment_count_text) + info['comments_disabled'] = False + else: # no comment section present means comments are disabled + info['comment_count'] = 0 + info['comments_disabled'] = True + + # related videos + related, _ = extract_items(response) + info['related_videos'] = [renderer_info(renderer) for renderer in related] + + return info + +month_abbreviations = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'} +def extract_watch_info_desktop(top_level): + info = { + 'comment_count': None, + 'comments_disabled': None, + 'allowed_countries': None, + } + + video_info = {} + for renderer in default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()): + if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'): + video_info.update(list(renderer.values())[0]) + + info.update(extract_metadata_row_info(video_info)) + #info['description'] = extract_formatted_text(video_info.get('description', None)) + info['published_date'] = None + date_text = extract_plain_text(video_info.get('dateText', None)) + if date_text is not None: + date_text = util.left_remove(date_text.lower(), 'published on ').replace(',', '') + parts = date_text.split() + if len(parts) == 3: + month, day, year = date_text.split() + month = month_abbreviations.get(month[0:3]) # slicing in case they start writing out the full month name + if month and (re.fullmatch(r'\d\d?', day) is not None) and (re.fullmatch(r'\d{4}', year) is not None): + info['published_date'] = year + '-' + month + '-' + day + + likes_dislikes = default_multi_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/') + if len(likes_dislikes) == 2: + info['like_count'] = extract_integer(likes_dislikes[0]) + info['dislike_count'] = extract_integer(likes_dislikes[1]) + else: + info['like_count'] = None + info['dislike_count'] = None + + #info['title'] = extract_plain_text(video_info.get('title', None)) + #info['author'] = extract_plain_text(default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'title')) + #info['author_id'] = default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') + #info['view_count'] = extract_integer(extract_plain_text(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount'))) + + related = default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[]) + info['related_videos'] = [renderer_info(renderer) for renderer in related] + + return info + + +def extract_watch_info(polymer_json): + info = {'playability_error': None, 'error': None} + + if isinstance(polymer_json, dict): + top_level = polymer_json + elif isinstance(polymer_json, (list, tuple)): + top_level = {} + for page_part in polymer_json: + if not isinstance(page_part, dict): + return {'error': 'Invalid page part'} + top_level.update(page_part) + else: + return {'error': 'Invalid top level polymer data'} + + error = check_missing_keys(top_level, + ['playerResponse'], + ) + if error: + return {'error': error} + + error = check_missing_keys(top_level, + ['player', 'args'], + ['player', 'assets', 'js'], + ) + if error: + info['playability_error'] = error + + + player_args = default_multi_get(top_level, 'player', 'args', default={}) + parsed_formats = [] + + if 'url_encoded_fmt_stream_map' in player_args: + string_formats = player_args['url_encoded_fmt_stream_map'].split(',') + parsed_formats += [dict(urllib.parse.parse_qsl(fmt_string)) for fmt_string in string_formats if fmt_string] + + if 'adaptive_fmts' in player_args: + string_formats = player_args['adaptive_fmts'].split(',') + parsed_formats += [dict(urllib.parse.parse_qsl(fmt_string)) for fmt_string in string_formats if fmt_string] + + info['formats'] = [] + + for parsed_fmt in parsed_formats: + # start with defaults from the big table at the top + if 'itag' in parsed_fmt: + fmt = _formats.get(parsed_fmt['itag'], {}).copy() + else: + fmt = {} + + # then override them + fmt.update(parsed_fmt) + try: + fmt['width'], fmt['height'] = map(int, fmt['size'].split('x')) + except (KeyError, ValueError, TypeError): + pass + + fmt['file_size'] = None + if 'clen' in fmt: + fmt['file_size'] = int(fmt.get('clen')) + else: + match = re.search(r'&clen=(\d+)', fmt.get('url')) + if match: + fmt['file_size'] = int(match.group(1)) + info['formats'].append(fmt) + + info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js') + if info['base_js']: + info['base_js'] = normalize_url(info['base_js']) + + mobile = 'singleColumnWatchNextResults' in default_multi_get(top_level, 'response', 'contents', default={}) + if mobile: + info.update(extract_watch_info_mobile(top_level)) + else: + info.update(extract_watch_info_desktop(top_level)) + + # stuff from videoDetails + video_details = default_multi_get(top_level, 'playerResponse', 'videoDetails', default={}) + info['title'] = extract_plain_text(video_details.get('title')) + info['duration'] = extract_integer(video_details.get('lengthSeconds')) + info['view_count'] = extract_integer(video_details.get('viewCount')) + # videos with no description have a blank string + info['description'] = video_details.get('shortDescription') + info['id'] = video_details.get('videoId') + info['author'] = video_details.get('author') + info['author_id'] = video_details.get('channelId') + info['live'] = video_details.get('isLiveContent') + info['unlisted'] = not video_details.get('isCrawlable', True) + info['tags'] = video_details.get('keywords', []) + + # other stuff + info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None + info['subtitles'] = {} # TODO + + return info |