From 54b39f13034fdbcf427a21b3be8d56020516a764 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Tue, 27 Jul 2021 21:35:11 -0700 Subject: Fix missing likes, dislikes, & music list due to Youtube changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also moves some microformat extraction from _extract_watch_info_mobile to extract_watch_info where it belongs. _extract_watch_info_mobile is really only for stuff visible on the page, and thus specialized for either mobile or desktop. Signed-off-by: Jesús --- youtube/yt_data_extract/common.py | 30 ++++-- youtube/yt_data_extract/watch_extraction.py | 151 ++++++++++++++++++---------- 2 files changed, 121 insertions(+), 60 deletions(-) diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index d03bd89..e87808b 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -403,6 +403,7 @@ nested_renderer_dispatch = { 'twoColumnBrowseResultsRenderer': _traverse_browse_renderer, 'twoColumnSearchResultsRenderer': lambda r: get(r, 'primaryContents', {}), 'richItemRenderer': lambda r: get(r, 'content', {}), + 'engagementPanelSectionListRenderer': lambda r: get(r, 'content', {}), } # these renderers contain a list of renderers inside them @@ -412,6 +413,8 @@ nested_renderer_list_dispatch = { 'gridRenderer': _traverse_standard_list, 'richGridRenderer': _traverse_standard_list, 'playlistVideoListRenderer': _traverse_standard_list, + 'structuredDescriptionContentRenderer': _traverse_standard_list, + 'slimVideoMetadataSectionRenderer': _traverse_standard_list, 'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[]), None), } def get_nested_renderer_list_function(key): @@ -475,8 +478,11 @@ def extract_items_from_renderer(renderer, item_types=_item_types): renderer = None -def extract_items(response, item_types=_item_types): +def extract_items(response, item_types=_item_types, + search_engagement_panels=False): '''return items, ctoken''' + items = [] + ctoken = None if 'continuationContents' in response: # sometimes there's another, empty, junk [something]Continuation key # find real one @@ -484,13 +490,23 @@ def extract_items(response, item_types=_item_types): 'continuationContents', {}).items(): # e.g. commentSectionContinuation, playlistVideoListContinuation if key.endswith('Continuation'): - items, cont = extract_items_from_renderer({key: renderer_cont}, + items, ctoken = extract_items_from_renderer( + {key: renderer_cont}, item_types=item_types) if items: - return items, cont - return [], None + break elif 'contents' in response: renderer = get(response, 'contents', {}) - return extract_items_from_renderer(renderer, item_types=item_types) - else: - return [], None + items, ctoken = extract_items_from_renderer( + renderer, + item_types=item_types) + + if search_engagement_panels and 'engagementPanels' in response: + for engagement_renderer in response['engagementPanels']: + additional_items, cont = extract_items_from_renderer( + engagement_renderer, + item_types=item_types) + items += additional_items + if cont and not ctoken: + ctoken = cont + return items, ctoken diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index daa1e89..9ad3d43 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -116,7 +116,72 @@ _formats = { '397': {'vcodec': 'av01.0.05M.08'}, } -def _extract_metadata_row_info(video_renderer_info): + +def _extract_from_video_information_renderer(renderer_content): + subtitle = extract_str(renderer_content.get('expandedSubtitle'), + default='') + info = { + 'title': extract_str(renderer_content.get('title')), + 'view_count': extract_int(subtitle), + 'unlisted': False, + 'live': 'watching' in subtitle, + } + for badge in renderer_content.get('badges', []): + if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted': + info['unlisted'] = True + return info + +def _extract_likes_dislikes(renderer_content): + info = { + 'like_count': None, + 'dislike_count': None, + } + for button in renderer_content.get('buttons', ()): + button_renderer = button.get('slimMetadataToggleButtonRenderer', {}) + + # all the digits can be found in the accessibility data + count = extract_int(deep_get( + button_renderer, + 'button', 'toggleButtonRenderer', 'defaultText', + 'accessibility', 'accessibilityData', 'label')) + + # this count doesn't have all the digits, it's like 53K for instance + dumb_count = extract_int(extract_str(deep_get( + button_renderer, 'button', 'toggleButtonRenderer', 'defaultText'))) + + # The accessibility text will be "No likes" or "No dislikes" or + # something like that, but dumb count will be 0 + if dumb_count == 0: + count = 0 + + if 'isLike' in button_renderer: + info['like_count'] = count + elif 'isDislike' in button_renderer: + info['dislike_count'] = count + return info + +def _extract_from_owner_renderer(renderer_content): + return { + 'author': extract_str(renderer_content.get('title')), + 'author_id': deep_get( + renderer_content, + 'navigationEndpoint', 'browseEndpoint', 'browseId'), + } + +def _extract_from_video_header_renderer(renderer_content): + return { + 'title': extract_str(renderer_content.get('title')), + 'time_published': extract_date(extract_str( + renderer_content.get('publishDate'))), + } + +def _extract_from_description_renderer(renderer_content): + return { + 'description': extract_str( + renderer_content.get('descriptionBodyText'), recover_urls=True), + } + +def _extract_metadata_row_info(renderer_content): # extract category and music list info = { 'category': None, @@ -124,7 +189,7 @@ def _extract_metadata_row_info(video_renderer_info): } current_song = {} - for row in deep_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]): + for row in deep_get(renderer_content, 'rows', default=[]): row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='') row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0)) if row_title == 'Category': @@ -146,18 +211,18 @@ def _extract_metadata_row_info(video_renderer_info): return info +visible_extraction_dispatch = { + 'slimVideoInformationRenderer': _extract_from_video_information_renderer, + 'slimVideoActionBarRenderer': _extract_likes_dislikes, + 'slimOwnerRenderer': _extract_from_owner_renderer, + 'videoDescriptionHeaderRenderer': _extract_from_video_header_renderer, + 'expandableVideoDescriptionRenderer': _extract_from_description_renderer, + 'metadataRowContainerRenderer': _extract_metadata_row_info, +} + def _extract_watch_info_mobile(top_level): + '''Scrapes information from the visible page''' info = {} - microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) - - family_safe = microformat.get('isFamilySafe') - if family_safe is None: - info['age_restricted'] = None - else: - info['age_restricted'] = not family_safe - info['allowed_countries'] = microformat.get('availableCountries', []) - info['time_published'] = microformat.get('publishDate') - response = top_level.get('response', {}) # this renderer has the stuff visible on the page @@ -190,47 +255,22 @@ def _extract_watch_info_mobile(top_level): else: info['playlist'] = None - # Holds the visible video info. It is inside singleColumnWatchNextResults - # but use our convenience function instead - items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'}) - if items: - video_info = items[0]['slimVideoMetadataRenderer'] - else: - print('Failed to extract video metadata') - video_info = {} - - info.update(_extract_metadata_row_info(video_info)) - info['description'] = extract_str(video_info.get('description'), recover_urls=True) - info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle'))) - info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) - info['author_id'] = deep_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') - info['title'] = extract_str(video_info.get('title')) - info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='') - info['unlisted'] = False - for badge in video_info.get('badges', []): - if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted': - info['unlisted'] = True - info['like_count'] = None - info['dislike_count'] = None - if not info['time_published']: - info['time_published'] = extract_date(extract_str(video_info.get('dateText', None))) - for button in video_info.get('buttons', ()): - button_renderer = button.get('slimMetadataToggleButtonRenderer', {}) - - # all the digits can be found in the accessibility data - count = extract_int(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label')) - - # this count doesn't have all the digits, it's like 53K for instance - dumb_count = extract_int(extract_str(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText'))) - - # the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0 - if dumb_count == 0: - count = 0 + # use dispatch table to get information scattered in various renderers + items, _ = extract_items( + response, + item_types=visible_extraction_dispatch.keys(), + search_engagement_panels=True + ) + found = set() + for renderer in items: + name, renderer_content = list(renderer.items())[0] + found.add(name) + info.update(visible_extraction_dispatch[name](renderer_content)) + # Call the function on blank dict for any that weren't found + # so that the empty keys get added + for name in visible_extraction_dispatch.keys() - found: + info.update(visible_extraction_dispatch[name]({})) - if 'isLike' in button_renderer: - info['like_count'] = count - elif 'isDislike' in button_renderer: - info['dislike_count'] = count # comment section info items, _ = extract_items(response, item_types={ @@ -274,7 +314,6 @@ def _extract_watch_info_desktop(top_level): info = { 'comment_count': None, 'comments_disabled': None, - 'allowed_countries': [], 'limited_state': None, 'playlist': None, } @@ -564,6 +603,12 @@ def extract_watch_info(polymer_json): liberal_update(info, 'category', mf.get('category')) liberal_update(info, 'time_published', mf.get('publishDate')) liberal_update(info, 'time_uploaded', mf.get('uploadDate')) + family_safe = mf.get('isFamilySafe') + if family_safe is None: + conservative_update(info, 'age_restricted', None) + else: + conservative_update(info, 'age_restricted', not family_safe) + info['allowed_countries'] = mf.get('availableCountries', []) # other stuff info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None -- cgit v1.2.3