aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/yt_data_extract/watch_extraction.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube/yt_data_extract/watch_extraction.py')
-rw-r--r--youtube/yt_data_extract/watch_extraction.py151
1 files changed, 98 insertions, 53 deletions
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index daa1e89..9ad3d43 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -116,7 +116,72 @@ _formats = {
'397': {'vcodec': 'av01.0.05M.08'},
}
-def _extract_metadata_row_info(video_renderer_info):
+
+def _extract_from_video_information_renderer(renderer_content):
+ subtitle = extract_str(renderer_content.get('expandedSubtitle'),
+ default='')
+ info = {
+ 'title': extract_str(renderer_content.get('title')),
+ 'view_count': extract_int(subtitle),
+ 'unlisted': False,
+ 'live': 'watching' in subtitle,
+ }
+ for badge in renderer_content.get('badges', []):
+ if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
+ info['unlisted'] = True
+ return info
+
+def _extract_likes_dislikes(renderer_content):
+ info = {
+ 'like_count': None,
+ 'dislike_count': None,
+ }
+ for button in renderer_content.get('buttons', ()):
+ button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
+
+ # all the digits can be found in the accessibility data
+ count = extract_int(deep_get(
+ button_renderer,
+ 'button', 'toggleButtonRenderer', 'defaultText',
+ 'accessibility', 'accessibilityData', 'label'))
+
+ # this count doesn't have all the digits, it's like 53K for instance
+ dumb_count = extract_int(extract_str(deep_get(
+ button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
+
+ # The accessibility text will be "No likes" or "No dislikes" or
+ # something like that, but dumb count will be 0
+ if dumb_count == 0:
+ count = 0
+
+ if 'isLike' in button_renderer:
+ info['like_count'] = count
+ elif 'isDislike' in button_renderer:
+ info['dislike_count'] = count
+ return info
+
+def _extract_from_owner_renderer(renderer_content):
+ return {
+ 'author': extract_str(renderer_content.get('title')),
+ 'author_id': deep_get(
+ renderer_content,
+ 'navigationEndpoint', 'browseEndpoint', 'browseId'),
+ }
+
+def _extract_from_video_header_renderer(renderer_content):
+ return {
+ 'title': extract_str(renderer_content.get('title')),
+ 'time_published': extract_date(extract_str(
+ renderer_content.get('publishDate'))),
+ }
+
+def _extract_from_description_renderer(renderer_content):
+ return {
+ 'description': extract_str(
+ renderer_content.get('descriptionBodyText'), recover_urls=True),
+ }
+
+def _extract_metadata_row_info(renderer_content):
# extract category and music list
info = {
'category': None,
@@ -124,7 +189,7 @@ def _extract_metadata_row_info(video_renderer_info):
}
current_song = {}
- for row in deep_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]):
+ for row in deep_get(renderer_content, 'rows', default=[]):
row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='')
row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0))
if row_title == 'Category':
@@ -146,18 +211,18 @@ def _extract_metadata_row_info(video_renderer_info):
return info
+visible_extraction_dispatch = {
+ 'slimVideoInformationRenderer': _extract_from_video_information_renderer,
+ 'slimVideoActionBarRenderer': _extract_likes_dislikes,
+ 'slimOwnerRenderer': _extract_from_owner_renderer,
+ 'videoDescriptionHeaderRenderer': _extract_from_video_header_renderer,
+ 'expandableVideoDescriptionRenderer': _extract_from_description_renderer,
+ 'metadataRowContainerRenderer': _extract_metadata_row_info,
+}
+
def _extract_watch_info_mobile(top_level):
+ '''Scrapes information from the visible page'''
info = {}
- microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
-
- family_safe = microformat.get('isFamilySafe')
- if family_safe is None:
- info['age_restricted'] = None
- else:
- info['age_restricted'] = not family_safe
- info['allowed_countries'] = microformat.get('availableCountries', [])
- info['time_published'] = microformat.get('publishDate')
-
response = top_level.get('response', {})
# this renderer has the stuff visible on the page
@@ -190,47 +255,22 @@ def _extract_watch_info_mobile(top_level):
else:
info['playlist'] = None
- # Holds the visible video info. It is inside singleColumnWatchNextResults
- # but use our convenience function instead
- items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'})
- if items:
- video_info = items[0]['slimVideoMetadataRenderer']
- else:
- print('Failed to extract video metadata')
- video_info = {}
-
- info.update(_extract_metadata_row_info(video_info))
- info['description'] = extract_str(video_info.get('description'), recover_urls=True)
- info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle')))
- info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title'))
- info['author_id'] = deep_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
- info['title'] = extract_str(video_info.get('title'))
- info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='')
- info['unlisted'] = False
- for badge in video_info.get('badges', []):
- if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
- info['unlisted'] = True
- info['like_count'] = None
- info['dislike_count'] = None
- if not info['time_published']:
- info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
- for button in video_info.get('buttons', ()):
- button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
-
- # all the digits can be found in the accessibility data
- count = extract_int(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label'))
-
- # this count doesn't have all the digits, it's like 53K for instance
- dumb_count = extract_int(extract_str(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
-
- # the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0
- if dumb_count == 0:
- count = 0
+ # use dispatch table to get information scattered in various renderers
+ items, _ = extract_items(
+ response,
+ item_types=visible_extraction_dispatch.keys(),
+ search_engagement_panels=True
+ )
+ found = set()
+ for renderer in items:
+ name, renderer_content = list(renderer.items())[0]
+ found.add(name)
+ info.update(visible_extraction_dispatch[name](renderer_content))
+ # Call the function on blank dict for any that weren't found
+ # so that the empty keys get added
+ for name in visible_extraction_dispatch.keys() - found:
+ info.update(visible_extraction_dispatch[name]({}))
- if 'isLike' in button_renderer:
- info['like_count'] = count
- elif 'isDislike' in button_renderer:
- info['dislike_count'] = count
# comment section info
items, _ = extract_items(response, item_types={
@@ -274,7 +314,6 @@ def _extract_watch_info_desktop(top_level):
info = {
'comment_count': None,
'comments_disabled': None,
- 'allowed_countries': [],
'limited_state': None,
'playlist': None,
}
@@ -564,6 +603,12 @@ def extract_watch_info(polymer_json):
liberal_update(info, 'category', mf.get('category'))
liberal_update(info, 'time_published', mf.get('publishDate'))
liberal_update(info, 'time_uploaded', mf.get('uploadDate'))
+ family_safe = mf.get('isFamilySafe')
+ if family_safe is None:
+ conservative_update(info, 'age_restricted', None)
+ else:
+ conservative_update(info, 'age_restricted', not family_safe)
+ info['allowed_countries'] = mf.get('availableCountries', [])
# other stuff
info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None