aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/yt_data_extract/everything_else.py
diff options
context:
space:
mode:
authorJames Taylor <user234683@users.noreply.github.com>2019-12-19 21:33:54 -0800
committerJames Taylor <user234683@users.noreply.github.com>2019-12-19 21:33:54 -0800
commitb4406df9cf33c53b6e942e6a5c72d955f57c4b5f (patch)
tree4de0082ac9eb26a05188dd424835ea50b1483113 /youtube/yt_data_extract/everything_else.py
parentb614fcdb8579ba29fccfa47eab1e2965cfb0beaa (diff)
parent6b7a1212e30b713453aa7d2b3a7122e97689dad0 (diff)
downloadyt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.tar.lz
yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.tar.xz
yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.zip
Merge branch 'modular-data-extract'
Commits in this branch are prefixed with "Extraction:" This branch refactors data extraction. All such functionality has been moved to the yt_data_extract module. Responses from requests are given to the module and it parses them into a consistent, more useful format. The dependency on youtube-dl has also been dropped and this functionality has been built from scratch for these reasons: (1) I've noticed youtube-dl breaks more often than invidious (which uses watch page extraction built from scratch) in response to changes from Youtube, so I'm hoping what I wrote will also be less brittle. (2) Such breakage is inconvenient because I have to manually merge the fixes since I had to make changes to youtube-dl to make it do things such as extracting related videos. (3) I have no control over error handling and request pooling with youtube-dl, since it does all the requests (these would require intrusive changes I don't want to maintain). (4) I will now be able to finally display the number of comments and whether comments are disabled without making additional requests.
Diffstat (limited to 'youtube/yt_data_extract/everything_else.py')
-rw-r--r--youtube/yt_data_extract/everything_else.py273
1 files changed, 273 insertions, 0 deletions
diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py
new file mode 100644
index 0000000..6dc5248
--- /dev/null
+++ b/youtube/yt_data_extract/everything_else.py
@@ -0,0 +1,273 @@
+from .common import (get, multi_get, deep_get, multi_deep_get,
+ liberal_update, conservative_update, remove_redirect, normalize_url,
+ extract_str, extract_formatted_text, extract_int, extract_approx_int,
+ extract_date, check_missing_keys, extract_item_info, extract_items,
+ extract_response)
+from youtube import proto
+
+import re
+import urllib
+from math import ceil
+
+def extract_channel_info(polymer_json, tab):
+ response, err = extract_response(polymer_json)
+ if err:
+ return {'error': err}
+
+ try:
+ microformat = response['microformat']['microformatDataRenderer']
+
+ # channel doesn't exist or was terminated
+ # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org
+ except KeyError:
+ if 'alerts' in response and len(response['alerts']) > 0:
+ return {'error': ' '.join(alert['alertRenderer']['text']['simpleText'] for alert in response['alerts']) }
+ elif 'errors' in response['responseContext']:
+ for error in response['responseContext']['errors']['error']:
+ if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id':
+ return {'error': 'This channel does not exist'}
+ return {'error': 'Failure getting microformat'}
+
+ info = {'error': None}
+ info['current_tab'] = tab
+
+
+ # stuff from microformat (info given by youtube for every page on channel)
+ info['short_description'] = microformat['description']
+ info['channel_name'] = microformat['title']
+ info['avatar'] = microformat['thumbnail']['thumbnails'][0]['url']
+ channel_url = microformat['urlCanonical'].rstrip('/')
+ channel_id = channel_url[channel_url.rfind('/')+1:]
+ info['channel_id'] = channel_id
+ info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id
+
+ info['items'] = []
+
+ # empty channel
+ if 'contents' not in response and 'continuationContents' not in response:
+ return info
+
+
+ items, _ = extract_items(response)
+ if tab in ('videos', 'playlists', 'search'):
+ additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id}
+ info['items'] = [extract_item_info(renderer, additional_info) for renderer in items]
+
+ elif tab == 'about':
+ for item in items:
+ try:
+ channel_metadata = item['channelAboutFullMetadataRenderer']
+ break
+ except KeyError:
+ pass
+ else:
+ info['error'] = 'Could not find channelAboutFullMetadataRenderer'
+ return info
+
+ info['links'] = []
+ for link_json in channel_metadata.get('primaryLinks', ()):
+ url = remove_redirect(link_json['navigationEndpoint']['urlEndpoint']['url'])
+
+ text = extract_str(link_json['title'])
+
+ info['links'].append( (text, url) )
+
+
+ info['stats'] = []
+ for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'):
+ try:
+ stat = channel_metadata[stat_name]
+ except KeyError:
+ continue
+ info['stats'].append(extract_str(stat))
+
+ if 'description' in channel_metadata:
+ info['description'] = extract_str(channel_metadata['description'])
+ else:
+ info['description'] = ''
+
+ else:
+ raise NotImplementedError('Unknown or unsupported channel tab: ' + tab)
+
+ return info
+
+def extract_search_info(polymer_json):
+ response, err = extract_response(polymer_json)
+ if err:
+ return {'error': err}
+ info = {'error': None}
+ info['estimated_results'] = int(response['estimatedResults'])
+ info['estimated_pages'] = ceil(info['estimated_results']/20)
+
+
+ results, _ = extract_items(response)
+
+
+ info['items'] = []
+ info['corrections'] = {'type': None}
+ for renderer in results:
+ type = list(renderer.keys())[0]
+ if type == 'shelfRenderer':
+ continue
+ if type == 'didYouMeanRenderer':
+ renderer = renderer[type]
+
+ info['corrections'] = {
+ 'type': 'did_you_mean',
+ 'corrected_query': renderer['correctedQueryEndpoint']['searchEndpoint']['query'],
+ 'corrected_query_text': renderer['correctedQuery']['runs'],
+ }
+ continue
+ if type == 'showingResultsForRenderer':
+ renderer = renderer[type]
+
+ info['corrections'] = {
+ 'type': 'showing_results_for',
+ 'corrected_query_text': renderer['correctedQuery']['runs'],
+ 'original_query_text': renderer['originalQuery']['simpleText'],
+ }
+ continue
+
+ i_info = extract_item_info(renderer)
+ if i_info.get('type') != 'unsupported':
+ info['items'].append(i_info)
+
+
+ return info
+
+def extract_playlist_metadata(polymer_json):
+ response, err = extract_response(polymer_json)
+ if err:
+ return {'error': err}
+
+ metadata = {'error': None}
+ header = deep_get(response, 'header', 'playlistHeaderRenderer', default={})
+ metadata['title'] = extract_str(header.get('title'))
+
+ metadata['first_video_id'] = deep_get(header, 'playEndpoint', 'watchEndpoint', 'videoId')
+ first_id = re.search(r'([a-z_\-]{11})', deep_get(header,
+ 'thumbnail', 'thumbnails', 0, 'url', default=''))
+ if first_id:
+ conservative_update(metadata, 'first_video_id', first_id.group(1))
+ if metadata['first_video_id'] is None:
+ metadata['thumbnail'] = None
+ else:
+ metadata['thumbnail'] = 'https://i.ytimg.com/vi/' + metadata['first_video_id'] + '/mqdefault.jpg'
+
+ metadata['video_count'] = extract_int(header.get('numVideosText'))
+ metadata['description'] = extract_str(header.get('descriptionText'), default='')
+ metadata['author'] = extract_str(header.get('ownerText'))
+ metadata['author_id'] = multi_deep_get(header,
+ ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
+ ['ownerEndpoint', 'browseEndpoint', 'browseId'])
+ if metadata['author_id']:
+ metadata['author_url'] = 'https://www.youtube.com/channel/' + metadata['author_id']
+ else:
+ metadata['author_url'] = None
+ metadata['view_count'] = extract_int(header.get('viewCountText'))
+ metadata['like_count'] = extract_int(header.get('likesCountWithoutLikeText'))
+ for stat in header.get('stats', ()):
+ text = extract_str(stat)
+ if 'videos' in text:
+ conservative_update(metadata, 'video_count', extract_int(text))
+ elif 'views' in text:
+ conservative_update(metadata, 'view_count', extract_int(text))
+ elif 'updated' in text:
+ metadata['time_published'] = extract_date(text)
+
+ return metadata
+
+def extract_playlist_info(polymer_json):
+ response, err = extract_response(polymer_json)
+ if err:
+ return {'error': err}
+ info = {'error': None}
+ first_page = 'continuationContents' not in response
+ video_list, _ = extract_items(response)
+
+ info['items'] = [extract_item_info(renderer) for renderer in video_list]
+
+ if first_page:
+ info['metadata'] = extract_playlist_metadata(polymer_json)
+
+ return info
+
+def _ctoken_metadata(ctoken):
+ result = dict()
+ params = proto.parse(proto.b64_to_bytes(ctoken))
+ result['video_id'] = proto.parse(params[2])[2].decode('ascii')
+
+ offset_information = proto.parse(params[6])
+ result['offset'] = offset_information.get(5, 0)
+
+ result['is_replies'] = False
+ if (3 in offset_information) and (2 in proto.parse(offset_information[3])):
+ result['is_replies'] = True
+ result['sort'] = None
+ else:
+ try:
+ result['sort'] = proto.parse(offset_information[4])[6]
+ except KeyError:
+ result['sort'] = 0
+ return result
+
+def extract_comments_info(polymer_json):
+ response, err = extract_response(polymer_json)
+ if err:
+ return {'error': err}
+ info = {'error': None}
+
+ url = multi_deep_get(polymer_json, [1, 'url'], ['url'])
+ if url:
+ ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0]
+ metadata = _ctoken_metadata(ctoken)
+ else:
+ metadata = {}
+ info['video_id'] = metadata.get('video_id')
+ info['offset'] = metadata.get('offset')
+ info['is_replies'] = metadata.get('is_replies')
+ info['sort'] = metadata.get('sort')
+ info['video_title'] = None
+
+ comments, ctoken = extract_items(response)
+ info['comments'] = []
+ info['ctoken'] = ctoken
+ for comment in comments:
+ comment_info = {}
+
+ if 'commentThreadRenderer' in comment: # top level comments
+ conservative_update(info, 'is_replies', False)
+ comment_thread = comment['commentThreadRenderer']
+ info['video_title'] = extract_str(comment_thread.get('commentTargetTitle'))
+ if 'replies' not in comment_thread:
+ comment_info['reply_count'] = 0
+ else:
+ comment_info['reply_count'] = extract_int(deep_get(comment_thread,
+ 'replies', 'commentRepliesRenderer', 'moreText'
+ ), default=1) # With 1 reply, the text reads "View reply"
+ comment_renderer = deep_get(comment_thread, 'comment', 'commentRenderer', default={})
+ elif 'commentRenderer' in comment: # replies
+ comment_info['reply_count'] = 0 # replyCount, below, not present for replies even if the reply has further replies to it
+ conservative_update(info, 'is_replies', True)
+ comment_renderer = comment['commentRenderer']
+ else:
+ comment_renderer = {}
+
+ # These 3 are sometimes absent, likely because the channel was deleted
+ comment_info['author'] = extract_str(comment_renderer.get('authorText'))
+ comment_info['author_url'] = deep_get(comment_renderer,
+ 'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')
+ comment_info['author_id'] = deep_get(comment_renderer,
+ 'authorEndpoint', 'browseEndpoint', 'browseId')
+
+ comment_info['author_avatar'] = deep_get(comment_renderer,
+ 'authorThumbnail', 'thumbnails', 0, 'url')
+ comment_info['id'] = comment_renderer.get('commentId')
+ comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText'))
+ comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText'))
+ comment_info['like_count'] = comment_renderer.get('likeCount')
+ liberal_update(comment_info, 'reply_count', comment_renderer.get('replyCount'))
+
+ info['comments'].append(comment_info)
+
+ return info