From 61c50e0b540fa7ebabadb870c6aeb38b87d4912c Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 19 Sep 2019 11:41:16 -0700 Subject: Extraction: Move comment extraction to yt_data_extract --- youtube/comments.py | 99 +++------------------------------------ youtube/util.py | 9 ---- youtube/yt_data_extract.py | 113 +++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 114 insertions(+), 107 deletions(-) diff --git a/youtube/comments.py b/youtube/comments.py index 3b1ef86..250a95f 100644 --- a/youtube/comments.py +++ b/youtube/comments.py @@ -48,24 +48,6 @@ def comment_replies_ctoken(video_id, comment_id, max_results=500): result = proto.nested(2, proto.string(2, video_id)) + proto.uint(3,6) + proto.nested(6, params) return base64.urlsafe_b64encode(result).decode('ascii') -def ctoken_metadata(ctoken): - result = dict() - params = proto.parse(proto.b64_to_bytes(ctoken)) - result['video_id'] = proto.parse(params[2])[2].decode('ascii') - - offset_information = proto.parse(params[6]) - result['offset'] = offset_information.get(5, 0) - - result['is_replies'] = False - if (3 in offset_information) and (2 in proto.parse(offset_information[3])): - result['is_replies'] = True - result['sort'] = None - else: - try: - result['sort'] = proto.parse(offset_information[4])[6] - except KeyError: - result['sort'] = 0 - return result mobile_headers = { @@ -91,7 +73,9 @@ def request_comments(ctoken, replies=False): print("got , retrying") continue break - return content + + polymer_json = json.loads(util.uppercase_escape(content.decode('utf-8'))) + return polymer_json def single_comment_ctoken(video_id, comment_id): @@ -102,77 +86,6 @@ def single_comment_ctoken(video_id, comment_id): -def parse_comments_polymer(content): - try: - video_title = '' - content = json.loads(util.uppercase_escape(content.decode('utf-8'))) - url = content[1]['url'] - ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] - metadata = ctoken_metadata(ctoken) - - try: - comments_raw = content[1]['response']['continuationContents']['commentSectionContinuation']['items'] - except KeyError: - comments_raw = content[1]['response']['continuationContents']['commentRepliesContinuation']['contents'] - - ctoken = util.default_multi_get(content, 1, 'response', 'continuationContents', 'commentSectionContinuation', 'continuations', 0, 'nextContinuationData', 'continuation', default='') - - comments = [] - for comment_json in comments_raw: - number_of_replies = 0 - try: - comment_thread = comment_json['commentThreadRenderer'] - except KeyError: - comment_renderer = comment_json['commentRenderer'] - else: - if 'commentTargetTitle' in comment_thread: - video_title = comment_thread['commentTargetTitle']['runs'][0]['text'] - - if 'replies' in comment_thread: - view_replies_text = yt_data_extract.get_plain_text(comment_thread['replies']['commentRepliesRenderer']['moreText']) - view_replies_text = view_replies_text.replace(',', '') - match = re.search(r'(\d+)', view_replies_text) - if match is None: - number_of_replies = 1 - else: - number_of_replies = int(match.group(1)) - comment_renderer = comment_thread['comment']['commentRenderer'] - - comment = { - 'author_id': comment_renderer.get('authorId', ''), - 'author_avatar': comment_renderer['authorThumbnail']['thumbnails'][0]['url'], - 'likes': comment_renderer['likeCount'], - 'published': yt_data_extract.get_plain_text(comment_renderer['publishedTimeText']), - 'text': comment_renderer['contentText'].get('runs', ''), - 'number_of_replies': number_of_replies, - 'comment_id': comment_renderer['commentId'], - } - - if 'authorText' in comment_renderer: # deleted channels have no name or channel link - comment['author'] = yt_data_extract.get_plain_text(comment_renderer['authorText']) - comment['author_url'] = comment_renderer['authorEndpoint']['commandMetadata']['webCommandMetadata']['url'] - comment['author_channel_id'] = comment_renderer['authorEndpoint']['browseEndpoint']['browseId'] - else: - comment['author'] = '' - comment['author_url'] = '' - comment['author_channel_id'] = '' - - comments.append(comment) - except Exception as e: - print('Error parsing comments: ' + str(e)) - comments = () - ctoken = '' - - return { - 'ctoken': ctoken, - 'comments': comments, - 'video_title': video_title, - 'video_id': metadata['video_id'], - 'offset': metadata['offset'], - 'is_replies': metadata['is_replies'], - 'sort': metadata['sort'], - } - def post_process_comments_info(comments_info): for comment in comments_info['comments']: comment['author_url'] = util.URL_ORIGIN + comment['author_url'] @@ -207,7 +120,7 @@ def post_process_comments_info(comments_info): comment['likes_text'] = str(comment['likes']) + ' likes' comments_info['include_avatars'] = settings.enable_comment_avatars - if comments_info['ctoken'] != '': + if comments_info['ctoken']: comments_info['more_comments_url'] = util.URL_ORIGIN + '/comments?ctoken=' + comments_info['ctoken'] comments_info['page_number'] = page_number = str(int(comments_info['offset']/20) + 1) @@ -222,7 +135,7 @@ def post_process_comments_info(comments_info): def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''): if settings.comments_mode: - comments_info = parse_comments_polymer(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key))) + comments_info = yt_data_extract.parse_comments_polymer(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key))) post_process_comments_info(comments_info) post_comment_url = util.URL_ORIGIN + "/post_comment?video_id=" + video_id @@ -247,7 +160,7 @@ def get_comments_page(): ctoken = comment_replies_ctoken(video_id, parent_id) replies = True - comments_info = parse_comments_polymer(request_comments(ctoken, replies)) + comments_info = yt_data_extract.parse_comments_polymer(request_comments(ctoken, replies)) post_process_comments_info(comments_info) if not replies: diff --git a/youtube/util.py b/youtube/util.py index a81ae83..5b63e2a 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -277,15 +277,6 @@ def video_id(url): url_parts = urllib.parse.urlparse(url) return urllib.parse.parse_qs(url_parts.query)['v'][0] -def default_multi_get(object, *keys, default): - ''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors ''' - try: - for key in keys: - object = object[key] - return object - except (IndexError, KeyError): - return default - # default, sddefault, mqdefault, hqdefault, hq720 def get_thumbnail_url(video_id): diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 440cc0d..551b663 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -1,4 +1,4 @@ -from youtube import util +from youtube import util, proto import html import json @@ -59,10 +59,14 @@ def format_text_runs(runs): return result - - - - +def default_multi_get(object, *keys, default): + ''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors ''' + try: + for key in keys: + object = object[key] + return object + except (IndexError, KeyError): + return default def get_url(node): @@ -501,3 +505,102 @@ def extract_playlist_info(polymer_json): return info +def ctoken_metadata(ctoken): + result = dict() + params = proto.parse(proto.b64_to_bytes(ctoken)) + result['video_id'] = proto.parse(params[2])[2].decode('ascii') + + offset_information = proto.parse(params[6]) + result['offset'] = offset_information.get(5, 0) + + result['is_replies'] = False + if (3 in offset_information) and (2 in proto.parse(offset_information[3])): + result['is_replies'] = True + result['sort'] = None + else: + try: + result['sort'] = proto.parse(offset_information[4])[6] + except KeyError: + result['sort'] = 0 + return result + +def parse_comments_polymer(polymer_json): + try: + video_title = '' + response, err = get_response(polymer_json) + if err: + raise Exception(err) + + try: + url = polymer_json[1]['url'] + except (TypeError, IndexError, KeyError): + url = polymer_json['url'] + + ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] + metadata = ctoken_metadata(ctoken) + + try: + comments_raw = response['continuationContents']['commentSectionContinuation']['items'] + except KeyError: + comments_raw = response['continuationContents']['commentRepliesContinuation']['contents'] + + ctoken = default_multi_get(response, 'continuationContents', 'commentSectionContinuation', 'continuations', 0, 'nextContinuationData', 'continuation', default='') + + comments = [] + for comment_json in comments_raw: + number_of_replies = 0 + try: + comment_thread = comment_json['commentThreadRenderer'] + except KeyError: + comment_renderer = comment_json['commentRenderer'] + else: + if 'commentTargetTitle' in comment_thread: + video_title = comment_thread['commentTargetTitle']['runs'][0]['text'] + + if 'replies' in comment_thread: + view_replies_text = get_plain_text(comment_thread['replies']['commentRepliesRenderer']['moreText']) + view_replies_text = view_replies_text.replace(',', '') + match = re.search(r'(\d+)', view_replies_text) + if match is None: + number_of_replies = 1 + else: + number_of_replies = int(match.group(1)) + comment_renderer = comment_thread['comment']['commentRenderer'] + + comment = { + 'author_id': comment_renderer.get('authorId', ''), + 'author_avatar': comment_renderer['authorThumbnail']['thumbnails'][0]['url'], + 'likes': comment_renderer['likeCount'], + 'published': get_plain_text(comment_renderer['publishedTimeText']), + 'text': comment_renderer['contentText'].get('runs', ''), + 'number_of_replies': number_of_replies, + 'comment_id': comment_renderer['commentId'], + } + + if 'authorText' in comment_renderer: # deleted channels have no name or channel link + comment['author'] = get_plain_text(comment_renderer['authorText']) + comment['author_url'] = comment_renderer['authorEndpoint']['commandMetadata']['webCommandMetadata']['url'] + comment['author_channel_id'] = comment_renderer['authorEndpoint']['browseEndpoint']['browseId'] + else: + comment['author'] = '' + comment['author_url'] = '' + comment['author_channel_id'] = '' + + comments.append(comment) + except Exception as e: + print('Error parsing comments: ' + str(e)) + comments = () + ctoken = '' + + return { + 'ctoken': ctoken, + 'comments': comments, + 'video_title': video_title, + 'video_id': metadata['video_id'], + 'offset': metadata['offset'], + 'is_replies': metadata['is_replies'], + 'sort': metadata['sort'], + } + + + -- cgit v1.2.3