From beb0976b5bc09a053d027a6e7020bb3a83f4aca1 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 19 Dec 2019 15:50:19 -0800 Subject: Extraction: Rewrite comment extraction, remove author_id and rename author_channel_id to that, fix bug in extract_items author_id (an internal sql-like integer previously required for deleting and editing comments) has been removed by Youtube and is no longer required. Remove it for simplicity. Rename author_channel_id to author_id for consistency with other extraction attributes. extract_items returned None for items instead of [] for empty continuation responses. Fixes that. --- youtube/comments.py | 9 ++-- youtube/post_comment.py | 8 +-- youtube/yt_data_extract.py | 130 ++++++++++++++++++++------------------------- 3 files changed, 67 insertions(+), 80 deletions(-) diff --git a/youtube/comments.py b/youtube/comments.py index e237f0f..4e79d8b 100644 --- a/youtube/comments.py +++ b/youtube/comments.py @@ -93,11 +93,10 @@ def post_process_comments_info(comments_info): comment['permalink'] = util.URL_ORIGIN + '/watch?v=' + comments_info['video_id'] + '&lc=' + comment['id'] - if comment['author_channel_id'] in accounts.accounts: + if comment['author_id'] in accounts.accounts: comment['delete_url'] = (util.URL_ORIGIN + '/delete_comment?video_id=' + comments_info['video_id'] - + '&channel_id='+ comment['author_channel_id'] - + '&author_id=' + comment['author_id'] + + '&channel_id='+ comment['author_id'] + '&comment_id=' + comment['id']) reply_count = comment['reply_count'] @@ -135,7 +134,7 @@ def post_process_comments_info(comments_info): def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''): if settings.comments_mode: - comments_info = yt_data_extract.parse_comments_polymer(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key))) + comments_info = yt_data_extract.extract_comments_info(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key))) post_process_comments_info(comments_info) post_comment_url = util.URL_ORIGIN + "/post_comment?video_id=" + video_id @@ -160,7 +159,7 @@ def get_comments_page(): ctoken = comment_replies_ctoken(video_id, parent_id) replies = True - comments_info = yt_data_extract.parse_comments_polymer(request_comments(ctoken, replies)) + comments_info = yt_data_extract.extract_comments_info(request_comments(ctoken, replies)) post_process_comments_info(comments_info) if not replies: diff --git a/youtube/post_comment.py b/youtube/post_comment.py index 25d0e3a..78f080f 100644 --- a/youtube/post_comment.py +++ b/youtube/post_comment.py @@ -70,7 +70,7 @@ def _post_comment_reply(text, video_id, parent_comment_id, session_token, cookie print("Comment posting code: " + code) return code -def _delete_comment(video_id, comment_id, author_id, session_token, cookiejar): +def _delete_comment(video_id, comment_id, session_token, cookiejar): headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1', 'Accept': '*/*', @@ -79,7 +79,7 @@ def _delete_comment(video_id, comment_id, author_id, session_token, cookiejar): 'X-YouTube-Client-Version': '2.20180823', 'Content-Type': 'application/x-www-form-urlencoded', } - action = proto.uint(1,6) + proto.string(3, comment_id) + proto.string(5, video_id) + proto.string(9, author_id) + action = proto.uint(1,6) + proto.string(3, comment_id) + proto.string(5, video_id) action = proto.percent_b64encode(action).decode('ascii') sej = json.dumps({"clickTrackingParams":"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=","commandMetadata":{"webCommandMetadata":{"url":"/service_ajax","sendPost":True}},"performCommentActionEndpoint":{"action":action}}) @@ -115,7 +115,7 @@ def delete_comment(): cookiejar = accounts.account_cookiejar(request.values['channel_id']) token = get_session_token(video_id, cookiejar) - code = _delete_comment(video_id, request.values['comment_id'], request.values['author_id'], token, cookiejar) + code = _delete_comment(video_id, request.values['comment_id'], token, cookiejar) if code == "SUCCESS": return flask.redirect(util.URL_ORIGIN + '/comment_delete_success', 303) @@ -147,7 +147,7 @@ def post_comment(): @yt_app.route('/delete_comment', methods=['GET']) def get_delete_comment_page(): - parameters = [(parameter_name, request.args[parameter_name]) for parameter_name in ('video_id', 'channel_id', 'author_id', 'comment_id')] + parameters = [(parameter_name, request.args[parameter_name]) for parameter_name in ('video_id', 'channel_id', 'comment_id')] return flask.render_template('delete_comment.html', parameters = parameters) diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index ac5b78b..68550cf 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -259,20 +259,20 @@ def extract_formatted_text(node): return [{'text': node['simpleText']}] return [] -def extract_int(string): +def extract_int(string, default=None): if isinstance(string, int): return string if not isinstance(string, str): string = extract_str(string) if not string: - return None + return default match = re.search(r'(\d+)', string.replace(',', '')) if match is None: - return None + return default try: return int(match.group(1)) except ValueError: - return None + return default def extract_approx_int(string): '''e.g. "15M" from "15M subscribers"''' @@ -514,7 +514,7 @@ def extract_items(response, item_types=item_types): # always has just the one [something]Continuation key, but do this just in case they add some tracking key or something for key, renderer_continuation in get(response, 'continuationContents', {}, types=dict).items(): if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation - items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=None, types=(list, tuple)) + items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=[], types=(list, tuple)) ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str) return items, ctoken return [], None @@ -772,78 +772,66 @@ def ctoken_metadata(ctoken): result['sort'] = 0 return result -def parse_comments_polymer(polymer_json): - try: - video_title = '' - response, err = extract_response(polymer_json) - if err: - raise Exception(err) - - try: - url = polymer_json[1]['url'] - except (TypeError, IndexError, KeyError): - url = polymer_json['url'] +def extract_comments_info(polymer_json): + response, err = extract_response(polymer_json) + if err: + return {'error': err} + info = {'error': None} + url = multi_deep_get(polymer_json, [1, 'url'], ['url']) + if url: ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] metadata = ctoken_metadata(ctoken) + else: + metadata = {} + info['video_id'] = metadata.get('video_id') + info['offset'] = metadata.get('offset') + info['is_replies'] = metadata.get('is_replies') + info['sort'] = metadata.get('sort') + info['video_title'] = None + + comments, ctoken = extract_items(response) + info['comments'] = [] + info['ctoken'] = ctoken + for comment in comments: + comment_info = {} + + if 'commentThreadRenderer' in comment: # top level comments + conservative_update(info, 'is_replies', False) + comment_thread = comment['commentThreadRenderer'] + info['video_title'] = extract_str(comment_thread.get('commentTargetTitle')) + if 'replies' not in comment_thread: + comment_info['reply_count'] = 0 + else: + comment_info['reply_count'] = extract_int(deep_get(comment_thread, + 'replies', 'commentRepliesRenderer', 'moreText' + ), default=1) # With 1 reply, the text reads "View reply" + comment_renderer = deep_get(comment_thread, 'comment', 'commentRenderer', default={}) + elif 'commentRenderer' in comment: # replies + comment_info['reply_count'] = 0 # replyCount, below, not present for replies even if the reply has further replies to it + conservative_update(info, 'is_replies', True) + comment_renderer = comment['commentRenderer'] + else: + comment_renderer = {} - comments_raw, ctoken = extract_items(response) + # These 3 are sometimes absent, likely because the channel was deleted + comment_info['author'] = extract_str(comment_renderer.get('authorText')) + comment_info['author_url'] = deep_get(comment_renderer, + 'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url') + comment_info['author_id'] = deep_get(comment_renderer, + 'authorEndpoint', 'browseEndpoint', 'browseId') - comments = [] - for comment_json in comments_raw: - number_of_replies = 0 - try: - comment_thread = comment_json['commentThreadRenderer'] - except KeyError: - comment_renderer = comment_json['commentRenderer'] - else: - if 'commentTargetTitle' in comment_thread: - video_title = comment_thread['commentTargetTitle']['runs'][0]['text'] - - if 'replies' in comment_thread: - view_replies_text = extract_str(comment_thread['replies']['commentRepliesRenderer']['moreText']) - view_replies_text = view_replies_text.replace(',', '') - match = re.search(r'(\d+)', view_replies_text) - if match is None: - number_of_replies = 1 - else: - number_of_replies = int(match.group(1)) - comment_renderer = comment_thread['comment']['commentRenderer'] - - comment = { - 'author_id': comment_renderer.get('authorId', ''), - 'author_avatar': comment_renderer['authorThumbnail']['thumbnails'][0]['url'], - 'like_count': comment_renderer['likeCount'], - 'time_published': extract_str(comment_renderer['publishedTimeText']), - 'text': comment_renderer['contentText'].get('runs', ''), - 'reply_count': number_of_replies, - 'id': comment_renderer['commentId'], - } + comment_info['author_avatar'] = deep_get(comment_renderer, + 'authorThumbnail', 'thumbnails', 0, 'url') + comment_info['id'] = comment_renderer.get('commentId') + comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText')) + comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText')) + comment_info['like_count'] = comment_renderer.get('likeCount') + liberal_update(comment_info, 'reply_count', comment_renderer.get('replyCount')) - if 'authorText' in comment_renderer: # deleted channels have no name or channel link - comment['author'] = extract_str(comment_renderer['authorText']) - comment['author_url'] = comment_renderer['authorEndpoint']['commandMetadata']['webCommandMetadata']['url'] - comment['author_channel_id'] = comment_renderer['authorEndpoint']['browseEndpoint']['browseId'] - else: - comment['author'] = '' - comment['author_url'] = '' - comment['author_channel_id'] = '' - - comments.append(comment) - except Exception as e: - print('Error parsing comments: ' + str(e)) - comments = () - ctoken = '' - - return { - 'ctoken': ctoken, - 'comments': comments, - 'video_title': video_title, - 'video_id': metadata['video_id'], - 'offset': metadata['offset'], - 'is_replies': metadata['is_replies'], - 'sort': metadata['sort'], - } + info['comments'].append(comment_info) + + return info def check_missing_keys(object, *key_sequences): for key_sequence in key_sequences: -- cgit v1.2.3