aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/yt_data_extract.py
diff options
context:
space:
mode:
authorJames Taylor <user234683@users.noreply.github.com>2019-12-19 15:50:19 -0800
committerJames Taylor <user234683@users.noreply.github.com>2019-12-19 15:50:19 -0800
commitbeb0976b5bc09a053d027a6e7020bb3a83f4aca1 (patch)
tree3208aaa71a26457ee17cdb580de464aa108377f9 /youtube/yt_data_extract.py
parent02848a1a3213bb4ad872865768a7b97f663a24ed (diff)
downloadyt-local-beb0976b5bc09a053d027a6e7020bb3a83f4aca1.tar.lz
yt-local-beb0976b5bc09a053d027a6e7020bb3a83f4aca1.tar.xz
yt-local-beb0976b5bc09a053d027a6e7020bb3a83f4aca1.zip
Extraction: Rewrite comment extraction, remove author_id and rename author_channel_id to that, fix bug in extract_items
author_id (an internal sql-like integer previously required for deleting and editing comments) has been removed by Youtube and is no longer required. Remove it for simplicity. Rename author_channel_id to author_id for consistency with other extraction attributes. extract_items returned None for items instead of [] for empty continuation responses. Fixes that.
Diffstat (limited to 'youtube/yt_data_extract.py')
-rw-r--r--youtube/yt_data_extract.py130
1 files changed, 59 insertions, 71 deletions
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py
index ac5b78b..68550cf 100644
--- a/youtube/yt_data_extract.py
+++ b/youtube/yt_data_extract.py
@@ -259,20 +259,20 @@ def extract_formatted_text(node):
return [{'text': node['simpleText']}]
return []
-def extract_int(string):
+def extract_int(string, default=None):
if isinstance(string, int):
return string
if not isinstance(string, str):
string = extract_str(string)
if not string:
- return None
+ return default
match = re.search(r'(\d+)', string.replace(',', ''))
if match is None:
- return None
+ return default
try:
return int(match.group(1))
except ValueError:
- return None
+ return default
def extract_approx_int(string):
'''e.g. "15M" from "15M subscribers"'''
@@ -514,7 +514,7 @@ def extract_items(response, item_types=item_types):
# always has just the one [something]Continuation key, but do this just in case they add some tracking key or something
for key, renderer_continuation in get(response, 'continuationContents', {}, types=dict).items():
if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation
- items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=None, types=(list, tuple))
+ items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=[], types=(list, tuple))
ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str)
return items, ctoken
return [], None
@@ -772,78 +772,66 @@ def ctoken_metadata(ctoken):
result['sort'] = 0
return result
-def parse_comments_polymer(polymer_json):
- try:
- video_title = ''
- response, err = extract_response(polymer_json)
- if err:
- raise Exception(err)
-
- try:
- url = polymer_json[1]['url']
- except (TypeError, IndexError, KeyError):
- url = polymer_json['url']
+def extract_comments_info(polymer_json):
+ response, err = extract_response(polymer_json)
+ if err:
+ return {'error': err}
+ info = {'error': None}
+ url = multi_deep_get(polymer_json, [1, 'url'], ['url'])
+ if url:
ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0]
metadata = ctoken_metadata(ctoken)
+ else:
+ metadata = {}
+ info['video_id'] = metadata.get('video_id')
+ info['offset'] = metadata.get('offset')
+ info['is_replies'] = metadata.get('is_replies')
+ info['sort'] = metadata.get('sort')
+ info['video_title'] = None
+
+ comments, ctoken = extract_items(response)
+ info['comments'] = []
+ info['ctoken'] = ctoken
+ for comment in comments:
+ comment_info = {}
+
+ if 'commentThreadRenderer' in comment: # top level comments
+ conservative_update(info, 'is_replies', False)
+ comment_thread = comment['commentThreadRenderer']
+ info['video_title'] = extract_str(comment_thread.get('commentTargetTitle'))
+ if 'replies' not in comment_thread:
+ comment_info['reply_count'] = 0
+ else:
+ comment_info['reply_count'] = extract_int(deep_get(comment_thread,
+ 'replies', 'commentRepliesRenderer', 'moreText'
+ ), default=1) # With 1 reply, the text reads "View reply"
+ comment_renderer = deep_get(comment_thread, 'comment', 'commentRenderer', default={})
+ elif 'commentRenderer' in comment: # replies
+ comment_info['reply_count'] = 0 # replyCount, below, not present for replies even if the reply has further replies to it
+ conservative_update(info, 'is_replies', True)
+ comment_renderer = comment['commentRenderer']
+ else:
+ comment_renderer = {}
- comments_raw, ctoken = extract_items(response)
+ # These 3 are sometimes absent, likely because the channel was deleted
+ comment_info['author'] = extract_str(comment_renderer.get('authorText'))
+ comment_info['author_url'] = deep_get(comment_renderer,
+ 'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')
+ comment_info['author_id'] = deep_get(comment_renderer,
+ 'authorEndpoint', 'browseEndpoint', 'browseId')
- comments = []
- for comment_json in comments_raw:
- number_of_replies = 0
- try:
- comment_thread = comment_json['commentThreadRenderer']
- except KeyError:
- comment_renderer = comment_json['commentRenderer']
- else:
- if 'commentTargetTitle' in comment_thread:
- video_title = comment_thread['commentTargetTitle']['runs'][0]['text']
-
- if 'replies' in comment_thread:
- view_replies_text = extract_str(comment_thread['replies']['commentRepliesRenderer']['moreText'])
- view_replies_text = view_replies_text.replace(',', '')
- match = re.search(r'(\d+)', view_replies_text)
- if match is None:
- number_of_replies = 1
- else:
- number_of_replies = int(match.group(1))
- comment_renderer = comment_thread['comment']['commentRenderer']
-
- comment = {
- 'author_id': comment_renderer.get('authorId', ''),
- 'author_avatar': comment_renderer['authorThumbnail']['thumbnails'][0]['url'],
- 'like_count': comment_renderer['likeCount'],
- 'time_published': extract_str(comment_renderer['publishedTimeText']),
- 'text': comment_renderer['contentText'].get('runs', ''),
- 'reply_count': number_of_replies,
- 'id': comment_renderer['commentId'],
- }
+ comment_info['author_avatar'] = deep_get(comment_renderer,
+ 'authorThumbnail', 'thumbnails', 0, 'url')
+ comment_info['id'] = comment_renderer.get('commentId')
+ comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText'))
+ comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText'))
+ comment_info['like_count'] = comment_renderer.get('likeCount')
+ liberal_update(comment_info, 'reply_count', comment_renderer.get('replyCount'))
- if 'authorText' in comment_renderer: # deleted channels have no name or channel link
- comment['author'] = extract_str(comment_renderer['authorText'])
- comment['author_url'] = comment_renderer['authorEndpoint']['commandMetadata']['webCommandMetadata']['url']
- comment['author_channel_id'] = comment_renderer['authorEndpoint']['browseEndpoint']['browseId']
- else:
- comment['author'] = ''
- comment['author_url'] = ''
- comment['author_channel_id'] = ''
-
- comments.append(comment)
- except Exception as e:
- print('Error parsing comments: ' + str(e))
- comments = ()
- ctoken = ''
-
- return {
- 'ctoken': ctoken,
- 'comments': comments,
- 'video_title': video_title,
- 'video_id': metadata['video_id'],
- 'offset': metadata['offset'],
- 'is_replies': metadata['is_replies'],
- 'sort': metadata['sort'],
- }
+ info['comments'].append(comment_info)
+
+ return info
def check_missing_keys(object, *key_sequences):
for key_sequence in key_sequences: