From beb0976b5bc09a053d027a6e7020bb3a83f4aca1 Mon Sep 17 00:00:00 2001
From: James Taylor <user234683@users.noreply.github.com>
Date: Thu, 19 Dec 2019 15:50:19 -0800
Subject: Extraction: Rewrite comment extraction, remove author_id and rename
 author_channel_id to that, fix bug in extract_items

author_id (an internal sql-like integer previously required for deleting and editing comments) has been removed by Youtube and is no longer required.
Remove it for simplicity.
Rename author_channel_id to author_id for consistency with other extraction attributes.
extract_items returned None for items instead of [] for empty continuation responses. Fixes that.
---
 youtube/yt_data_extract.py | 130 ++++++++++++++++++++-------------------------
 1 file changed, 59 insertions(+), 71 deletions(-)

(limited to 'youtube/yt_data_extract.py')

diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py
index ac5b78b..68550cf 100644
--- a/youtube/yt_data_extract.py
+++ b/youtube/yt_data_extract.py
@@ -259,20 +259,20 @@ def extract_formatted_text(node):
         return [{'text': node['simpleText']}]
     return []
 
-def extract_int(string):
+def extract_int(string, default=None):
     if isinstance(string, int):
         return string
     if not isinstance(string, str):
         string = extract_str(string)
     if not string:
-        return None
+        return default
     match = re.search(r'(\d+)', string.replace(',', ''))
     if match is None:
-        return None
+        return default
     try:
         return int(match.group(1))
     except ValueError:
-        return None
+        return default
 
 def extract_approx_int(string):
     '''e.g. "15M" from "15M subscribers"'''
@@ -514,7 +514,7 @@ def extract_items(response, item_types=item_types):
         # always has just the one [something]Continuation key, but do this just in case they add some tracking key or something
         for key, renderer_continuation in get(response, 'continuationContents', {}, types=dict).items():
             if key.endswith('Continuation'):    # e.g. commentSectionContinuation, playlistVideoListContinuation
-                items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=None, types=(list, tuple))
+                items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=[], types=(list, tuple))
                 ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str)
                 return items, ctoken
         return [], None
@@ -772,78 +772,66 @@ def ctoken_metadata(ctoken):
             result['sort'] = 0
     return result
 
-def parse_comments_polymer(polymer_json):
-    try:
-        video_title = ''
-        response, err = extract_response(polymer_json)
-        if err:
-            raise Exception(err)
-
-        try:
-            url = polymer_json[1]['url']
-        except (TypeError, IndexError, KeyError):
-            url = polymer_json['url']
+def extract_comments_info(polymer_json):
+    response, err = extract_response(polymer_json)
+    if err:
+        return {'error': err}
+    info = {'error': None}
 
+    url = multi_deep_get(polymer_json, [1, 'url'], ['url'])
+    if url:
         ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0]
         metadata = ctoken_metadata(ctoken)
+    else:
+        metadata = {}
+    info['video_id'] = metadata.get('video_id')
+    info['offset'] = metadata.get('offset')
+    info['is_replies'] = metadata.get('is_replies')
+    info['sort'] = metadata.get('sort')
+    info['video_title'] = None
+
+    comments, ctoken = extract_items(response)
+    info['comments'] = []
+    info['ctoken'] = ctoken
+    for comment in comments:
+        comment_info = {}
+
+        if 'commentThreadRenderer' in comment:  # top level comments
+            conservative_update(info, 'is_replies', False)
+            comment_thread  = comment['commentThreadRenderer']
+            info['video_title'] = extract_str(comment_thread.get('commentTargetTitle'))
+            if 'replies' not in comment_thread:
+                comment_info['reply_count'] = 0
+            else:
+                comment_info['reply_count'] = extract_int(deep_get(comment_thread,
+                    'replies', 'commentRepliesRenderer', 'moreText'
+                ), default=1)   # With 1 reply, the text reads "View reply"
+            comment_renderer = deep_get(comment_thread, 'comment', 'commentRenderer', default={})
+        elif 'commentRenderer' in comment:  # replies
+            comment_info['reply_count'] = 0     # replyCount, below, not present for replies even if the reply has further replies to it
+            conservative_update(info, 'is_replies', True)
+            comment_renderer = comment['commentRenderer']
+        else:
+            comment_renderer = {}
 
-        comments_raw, ctoken = extract_items(response)
+        # These 3 are sometimes absent, likely because the channel was deleted
+        comment_info['author'] = extract_str(comment_renderer.get('authorText'))
+        comment_info['author_url'] = deep_get(comment_renderer,
+            'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')
+        comment_info['author_id'] = deep_get(comment_renderer,
+            'authorEndpoint', 'browseEndpoint', 'browseId')
 
-        comments = []
-        for comment_json in comments_raw:
-            number_of_replies = 0
-            try:
-                comment_thread = comment_json['commentThreadRenderer']
-            except KeyError:
-                comment_renderer = comment_json['commentRenderer']
-            else:
-                if 'commentTargetTitle' in comment_thread:
-                    video_title = comment_thread['commentTargetTitle']['runs'][0]['text']
-
-                if 'replies' in comment_thread:
-                    view_replies_text = extract_str(comment_thread['replies']['commentRepliesRenderer']['moreText'])
-                    view_replies_text = view_replies_text.replace(',', '')
-                    match = re.search(r'(\d+)', view_replies_text)
-                    if match is None:
-                        number_of_replies = 1
-                    else:
-                        number_of_replies = int(match.group(1))
-                comment_renderer = comment_thread['comment']['commentRenderer']
-
-            comment = {
-                'author_id': comment_renderer.get('authorId', ''),
-                'author_avatar': comment_renderer['authorThumbnail']['thumbnails'][0]['url'],
-                'like_count': comment_renderer['likeCount'],
-                'time_published': extract_str(comment_renderer['publishedTimeText']),
-                'text': comment_renderer['contentText'].get('runs', ''),
-                'reply_count': number_of_replies,
-                'id': comment_renderer['commentId'],
-            }
+        comment_info['author_avatar'] = deep_get(comment_renderer,
+            'authorThumbnail', 'thumbnails', 0, 'url')
+        comment_info['id'] = comment_renderer.get('commentId')
+        comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText'))
+        comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText'))
+        comment_info['like_count'] = comment_renderer.get('likeCount')
+        liberal_update(comment_info, 'reply_count', comment_renderer.get('replyCount'))
 
-            if 'authorText' in comment_renderer:     # deleted channels have no name or channel link
-                comment['author'] = extract_str(comment_renderer['authorText'])
-                comment['author_url'] = comment_renderer['authorEndpoint']['commandMetadata']['webCommandMetadata']['url']
-                comment['author_channel_id'] = comment_renderer['authorEndpoint']['browseEndpoint']['browseId']
-            else:
-                comment['author'] = ''
-                comment['author_url'] = ''
-                comment['author_channel_id'] = ''
-
-            comments.append(comment)
-    except Exception as e:
-        print('Error parsing comments: ' + str(e))
-        comments = ()
-        ctoken = ''
-
-    return {
-        'ctoken': ctoken,
-        'comments': comments,
-        'video_title': video_title,
-        'video_id': metadata['video_id'],
-        'offset': metadata['offset'],
-        'is_replies': metadata['is_replies'],
-        'sort': metadata['sort'],
-    }
+        info['comments'].append(comment_info)
+
+    return info
 
 def check_missing_keys(object, *key_sequences):
     for key_sequence in key_sequences:
-- 
cgit v1.2.3