extract_items: Handle case where continuation has multiple

[something]Continuation renderers, all of which are junk except one. Check the items in each one until the one which contains the items being sought is found. The usage in extract_comments_info needed to be changed to specify the items being sought. It was unspecified before which is strictly incorrect since extract_items by default looks for video/playlist/channel thumbnail items. It was relying on this special case for continuations. But now that wouldn't work anymore.
author: James Taylor <user234683@users.noreply.github.com> 2020-08-11 19:59:25 -0700
committer: James Taylor <user234683@users.noreply.github.com> 2020-08-11 19:59:25 -0700
commit: fa61874f97a72c796dd9bcc2db910d51c0244339 (patch)
tree: 74c7652bd95d9114dc51182462e60cb21af66a68 /youtube/yt_data_extract
parent: 81ff5ab99ca05c4559c604ba7517d17f4bc79ea8 (diff)
download: yt-local-fa61874f97a72c796dd9bcc2db910d51c0244339.tar.lz
yt-local-fa61874f97a72c796dd9bcc2db910d51c0244339.tar.xz
yt-local-fa61874f97a72c796dd9bcc2db910d51c0244339.zip
2 files changed, 23 insertions, 11 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 3b2ebb5..dd02f2e 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -392,6 +392,13 @@ nested_renderer_list_dispatch = {
     'playlistVideoListRenderer': _traverse_standard_list,
     'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[]), None),
 }
+def get_nested_renderer_list_function(key):
+    if key in nested_renderer_list_dispatch:
+        return nested_renderer_list_dispatch[key]
+    elif key.endswith('Continuation'):
+        return _traverse_standard_list
+    return None
+
 def extract_items_from_renderer(renderer, item_types=_item_types):
     ctoken = None
     items = []
@@ -423,13 +430,13 @@ def extract_items_from_renderer(renderer, item_types=_item_types):
             items.append(renderer)
 
         # has a list in it, add it to the iter stack
-        elif key in nested_renderer_list_dispatch:
-            renderer_list, continuation = nested_renderer_list_dispatch[key](value)
+        elif get_nested_renderer_list_function(key):
+            renderer_list, cont = get_nested_renderer_list_function(key)(value)
             if renderer_list:
                 iter_stack.append(current_iter)
                 current_iter = iter(renderer_list)
-                if continuation:
-                    ctoken = continuation
+                if cont:
+                    ctoken = cont
 
         # new renderer nested inside this one
         elif key in nested_renderer_dispatch:
@@ -441,12 +448,16 @@ def extract_items_from_renderer(renderer, item_types=_item_types):
 def extract_items(response, item_types=_item_types):
     '''return items, ctoken'''
     if 'continuationContents' in response:
-        # always has just the one [something]Continuation key, but do this just in case they add some tracking key or something
-        for key, renderer_continuation in get(response, 'continuationContents', {}).items():
-            if key.endswith('Continuation'):    # e.g. commentSectionContinuation, playlistVideoListContinuation
-                items = multi_get(renderer_continuation, 'contents', 'items', default=[])
-                ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation')
-                return items, ctoken
+        # sometimes there's another, empty, junk [something]Continuation key
+        # find real one
+        for key, renderer_cont in get(response,
+                'continuationContents', {}).items():
+            # e.g. commentSectionContinuation, playlistVideoListContinuation
+            if key.endswith('Continuation'):
+                items, cont = extract_items_from_renderer({key: renderer_cont},
+                    item_types=item_types)
+                if items:
+                    return items, cont
         return [], None
     elif 'contents' in response:
         renderer = get(response, 'contents', {})
diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py
index d1389c6..20e0f30 100644
--- a/youtube/yt_data_extract/everything_else.py
+++ b/youtube/yt_data_extract/everything_else.py
@@ -227,7 +227,8 @@ def extract_comments_info(polymer_json):
     info['sort'] = metadata.get('sort')
     info['video_title'] = None
 
-    comments, ctoken = extract_items(response)
+    comments, ctoken = extract_items(response,
+        item_types={'commentThreadRenderer', 'commentRenderer'})
     info['comments'] = []
     info['ctoken'] = ctoken
     for comment in comments:
author	James Taylor <user234683@users.noreply.github.com>	2020-08-11 19:59:25 -0700
committer	James Taylor <user234683@users.noreply.github.com>	2020-08-11 19:59:25 -0700
commit	fa61874f97a72c796dd9bcc2db910d51c0244339 (patch)
tree	74c7652bd95d9114dc51182462e60cb21af66a68 /youtube/yt_data_extract
parent	81ff5ab99ca05c4559c604ba7517d17f4bc79ea8 (diff)
download	yt-local-fa61874f97a72c796dd9bcc2db910d51c0244339.tar.lz yt-local-fa61874f97a72c796dd9bcc2db910d51c0244339.tar.xz yt-local-fa61874f97a72c796dd9bcc2db910d51c0244339.zip