diff options
author | James Taylor <user234683@users.noreply.github.com> | 2020-08-11 19:59:25 -0700 |
---|---|---|
committer | James Taylor <user234683@users.noreply.github.com> | 2020-08-11 19:59:25 -0700 |
commit | fa61874f97a72c796dd9bcc2db910d51c0244339 (patch) | |
tree | 74c7652bd95d9114dc51182462e60cb21af66a68 | |
parent | 81ff5ab99ca05c4559c604ba7517d17f4bc79ea8 (diff) | |
download | yt-local-fa61874f97a72c796dd9bcc2db910d51c0244339.tar.lz yt-local-fa61874f97a72c796dd9bcc2db910d51c0244339.tar.xz yt-local-fa61874f97a72c796dd9bcc2db910d51c0244339.zip |
extract_items: Handle case where continuation has multiple
[something]Continuation renderers, all of which are junk
except one. Check the items in each one until the one which
contains the items being sought is found.
The usage in extract_comments_info needed to be changed to
specify the items being sought. It was unspecified before which
is strictly incorrect since extract_items by default looks for
video/playlist/channel thumbnail items. It was relying on this
special case for continuations. But now that wouldn't work
anymore.
-rw-r--r-- | youtube/yt_data_extract/common.py | 31 | ||||
-rw-r--r-- | youtube/yt_data_extract/everything_else.py | 3 |
2 files changed, 23 insertions, 11 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index 3b2ebb5..dd02f2e 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -392,6 +392,13 @@ nested_renderer_list_dispatch = { 'playlistVideoListRenderer': _traverse_standard_list, 'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[]), None), } +def get_nested_renderer_list_function(key): + if key in nested_renderer_list_dispatch: + return nested_renderer_list_dispatch[key] + elif key.endswith('Continuation'): + return _traverse_standard_list + return None + def extract_items_from_renderer(renderer, item_types=_item_types): ctoken = None items = [] @@ -423,13 +430,13 @@ def extract_items_from_renderer(renderer, item_types=_item_types): items.append(renderer) # has a list in it, add it to the iter stack - elif key in nested_renderer_list_dispatch: - renderer_list, continuation = nested_renderer_list_dispatch[key](value) + elif get_nested_renderer_list_function(key): + renderer_list, cont = get_nested_renderer_list_function(key)(value) if renderer_list: iter_stack.append(current_iter) current_iter = iter(renderer_list) - if continuation: - ctoken = continuation + if cont: + ctoken = cont # new renderer nested inside this one elif key in nested_renderer_dispatch: @@ -441,12 +448,16 @@ def extract_items_from_renderer(renderer, item_types=_item_types): def extract_items(response, item_types=_item_types): '''return items, ctoken''' if 'continuationContents' in response: - # always has just the one [something]Continuation key, but do this just in case they add some tracking key or something - for key, renderer_continuation in get(response, 'continuationContents', {}).items(): - if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation - items = multi_get(renderer_continuation, 'contents', 'items', default=[]) - ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation') - return items, ctoken + # sometimes there's another, empty, junk [something]Continuation key + # find real one + for key, renderer_cont in get(response, + 'continuationContents', {}).items(): + # e.g. commentSectionContinuation, playlistVideoListContinuation + if key.endswith('Continuation'): + items, cont = extract_items_from_renderer({key: renderer_cont}, + item_types=item_types) + if items: + return items, cont return [], None elif 'contents' in response: renderer = get(response, 'contents', {}) diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py index d1389c6..20e0f30 100644 --- a/youtube/yt_data_extract/everything_else.py +++ b/youtube/yt_data_extract/everything_else.py @@ -227,7 +227,8 @@ def extract_comments_info(polymer_json): info['sort'] = metadata.get('sort') info['video_title'] = None - comments, ctoken = extract_items(response) + comments, ctoken = extract_items(response, + item_types={'commentThreadRenderer', 'commentRenderer'}) info['comments'] = [] info['ctoken'] = ctoken for comment in comments: |