From 5594d017e28a5524a97da8e9f9b113737c431415 Mon Sep 17 00:00:00 2001 From: Jesus Date: Mon, 11 Sep 2023 04:13:48 +0800 Subject: Fix related vids, like_count, playlist sometimes missing Cause is that some pages have the onResponseReceivedEndpoints key at the top level with useless stuff in it, and the extract_items function was searching in that instead of the 'contents' key. Change to use if blocks instead of elif blocks in the extract_items function. --- youtube/yt_data_extract/common.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'youtube/yt_data_extract/common.py') diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index b029a15..e7b31b7 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -109,7 +109,7 @@ def concat_or_none(*strings): def remove_redirect(url): if url is None: return None - if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking + if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # YouTube puts these on external links to do tracking query_string = url[url.find('?')+1: ] return urllib.parse.parse_qs(query_string)['q'][0] return url @@ -133,11 +133,11 @@ def _recover_urls(runs): for run in runs: url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') text = run.get('text', '') - # second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text + # second condition is necessary because YouTube makes other things into urls, such as hashtags, which we want to keep as text if url is not None and (text.startswith('http://') or text.startswith('https://')): url = remove_redirect(url) run['url'] = url - run['text'] = url # youtube truncates the url text, use actual url instead + run['text'] = url # YouTube truncates the url text, use actual url instead def extract_str(node, default=None, recover_urls=False): '''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix YouTube's truncation of url text (most prominently seen in descriptions)''' @@ -569,13 +569,13 @@ def extract_items(response, item_types=_item_types, item_types=item_types) if items: break - elif ('onResponseReceivedEndpoints' in response + if ('onResponseReceivedEndpoints' in response or 'onResponseReceivedActions' in response): for endpoint in multi_get(response, 'onResponseReceivedEndpoints', 'onResponseReceivedActions', []): - items, ctoken = extract_items_from_renderer_list( + new_items, new_ctoken = extract_items_from_renderer_list( multi_deep_get( endpoint, ['reloadContinuationItemsCommand', 'continuationItems'], @@ -584,13 +584,17 @@ def extract_items(response, item_types=_item_types, ), item_types=item_types, ) - if items: - break - elif 'contents' in response: + items += new_items + if (not ctoken) or (new_ctoken and new_items): + ctoken = new_ctoken + if 'contents' in response: renderer = get(response, 'contents', {}) - items, ctoken = extract_items_from_renderer( + new_items, new_ctoken = extract_items_from_renderer( renderer, item_types=item_types) + items += new_items + if (not ctoken) or (new_ctoken and new_items): + ctoken = new_ctoken if search_engagement_panels and 'engagementPanels' in response: new_items, new_ctoken = extract_items_from_renderer_list( -- cgit v1.2.3