diff options
author | Jesus <heckyel@riseup.net> | 2023-09-11 04:13:48 +0800 |
---|---|---|
committer | Jesus <heckyel@riseup.net> | 2023-09-11 04:13:56 +0800 |
commit | 5594d017e28a5524a97da8e9f9b113737c431415 (patch) | |
tree | a52dfd2be552f03af6b5b007a16092068096d16c | |
parent | 8f9c5eeb4824fed924c5550e7fc77270995b18ba (diff) | |
download | yt-local-5594d017e28a5524a97da8e9f9b113737c431415.tar.lz yt-local-5594d017e28a5524a97da8e9f9b113737c431415.tar.xz yt-local-5594d017e28a5524a97da8e9f9b113737c431415.zip |
Fix related vids, like_count, playlist sometimes missing
Cause is that some pages have the onResponseReceivedEndpoints key
at the top level with useless stuff in it, and the extract_items
function was searching in that instead of the 'contents' key.
Change to use if blocks instead of elif blocks in the
extract_items function.
-rw-r--r-- | youtube/yt_data_extract/common.py | 22 |
1 files changed, 13 insertions, 9 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index b029a15..e7b31b7 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -109,7 +109,7 @@ def concat_or_none(*strings): def remove_redirect(url): if url is None: return None - if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking + if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # YouTube puts these on external links to do tracking query_string = url[url.find('?')+1: ] return urllib.parse.parse_qs(query_string)['q'][0] return url @@ -133,11 +133,11 @@ def _recover_urls(runs): for run in runs: url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') text = run.get('text', '') - # second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text + # second condition is necessary because YouTube makes other things into urls, such as hashtags, which we want to keep as text if url is not None and (text.startswith('http://') or text.startswith('https://')): url = remove_redirect(url) run['url'] = url - run['text'] = url # youtube truncates the url text, use actual url instead + run['text'] = url # YouTube truncates the url text, use actual url instead def extract_str(node, default=None, recover_urls=False): '''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix YouTube's truncation of url text (most prominently seen in descriptions)''' @@ -569,13 +569,13 @@ def extract_items(response, item_types=_item_types, item_types=item_types) if items: break - elif ('onResponseReceivedEndpoints' in response + if ('onResponseReceivedEndpoints' in response or 'onResponseReceivedActions' in response): for endpoint in multi_get(response, 'onResponseReceivedEndpoints', 'onResponseReceivedActions', []): - items, ctoken = extract_items_from_renderer_list( + new_items, new_ctoken = extract_items_from_renderer_list( multi_deep_get( endpoint, ['reloadContinuationItemsCommand', 'continuationItems'], @@ -584,13 +584,17 @@ def extract_items(response, item_types=_item_types, ), item_types=item_types, ) - if items: - break - elif 'contents' in response: + items += new_items + if (not ctoken) or (new_ctoken and new_items): + ctoken = new_ctoken + if 'contents' in response: renderer = get(response, 'contents', {}) - items, ctoken = extract_items_from_renderer( + new_items, new_ctoken = extract_items_from_renderer( renderer, item_types=item_types) + items += new_items + if (not ctoken) or (new_ctoken and new_items): + ctoken = new_ctoken if search_engagement_panels and 'engagementPanels' in response: new_items, new_ctoken = extract_items_from_renderer_list( |