Fix related vids, like_count, playlist sometimes missing

Cause is that some pages have the onResponseReceivedEndpoints key at the top level with useless stuff in it, and the extract_items function was searching in that instead of the 'contents' key. Change to use if blocks instead of elif blocks in the extract_items function.
author: Jesus <heckyel@riseup.net> 2023-09-11 04:13:48 +0800
committer: Jesus <heckyel@riseup.net> 2023-09-11 04:13:56 +0800
commit: 5594d017e28a5524a97da8e9f9b113737c431415 (patch)
tree: a52dfd2be552f03af6b5b007a16092068096d16c
parent: 8f9c5eeb4824fed924c5550e7fc77270995b18ba (diff)
download: yt-local-5594d017e28a5524a97da8e9f9b113737c431415.tar.lz
yt-local-5594d017e28a5524a97da8e9f9b113737c431415.tar.xz
yt-local-5594d017e28a5524a97da8e9f9b113737c431415.zip
1 files changed, 13 insertions, 9 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index b029a15..e7b31b7 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -109,7 +109,7 @@ def concat_or_none(*strings):
 def remove_redirect(url):
     if url is None:
         return None
-    if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking
+    if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # YouTube puts these on external links to do tracking
         query_string = url[url.find('?')+1: ]
         return urllib.parse.parse_qs(query_string)['q'][0]
     return url
@@ -133,11 +133,11 @@ def _recover_urls(runs):
     for run in runs:
         url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
         text = run.get('text', '')
-        # second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text
+        # second condition is necessary because YouTube makes other things into urls, such as hashtags, which we want to keep as text
         if url is not None and (text.startswith('http://') or text.startswith('https://')):
             url = remove_redirect(url)
             run['url'] = url
-            run['text'] = url # youtube truncates the url text, use actual url instead
+            run['text'] = url # YouTube truncates the url text, use actual url instead
 
 def extract_str(node, default=None, recover_urls=False):
     '''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix YouTube's truncation of url text (most prominently seen in descriptions)'''
@@ -569,13 +569,13 @@ def extract_items(response, item_types=_item_types,
                     item_types=item_types)
                 if items:
                     break
-    elif ('onResponseReceivedEndpoints' in response
+    if ('onResponseReceivedEndpoints' in response
           or 'onResponseReceivedActions' in response):
         for endpoint in multi_get(response,
                                   'onResponseReceivedEndpoints',
                                   'onResponseReceivedActions',
                                   []):
-            items, ctoken = extract_items_from_renderer_list(
+            new_items, new_ctoken = extract_items_from_renderer_list(
                 multi_deep_get(
                     endpoint,
                     ['reloadContinuationItemsCommand', 'continuationItems'],
@@ -584,13 +584,17 @@ def extract_items(response, item_types=_item_types,
                 ),
                 item_types=item_types,
             )
-            if items:
-                break
-    elif 'contents' in response:
+            items += new_items
+            if (not ctoken) or (new_ctoken and new_items):
+                ctoken = new_ctoken
+    if 'contents' in response:
         renderer = get(response, 'contents', {})
-        items, ctoken = extract_items_from_renderer(
+        new_items, new_ctoken = extract_items_from_renderer(
             renderer,
             item_types=item_types)
+        items += new_items
+        if (not ctoken) or (new_ctoken and new_items):
+            ctoken = new_ctoken
 
     if search_engagement_panels and 'engagementPanels' in response:
         new_items, new_ctoken = extract_items_from_renderer_list(
author	Jesus <heckyel@riseup.net>	2023-09-11 04:13:48 +0800
committer	Jesus <heckyel@riseup.net>	2023-09-11 04:13:56 +0800
commit	5594d017e28a5524a97da8e9f9b113737c431415 (patch)
tree	a52dfd2be552f03af6b5b007a16092068096d16c
parent	8f9c5eeb4824fed924c5550e7fc77270995b18ba (diff)
download	yt-local-5594d017e28a5524a97da8e9f9b113737c431415.tar.lz yt-local-5594d017e28a5524a97da8e9f9b113737c431415.tar.xz yt-local-5594d017e28a5524a97da8e9f9b113737c431415.zip