Add functional but preliminary channel tab support

Add channel tabs to the channel template and script Update continuation token to request different tabs Add support for 'reelItemRenderer' format required to extract shorts
author: Jesus E <heckyel@riseup.net> 2023-06-17 16:05:40 -0400
committer: Jesus E <heckyel@riseup.net> 2023-06-17 16:05:40 -0400
commit: f322035d4ac6aa17386ac9dd05f9c7a8d6720e99 (patch)
tree: 865c9ad567525b1d0d5d912a7e2e92b33ee90378
parent: 74907a81835435f881424b41729cc71cb9d50056 (diff)
download: yt-local-f322035d4ac6aa17386ac9dd05f9c7a8d6720e99.tar.lz
yt-local-f322035d4ac6aa17386ac9dd05f9c7a8d6720e99.tar.xz
yt-local-f322035d4ac6aa17386ac9dd05f9c7a8d6720e99.zip
4 files changed, 75 insertions, 21 deletions
diff --git a/youtube/channel.py b/youtube/channel.py
index 4cf6cdf..5c757d3 100644
--- a/youtube/channel.py
+++ b/youtube/channel.py
@@ -32,16 +32,23 @@ real_cookie = (('Cookie', 'VISITOR_INFO1_LIVE=8XihrAcN1l4'),)
 generic_cookie = (('Cookie', 'VISITOR_INFO1_LIVE=ST1Ti53r4fU'),)
 
 # added an extra nesting under the 2nd base64 compared to v4
+# added tab support
 def channel_ctoken_v5(channel_id, page, sort, tab, view=1):
     new_sort = (2 if int(sort) == 1 else 1)
     offset = str(30*(int(page) - 1))
+    if tab == 'videos':
+        tab = 15
+    elif tab == 'shorts':
+        tab = 10
+    elif tab == 'streams':
+        tab = 14
     pointless_nest = proto.string(80226972,
         proto.string(2, channel_id)
         + proto.string(3,
             proto.percent_b64encode(
                 proto.string(110,
                     proto.string(3,
-                        proto.string(15,
+                        proto.string(tab,
                             proto.string(1,
                                 proto.string(1,
                                     proto.unpadded_b64encode(
@@ -167,7 +174,7 @@ def channel_ctoken_v2(channel_id, page, sort, tab, view=1):
 
     tab = proto.string(2, tab)
     sort = proto.uint(3, int(sort))
-    # page = proto.string(15, str(page) )
+    #page = proto.string(15, str(page))
 
     shelf_view = proto.uint(4, 0)
     view = proto.uint(6, int(view))
@@ -202,7 +209,7 @@ def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1,
     message = 'Got channel tab' if print_status else None
 
     if not ctoken:
-        if tab == 'videos':
+        if tab in ('videos', 'shorts', 'streams'):
             ctoken = channel_ctoken_v5(channel_id, page, sort, tab, view)
         else:
             ctoken = channel_ctoken_v3(channel_id, page, sort, tab, view)
@@ -349,11 +356,11 @@ def post_process_channel_info(info):
                 info['links'][i] = (text, util.prefix_url(url))
 
 
-def get_channel_first_page(base_url=None, channel_id=None):
+def get_channel_first_page(base_url=None, channel_id=None, tab='videos'):
     if channel_id:
         base_url = 'https://www.youtube.com/channel/' + channel_id
-    return util.fetch_url(base_url + '/videos?pbj=1&view=0', headers_desktop,
-                          debug_name='gen_channel_videos')
+    return util.fetch_url(base_url + '/' + tab + '?pbj=1&view=0',
+                          headers_desktop, debug_name='gen_channel_' + tab)
 
 
 playlist_sort_codes = {'2': "da", '3': "dd", '4': "lad"}
@@ -374,24 +381,25 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None):
     default_params = (page_number == 1 and sort == '3' and view == '1')
     continuation = bool(ctoken) # whether or not we're using a continuation
 
-    if tab == 'videos' and channel_id and not default_params:
+    if (tab in ('videos', 'shorts', 'streams') and channel_id and
+        not default_params):
         tasks = (
             gevent.spawn(get_number_of_videos_channel, channel_id),
             gevent.spawn(get_channel_tab, channel_id, page_number, sort,
-                         'videos', view, ctoken)
+                         tab, view, ctoken)
         )
         gevent.joinall(tasks)
         util.check_gevent_exceptions(*tasks)
         number_of_videos, polymer_json = tasks[0].value, tasks[1].value
         continuation = True
-    elif tab == 'videos':
+    elif tab in ('videos', 'shorts', 'streams'):
         if channel_id:
             num_videos_call = (get_number_of_videos_channel, channel_id)
         else:
             num_videos_call = (get_number_of_videos_general, base_url)
         tasks = (
             gevent.spawn(*num_videos_call),
-            gevent.spawn(get_channel_first_page, base_url=base_url),
+            gevent.spawn(get_channel_first_page, base_url=base_url, tab=tab),
         )
         gevent.joinall(tasks)
         util.check_gevent_exceptions(*tasks)
@@ -440,13 +448,13 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None):
             item.update(additional_info)
 
     if info['error'] is not None:
-        return flask.render_template('error.html', error_message=info['error'])
+        return flask.render_template('error.html', error_message = info['error'])
 
-    if tab == 'videos':
+    if tab in ('videos', 'shorts', 'streams'):
         info['number_of_videos'] = number_of_videos
         info['number_of_pages'] = math.ceil(number_of_videos/30)
         info['header_playlist_names'] = local_playlist.get_playlist_names()
-    if tab in ('videos', 'playlists'):
+    if tab in ('videos', 'shorts', 'streams', 'playlists'):
         info['current_sort'] = sort
     elif tab == 'search':
         info['search_box_value'] = query
@@ -457,9 +465,8 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None):
 
     post_process_channel_info(info)
 
-    return flask.render_template(
-        'channel.html',
-        parameters_dictionary=request.args,
+    return flask.render_template('channel.html',
+        parameters_dictionary = request.args,
         **info
     )
 
diff --git a/youtube/templates/channel.html b/youtube/templates/channel.html
index 6266aab..b86cd54 100644
--- a/youtube/templates/channel.html
+++ b/youtube/templates/channel.html
@@ -33,7 +33,7 @@
     <hr/>
 
     <nav class="channel-tabs">
-        {% for tab_name in ('Videos', 'Playlists', 'About') %}
+        {% for tab_name in ('Videos', 'Shorts', 'Streams', 'Playlists', 'About') %}
             {% if tab_name.lower() == current_tab %}
                 <a class="tab page-button">{{ tab_name }}</a>
             {% else %}
@@ -73,7 +73,7 @@
 
         <!-- new-->
         <div id="links-metadata">
-                {% if current_tab == 'videos' %}
+                {% if current_tab in ('videos', 'shorts', 'streams') %}
                     {% set sorts = [('1', 'views'), ('2', 'oldest'), ('3', 'newest')] %}
                     <div id="number-of-results">{{ number_of_videos }} videos</div>
                 {% elif current_tab == 'playlists' %}
@@ -110,11 +110,11 @@
         <hr/>
 
         <footer class="pagination-container">
-            {% if current_tab == 'videos' and current_sort.__str__() == '2' %}
+            {% if (current_tab in ('videos', 'shorts', 'streams')) and current_sort.__str__() == '2' %}
                 <nav class="next-previous-button-row">
                     {{ common_elements.next_previous_ctoken_buttons(None, ctoken, channel_url + '/' + current_tab, parameters_dictionary) }}
                 </nav>
-            {% elif current_tab == 'videos' %}
+            {% elif current_tab in ('videos', 'shorts', 'streams') %}
                 <nav class="pagination-list">
                     {{ common_elements.page_buttons(number_of_pages, channel_url + '/' + current_tab, parameters_dictionary, include_ends=(current_sort.__str__() == '3')) }}
                 </nav>
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index fcefbf7..5680b16 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -249,6 +249,9 @@ def extract_item_info(item, additional_info={}):
     primary_type = type_parts[-2]
     if primary_type == 'video':
         info['type'] = 'video'
+    elif type_parts[0] == 'reel': # shorts
+        info['type'] = 'video'
+        primary_type = 'short'
     elif primary_type in ('playlist', 'radio', 'show'):
         info['type'] = 'playlist'
         info['playlist_type'] = primary_type
@@ -343,6 +346,48 @@ def extract_item_info(item, additional_info={}):
         else:
             info['index'] = None
 
+    elif primary_type == 'short':
+        info['id'] = item.get('videoId')
+        if not info['id']:
+            info['id'] = deep_get(item,'navigationEndpoint',
+                                  'reelWatchEndpoint', 'videoId')
+        info['approx_view_count'] = extract_approx_int(item.get('viewCountText'))
+
+        # handle case where it is "No views"
+        if not info['approx_view_count']:
+            if ('No views' in item.get('shortViewCountText', '')
+                    or 'no views' in accessibility_label.lower()):
+                info['view_count'] = 0
+                info['approx_view_count'] = '0'
+
+        # dig into accessibility data to get duration for shorts
+        accessibility_label = multi_deep_get(item,
+            ['accessibility', 'accessibilityData', 'label'],
+            default='')
+
+        duration = re.search(r'(\d+) (second|seconds|minute) - play video',
+                             accessibility_label)
+        if duration.group(2) == 'minute':
+            info['duration'] = "1:00"
+        else:
+            info['duration'] = "0:" + duration.group(1).zfill(2)
+
+        # if it's an item in a playlist, get its index
+        if 'index' in item: # url has wrong index on playlist page
+            info['index'] = extract_int(item.get('index'))
+        elif 'indexText' in item:
+            # Current item in playlist has ▶ instead of the actual index, must
+            # dig into url
+            match = re.search(r'index=(\d+)', deep_get(item,
+                'navigationEndpoint', 'commandMetadata', 'webCommandMetadata',
+                'url', default=''))
+            if match is None:   # worth a try then
+                info['index'] = extract_int(item.get('indexText'))
+            else:
+                info['index'] = int(match.group(1))
+        else:
+            info['index'] = None
+
     elif primary_type in ('playlist', 'radio'):
         info['id'] = item.get('playlistId')
         info['video_count'] = extract_int(item.get('videoCount'))
@@ -398,6 +443,8 @@ _item_types = {
     'gridVideoRenderer',
     'playlistVideoRenderer',
 
+    'reelItemRenderer',
+
     'playlistRenderer',
     'compactPlaylistRenderer',
     'gridPlaylistRenderer',
diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py
index 9a6e31a..745d08f 100644
--- a/youtube/yt_data_extract/everything_else.py
+++ b/youtube/yt_data_extract/everything_else.py
@@ -73,7 +73,7 @@ def extract_channel_info(polymer_json, tab, continuation=False):
     #if 'contents' not in response and 'continuationContents' not in response:
     #    return info
 
-    if tab in ('videos', 'playlists', 'search'):
+    if tab in ('videos', 'shorts', 'streams', 'playlists', 'search'):
         items, ctoken = extract_items(response)
         additional_info = {
             'author': info['channel_name'],
author	Jesus E <heckyel@riseup.net>	2023-06-17 16:05:40 -0400
committer	Jesus E <heckyel@riseup.net>	2023-06-17 16:05:40 -0400
commit	f322035d4ac6aa17386ac9dd05f9c7a8d6720e99 (patch)
tree	865c9ad567525b1d0d5d912a7e2e92b33ee90378
parent	74907a81835435f881424b41729cc71cb9d50056 (diff)
download	yt-local-f322035d4ac6aa17386ac9dd05f9c7a8d6720e99.tar.lz yt-local-f322035d4ac6aa17386ac9dd05f9c7a8d6720e99.tar.xz yt-local-f322035d4ac6aa17386ac9dd05f9c7a8d6720e99.zip