From a374f90f6e6d3544d759d206a154a51d213c0574 Mon Sep 17 00:00:00 2001 From: Astounds Date: Wed, 1 Apr 2026 11:43:46 -0500 Subject: fix: add support for YouTube Shorts tab on channel pages - Rewrite channel_ctoken_v5 with correct protobuf field numbers per tab (videos=15, shorts=10, streams=14) based on Invidious source - Replace broken pbj=1 endpoint with youtubei browse API for shorts/streams - Add shortsLockupViewModel parser to extract video data from new YT format - Fix channel metadata not loading (get_metadata now uses browse API) - Fix metadata caching: skip caching when channel_name is absent - Show actual item count instead of UU playlist count for shorts/streams - Format view counts with spaced suffixes (7.1 K, 1.2 M, 3 B) --- youtube/channel.py | 156 ++++++++++++++++++-------------------- youtube/yt_data_extract/common.py | 83 ++++++++++++++++++++ 2 files changed, 155 insertions(+), 84 deletions(-) diff --git a/youtube/channel.py b/youtube/channel.py index 55c1124..3352ca2 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -36,64 +36,41 @@ generic_cookie = (('Cookie', 'VISITOR_INFO1_LIVE=ST1Ti53r4fU'),) # FIXED 2026: YouTube changed continuation token structure (from Invidious commit a9f8127) # Sort values for YouTube API (from Invidious): 2=popular, 4=newest, 5=oldest def channel_ctoken_v5(channel_id, page, sort, tab, view=1): - # Map sort values to YouTube API values (Invidious values) - # Input: sort=3 (newest), sort=4 (newest no shorts) - # YouTube expects: 4=newest - sort_mapping = {'1': 2, '2': 5, '3': 4, '4': 4} # 4 is newest without shorts - new_sort = sort_mapping.get(sort, 4) - - offset = 30*(int(page) - 1) - - # Build continuation token using Invidious structure - # The structure is: base64(protobuf({ - # 80226972: { - # 2: channel_id, - # 3: base64(protobuf({ - # 110: { - # 3: { - # tab: { - # 1: { - # 1: base64(protobuf({ - # 1: base64(protobuf({ - # 2: "ST:" + base64(offset_varint) - # })) - # })) - # }, - # 2: base64(protobuf({1: UUID})) - # 4: sort_value - # 8: base64(protobuf({ - # 1: UUID - # 3: sort_value - # })) - # } - # } - # } - # })) - # } - # })) - - # UUID placeholder - uuid_proto = proto.string(1, "00000000-0000-0000-0000-000000000000") - - # Offset encoding - offset_varint = proto.uint(1, offset) - offset_encoded = proto.string(2, proto.unpadded_b64encode(offset_varint)) - offset_wrapper = proto.string(1, proto.unpadded_b64encode(offset_encoded)) - offset_base = proto.string(1, proto.unpadded_b64encode(offset_wrapper)) - - # Sort value varint - sort_varint = proto.uint(4, new_sort) - - # Embedded message with UUID and sort - embedded_inner = uuid_proto + proto.uint(3, new_sort) - embedded_encoded = proto.string(8, proto.unpadded_b64encode(embedded_inner)) - - # Combine: uuid_wrapper + sort_varint + embedded - tab_inner_content = offset_base + uuid_proto + sort_varint + embedded_encoded - - tab_inner = proto.string(1, proto.unpadded_b64encode(tab_inner_content)) - tab_wrapper = proto.string(tab, tab_inner) + # Tab-specific protobuf field numbers (from Invidious source) + # Each tab uses different field numbers in the protobuf structure: + # videos: 110 -> 3 -> 15 -> { 2:{1:UUID}, 4:sort, 8:{1:UUID, 3:sort} } + # shorts: 110 -> 3 -> 10 -> { 2:{1:UUID}, 4:sort, 7:{1:UUID, 3:sort} } + # streams: 110 -> 3 -> 14 -> { 2:{1:UUID}, 5:sort, 8:{1:UUID, 3:sort} } + tab_config = { + 'videos': {'tab_field': 15, 'sort_field': 4, 'embedded_field': 8}, + 'shorts': {'tab_field': 10, 'sort_field': 4, 'embedded_field': 7}, + 'streams': {'tab_field': 14, 'sort_field': 5, 'embedded_field': 8}, + } + config = tab_config.get(tab, tab_config['videos']) + tab_field = config['tab_field'] + sort_field = config['sort_field'] + embedded_field = config['embedded_field'] + + # Map sort values to YouTube API values + if tab == 'streams': + sort_mapping = {'1': 14, '2': 13, '3': 12, '4': 12} + else: + sort_mapping = {'1': 2, '2': 5, '3': 4, '4': 4} + new_sort = sort_mapping.get(sort, sort_mapping['3']) + + # UUID placeholder (field 1) + uuid_str = "00000000-0000-0000-0000-000000000000" + + # Build the tab-level object matching Invidious structure exactly: + # { 2: embedded{1: UUID}, sort_field: sort_val, embedded_field: embedded{1: UUID, 3: sort_val} } + tab_content = ( + proto.string(2, proto.string(1, uuid_str)) + + proto.uint(sort_field, new_sort) + + proto.string(embedded_field, + proto.string(1, uuid_str) + proto.uint(3, new_sort)) + ) + tab_wrapper = proto.string(tab_field, tab_content) inner_container = proto.string(3, tab_wrapper) outer_container = proto.string(110, inner_container) @@ -346,11 +323,10 @@ def get_channel_id(base_url): metadata_cache = cachetools.LRUCache(128) @cachetools.cached(metadata_cache) def get_metadata(channel_id): - base_url = 'https://www.youtube.com/channel/' + channel_id - polymer_json = util.fetch_url(base_url + '/about?pbj=1', - headers_desktop, - debug_name='gen_channel_about', - report_text='Retrieved channel metadata') + # Use youtubei browse API to get channel metadata + polymer_json = util.call_youtube_api('web', 'browse', { + 'browseId': channel_id, + }) info = yt_data_extract.extract_channel_info(json.loads(polymer_json), 'about', continuation=False) @@ -508,28 +484,35 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None): # Use the regular channel API if tab in ('shorts', 'streams') or (tab=='videos' and try_channel_api): - if channel_id: - num_videos_call = (get_number_of_videos_channel, channel_id) - else: - num_videos_call = (get_number_of_videos_general, base_url) + if not channel_id: + channel_id = get_channel_id(base_url) - # For page 1, use the first-page method which won't break - # Pass sort parameter directly (2=oldest, 3=newest, etc.) - if page_number == 1: - # Always use first-page method for page 1 with sort parameter - page_call = (get_channel_first_page, base_url, tab, None, sort) - else: - # For page 2+, we can't paginate without continuation tokens - # This is a YouTube limitation, not our bug - flask.abort(404, 'Pagination not available for this sort option. YouTube removed this feature.') + # Use youtubei browse API with continuation token for all pages + page_call = (get_channel_tab, channel_id, str(page_number), sort, + tab, int(view)) + continuation = True - tasks = ( - gevent.spawn(*num_videos_call), - gevent.spawn(*page_call), - ) - gevent.joinall(tasks) - util.check_gevent_exceptions(*tasks) - number_of_videos, polymer_json = tasks[0].value, tasks[1].value + if tab == 'videos': + # Only need video count for the videos tab + if channel_id: + num_videos_call = (get_number_of_videos_channel, channel_id) + else: + num_videos_call = (get_number_of_videos_general, base_url) + tasks = ( + gevent.spawn(*num_videos_call), + gevent.spawn(*page_call), + ) + gevent.joinall(tasks) + util.check_gevent_exceptions(*tasks) + number_of_videos, polymer_json = tasks[0].value, tasks[1].value + else: + # For shorts/streams, item count is used instead + polymer_json = gevent.spawn(*page_call) + polymer_json.join() + if polymer_json.exception: + raise polymer_json.exception + polymer_json = polymer_json.value + number_of_videos = 0 # will be replaced by actual item count later elif tab == 'about': # polymer_json = util.fetch_url(base_url + '/about?pbj=1', headers_desktop, debug_name='gen_channel_about') @@ -577,7 +560,8 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None): channel_id = info['channel_id'] # Will have microformat present, cache metadata while we have it - if channel_id and default_params and tab not in ('videos', 'about'): + if (channel_id and default_params and tab not in ('videos', 'about') + and info.get('channel_name') is not None): metadata = extract_metadata_for_caching(info) set_cached_metadata(channel_id, metadata) # Otherwise, populate with our (hopefully cached) metadata @@ -595,8 +579,12 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None): item.update(additional_info) if tab in ('videos', 'shorts', 'streams'): + if tab in ('shorts', 'streams'): + # For shorts/streams, use the actual item count since + # get_number_of_videos_channel counts regular uploads only + number_of_videos = len(info.get('items', [])) info['number_of_videos'] = number_of_videos - info['number_of_pages'] = math.ceil(number_of_videos/page_size) + info['number_of_pages'] = math.ceil(number_of_videos/page_size) if number_of_videos else 1 info['header_playlist_names'] = local_playlist.get_playlist_names() if tab in ('videos', 'shorts', 'streams', 'playlists'): info['current_sort'] = sort diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index 9a940ea..dce1d30 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -332,6 +332,84 @@ def extract_lockup_view_model_info(item, additional_info={}): return info +def extract_shorts_lockup_view_model_info(item, additional_info={}): + """Extract info from shortsLockupViewModel format (YouTube Shorts)""" + info = {'error': None, 'type': 'video'} + + # Video ID from reelWatchEndpoint or entityId + info['id'] = deep_get(item, + 'onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId') + if not info['id']: + entity_id = item.get('entityId', '') + if entity_id.startswith('shorts-shelf-item-'): + info['id'] = entity_id[len('shorts-shelf-item-'):] + + # Thumbnail + info['thumbnail'] = normalize_url(deep_get(item, + 'onTap', 'innertubeCommand', 'reelWatchEndpoint', + 'thumbnail', 'thumbnails', 0, 'url')) + + # Parse title and views from accessibilityText + # Format: "Title, N views - play Short" + acc_text = item.get('accessibilityText', '') + info['title'] = '' + info['view_count'] = None + info['approx_view_count'] = None + if acc_text: + # Remove trailing " - play Short" + cleaned = re.sub(r'\s*-\s*play Short$', '', acc_text) + # Split on last comma+views pattern to separate title from view count + match = re.match(r'^(.*?),\s*([\d,.]+\s*(?:thousand|million|billion|)\s*views?)$', + cleaned, re.IGNORECASE) + if match: + info['title'] = match.group(1).strip() + view_text = match.group(2) + info['view_count'] = extract_int(view_text) + # Convert "7.1 thousand" -> "7.1 K" for display + suffix_map = {'thousand': 'K', 'million': 'M', 'billion': 'B'} + suffix_match = re.search(r'([\d,.]+)\s*(thousand|million|billion)?', view_text, re.IGNORECASE) + if suffix_match: + num = suffix_match.group(1) + word = suffix_match.group(2) + if word: + info['approx_view_count'] = num + ' ' + suffix_map[word.lower()] + else: + info['approx_view_count'] = '{:,}'.format(int(num.replace(',', ''))) if num.isdigit() or num.replace(',','').isdigit() else num + else: + info['approx_view_count'] = extract_approx_int(view_text) + else: + # Fallback: try "N views" at end + match2 = re.match(r'^(.*?),\s*(.+views?)$', cleaned, re.IGNORECASE) + if match2: + info['title'] = match2.group(1).strip() + info['approx_view_count'] = extract_approx_int(match2.group(2)) + else: + info['title'] = cleaned + + # Overlay text (usually has the title too) + overlay_metadata = deep_get(item, 'overlayMetadata', + 'secondaryText', 'content') + if overlay_metadata and not info['approx_view_count']: + info['approx_view_count'] = extract_approx_int(overlay_metadata) + + primary_text = deep_get(item, 'overlayMetadata', + 'primaryText', 'content') + if primary_text and not info['title']: + info['title'] = primary_text + + info['duration'] = '' + info['time_published'] = None + info['description'] = None + info['badges'] = [] + info['author'] = None + info['author_id'] = None + info['author_url'] = None + info['index'] = None + + info.update(additional_info) + return info + + def extract_item_info(item, additional_info={}): if not item: return {'error': 'No item given'} @@ -353,6 +431,10 @@ def extract_item_info(item, additional_info={}): if type == 'lockupViewModel': return extract_lockup_view_model_info(item, additional_info) + # Handle shortsLockupViewModel format (YouTube Shorts) + if type == 'shortsLockupViewModel': + return extract_shorts_lockup_view_model_info(item, additional_info) + # type looks like e.g. 'compactVideoRenderer' or 'gridVideoRenderer' # camelCase split, https://stackoverflow.com/a/37697078 type_parts = [s.lower() for s in re.sub(r'([A-Z][a-z]+)', r' \1', type).split()] @@ -561,6 +643,7 @@ _item_types = { # New viewModel format (YouTube 2024+) 'lockupViewModel', + 'shortsLockupViewModel', } def _traverse_browse_renderer(renderer): -- cgit v1.2.3