diff options
| author | Astounds <kirito@disroot.org> | 2026-04-01 11:43:46 -0500 |
|---|---|---|
| committer | Astounds <kirito@disroot.org> | 2026-04-01 11:43:46 -0500 |
| commit | a374f90f6e6d3544d759d206a154a51d213c0574 (patch) | |
| tree | 9f0a89b801681fb5a96f8d9c1f1cfebddaac468f /youtube/yt_data_extract | |
| parent | bed14713adce4781af245cef1b9d9c6fbc413823 (diff) | |
| download | yt-local-a374f90f6e6d3544d759d206a154a51d213c0574.tar.lz yt-local-a374f90f6e6d3544d759d206a154a51d213c0574.tar.xz yt-local-a374f90f6e6d3544d759d206a154a51d213c0574.zip | |
fix: add support for YouTube Shorts tab on channel pages
- Rewrite channel_ctoken_v5 with correct protobuf field numbers per tab
(videos=15, shorts=10, streams=14) based on Invidious source
- Replace broken pbj=1 endpoint with youtubei browse API for shorts/streams
- Add shortsLockupViewModel parser to extract video data from new YT format
- Fix channel metadata not loading (get_metadata now uses browse API)
- Fix metadata caching: skip caching when channel_name is absent
- Show actual item count instead of UU playlist count for shorts/streams
- Format view counts with spaced suffixes (7.1 K, 1.2 M, 3 B)
Diffstat (limited to 'youtube/yt_data_extract')
| -rw-r--r-- | youtube/yt_data_extract/common.py | 83 |
1 files changed, 83 insertions, 0 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index 9a940ea..dce1d30 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -332,6 +332,84 @@ def extract_lockup_view_model_info(item, additional_info={}): return info +def extract_shorts_lockup_view_model_info(item, additional_info={}): + """Extract info from shortsLockupViewModel format (YouTube Shorts)""" + info = {'error': None, 'type': 'video'} + + # Video ID from reelWatchEndpoint or entityId + info['id'] = deep_get(item, + 'onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId') + if not info['id']: + entity_id = item.get('entityId', '') + if entity_id.startswith('shorts-shelf-item-'): + info['id'] = entity_id[len('shorts-shelf-item-'):] + + # Thumbnail + info['thumbnail'] = normalize_url(deep_get(item, + 'onTap', 'innertubeCommand', 'reelWatchEndpoint', + 'thumbnail', 'thumbnails', 0, 'url')) + + # Parse title and views from accessibilityText + # Format: "Title, N views - play Short" + acc_text = item.get('accessibilityText', '') + info['title'] = '' + info['view_count'] = None + info['approx_view_count'] = None + if acc_text: + # Remove trailing " - play Short" + cleaned = re.sub(r'\s*-\s*play Short$', '', acc_text) + # Split on last comma+views pattern to separate title from view count + match = re.match(r'^(.*?),\s*([\d,.]+\s*(?:thousand|million|billion|)\s*views?)$', + cleaned, re.IGNORECASE) + if match: + info['title'] = match.group(1).strip() + view_text = match.group(2) + info['view_count'] = extract_int(view_text) + # Convert "7.1 thousand" -> "7.1 K" for display + suffix_map = {'thousand': 'K', 'million': 'M', 'billion': 'B'} + suffix_match = re.search(r'([\d,.]+)\s*(thousand|million|billion)?', view_text, re.IGNORECASE) + if suffix_match: + num = suffix_match.group(1) + word = suffix_match.group(2) + if word: + info['approx_view_count'] = num + ' ' + suffix_map[word.lower()] + else: + info['approx_view_count'] = '{:,}'.format(int(num.replace(',', ''))) if num.isdigit() or num.replace(',','').isdigit() else num + else: + info['approx_view_count'] = extract_approx_int(view_text) + else: + # Fallback: try "N views" at end + match2 = re.match(r'^(.*?),\s*(.+views?)$', cleaned, re.IGNORECASE) + if match2: + info['title'] = match2.group(1).strip() + info['approx_view_count'] = extract_approx_int(match2.group(2)) + else: + info['title'] = cleaned + + # Overlay text (usually has the title too) + overlay_metadata = deep_get(item, 'overlayMetadata', + 'secondaryText', 'content') + if overlay_metadata and not info['approx_view_count']: + info['approx_view_count'] = extract_approx_int(overlay_metadata) + + primary_text = deep_get(item, 'overlayMetadata', + 'primaryText', 'content') + if primary_text and not info['title']: + info['title'] = primary_text + + info['duration'] = '' + info['time_published'] = None + info['description'] = None + info['badges'] = [] + info['author'] = None + info['author_id'] = None + info['author_url'] = None + info['index'] = None + + info.update(additional_info) + return info + + def extract_item_info(item, additional_info={}): if not item: return {'error': 'No item given'} @@ -353,6 +431,10 @@ def extract_item_info(item, additional_info={}): if type == 'lockupViewModel': return extract_lockup_view_model_info(item, additional_info) + # Handle shortsLockupViewModel format (YouTube Shorts) + if type == 'shortsLockupViewModel': + return extract_shorts_lockup_view_model_info(item, additional_info) + # type looks like e.g. 'compactVideoRenderer' or 'gridVideoRenderer' # camelCase split, https://stackoverflow.com/a/37697078 type_parts = [s.lower() for s in re.sub(r'([A-Z][a-z]+)', r' \1', type).split()] @@ -561,6 +643,7 @@ _item_types = { # New viewModel format (YouTube 2024+) 'lockupViewModel', + 'shortsLockupViewModel', } def _traverse_browse_renderer(renderer): |
