diff options
Diffstat (limited to 'youtube/yt_data_extract/common.py')
| -rw-r--r-- | youtube/yt_data_extract/common.py | 83 |
1 files changed, 83 insertions, 0 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index 9a940ea..dce1d30 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -332,6 +332,84 @@ def extract_lockup_view_model_info(item, additional_info={}): return info +def extract_shorts_lockup_view_model_info(item, additional_info={}): + """Extract info from shortsLockupViewModel format (YouTube Shorts)""" + info = {'error': None, 'type': 'video'} + + # Video ID from reelWatchEndpoint or entityId + info['id'] = deep_get(item, + 'onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId') + if not info['id']: + entity_id = item.get('entityId', '') + if entity_id.startswith('shorts-shelf-item-'): + info['id'] = entity_id[len('shorts-shelf-item-'):] + + # Thumbnail + info['thumbnail'] = normalize_url(deep_get(item, + 'onTap', 'innertubeCommand', 'reelWatchEndpoint', + 'thumbnail', 'thumbnails', 0, 'url')) + + # Parse title and views from accessibilityText + # Format: "Title, N views - play Short" + acc_text = item.get('accessibilityText', '') + info['title'] = '' + info['view_count'] = None + info['approx_view_count'] = None + if acc_text: + # Remove trailing " - play Short" + cleaned = re.sub(r'\s*-\s*play Short$', '', acc_text) + # Split on last comma+views pattern to separate title from view count + match = re.match(r'^(.*?),\s*([\d,.]+\s*(?:thousand|million|billion|)\s*views?)$', + cleaned, re.IGNORECASE) + if match: + info['title'] = match.group(1).strip() + view_text = match.group(2) + info['view_count'] = extract_int(view_text) + # Convert "7.1 thousand" -> "7.1 K" for display + suffix_map = {'thousand': 'K', 'million': 'M', 'billion': 'B'} + suffix_match = re.search(r'([\d,.]+)\s*(thousand|million|billion)?', view_text, re.IGNORECASE) + if suffix_match: + num = suffix_match.group(1) + word = suffix_match.group(2) + if word: + info['approx_view_count'] = num + ' ' + suffix_map[word.lower()] + else: + info['approx_view_count'] = '{:,}'.format(int(num.replace(',', ''))) if num.isdigit() or num.replace(',','').isdigit() else num + else: + info['approx_view_count'] = extract_approx_int(view_text) + else: + # Fallback: try "N views" at end + match2 = re.match(r'^(.*?),\s*(.+views?)$', cleaned, re.IGNORECASE) + if match2: + info['title'] = match2.group(1).strip() + info['approx_view_count'] = extract_approx_int(match2.group(2)) + else: + info['title'] = cleaned + + # Overlay text (usually has the title too) + overlay_metadata = deep_get(item, 'overlayMetadata', + 'secondaryText', 'content') + if overlay_metadata and not info['approx_view_count']: + info['approx_view_count'] = extract_approx_int(overlay_metadata) + + primary_text = deep_get(item, 'overlayMetadata', + 'primaryText', 'content') + if primary_text and not info['title']: + info['title'] = primary_text + + info['duration'] = '' + info['time_published'] = None + info['description'] = None + info['badges'] = [] + info['author'] = None + info['author_id'] = None + info['author_url'] = None + info['index'] = None + + info.update(additional_info) + return info + + def extract_item_info(item, additional_info={}): if not item: return {'error': 'No item given'} @@ -353,6 +431,10 @@ def extract_item_info(item, additional_info={}): if type == 'lockupViewModel': return extract_lockup_view_model_info(item, additional_info) + # Handle shortsLockupViewModel format (YouTube Shorts) + if type == 'shortsLockupViewModel': + return extract_shorts_lockup_view_model_info(item, additional_info) + # type looks like e.g. 'compactVideoRenderer' or 'gridVideoRenderer' # camelCase split, https://stackoverflow.com/a/37697078 type_parts = [s.lower() for s in re.sub(r'([A-Z][a-z]+)', r' \1', type).split()] @@ -561,6 +643,7 @@ _item_types = { # New viewModel format (YouTube 2024+) 'lockupViewModel', + 'shortsLockupViewModel', } def _traverse_browse_renderer(renderer): |
