1 files changed, 83 insertions, 0 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 9a940ea..dce1d30 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -332,6 +332,84 @@ def extract_lockup_view_model_info(item, additional_info={}):
     return info
 
 
+def extract_shorts_lockup_view_model_info(item, additional_info={}):
+    """Extract info from shortsLockupViewModel format (YouTube Shorts)"""
+    info = {'error': None, 'type': 'video'}
+
+    # Video ID from reelWatchEndpoint or entityId
+    info['id'] = deep_get(item,
+        'onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId')
+    if not info['id']:
+        entity_id = item.get('entityId', '')
+        if entity_id.startswith('shorts-shelf-item-'):
+            info['id'] = entity_id[len('shorts-shelf-item-'):]
+
+    # Thumbnail
+    info['thumbnail'] = normalize_url(deep_get(item,
+        'onTap', 'innertubeCommand', 'reelWatchEndpoint',
+        'thumbnail', 'thumbnails', 0, 'url'))
+
+    # Parse title and views from accessibilityText
+    # Format: "Title, N views - play Short"
+    acc_text = item.get('accessibilityText', '')
+    info['title'] = ''
+    info['view_count'] = None
+    info['approx_view_count'] = None
+    if acc_text:
+        # Remove trailing " - play Short"
+        cleaned = re.sub(r'\s*-\s*play Short$', '', acc_text)
+        # Split on last comma+views pattern to separate title from view count
+        match = re.match(r'^(.*?),\s*([\d,.]+\s*(?:thousand|million|billion|)\s*views?)$',
+                         cleaned, re.IGNORECASE)
+        if match:
+            info['title'] = match.group(1).strip()
+            view_text = match.group(2)
+            info['view_count'] = extract_int(view_text)
+            # Convert "7.1 thousand" -> "7.1 K" for display
+            suffix_map = {'thousand': 'K', 'million': 'M', 'billion': 'B'}
+            suffix_match = re.search(r'([\d,.]+)\s*(thousand|million|billion)?', view_text, re.IGNORECASE)
+            if suffix_match:
+                num = suffix_match.group(1)
+                word = suffix_match.group(2)
+                if word:
+                    info['approx_view_count'] = num + ' ' + suffix_map[word.lower()]
+                else:
+                    info['approx_view_count'] = '{:,}'.format(int(num.replace(',', ''))) if num.isdigit() or num.replace(',','').isdigit() else num
+            else:
+                info['approx_view_count'] = extract_approx_int(view_text)
+        else:
+            # Fallback: try "N views" at end
+            match2 = re.match(r'^(.*?),\s*(.+views?)$', cleaned, re.IGNORECASE)
+            if match2:
+                info['title'] = match2.group(1).strip()
+                info['approx_view_count'] = extract_approx_int(match2.group(2))
+            else:
+                info['title'] = cleaned
+
+    # Overlay text (usually has the title too)
+    overlay_metadata = deep_get(item, 'overlayMetadata',
+        'secondaryText', 'content')
+    if overlay_metadata and not info['approx_view_count']:
+        info['approx_view_count'] = extract_approx_int(overlay_metadata)
+
+    primary_text = deep_get(item, 'overlayMetadata',
+        'primaryText', 'content')
+    if primary_text and not info['title']:
+        info['title'] = primary_text
+
+    info['duration'] = ''
+    info['time_published'] = None
+    info['description'] = None
+    info['badges'] = []
+    info['author'] = None
+    info['author_id'] = None
+    info['author_url'] = None
+    info['index'] = None
+
+    info.update(additional_info)
+    return info
+
+
 def extract_item_info(item, additional_info={}):
     if not item:
         return {'error': 'No item given'}
@@ -353,6 +431,10 @@ def extract_item_info(item, additional_info={}):
     if type == 'lockupViewModel':
         return extract_lockup_view_model_info(item, additional_info)
 
+    # Handle shortsLockupViewModel format (YouTube Shorts)
+    if type == 'shortsLockupViewModel':
+        return extract_shorts_lockup_view_model_info(item, additional_info)
+
     # type looks like e.g. 'compactVideoRenderer' or 'gridVideoRenderer'
     # camelCase split, https://stackoverflow.com/a/37697078
     type_parts = [s.lower() for s in re.sub(r'([A-Z][a-z]+)', r' \1', type).split()]
@@ -561,6 +643,7 @@ _item_types = {
 
     # New viewModel format (YouTube 2024+)
     'lockupViewModel',
+    'shortsLockupViewModel',
 }
 
 def _traverse_browse_renderer(renderer):