aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/yt_data_extract/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube/yt_data_extract/common.py')
-rw-r--r--youtube/yt_data_extract/common.py83
1 files changed, 83 insertions, 0 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 9a940ea..dce1d30 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -332,6 +332,84 @@ def extract_lockup_view_model_info(item, additional_info={}):
return info
+def extract_shorts_lockup_view_model_info(item, additional_info={}):
+ """Extract info from shortsLockupViewModel format (YouTube Shorts)"""
+ info = {'error': None, 'type': 'video'}
+
+ # Video ID from reelWatchEndpoint or entityId
+ info['id'] = deep_get(item,
+ 'onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId')
+ if not info['id']:
+ entity_id = item.get('entityId', '')
+ if entity_id.startswith('shorts-shelf-item-'):
+ info['id'] = entity_id[len('shorts-shelf-item-'):]
+
+ # Thumbnail
+ info['thumbnail'] = normalize_url(deep_get(item,
+ 'onTap', 'innertubeCommand', 'reelWatchEndpoint',
+ 'thumbnail', 'thumbnails', 0, 'url'))
+
+ # Parse title and views from accessibilityText
+ # Format: "Title, N views - play Short"
+ acc_text = item.get('accessibilityText', '')
+ info['title'] = ''
+ info['view_count'] = None
+ info['approx_view_count'] = None
+ if acc_text:
+ # Remove trailing " - play Short"
+ cleaned = re.sub(r'\s*-\s*play Short$', '', acc_text)
+ # Split on last comma+views pattern to separate title from view count
+ match = re.match(r'^(.*?),\s*([\d,.]+\s*(?:thousand|million|billion|)\s*views?)$',
+ cleaned, re.IGNORECASE)
+ if match:
+ info['title'] = match.group(1).strip()
+ view_text = match.group(2)
+ info['view_count'] = extract_int(view_text)
+ # Convert "7.1 thousand" -> "7.1 K" for display
+ suffix_map = {'thousand': 'K', 'million': 'M', 'billion': 'B'}
+ suffix_match = re.search(r'([\d,.]+)\s*(thousand|million|billion)?', view_text, re.IGNORECASE)
+ if suffix_match:
+ num = suffix_match.group(1)
+ word = suffix_match.group(2)
+ if word:
+ info['approx_view_count'] = num + ' ' + suffix_map[word.lower()]
+ else:
+ info['approx_view_count'] = '{:,}'.format(int(num.replace(',', ''))) if num.isdigit() or num.replace(',','').isdigit() else num
+ else:
+ info['approx_view_count'] = extract_approx_int(view_text)
+ else:
+ # Fallback: try "N views" at end
+ match2 = re.match(r'^(.*?),\s*(.+views?)$', cleaned, re.IGNORECASE)
+ if match2:
+ info['title'] = match2.group(1).strip()
+ info['approx_view_count'] = extract_approx_int(match2.group(2))
+ else:
+ info['title'] = cleaned
+
+ # Overlay text (usually has the title too)
+ overlay_metadata = deep_get(item, 'overlayMetadata',
+ 'secondaryText', 'content')
+ if overlay_metadata and not info['approx_view_count']:
+ info['approx_view_count'] = extract_approx_int(overlay_metadata)
+
+ primary_text = deep_get(item, 'overlayMetadata',
+ 'primaryText', 'content')
+ if primary_text and not info['title']:
+ info['title'] = primary_text
+
+ info['duration'] = ''
+ info['time_published'] = None
+ info['description'] = None
+ info['badges'] = []
+ info['author'] = None
+ info['author_id'] = None
+ info['author_url'] = None
+ info['index'] = None
+
+ info.update(additional_info)
+ return info
+
+
def extract_item_info(item, additional_info={}):
if not item:
return {'error': 'No item given'}
@@ -353,6 +431,10 @@ def extract_item_info(item, additional_info={}):
if type == 'lockupViewModel':
return extract_lockup_view_model_info(item, additional_info)
+ # Handle shortsLockupViewModel format (YouTube Shorts)
+ if type == 'shortsLockupViewModel':
+ return extract_shorts_lockup_view_model_info(item, additional_info)
+
# type looks like e.g. 'compactVideoRenderer' or 'gridVideoRenderer'
# camelCase split, https://stackoverflow.com/a/37697078
type_parts = [s.lower() for s in re.sub(r'([A-Z][a-z]+)', r' \1', type).split()]
@@ -561,6 +643,7 @@ _item_types = {
# New viewModel format (YouTube 2024+)
'lockupViewModel',
+ 'shortsLockupViewModel',
}
def _traverse_browse_renderer(renderer):