diff options
Diffstat (limited to 'youtube/yt_data_extract')
-rw-r--r-- | youtube/yt_data_extract/common.py | 47 | ||||
-rw-r--r-- | youtube/yt_data_extract/everything_else.py | 2 |
2 files changed, 48 insertions, 1 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index fcefbf7..5680b16 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -249,6 +249,9 @@ def extract_item_info(item, additional_info={}): primary_type = type_parts[-2] if primary_type == 'video': info['type'] = 'video' + elif type_parts[0] == 'reel': # shorts + info['type'] = 'video' + primary_type = 'short' elif primary_type in ('playlist', 'radio', 'show'): info['type'] = 'playlist' info['playlist_type'] = primary_type @@ -343,6 +346,48 @@ def extract_item_info(item, additional_info={}): else: info['index'] = None + elif primary_type == 'short': + info['id'] = item.get('videoId') + if not info['id']: + info['id'] = deep_get(item,'navigationEndpoint', + 'reelWatchEndpoint', 'videoId') + info['approx_view_count'] = extract_approx_int(item.get('viewCountText')) + + # handle case where it is "No views" + if not info['approx_view_count']: + if ('No views' in item.get('shortViewCountText', '') + or 'no views' in accessibility_label.lower()): + info['view_count'] = 0 + info['approx_view_count'] = '0' + + # dig into accessibility data to get duration for shorts + accessibility_label = multi_deep_get(item, + ['accessibility', 'accessibilityData', 'label'], + default='') + + duration = re.search(r'(\d+) (second|seconds|minute) - play video', + accessibility_label) + if duration.group(2) == 'minute': + info['duration'] = "1:00" + else: + info['duration'] = "0:" + duration.group(1).zfill(2) + + # if it's an item in a playlist, get its index + if 'index' in item: # url has wrong index on playlist page + info['index'] = extract_int(item.get('index')) + elif 'indexText' in item: + # Current item in playlist has ▶ instead of the actual index, must + # dig into url + match = re.search(r'index=(\d+)', deep_get(item, + 'navigationEndpoint', 'commandMetadata', 'webCommandMetadata', + 'url', default='')) + if match is None: # worth a try then + info['index'] = extract_int(item.get('indexText')) + else: + info['index'] = int(match.group(1)) + else: + info['index'] = None + elif primary_type in ('playlist', 'radio'): info['id'] = item.get('playlistId') info['video_count'] = extract_int(item.get('videoCount')) @@ -398,6 +443,8 @@ _item_types = { 'gridVideoRenderer', 'playlistVideoRenderer', + 'reelItemRenderer', + 'playlistRenderer', 'compactPlaylistRenderer', 'gridPlaylistRenderer', diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py index 9a6e31a..745d08f 100644 --- a/youtube/yt_data_extract/everything_else.py +++ b/youtube/yt_data_extract/everything_else.py @@ -73,7 +73,7 @@ def extract_channel_info(polymer_json, tab, continuation=False): #if 'contents' not in response and 'continuationContents' not in response: # return info - if tab in ('videos', 'playlists', 'search'): + if tab in ('videos', 'shorts', 'streams', 'playlists', 'search'): items, ctoken = extract_items(response) additional_info = { 'author': info['channel_name'], |