diff options
Diffstat (limited to 'youtube/yt_data_extract')
-rw-r--r-- | youtube/yt_data_extract/common.py | 26 | ||||
-rw-r--r-- | youtube/yt_data_extract/watch_extraction.py | 34 |
2 files changed, 58 insertions, 2 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index 877444e..974d981 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -73,6 +73,15 @@ def conservative_update(obj, key, value): if obj.get(key) is None: obj[key] = value +def concat_or_none(*strings): + '''Concatenates strings. Returns None if any of the arguments are None''' + result = '' + for string in strings: + if string is None: + return None + result += string + return result + def remove_redirect(url): if url is None: return None @@ -268,6 +277,23 @@ def extract_item_info(item, additional_info={}): info['approx_view_count'] = '0' info['duration'] = extract_str(item.get('lengthText')) + + # if it's an item in a playlist, get its index + if 'index' in item: # url has wrong index on playlist page + info['index'] = extract_int(item.get('index')) + elif 'indexText' in item: + # Current item in playlist has ▶ instead of the actual index, must + # dig into url + match = re.search(r'index=(\d+)', deep_get(item, + 'navigationEndpoint', 'commandMetadata', 'webCommandMetadata', + 'url', default='')) + if match is None: # worth a try then + info['index'] = extract_int(item.get('indexText')) + else: + info['index'] = int(match.group(1)) + else: + info['index'] = None + elif primary_type in ('playlist', 'radio'): info['id'] = item.get('playlistId') info['video_count'] = extract_int(item.get('videoCount')) diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index bc02313..0b30c91 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -2,7 +2,7 @@ from .common import (get, multi_get, deep_get, multi_deep_get, liberal_update, conservative_update, remove_redirect, normalize_url, extract_str, extract_formatted_text, extract_int, extract_approx_int, extract_date, check_missing_keys, extract_item_info, extract_items, - extract_response) + extract_response, concat_or_none) import json import urllib.parse @@ -160,7 +160,37 @@ def _extract_watch_info_mobile(top_level): response = top_level.get('response', {}) - # video info from metadata renderers + # this renderer has the stuff visible on the page + # check for playlist + items, _ = extract_items(response, + item_types={'singleColumnWatchNextResults'}) + if items: + watch_next_results = items[0]['singleColumnWatchNextResults'] + playlist = deep_get(watch_next_results, 'playlist', 'playlist') + if playlist is None: + info['playlist'] = None + else: + info['playlist'] = {} + info['playlist']['title'] = playlist.get('title') + info['playlist']['author'] = extract_str(multi_get(playlist, + 'ownerName', 'longBylineText', 'shortBylineText', 'ownerText')) + author_id = deep_get(playlist, 'longBylineText', 'runs', 0, + 'navigationEndpoint', 'browseEndpoint', 'browseId') + info['playlist']['author_id'] = author_id + if author_id: + info['playlist']['author_url'] = concat_or_none( + 'https://www.youtube.com/channel/', author_id) + info['playlist']['id'] = playlist.get('playlistId') + info['playlist']['url'] = concat_or_none( + 'https://www.youtube.com/playlist?list=', + info['playlist']['id']) + info['playlist']['video_count'] = playlist.get('totalVideos') + info['playlist']['current_index'] = playlist.get('currentIndex') + info['playlist']['items'] = [ + extract_item_info(i) for i in playlist.get('contents', ())] + + # Holds the visible video info. It is inside singleColumnWatchNextResults + # but use our convenience function instead items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'}) if items: video_info = items[0]['slimVideoMetadataRenderer'] |