diff options
Diffstat (limited to 'youtube/yt_data_extract')
-rw-r--r-- | youtube/yt_data_extract/common.py | 19 | ||||
-rw-r--r-- | youtube/yt_data_extract/everything_else.py | 14 | ||||
-rw-r--r-- | youtube/yt_data_extract/watch_extraction.py | 12 |
3 files changed, 28 insertions, 17 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index 2d3b637..683b1c6 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -90,15 +90,20 @@ def remove_redirect(url): return urllib.parse.parse_qs(query_string)['q'][0] return url -youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$') +norm_url_re = re.compile(r'^(?:(?:https?:)?//)?((?:[\w-]+\.)+[\w-]+)?(/.*)$') def normalize_url(url): + '''Insert https, resolve relative paths for youtube.com, and put www. infront of youtube.com''' if url is None: return None - match = youtube_url_re.fullmatch(url) + match = norm_url_re.fullmatch(url) if match is None: - raise Exception() + raise Exception(url) - return 'https://www.youtube.com' + match.group(1) + domain = match.group(1) or 'www.youtube.com' + if domain == 'youtube.com': + domain = 'www.youtube.com' + + return 'https://' + domain + match.group(2) def _recover_urls(runs): for run in runs: @@ -240,11 +245,11 @@ def extract_item_info(item, additional_info={}): )) info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText')) - info['thumbnail'] = multi_deep_get(item, + info['thumbnail'] = normalize_url(multi_deep_get(item, ['thumbnail', 'thumbnails', 0, 'url'], # videos ['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists ['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows - ) + )) info['badges'] = [] for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()): @@ -290,7 +295,7 @@ def extract_item_info(item, additional_info={}): info['duration'] = extract_str(item.get('lengthText')) # if it's an item in a playlist, get its index - if 'index' in item: # url has wrong index on playlist page + if 'index' in item: # url has wrong index on playlist page info['index'] = extract_int(item.get('index')) elif 'indexText' in item: # Current item in playlist has ▶ instead of the actual index, must diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py index 5bb8709..b4b612d 100644 --- a/youtube/yt_data_extract/everything_else.py +++ b/youtube/yt_data_extract/everything_else.py @@ -49,10 +49,10 @@ def extract_channel_info(polymer_json, tab): if info['short_description'] and len(info['short_description']) > 730: info['short_description'] = info['short_description'][0:730] + '...' info['channel_name'] = metadata.get('title') - info['avatar'] = multi_deep_get(metadata, + info['avatar'] = normalize_url(multi_deep_get(metadata, ['avatar', 'thumbnails', 0, 'url'], ['thumbnail', 'thumbnails', 0, 'url'], - ) + )) channel_url = multi_get(metadata, 'urlCanonical', 'channelUrl') if channel_url: channel_id = get(channel_url.rstrip('/').split('/'), -1) @@ -164,7 +164,7 @@ def extract_playlist_metadata(polymer_json): metadata['video_count'] = extract_int(header.get('numVideosText')) metadata['description'] = extract_str(header.get('descriptionText'), default='') metadata['author'] = extract_str(header.get('ownerText')) - metadata['author_id'] = multi_deep_get(header, + metadata['author_id'] = multi_deep_get(header, ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], ['ownerEndpoint', 'browseEndpoint', 'browseId']) if metadata['author_id']: @@ -263,13 +263,13 @@ def extract_comments_info(polymer_json): # These 3 are sometimes absent, likely because the channel was deleted comment_info['author'] = extract_str(comment_renderer.get('authorText')) - comment_info['author_url'] = deep_get(comment_renderer, - 'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url') + comment_info['author_url'] = normalize_url(deep_get(comment_renderer, + 'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')) comment_info['author_id'] = deep_get(comment_renderer, 'authorEndpoint', 'browseEndpoint', 'browseId') - comment_info['author_avatar'] = deep_get(comment_renderer, - 'authorThumbnail', 'thumbnails', 0, 'url') + comment_info['author_avatar'] = normalize_url(deep_get( + comment_renderer, 'authorThumbnail', 'thumbnails', 0, 'url')) comment_info['id'] = comment_renderer.get('commentId') comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText')) comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText')) diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index 340a367..5e57c15 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -172,7 +172,7 @@ def _extract_watch_info_mobile(top_level): else: info['playlist'] = {} info['playlist']['title'] = playlist.get('title') - info['playlist']['author'] = extract_str(multi_get(playlist, + info['playlist']['author'] = extract_str(multi_get(playlist, 'ownerName', 'longBylineText', 'shortBylineText', 'ownerText')) author_id = deep_get(playlist, 'longBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId') @@ -447,7 +447,8 @@ def _extract_playability_error(info, player_response, error_prefix=''): SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') def extract_watch_info(polymer_json): - info = {'playability_error': None, 'error': None} + info = {'playability_error': None, 'error': None, + 'player_response_missing': None} if isinstance(polymer_json, dict): top_level = polymer_json @@ -477,6 +478,10 @@ def extract_watch_info(polymer_json): else: embedded_player_response = {} + # see https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160 + info['player_response_missing'] = not ( + player_response or embedded_player_response) + # captions info['automatic_caption_languages'] = [] info['manual_caption_languages'] = [] @@ -580,7 +585,8 @@ def get_caption_url(info, language, format, automatic=False, translation_languag return url def update_with_age_restricted_info(info, video_info_page): - ERROR_PREFIX = 'Error bypassing age-restriction: ' + '''Inserts urls from 'player_response' in get_video_info page''' + ERROR_PREFIX = 'Error getting missing player or bypassing age-restriction: ' video_info = urllib.parse.parse_qs(video_info_page) player_response = deep_get(video_info, 'player_response', 0) |