aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/yt_data_extract
diff options
context:
space:
mode:
authorJames Taylor <28744867+user234683@users.noreply.github.com>2020-10-21 18:53:12 -0700
committerGitHub <noreply@github.com>2020-10-21 18:53:12 -0700
commitaa52c7a42e9573105dfadb07981c7f5f1447ca9d (patch)
tree6bff6ae507db03e435bb04e3969ef08093f3f8d8 /youtube/yt_data_extract
parent3b5df36b0310b751fc25f8c0b7167c659c8259de (diff)
parentf01ef36a37c9112eca3f85d49622c41d68000a69 (diff)
downloadyt-local-aa52c7a42e9573105dfadb07981c7f5f1447ca9d.tar.lz
yt-local-aa52c7a42e9573105dfadb07981c7f5f1447ca9d.tar.xz
yt-local-aa52c7a42e9573105dfadb07981c7f5f1447ca9d.zip
Merge branch 'master' into add_sponsorblock
Diffstat (limited to 'youtube/yt_data_extract')
-rw-r--r--youtube/yt_data_extract/common.py19
-rw-r--r--youtube/yt_data_extract/everything_else.py14
-rw-r--r--youtube/yt_data_extract/watch_extraction.py12
3 files changed, 28 insertions, 17 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 2d3b637..683b1c6 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -90,15 +90,20 @@ def remove_redirect(url):
return urllib.parse.parse_qs(query_string)['q'][0]
return url
-youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$')
+norm_url_re = re.compile(r'^(?:(?:https?:)?//)?((?:[\w-]+\.)+[\w-]+)?(/.*)$')
def normalize_url(url):
+ '''Insert https, resolve relative paths for youtube.com, and put www. infront of youtube.com'''
if url is None:
return None
- match = youtube_url_re.fullmatch(url)
+ match = norm_url_re.fullmatch(url)
if match is None:
- raise Exception()
+ raise Exception(url)
- return 'https://www.youtube.com' + match.group(1)
+ domain = match.group(1) or 'www.youtube.com'
+ if domain == 'youtube.com':
+ domain = 'www.youtube.com'
+
+ return 'https://' + domain + match.group(2)
def _recover_urls(runs):
for run in runs:
@@ -240,11 +245,11 @@ def extract_item_info(item, additional_info={}):
))
info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None
info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText'))
- info['thumbnail'] = multi_deep_get(item,
+ info['thumbnail'] = normalize_url(multi_deep_get(item,
['thumbnail', 'thumbnails', 0, 'url'], # videos
['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists
['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows
- )
+ ))
info['badges'] = []
for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()):
@@ -290,7 +295,7 @@ def extract_item_info(item, additional_info={}):
info['duration'] = extract_str(item.get('lengthText'))
# if it's an item in a playlist, get its index
- if 'index' in item: # url has wrong index on playlist page
+ if 'index' in item: # url has wrong index on playlist page
info['index'] = extract_int(item.get('index'))
elif 'indexText' in item:
# Current item in playlist has ▶ instead of the actual index, must
diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py
index 5bb8709..b4b612d 100644
--- a/youtube/yt_data_extract/everything_else.py
+++ b/youtube/yt_data_extract/everything_else.py
@@ -49,10 +49,10 @@ def extract_channel_info(polymer_json, tab):
if info['short_description'] and len(info['short_description']) > 730:
info['short_description'] = info['short_description'][0:730] + '...'
info['channel_name'] = metadata.get('title')
- info['avatar'] = multi_deep_get(metadata,
+ info['avatar'] = normalize_url(multi_deep_get(metadata,
['avatar', 'thumbnails', 0, 'url'],
['thumbnail', 'thumbnails', 0, 'url'],
- )
+ ))
channel_url = multi_get(metadata, 'urlCanonical', 'channelUrl')
if channel_url:
channel_id = get(channel_url.rstrip('/').split('/'), -1)
@@ -164,7 +164,7 @@ def extract_playlist_metadata(polymer_json):
metadata['video_count'] = extract_int(header.get('numVideosText'))
metadata['description'] = extract_str(header.get('descriptionText'), default='')
metadata['author'] = extract_str(header.get('ownerText'))
- metadata['author_id'] = multi_deep_get(header,
+ metadata['author_id'] = multi_deep_get(header,
['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
['ownerEndpoint', 'browseEndpoint', 'browseId'])
if metadata['author_id']:
@@ -263,13 +263,13 @@ def extract_comments_info(polymer_json):
# These 3 are sometimes absent, likely because the channel was deleted
comment_info['author'] = extract_str(comment_renderer.get('authorText'))
- comment_info['author_url'] = deep_get(comment_renderer,
- 'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')
+ comment_info['author_url'] = normalize_url(deep_get(comment_renderer,
+ 'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'))
comment_info['author_id'] = deep_get(comment_renderer,
'authorEndpoint', 'browseEndpoint', 'browseId')
- comment_info['author_avatar'] = deep_get(comment_renderer,
- 'authorThumbnail', 'thumbnails', 0, 'url')
+ comment_info['author_avatar'] = normalize_url(deep_get(
+ comment_renderer, 'authorThumbnail', 'thumbnails', 0, 'url'))
comment_info['id'] = comment_renderer.get('commentId')
comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText'))
comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText'))
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index 340a367..5e57c15 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -172,7 +172,7 @@ def _extract_watch_info_mobile(top_level):
else:
info['playlist'] = {}
info['playlist']['title'] = playlist.get('title')
- info['playlist']['author'] = extract_str(multi_get(playlist,
+ info['playlist']['author'] = extract_str(multi_get(playlist,
'ownerName', 'longBylineText', 'shortBylineText', 'ownerText'))
author_id = deep_get(playlist, 'longBylineText', 'runs', 0,
'navigationEndpoint', 'browseEndpoint', 'browseId')
@@ -447,7 +447,8 @@ def _extract_playability_error(info, player_response, error_prefix=''):
SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
def extract_watch_info(polymer_json):
- info = {'playability_error': None, 'error': None}
+ info = {'playability_error': None, 'error': None,
+ 'player_response_missing': None}
if isinstance(polymer_json, dict):
top_level = polymer_json
@@ -477,6 +478,10 @@ def extract_watch_info(polymer_json):
else:
embedded_player_response = {}
+ # see https://github.com/user234683/youtube-local/issues/22#issuecomment-706395160
+ info['player_response_missing'] = not (
+ player_response or embedded_player_response)
+
# captions
info['automatic_caption_languages'] = []
info['manual_caption_languages'] = []
@@ -580,7 +585,8 @@ def get_caption_url(info, language, format, automatic=False, translation_languag
return url
def update_with_age_restricted_info(info, video_info_page):
- ERROR_PREFIX = 'Error bypassing age-restriction: '
+ '''Inserts urls from 'player_response' in get_video_info page'''
+ ERROR_PREFIX = 'Error getting missing player or bypassing age-restriction: '
video_info = urllib.parse.parse_qs(video_info_page)
player_response = deep_get(video_info, 'player_response', 0)