diff options
author | James Taylor <28744867+user234683@users.noreply.github.com> | 2020-10-21 18:53:12 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-10-21 18:53:12 -0700 |
commit | aa52c7a42e9573105dfadb07981c7f5f1447ca9d (patch) | |
tree | 6bff6ae507db03e435bb04e3969ef08093f3f8d8 /youtube/yt_data_extract/common.py | |
parent | 3b5df36b0310b751fc25f8c0b7167c659c8259de (diff) | |
parent | f01ef36a37c9112eca3f85d49622c41d68000a69 (diff) | |
download | yt-local-aa52c7a42e9573105dfadb07981c7f5f1447ca9d.tar.lz yt-local-aa52c7a42e9573105dfadb07981c7f5f1447ca9d.tar.xz yt-local-aa52c7a42e9573105dfadb07981c7f5f1447ca9d.zip |
Merge branch 'master' into add_sponsorblock
Diffstat (limited to 'youtube/yt_data_extract/common.py')
-rw-r--r-- | youtube/yt_data_extract/common.py | 19 |
1 files changed, 12 insertions, 7 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index 2d3b637..683b1c6 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -90,15 +90,20 @@ def remove_redirect(url): return urllib.parse.parse_qs(query_string)['q'][0] return url -youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$') +norm_url_re = re.compile(r'^(?:(?:https?:)?//)?((?:[\w-]+\.)+[\w-]+)?(/.*)$') def normalize_url(url): + '''Insert https, resolve relative paths for youtube.com, and put www. infront of youtube.com''' if url is None: return None - match = youtube_url_re.fullmatch(url) + match = norm_url_re.fullmatch(url) if match is None: - raise Exception() + raise Exception(url) - return 'https://www.youtube.com' + match.group(1) + domain = match.group(1) or 'www.youtube.com' + if domain == 'youtube.com': + domain = 'www.youtube.com' + + return 'https://' + domain + match.group(2) def _recover_urls(runs): for run in runs: @@ -240,11 +245,11 @@ def extract_item_info(item, additional_info={}): )) info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText')) - info['thumbnail'] = multi_deep_get(item, + info['thumbnail'] = normalize_url(multi_deep_get(item, ['thumbnail', 'thumbnails', 0, 'url'], # videos ['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists ['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows - ) + )) info['badges'] = [] for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()): @@ -290,7 +295,7 @@ def extract_item_info(item, additional_info={}): info['duration'] = extract_str(item.get('lengthText')) # if it's an item in a playlist, get its index - if 'index' in item: # url has wrong index on playlist page + if 'index' in item: # url has wrong index on playlist page info['index'] = extract_int(item.get('index')) elif 'indexText' in item: # Current item in playlist has ▶ instead of the actual index, must |