yt_data_extract: normalize thumbnail and author urls

for instance, urls that start with // become https:// adjustment required in comments.py because the url was left as a relative url in yt_data_extract by mistake and was using URL_ORIGIN prefix as fix. see #31
author: James Taylor <user234683@users.noreply.github.com> 2020-10-19 12:55:03 -0700
committer: James Taylor <user234683@users.noreply.github.com> 2020-10-19 12:55:03 -0700
commit: 75e8930958ea305a26a4652ef88639f7ad5db356 (patch)
tree: f92747b085110a05c11dafba66e012066d21659a /youtube/yt_data_extract/common.py
parent: e3c311e10a54ae1b7d9114cc58de315157538e0f (diff)
download: yt-local-75e8930958ea305a26a4652ef88639f7ad5db356.tar.lz
yt-local-75e8930958ea305a26a4652ef88639f7ad5db356.tar.xz
yt-local-75e8930958ea305a26a4652ef88639f7ad5db356.zip
1 files changed, 11 insertions, 6 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 2d3b637..9610479 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -90,15 +90,20 @@ def remove_redirect(url):
         return urllib.parse.parse_qs(query_string)['q'][0]
     return url
 
-youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$')
+norm_url_re = re.compile(r'^(?:(?:https?:)?//)?((?:[\w-]+\.)+[\w-]+)?(/.*)$')
 def normalize_url(url):
+    '''Insert https, resolve relative paths for youtube.com, and put www. infront of youtube.com'''
     if url is None:
         return None
-    match = youtube_url_re.fullmatch(url)
+    match = norm_url_re.fullmatch(url)
     if match is None:
-        raise Exception()
+        raise Exception(url)
 
-    return 'https://www.youtube.com' + match.group(1)
+    domain = match.group(1) or 'www.youtube.com'
+    if domain == 'youtube.com':
+        domain = 'www.youtube.com'
+
+    return 'https://' + domain + match.group(2)
 
 def _recover_urls(runs):
     for run in runs:
@@ -240,11 +245,11 @@ def extract_item_info(item, additional_info={}):
         ))
         info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None
     info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText'))
-    info['thumbnail'] = multi_deep_get(item,
+    info['thumbnail'] = normalize_url(multi_deep_get(item,
         ['thumbnail', 'thumbnails', 0, 'url'],      # videos
         ['thumbnails', 0, 'thumbnails', 0, 'url'],  # playlists
         ['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows
-    )
+    ))
 
     info['badges'] = []
     for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()):
author	James Taylor <user234683@users.noreply.github.com>	2020-10-19 12:55:03 -0700
committer	James Taylor <user234683@users.noreply.github.com>	2020-10-19 12:55:03 -0700
commit	75e8930958ea305a26a4652ef88639f7ad5db356 (patch)
tree	f92747b085110a05c11dafba66e012066d21659a /youtube/yt_data_extract/common.py
parent	e3c311e10a54ae1b7d9114cc58de315157538e0f (diff)
download	yt-local-75e8930958ea305a26a4652ef88639f7ad5db356.tar.lz yt-local-75e8930958ea305a26a4652ef88639f7ad5db356.tar.xz yt-local-75e8930958ea305a26a4652ef88639f7ad5db356.zip