aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/yt_data_extract.py
diff options
context:
space:
mode:
authorJames Taylor <user234683@users.noreply.github.com>2019-09-08 17:20:02 -0700
committerJames Taylor <user234683@users.noreply.github.com>2019-09-08 17:20:02 -0700
commitfb1a3531c59f5d9cee406295bbe006730695c249 (patch)
tree90423096184d7632b6271dfe3ca6375198bf8d8c /youtube/yt_data_extract.py
parent1b6fb4e100a2aed8c9c391e100fbbe60ac74d352 (diff)
downloadyt-local-fb1a3531c59f5d9cee406295bbe006730695c249.tar.lz
yt-local-fb1a3531c59f5d9cee406295bbe006730695c249.tar.xz
yt-local-fb1a3531c59f5d9cee406295bbe006730695c249.zip
Extraction: Fix url prefixing
Diffstat (limited to 'youtube/yt_data_extract.py')
-rw-r--r--youtube/yt_data_extract.py16
1 files changed, 12 insertions, 4 deletions
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py
index 5419084..663edc4 100644
--- a/youtube/yt_data_extract.py
+++ b/youtube/yt_data_extract.py
@@ -2,6 +2,7 @@ from youtube import util
import html
import json
+import re
# videos (all of type str):
@@ -152,15 +153,22 @@ def ajax_info(item_json):
raise
+youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$')
+def normalize_url(url):
+ match = youtube_url_re.fullmatch(url)
+ if match is None:
+ raise Exception()
+
+ return 'https://www.youtube.com' + match.group(1)
def prefix_urls(item):
try:
- item['thumbnail'] = '/' + item['thumbnail'].lstrip('/')
+ item['thumbnail'] = util.prefix_url(item['thumbnail'])
except KeyError:
pass
try:
- item['author_url'] = util.URL_ORIGIN + item['author_url']
+ item['author_url'] = util.prefix_url(item['author_url'])
except KeyError:
pass
@@ -219,7 +227,7 @@ def renderer_info(renderer, additional_info={}):
if 'ownerText' in renderer:
info['author'] = renderer['ownerText']['runs'][0]['text']
- info['author_url'] = renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
+ info['author_url'] = normalize_url(renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'])
try:
overlays = renderer['thumbnailOverlays']
except KeyError:
@@ -241,7 +249,7 @@ def renderer_info(renderer, additional_info={}):
if key in ('longBylineText', 'shortBylineText'):
info['author'] = get_text(node)
try:
- info['author_url'] = get_url(node)
+ info['author_url'] = normalize_url(get_url(node))
except KeyError:
pass