diff options
author | James Taylor <user234683@users.noreply.github.com> | 2019-09-08 17:20:02 -0700 |
---|---|---|
committer | James Taylor <user234683@users.noreply.github.com> | 2019-09-08 17:20:02 -0700 |
commit | fb1a3531c59f5d9cee406295bbe006730695c249 (patch) | |
tree | 90423096184d7632b6271dfe3ca6375198bf8d8c /youtube/yt_data_extract.py | |
parent | 1b6fb4e100a2aed8c9c391e100fbbe60ac74d352 (diff) | |
download | yt-local-fb1a3531c59f5d9cee406295bbe006730695c249.tar.lz yt-local-fb1a3531c59f5d9cee406295bbe006730695c249.tar.xz yt-local-fb1a3531c59f5d9cee406295bbe006730695c249.zip |
Extraction: Fix url prefixing
Diffstat (limited to 'youtube/yt_data_extract.py')
-rw-r--r-- | youtube/yt_data_extract.py | 16 |
1 files changed, 12 insertions, 4 deletions
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 5419084..663edc4 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -2,6 +2,7 @@ from youtube import util import html import json +import re # videos (all of type str): @@ -152,15 +153,22 @@ def ajax_info(item_json): raise +youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$') +def normalize_url(url): + match = youtube_url_re.fullmatch(url) + if match is None: + raise Exception() + + return 'https://www.youtube.com' + match.group(1) def prefix_urls(item): try: - item['thumbnail'] = '/' + item['thumbnail'].lstrip('/') + item['thumbnail'] = util.prefix_url(item['thumbnail']) except KeyError: pass try: - item['author_url'] = util.URL_ORIGIN + item['author_url'] + item['author_url'] = util.prefix_url(item['author_url']) except KeyError: pass @@ -219,7 +227,7 @@ def renderer_info(renderer, additional_info={}): if 'ownerText' in renderer: info['author'] = renderer['ownerText']['runs'][0]['text'] - info['author_url'] = renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + info['author_url'] = normalize_url(renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']) try: overlays = renderer['thumbnailOverlays'] except KeyError: @@ -241,7 +249,7 @@ def renderer_info(renderer, additional_info={}): if key in ('longBylineText', 'shortBylineText'): info['author'] = get_text(node) try: - info['author_url'] = get_url(node) + info['author_url'] = normalize_url(get_url(node)) except KeyError: pass |