From fb1a3531c59f5d9cee406295bbe006730695c249 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sun, 8 Sep 2019 17:20:02 -0700 Subject: Extraction: Fix url prefixing --- youtube/channel.py | 7 +++---- youtube/util.py | 4 ++++ youtube/yt_data_extract.py | 16 ++++++++++++---- 3 files changed, 19 insertions(+), 8 deletions(-) (limited to 'youtube') diff --git a/youtube/channel.py b/youtube/channel.py index de75eaa..79b7c9b 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -219,8 +219,7 @@ def extract_info(polymer_json, tab): else: items = contents # for search - # TODO: Fix this URL prefixing shit - additional_info = {'author': info['channel_name'], 'author_url': '/channel/' + channel_id} + additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id} info['items'] = [yt_data_extract.renderer_info(renderer, additional_info) for renderer in items] elif tab == 'about': @@ -258,8 +257,8 @@ def extract_info(polymer_json, tab): return info def post_process_channel_info(info): - info['avatar'] = '/' + info['avatar'] - info['channel_url'] = '/' + info['channel_url'] + info['avatar'] = util.prefix_url(info['avatar']) + info['channel_url'] = util.prefix_url(info['channel_url']) for item in info['items']: yt_data_extract.prefix_urls(item) yt_data_extract.add_extra_html_info(item) diff --git a/youtube/util.py b/youtube/util.py index 2205645..a81ae83 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -317,3 +317,7 @@ def uppercase_escape(s): return re.sub( r'\\U([0-9a-fA-F]{8})', lambda m: chr(int(m.group(1), base=16)), s) + +def prefix_url(url): + url = url.lstrip('/') # some urls have // before them, which has a special meaning + return '/' + url diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 5419084..663edc4 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -2,6 +2,7 @@ from youtube import util import html import json +import re # videos (all of type str): @@ -152,15 +153,22 @@ def ajax_info(item_json): raise +youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$') +def normalize_url(url): + match = youtube_url_re.fullmatch(url) + if match is None: + raise Exception() + + return 'https://www.youtube.com' + match.group(1) def prefix_urls(item): try: - item['thumbnail'] = '/' + item['thumbnail'].lstrip('/') + item['thumbnail'] = util.prefix_url(item['thumbnail']) except KeyError: pass try: - item['author_url'] = util.URL_ORIGIN + item['author_url'] + item['author_url'] = util.prefix_url(item['author_url']) except KeyError: pass @@ -219,7 +227,7 @@ def renderer_info(renderer, additional_info={}): if 'ownerText' in renderer: info['author'] = renderer['ownerText']['runs'][0]['text'] - info['author_url'] = renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + info['author_url'] = normalize_url(renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']) try: overlays = renderer['thumbnailOverlays'] except KeyError: @@ -241,7 +249,7 @@ def renderer_info(renderer, additional_info={}): if key in ('longBylineText', 'shortBylineText'): info['author'] = get_text(node) try: - info['author_url'] = get_url(node) + info['author_url'] = normalize_url(get_url(node)) except KeyError: pass -- cgit v1.2.3