From fb1a3531c59f5d9cee406295bbe006730695c249 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sun, 8 Sep 2019 17:20:02 -0700 Subject: Extraction: Fix url prefixing --- youtube/channel.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'youtube/channel.py') diff --git a/youtube/channel.py b/youtube/channel.py index de75eaa..79b7c9b 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -219,8 +219,7 @@ def extract_info(polymer_json, tab): else: items = contents # for search - # TODO: Fix this URL prefixing shit - additional_info = {'author': info['channel_name'], 'author_url': '/channel/' + channel_id} + additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id} info['items'] = [yt_data_extract.renderer_info(renderer, additional_info) for renderer in items] elif tab == 'about': @@ -258,8 +257,8 @@ def extract_info(polymer_json, tab): return info def post_process_channel_info(info): - info['avatar'] = '/' + info['avatar'] - info['channel_url'] = '/' + info['channel_url'] + info['avatar'] = util.prefix_url(info['avatar']) + info['channel_url'] = util.prefix_url(info['channel_url']) for item in info['items']: yt_data_extract.prefix_urls(item) yt_data_extract.add_extra_html_info(item) -- cgit v1.2.3 From bd343ed71f628e0f1dd1eb3f45fb4e04887f223f Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sun, 8 Sep 2019 17:28:11 -0700 Subject: Extraction: Move channel extraction to yt_data_extract --- youtube/channel.py | 122 +---------------------------------------------------- 1 file changed, 2 insertions(+), 120 deletions(-) (limited to 'youtube/channel.py') diff --git a/youtube/channel.py b/youtube/channel.py index 79b7c9b..16d0a3f 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -137,124 +137,6 @@ def get_channel_search_json(channel_id, query, page): return polymer_json -def extract_info(polymer_json, tab): - response = polymer_json[1]['response'] - try: - microformat = response['microformat']['microformatDataRenderer'] - - # channel doesn't exist or was terminated - # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org - except KeyError: - if 'alerts' in response and len(response['alerts']) > 0: - result = '' - for alert in response['alerts']: - result += alert['alertRenderer']['text']['simpleText'] + '\n' - flask.abort(200, result) - elif 'errors' in response['responseContext']: - for error in response['responseContext']['errors']['error']: - if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id': - flask.abort(404, 'This channel does not exist') - raise - - - info = {} - info['current_tab'] = tab - - - # stuff from microformat (info given by youtube for every page on channel) - info['short_description'] = microformat['description'] - info['channel_name'] = microformat['title'] - info['avatar'] = microformat['thumbnail']['thumbnails'][0]['url'] - channel_url = microformat['urlCanonical'].rstrip('/') - channel_id = channel_url[channel_url.rfind('/')+1:] - info['channel_id'] = channel_id - info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id - - info['items'] = [] - - # empty channel - if 'contents' not in response and 'continuationContents' not in response: - return info - - - # find the tab with content - # example channel where tabs do not have definite index: https://www.youtube.com/channel/UC4gQ8i3FD7YbhOgqUkeQEJg - # TODO: maybe use the 'selected' attribute for this? - if 'continuationContents' not in response: - tab_renderer = None - tab_content = None - for tab_json in response['contents']['twoColumnBrowseResultsRenderer']['tabs']: - try: - tab_renderer = tab_json['tabRenderer'] - except KeyError: - tab_renderer = tab_json['expandableTabRenderer'] - try: - tab_content = tab_renderer['content'] - break - except KeyError: - pass - else: # didn't break - raise Exception("No tabs found with content") - assert tab == tab_renderer['title'].lower() - - - # extract tab-specific info - if tab in ('videos', 'playlists', 'search'): # find the list of items - if 'continuationContents' in response: - try: - items = response['continuationContents']['gridContinuation']['items'] - except KeyError: - items = response['continuationContents']['sectionListContinuation']['contents'] # for search - else: - contents = tab_content['sectionListRenderer']['contents'] - if 'itemSectionRenderer' in contents[0]: - item_section = contents[0]['itemSectionRenderer']['contents'][0] - try: - items = item_section['gridRenderer']['items'] - except KeyError: - if "messageRenderer" in item_section: - items = [] - else: - raise Exception('gridRenderer missing but messageRenderer not found') - else: - items = contents # for search - - additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id} - info['items'] = [yt_data_extract.renderer_info(renderer, additional_info) for renderer in items] - - elif tab == 'about': - channel_metadata = tab_content['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'] - - - info['links'] = [] - for link_json in channel_metadata.get('primaryLinks', ()): - url = link_json['navigationEndpoint']['urlEndpoint']['url'] - if url.startswith('/redirect'): # youtube puts these on external links to do tracking - query_string = url[url.find('?')+1: ] - url = urllib.parse.parse_qs(query_string)['q'][0] - - text = yt_data_extract.get_plain_text(link_json['title']) - - info['links'].append( (text, url) ) - - - info['stats'] = [] - for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'): - try: - stat = channel_metadata[stat_name] - except KeyError: - continue - info['stats'].append(yt_data_extract.get_plain_text(stat)) - - if 'description' in channel_metadata: - info['description'] = yt_data_extract.get_text(channel_metadata['description']) - else: - info['description'] = '' - - else: - raise NotImplementedError('Unknown or unsupported channel tab: ' + tab) - - return info def post_process_channel_info(info): info['avatar'] = util.prefix_url(info['avatar']) @@ -303,7 +185,7 @@ def get_channel_page(channel_id, tab='videos'): flask.abort(404, 'Unknown channel tab: ' + tab) - info = extract_info(json.loads(polymer_json), tab) + info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) post_process_channel_info(info) if tab in ('videos', 'search'): info['number_of_videos'] = number_of_videos @@ -343,7 +225,7 @@ def get_channel_page_general_url(base_url, tab, request): flask.abort(404, 'Unknown channel tab: ' + tab) - info = extract_info(json.loads(polymer_json), tab) + info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) post_process_channel_info(info) if tab in ('videos', 'search'): info['number_of_videos'] = 1000 -- cgit v1.2.3 From 216231f9a6ca9ed48389e797a0c30d7d3b01e379 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sun, 8 Sep 2019 17:48:02 -0700 Subject: Extraction: Proper error handling for terminated or non-existant channels --- youtube/channel.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'youtube/channel.py') diff --git a/youtube/channel.py b/youtube/channel.py index 16d0a3f..3a2a0b3 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -186,6 +186,8 @@ def get_channel_page(channel_id, tab='videos'): info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) + if info['errors']: + return flask.render_template('error.html', error_message = '\n'.join(info['errors'])) post_process_channel_info(info) if tab in ('videos', 'search'): info['number_of_videos'] = number_of_videos @@ -226,6 +228,9 @@ def get_channel_page_general_url(base_url, tab, request): info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) + if info['errors']: + return flask.render_template('error.html', error_message = '\n'.join(info['errors'])) + post_process_channel_info(info) if tab in ('videos', 'search'): info['number_of_videos'] = 1000 -- cgit v1.2.3 From dc6c370152d063ad4198c747fc12eb06fc1ec0e4 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Wed, 18 Sep 2019 21:39:53 -0700 Subject: Extraction: refactor response extraction to work with both mobile & desktop respones, also improve errors --- youtube/channel.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'youtube/channel.py') diff --git a/youtube/channel.py b/youtube/channel.py index 3a2a0b3..67a79ad 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -186,8 +186,8 @@ def get_channel_page(channel_id, tab='videos'): info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) - if info['errors']: - return flask.render_template('error.html', error_message = '\n'.join(info['errors'])) + if info['error']: + return flask.render_template('error.html', error_message = info['error']) post_process_channel_info(info) if tab in ('videos', 'search'): info['number_of_videos'] = number_of_videos @@ -228,8 +228,8 @@ def get_channel_page_general_url(base_url, tab, request): info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) - if info['errors']: - return flask.render_template('error.html', error_message = '\n'.join(info['errors'])) + if info['error']: + return flask.render_template('error.html', error_message = info['error']) post_process_channel_info(info) if tab in ('videos', 'search'): -- cgit v1.2.3 From d1d908d5b1aadb0dc75b25df1a47789c021f89e2 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 19 Dec 2019 19:48:53 -0800 Subject: Extraction: Move html post processing stuff from yt_data_extract to util --- youtube/channel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube/channel.py') diff --git a/youtube/channel.py b/youtube/channel.py index 67a79ad..ad06e3f 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -142,8 +142,8 @@ def post_process_channel_info(info): info['avatar'] = util.prefix_url(info['avatar']) info['channel_url'] = util.prefix_url(info['channel_url']) for item in info['items']: - yt_data_extract.prefix_urls(item) - yt_data_extract.add_extra_html_info(item) + util.prefix_urls(item) + util.add_extra_html_info(item) -- cgit v1.2.3