diff options
Diffstat (limited to 'youtube/yt_data_extract.py')
-rw-r--r-- | youtube/yt_data_extract.py | 121 |
1 files changed, 121 insertions, 0 deletions
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 663edc4..c666ede 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -3,6 +3,7 @@ from youtube import util import html import json import re +import urllib # videos (all of type str): @@ -279,3 +280,123 @@ def parse_info_prepare_for_html(renderer, additional_info={}): return item +def extract_channel_info(polymer_json, tab): + response = polymer_json[1]['response'] + try: + microformat = response['microformat']['microformatDataRenderer'] + + # channel doesn't exist or was terminated + # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org + except KeyError: + if 'alerts' in response and len(response['alerts']) > 0: + result = '' + for alert in response['alerts']: + result += alert['alertRenderer']['text']['simpleText'] + '\n' + flask.abort(200, result) + elif 'errors' in response['responseContext']: + for error in response['responseContext']['errors']['error']: + if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id': + flask.abort(404, 'This channel does not exist') + raise + + + info = {} + info['current_tab'] = tab + + + # stuff from microformat (info given by youtube for every page on channel) + info['short_description'] = microformat['description'] + info['channel_name'] = microformat['title'] + info['avatar'] = microformat['thumbnail']['thumbnails'][0]['url'] + channel_url = microformat['urlCanonical'].rstrip('/') + channel_id = channel_url[channel_url.rfind('/')+1:] + info['channel_id'] = channel_id + info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id + + info['items'] = [] + + # empty channel + if 'contents' not in response and 'continuationContents' not in response: + return info + + + # find the tab with content + # example channel where tabs do not have definite index: https://www.youtube.com/channel/UC4gQ8i3FD7YbhOgqUkeQEJg + # TODO: maybe use the 'selected' attribute for this? + if 'continuationContents' not in response: + tab_renderer = None + tab_content = None + for tab_json in response['contents']['twoColumnBrowseResultsRenderer']['tabs']: + try: + tab_renderer = tab_json['tabRenderer'] + except KeyError: + tab_renderer = tab_json['expandableTabRenderer'] + try: + tab_content = tab_renderer['content'] + break + except KeyError: + pass + else: # didn't break + raise Exception("No tabs found with content") + assert tab == tab_renderer['title'].lower() + + + # extract tab-specific info + if tab in ('videos', 'playlists', 'search'): # find the list of items + if 'continuationContents' in response: + try: + items = response['continuationContents']['gridContinuation']['items'] + except KeyError: + items = response['continuationContents']['sectionListContinuation']['contents'] # for search + else: + contents = tab_content['sectionListRenderer']['contents'] + if 'itemSectionRenderer' in contents[0]: + item_section = contents[0]['itemSectionRenderer']['contents'][0] + try: + items = item_section['gridRenderer']['items'] + except KeyError: + if "messageRenderer" in item_section: + items = [] + else: + raise Exception('gridRenderer missing but messageRenderer not found') + else: + items = contents # for search + + additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id} + info['items'] = [renderer_info(renderer, additional_info) for renderer in items] + + elif tab == 'about': + channel_metadata = tab_content['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'] + + + info['links'] = [] + for link_json in channel_metadata.get('primaryLinks', ()): + url = link_json['navigationEndpoint']['urlEndpoint']['url'] + if url.startswith('/redirect'): # youtube puts these on external links to do tracking + query_string = url[url.find('?')+1: ] + url = urllib.parse.parse_qs(query_string)['q'][0] + + text = get_plain_text(link_json['title']) + + info['links'].append( (text, url) ) + + + info['stats'] = [] + for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'): + try: + stat = channel_metadata[stat_name] + except KeyError: + continue + info['stats'].append(get_plain_text(stat)) + + if 'description' in channel_metadata: + info['description'] = get_text(channel_metadata['description']) + else: + info['description'] = '' + + else: + raise NotImplementedError('Unknown or unsupported channel tab: ' + tab) + + return info + + |