From fb1a3531c59f5d9cee406295bbe006730695c249 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sun, 8 Sep 2019 17:20:02 -0700 Subject: Extraction: Fix url prefixing --- youtube/channel.py | 7 +++---- youtube/util.py | 4 ++++ youtube/yt_data_extract.py | 16 ++++++++++++---- 3 files changed, 19 insertions(+), 8 deletions(-) (limited to 'youtube') diff --git a/youtube/channel.py b/youtube/channel.py index de75eaa..79b7c9b 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -219,8 +219,7 @@ def extract_info(polymer_json, tab): else: items = contents # for search - # TODO: Fix this URL prefixing shit - additional_info = {'author': info['channel_name'], 'author_url': '/channel/' + channel_id} + additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id} info['items'] = [yt_data_extract.renderer_info(renderer, additional_info) for renderer in items] elif tab == 'about': @@ -258,8 +257,8 @@ def extract_info(polymer_json, tab): return info def post_process_channel_info(info): - info['avatar'] = '/' + info['avatar'] - info['channel_url'] = '/' + info['channel_url'] + info['avatar'] = util.prefix_url(info['avatar']) + info['channel_url'] = util.prefix_url(info['channel_url']) for item in info['items']: yt_data_extract.prefix_urls(item) yt_data_extract.add_extra_html_info(item) diff --git a/youtube/util.py b/youtube/util.py index 2205645..a81ae83 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -317,3 +317,7 @@ def uppercase_escape(s): return re.sub( r'\\U([0-9a-fA-F]{8})', lambda m: chr(int(m.group(1), base=16)), s) + +def prefix_url(url): + url = url.lstrip('/') # some urls have // before them, which has a special meaning + return '/' + url diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 5419084..663edc4 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -2,6 +2,7 @@ from youtube import util import html import json +import re # videos (all of type str): @@ -152,15 +153,22 @@ def ajax_info(item_json): raise +youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$') +def normalize_url(url): + match = youtube_url_re.fullmatch(url) + if match is None: + raise Exception() + + return 'https://www.youtube.com' + match.group(1) def prefix_urls(item): try: - item['thumbnail'] = '/' + item['thumbnail'].lstrip('/') + item['thumbnail'] = util.prefix_url(item['thumbnail']) except KeyError: pass try: - item['author_url'] = util.URL_ORIGIN + item['author_url'] + item['author_url'] = util.prefix_url(item['author_url']) except KeyError: pass @@ -219,7 +227,7 @@ def renderer_info(renderer, additional_info={}): if 'ownerText' in renderer: info['author'] = renderer['ownerText']['runs'][0]['text'] - info['author_url'] = renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + info['author_url'] = normalize_url(renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']) try: overlays = renderer['thumbnailOverlays'] except KeyError: @@ -241,7 +249,7 @@ def renderer_info(renderer, additional_info={}): if key in ('longBylineText', 'shortBylineText'): info['author'] = get_text(node) try: - info['author_url'] = get_url(node) + info['author_url'] = normalize_url(get_url(node)) except KeyError: pass -- cgit v1.2.3 From bd343ed71f628e0f1dd1eb3f45fb4e04887f223f Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sun, 8 Sep 2019 17:28:11 -0700 Subject: Extraction: Move channel extraction to yt_data_extract --- youtube/channel.py | 122 +-------------------------------------------- youtube/subscriptions.py | 2 +- youtube/yt_data_extract.py | 121 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 121 deletions(-) (limited to 'youtube') diff --git a/youtube/channel.py b/youtube/channel.py index 79b7c9b..16d0a3f 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -137,124 +137,6 @@ def get_channel_search_json(channel_id, query, page): return polymer_json -def extract_info(polymer_json, tab): - response = polymer_json[1]['response'] - try: - microformat = response['microformat']['microformatDataRenderer'] - - # channel doesn't exist or was terminated - # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org - except KeyError: - if 'alerts' in response and len(response['alerts']) > 0: - result = '' - for alert in response['alerts']: - result += alert['alertRenderer']['text']['simpleText'] + '\n' - flask.abort(200, result) - elif 'errors' in response['responseContext']: - for error in response['responseContext']['errors']['error']: - if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id': - flask.abort(404, 'This channel does not exist') - raise - - - info = {} - info['current_tab'] = tab - - - # stuff from microformat (info given by youtube for every page on channel) - info['short_description'] = microformat['description'] - info['channel_name'] = microformat['title'] - info['avatar'] = microformat['thumbnail']['thumbnails'][0]['url'] - channel_url = microformat['urlCanonical'].rstrip('/') - channel_id = channel_url[channel_url.rfind('/')+1:] - info['channel_id'] = channel_id - info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id - - info['items'] = [] - - # empty channel - if 'contents' not in response and 'continuationContents' not in response: - return info - - - # find the tab with content - # example channel where tabs do not have definite index: https://www.youtube.com/channel/UC4gQ8i3FD7YbhOgqUkeQEJg - # TODO: maybe use the 'selected' attribute for this? - if 'continuationContents' not in response: - tab_renderer = None - tab_content = None - for tab_json in response['contents']['twoColumnBrowseResultsRenderer']['tabs']: - try: - tab_renderer = tab_json['tabRenderer'] - except KeyError: - tab_renderer = tab_json['expandableTabRenderer'] - try: - tab_content = tab_renderer['content'] - break - except KeyError: - pass - else: # didn't break - raise Exception("No tabs found with content") - assert tab == tab_renderer['title'].lower() - - - # extract tab-specific info - if tab in ('videos', 'playlists', 'search'): # find the list of items - if 'continuationContents' in response: - try: - items = response['continuationContents']['gridContinuation']['items'] - except KeyError: - items = response['continuationContents']['sectionListContinuation']['contents'] # for search - else: - contents = tab_content['sectionListRenderer']['contents'] - if 'itemSectionRenderer' in contents[0]: - item_section = contents[0]['itemSectionRenderer']['contents'][0] - try: - items = item_section['gridRenderer']['items'] - except KeyError: - if "messageRenderer" in item_section: - items = [] - else: - raise Exception('gridRenderer missing but messageRenderer not found') - else: - items = contents # for search - - additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id} - info['items'] = [yt_data_extract.renderer_info(renderer, additional_info) for renderer in items] - - elif tab == 'about': - channel_metadata = tab_content['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'] - - - info['links'] = [] - for link_json in channel_metadata.get('primaryLinks', ()): - url = link_json['navigationEndpoint']['urlEndpoint']['url'] - if url.startswith('/redirect'): # youtube puts these on external links to do tracking - query_string = url[url.find('?')+1: ] - url = urllib.parse.parse_qs(query_string)['q'][0] - - text = yt_data_extract.get_plain_text(link_json['title']) - - info['links'].append( (text, url) ) - - - info['stats'] = [] - for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'): - try: - stat = channel_metadata[stat_name] - except KeyError: - continue - info['stats'].append(yt_data_extract.get_plain_text(stat)) - - if 'description' in channel_metadata: - info['description'] = yt_data_extract.get_text(channel_metadata['description']) - else: - info['description'] = '' - - else: - raise NotImplementedError('Unknown or unsupported channel tab: ' + tab) - - return info def post_process_channel_info(info): info['avatar'] = util.prefix_url(info['avatar']) @@ -303,7 +185,7 @@ def get_channel_page(channel_id, tab='videos'): flask.abort(404, 'Unknown channel tab: ' + tab) - info = extract_info(json.loads(polymer_json), tab) + info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) post_process_channel_info(info) if tab in ('videos', 'search'): info['number_of_videos'] = number_of_videos @@ -343,7 +225,7 @@ def get_channel_page_general_url(base_url, tab, request): flask.abort(404, 'Unknown channel tab: ' + tab) - info = extract_info(json.loads(polymer_json), tab) + info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) post_process_channel_info(info) if tab in ('videos', 'search'): info['number_of_videos'] = 1000 diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py index 56bdf93..175622f 100644 --- a/youtube/subscriptions.py +++ b/youtube/subscriptions.py @@ -455,7 +455,7 @@ def _get_upstream_videos(channel_id): print('Failed to read atoma feed for ' + channel_status_name) traceback.print_exc() - videos = channel.extract_info(json.loads(channel_tab), 'videos')['items'] + videos = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos')['items'] for i, video_item in enumerate(videos): if 'description' not in video_item: video_item['description'] = '' diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 663edc4..c666ede 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -3,6 +3,7 @@ from youtube import util import html import json import re +import urllib # videos (all of type str): @@ -279,3 +280,123 @@ def parse_info_prepare_for_html(renderer, additional_info={}): return item +def extract_channel_info(polymer_json, tab): + response = polymer_json[1]['response'] + try: + microformat = response['microformat']['microformatDataRenderer'] + + # channel doesn't exist or was terminated + # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org + except KeyError: + if 'alerts' in response and len(response['alerts']) > 0: + result = '' + for alert in response['alerts']: + result += alert['alertRenderer']['text']['simpleText'] + '\n' + flask.abort(200, result) + elif 'errors' in response['responseContext']: + for error in response['responseContext']['errors']['error']: + if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id': + flask.abort(404, 'This channel does not exist') + raise + + + info = {} + info['current_tab'] = tab + + + # stuff from microformat (info given by youtube for every page on channel) + info['short_description'] = microformat['description'] + info['channel_name'] = microformat['title'] + info['avatar'] = microformat['thumbnail']['thumbnails'][0]['url'] + channel_url = microformat['urlCanonical'].rstrip('/') + channel_id = channel_url[channel_url.rfind('/')+1:] + info['channel_id'] = channel_id + info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id + + info['items'] = [] + + # empty channel + if 'contents' not in response and 'continuationContents' not in response: + return info + + + # find the tab with content + # example channel where tabs do not have definite index: https://www.youtube.com/channel/UC4gQ8i3FD7YbhOgqUkeQEJg + # TODO: maybe use the 'selected' attribute for this? + if 'continuationContents' not in response: + tab_renderer = None + tab_content = None + for tab_json in response['contents']['twoColumnBrowseResultsRenderer']['tabs']: + try: + tab_renderer = tab_json['tabRenderer'] + except KeyError: + tab_renderer = tab_json['expandableTabRenderer'] + try: + tab_content = tab_renderer['content'] + break + except KeyError: + pass + else: # didn't break + raise Exception("No tabs found with content") + assert tab == tab_renderer['title'].lower() + + + # extract tab-specific info + if tab in ('videos', 'playlists', 'search'): # find the list of items + if 'continuationContents' in response: + try: + items = response['continuationContents']['gridContinuation']['items'] + except KeyError: + items = response['continuationContents']['sectionListContinuation']['contents'] # for search + else: + contents = tab_content['sectionListRenderer']['contents'] + if 'itemSectionRenderer' in contents[0]: + item_section = contents[0]['itemSectionRenderer']['contents'][0] + try: + items = item_section['gridRenderer']['items'] + except KeyError: + if "messageRenderer" in item_section: + items = [] + else: + raise Exception('gridRenderer missing but messageRenderer not found') + else: + items = contents # for search + + additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id} + info['items'] = [renderer_info(renderer, additional_info) for renderer in items] + + elif tab == 'about': + channel_metadata = tab_content['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'] + + + info['links'] = [] + for link_json in channel_metadata.get('primaryLinks', ()): + url = link_json['navigationEndpoint']['urlEndpoint']['url'] + if url.startswith('/redirect'): # youtube puts these on external links to do tracking + query_string = url[url.find('?')+1: ] + url = urllib.parse.parse_qs(query_string)['q'][0] + + text = get_plain_text(link_json['title']) + + info['links'].append( (text, url) ) + + + info['stats'] = [] + for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'): + try: + stat = channel_metadata[stat_name] + except KeyError: + continue + info['stats'].append(get_plain_text(stat)) + + if 'description' in channel_metadata: + info['description'] = get_text(channel_metadata['description']) + else: + info['description'] = '' + + else: + raise NotImplementedError('Unknown or unsupported channel tab: ' + tab) + + return info + + -- cgit v1.2.3 From 216231f9a6ca9ed48389e797a0c30d7d3b01e379 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sun, 8 Sep 2019 17:48:02 -0700 Subject: Extraction: Proper error handling for terminated or non-existant channels --- youtube/channel.py | 5 +++++ youtube/subscriptions.py | 7 ++++++- youtube/yt_data_extract.py | 13 +++++++------ 3 files changed, 18 insertions(+), 7 deletions(-) (limited to 'youtube') diff --git a/youtube/channel.py b/youtube/channel.py index 16d0a3f..3a2a0b3 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -186,6 +186,8 @@ def get_channel_page(channel_id, tab='videos'): info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) + if info['errors']: + return flask.render_template('error.html', error_message = '\n'.join(info['errors'])) post_process_channel_info(info) if tab in ('videos', 'search'): info['number_of_videos'] = number_of_videos @@ -226,6 +228,9 @@ def get_channel_page_general_url(base_url, tab, request): info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) + if info['errors']: + return flask.render_template('error.html', error_message = '\n'.join(info['errors'])) + post_process_channel_info(info) if tab in ('videos', 'search'): info['number_of_videos'] = 1000 diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py index 175622f..87e1659 100644 --- a/youtube/subscriptions.py +++ b/youtube/subscriptions.py @@ -455,7 +455,12 @@ def _get_upstream_videos(channel_id): print('Failed to read atoma feed for ' + channel_status_name) traceback.print_exc() - videos = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos')['items'] + channel_info = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos') + if channel_info['errors']: + print('Error checking channel ' + channel_status_name + ': ' + ', '.join(channel_info['errors'])) + return + + videos = channel_info['items'] for i, video_item in enumerate(videos): if 'description' not in video_item: video_item['description'] = '' diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index c666ede..f0c89cb 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -281,6 +281,7 @@ def parse_info_prepare_for_html(renderer, additional_info={}): def extract_channel_info(polymer_json, tab): + info = {'errors': []} response = polymer_json[1]['response'] try: microformat = response['microformat']['microformatDataRenderer'] @@ -289,18 +290,18 @@ def extract_channel_info(polymer_json, tab): # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org except KeyError: if 'alerts' in response and len(response['alerts']) > 0: - result = '' for alert in response['alerts']: - result += alert['alertRenderer']['text']['simpleText'] + '\n' - flask.abort(200, result) + info['errors'].append(alert['alertRenderer']['text']['simpleText']) + return info elif 'errors' in response['responseContext']: for error in response['responseContext']['errors']['error']: if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id': - flask.abort(404, 'This channel does not exist') - raise + info['errors'].append('This channel does not exist') + return info + info['errors'].append('Failure getting microformat') + return info - info = {} info['current_tab'] = tab -- cgit v1.2.3 From c362a5e834d88524c154cb010be9dc909dcbe25d Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sun, 8 Sep 2019 18:06:30 -0700 Subject: Extraction: Move search extraction to yt_data_extract --- youtube/search.py | 58 +++++++--------------------------------------- youtube/yt_data_extract.py | 48 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 49 deletions(-) (limited to 'youtube') diff --git a/youtube/search.py b/youtube/search.py index e167279..81a69f2 100644 --- a/youtube/search.py +++ b/youtube/search.py @@ -5,7 +5,6 @@ import settings import json import urllib import base64 -from math import ceil import mimetypes from flask import request import flask @@ -74,59 +73,20 @@ def get_search_page(): filters['time'] = int(request.args.get("time", "0")) filters['type'] = int(request.args.get("type", "0")) filters['duration'] = int(request.args.get("duration", "0")) - info = get_search_json(query, page, autocorrect, sort, filters) - - estimated_results = int(info[1]['response']['estimatedResults']) - estimated_pages = ceil(estimated_results/20) + polymer_json = get_search_json(query, page, autocorrect, sort, filters) - # almost always is the first "section", but if there's an advertisement for a google product like Stadia or Home in the search results, then that becomes the first "section" and the search results are in the second. So just join all of them for resiliency - results = [] - for section in info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']: - results += section['itemSectionRenderer']['contents'] - - parsed_results = [] - corrections = {'type': None} - for renderer in results: - type = list(renderer.keys())[0] - if type == 'shelfRenderer': - continue - if type == 'didYouMeanRenderer': - renderer = renderer[type] - corrected_query_string = request.args.to_dict(flat=False) - corrected_query_string['query'] = [renderer['correctedQueryEndpoint']['searchEndpoint']['query']] - corrected_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(corrected_query_string, doseq=True) - - corrections = { - 'type': 'did_you_mean', - 'corrected_query': yt_data_extract.format_text_runs(renderer['correctedQuery']['runs']), - 'corrected_query_url': corrected_query_url, - } - continue - if type == 'showingResultsForRenderer': - renderer = renderer[type] - no_autocorrect_query_string = request.args.to_dict(flat=False) - no_autocorrect_query_string['autocorrect'] = ['0'] - no_autocorrect_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(no_autocorrect_query_string, doseq=True) - - corrections = { - 'type': 'showing_results_for', - 'corrected_query': yt_data_extract.format_text_runs(renderer['correctedQuery']['runs']), - 'original_query_url': no_autocorrect_query_url, - 'original_query': renderer['originalQuery']['simpleText'], - } - continue - - info = yt_data_extract.parse_info_prepare_for_html(renderer) - if info['type'] != 'unsupported': - parsed_results.append(info) + search_info = yt_data_extract.extract_search_info(polymer_json) + for item_info in search_info['items']: + yt_data_extract.prefix_urls(item_info) + yt_data_extract.add_extra_html_info(item_info) return flask.render_template('search.html', header_playlist_names = local_playlist.get_playlist_names(), query = query, - estimated_results = estimated_results, - estimated_pages = estimated_pages, - corrections = corrections, - results = parsed_results, + estimated_results = search_info['estimated_results'], + estimated_pages = search_info['estimated_pages'], + corrections = search_info['corrections'], + results = search_info['items'], parameters_dictionary = request.args, ) diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index f0c89cb..95c68bc 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -4,6 +4,7 @@ import html import json import re import urllib +from math import ceil # videos (all of type str): @@ -400,4 +401,51 @@ def extract_channel_info(polymer_json, tab): return info +def extract_search_info(polymer_json): + info = {} + info['estimated_results'] = int(polymer_json[1]['response']['estimatedResults']) + info['estimated_pages'] = ceil(info['estimated_results']/20) + + # almost always is the first "section", but if there's an advertisement for a google product like Stadia or Home in the search results, then that becomes the first "section" and the search results are in the second. So just join all of them for resiliency + results = [] + for section in polymer_json[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']: + results += section['itemSectionRenderer']['contents'] + + info['items'] = [] + info['corrections'] = {'type': None} + for renderer in results: + type = list(renderer.keys())[0] + if type == 'shelfRenderer': + continue + if type == 'didYouMeanRenderer': + renderer = renderer[type] + corrected_query_string = request.args.to_dict(flat=False) + corrected_query_string['query'] = [renderer['correctedQueryEndpoint']['searchEndpoint']['query']] + corrected_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(corrected_query_string, doseq=True) + + info['corrections'] = { + 'type': 'did_you_mean', + 'corrected_query': yt_data_extract.format_text_runs(renderer['correctedQuery']['runs']), + 'corrected_query_url': corrected_query_url, + } + continue + if type == 'showingResultsForRenderer': + renderer = renderer[type] + no_autocorrect_query_string = request.args.to_dict(flat=False) + no_autocorrect_query_string['autocorrect'] = ['0'] + no_autocorrect_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(no_autocorrect_query_string, doseq=True) + + info['corrections'] = { + 'type': 'showing_results_for', + 'corrected_query': yt_data_extract.format_text_runs(renderer['correctedQuery']['runs']), + 'original_query_url': no_autocorrect_query_url, + 'original_query': renderer['originalQuery']['simpleText'], + } + continue + + item_info = renderer_info(renderer) + if item_info['type'] != 'unsupported': + info['items'].append(item_info) + + return info -- cgit v1.2.3 From 89e5761f8d9ae4221c4a97eca3c0fce3405a5bc4 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sun, 8 Sep 2019 18:42:08 -0700 Subject: Extraction: Move playlist extraction to yt_data_extract --- youtube/playlist.py | 28 ++++++++++------------------ youtube/templates/playlist.html | 2 +- youtube/yt_data_extract.py | 27 +++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 19 deletions(-) (limited to 'youtube') diff --git a/youtube/playlist.py b/youtube/playlist.py index 3e5b0d2..2f7abdc 100644 --- a/youtube/playlist.py +++ b/youtube/playlist.py @@ -89,28 +89,20 @@ def get_playlist_page(): ) gevent.joinall(tasks) first_page_json, this_page_json = tasks[0].value, tasks[1].value - - try: # first page - video_list = this_page_json['response']['contents']['singleColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer']['contents'] - except KeyError: # other pages - video_list = this_page_json['response']['continuationContents']['playlistVideoListContinuation']['contents'] - - parsed_video_list = [yt_data_extract.parse_info_prepare_for_html(video_json) for video_json in video_list] - - - metadata = yt_data_extract.renderer_info(first_page_json['response']['header']) - yt_data_extract.prefix_urls(metadata) - if 'description' not in metadata: - metadata['description'] = '' + info = yt_data_extract.extract_playlist_info(this_page_json) + if page != '1': + info['metadata'] = yt_data_extract.extract_playlist_metadata(first_page_json) - video_count = int(metadata['size'].replace(',', '')) - metadata['size'] += ' videos' + yt_data_extract.prefix_urls(info['metadata']) + for item in info['items']: + yt_data_extract.prefix_urls(item) + yt_data_extract.add_extra_html_info(item) return flask.render_template('playlist.html', - video_list = parsed_video_list, - num_pages = math.ceil(video_count/20), + video_list = info['items'], + num_pages = math.ceil(info['metadata']['size']/20), parameters_dictionary = request.args, - **metadata + **info['metadata'] ).encode('utf-8') diff --git a/youtube/templates/playlist.html b/youtube/templates/playlist.html index ab2640f..52c468e 100644 --- a/youtube/templates/playlist.html +++ b/youtube/templates/playlist.html @@ -55,7 +55,7 @@ {{ author }}
{{ views }}
-
{{ size }}
+
{{ size }} videos
{{ common_elements.text_runs(description) }}
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 95c68bc..e7a2f1e 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -449,3 +449,30 @@ def extract_search_info(polymer_json): return info + +def extract_playlist_metadata(polymer_json): + metadata = renderer_info(polymer_json['response']['header']) + + if 'description' not in metadata: + metadata['description'] = '' + + metadata['size'] = int(metadata['size'].replace(',', '')) + + return metadata + +def extract_playlist_info(polymer_json): + info = {} + try: # first page + video_list = polymer_json['response']['contents']['singleColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer']['contents'] + first_page = True + except KeyError: # other pages + video_list = polymer_json['response']['continuationContents']['playlistVideoListContinuation']['contents'] + first_page = False + + info['items'] = [renderer_info(renderer) for renderer in video_list] + + if first_page: + info['metadata'] = extract_playlist_metadata(polymer_json) + + return info + -- cgit v1.2.3 From dc6c370152d063ad4198c747fc12eb06fc1ec0e4 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Wed, 18 Sep 2019 21:39:53 -0700 Subject: Extraction: refactor response extraction to work with both mobile & desktop respones, also improve errors --- youtube/channel.py | 8 +++---- youtube/playlist.py | 3 +++ youtube/search.py | 3 +++ youtube/subscriptions.py | 4 ++-- youtube/yt_data_extract.py | 59 +++++++++++++++++++++++++++++++++------------- 5 files changed, 54 insertions(+), 23 deletions(-) (limited to 'youtube') diff --git a/youtube/channel.py b/youtube/channel.py index 3a2a0b3..67a79ad 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -186,8 +186,8 @@ def get_channel_page(channel_id, tab='videos'): info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) - if info['errors']: - return flask.render_template('error.html', error_message = '\n'.join(info['errors'])) + if info['error']: + return flask.render_template('error.html', error_message = info['error']) post_process_channel_info(info) if tab in ('videos', 'search'): info['number_of_videos'] = number_of_videos @@ -228,8 +228,8 @@ def get_channel_page_general_url(base_url, tab, request): info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) - if info['errors']: - return flask.render_template('error.html', error_message = '\n'.join(info['errors'])) + if info['error']: + return flask.render_template('error.html', error_message = info['error']) post_process_channel_info(info) if tab in ('videos', 'search'): diff --git a/youtube/playlist.py b/youtube/playlist.py index 2f7abdc..bc2c417 100644 --- a/youtube/playlist.py +++ b/youtube/playlist.py @@ -91,6 +91,9 @@ def get_playlist_page(): first_page_json, this_page_json = tasks[0].value, tasks[1].value info = yt_data_extract.extract_playlist_info(this_page_json) + if info['error']: + return flask.render_template('error.html', error_message = info['error']) + if page != '1': info['metadata'] = yt_data_extract.extract_playlist_metadata(first_page_json) diff --git a/youtube/search.py b/youtube/search.py index 81a69f2..ba40f0b 100644 --- a/youtube/search.py +++ b/youtube/search.py @@ -76,6 +76,9 @@ def get_search_page(): polymer_json = get_search_json(query, page, autocorrect, sort, filters) search_info = yt_data_extract.extract_search_info(polymer_json) + if search_info['error']: + return flask.render_template('error.html', error_message = search_info['error']) + for item_info in search_info['items']: yt_data_extract.prefix_urls(item_info) yt_data_extract.add_extra_html_info(item_info) diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py index 87e1659..e0c71f5 100644 --- a/youtube/subscriptions.py +++ b/youtube/subscriptions.py @@ -456,8 +456,8 @@ def _get_upstream_videos(channel_id): traceback.print_exc() channel_info = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos') - if channel_info['errors']: - print('Error checking channel ' + channel_status_name + ': ' + ', '.join(channel_info['errors'])) + if channel_info['error']: + print('Error checking channel ' + channel_status_name + ': ' + channel_info['error']) return videos = channel_info['items'] diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index e7a2f1e..440cc0d 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -280,10 +280,29 @@ def parse_info_prepare_for_html(renderer, additional_info={}): return item +def get_response(polymer_json): + '''return response, error''' + + # responses returned for desktop version + try: + return polymer_json[1]['response'], None + except (TypeError, KeyError, IndexError): + pass + + # responses returned for mobile version + try: + return polymer_json['response'], None + except (TypeError, KeyError): + pass + + return None, 'Failed to extract response' + def extract_channel_info(polymer_json, tab): - info = {'errors': []} - response = polymer_json[1]['response'] + response, err = get_response(polymer_json) + if err: + return {'error': err} + try: microformat = response['microformat']['microformatDataRenderer'] @@ -291,18 +310,14 @@ def extract_channel_info(polymer_json, tab): # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org except KeyError: if 'alerts' in response and len(response['alerts']) > 0: - for alert in response['alerts']: - info['errors'].append(alert['alertRenderer']['text']['simpleText']) - return info + return {'error': ' '.join(alert['alertRenderer']['text']['simpleText'] for alert in response['alerts']) } elif 'errors' in response['responseContext']: for error in response['responseContext']['errors']['error']: if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id': - info['errors'].append('This channel does not exist') - return info - info['errors'].append('Failure getting microformat') - return info - + return {'error': 'This channel does not exist'} + return {'error': 'Failure getting microformat'} + info = {'error': None} info['current_tab'] = tab @@ -402,13 +417,16 @@ def extract_channel_info(polymer_json, tab): return info def extract_search_info(polymer_json): - info = {} - info['estimated_results'] = int(polymer_json[1]['response']['estimatedResults']) + response, err = get_response(polymer_json) + if err: + return {'error': err} + info = {'error': None} + info['estimated_results'] = int(response['estimatedResults']) info['estimated_pages'] = ceil(info['estimated_results']/20) # almost always is the first "section", but if there's an advertisement for a google product like Stadia or Home in the search results, then that becomes the first "section" and the search results are in the second. So just join all of them for resiliency results = [] - for section in polymer_json[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']: + for section in response['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']: results += section['itemSectionRenderer']['contents'] info['items'] = [] @@ -451,7 +469,11 @@ def extract_search_info(polymer_json): return info def extract_playlist_metadata(polymer_json): - metadata = renderer_info(polymer_json['response']['header']) + response, err = get_response(polymer_json) + if err: + return {'error': err} + metadata = renderer_info(response['header']) + metadata['error'] = None if 'description' not in metadata: metadata['description'] = '' @@ -461,12 +483,15 @@ def extract_playlist_metadata(polymer_json): return metadata def extract_playlist_info(polymer_json): - info = {} + response, err = get_response(polymer_json) + if err: + return {'error': err} + info = {'error': None} try: # first page - video_list = polymer_json['response']['contents']['singleColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer']['contents'] + video_list = response['contents']['singleColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer']['contents'] first_page = True except KeyError: # other pages - video_list = polymer_json['response']['continuationContents']['playlistVideoListContinuation']['contents'] + video_list = response['continuationContents']['playlistVideoListContinuation']['contents'] first_page = False info['items'] = [renderer_info(renderer) for renderer in video_list] -- cgit v1.2.3 From 61c50e0b540fa7ebabadb870c6aeb38b87d4912c Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 19 Sep 2019 11:41:16 -0700 Subject: Extraction: Move comment extraction to yt_data_extract --- youtube/comments.py | 99 +++------------------------------------ youtube/util.py | 9 ---- youtube/yt_data_extract.py | 113 +++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 114 insertions(+), 107 deletions(-) (limited to 'youtube') diff --git a/youtube/comments.py b/youtube/comments.py index 3b1ef86..250a95f 100644 --- a/youtube/comments.py +++ b/youtube/comments.py @@ -48,24 +48,6 @@ def comment_replies_ctoken(video_id, comment_id, max_results=500): result = proto.nested(2, proto.string(2, video_id)) + proto.uint(3,6) + proto.nested(6, params) return base64.urlsafe_b64encode(result).decode('ascii') -def ctoken_metadata(ctoken): - result = dict() - params = proto.parse(proto.b64_to_bytes(ctoken)) - result['video_id'] = proto.parse(params[2])[2].decode('ascii') - - offset_information = proto.parse(params[6]) - result['offset'] = offset_information.get(5, 0) - - result['is_replies'] = False - if (3 in offset_information) and (2 in proto.parse(offset_information[3])): - result['is_replies'] = True - result['sort'] = None - else: - try: - result['sort'] = proto.parse(offset_information[4])[6] - except KeyError: - result['sort'] = 0 - return result mobile_headers = { @@ -91,7 +73,9 @@ def request_comments(ctoken, replies=False): print("got , retrying") continue break - return content + + polymer_json = json.loads(util.uppercase_escape(content.decode('utf-8'))) + return polymer_json def single_comment_ctoken(video_id, comment_id): @@ -102,77 +86,6 @@ def single_comment_ctoken(video_id, comment_id): -def parse_comments_polymer(content): - try: - video_title = '' - content = json.loads(util.uppercase_escape(content.decode('utf-8'))) - url = content[1]['url'] - ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] - metadata = ctoken_metadata(ctoken) - - try: - comments_raw = content[1]['response']['continuationContents']['commentSectionContinuation']['items'] - except KeyError: - comments_raw = content[1]['response']['continuationContents']['commentRepliesContinuation']['contents'] - - ctoken = util.default_multi_get(content, 1, 'response', 'continuationContents', 'commentSectionContinuation', 'continuations', 0, 'nextContinuationData', 'continuation', default='') - - comments = [] - for comment_json in comments_raw: - number_of_replies = 0 - try: - comment_thread = comment_json['commentThreadRenderer'] - except KeyError: - comment_renderer = comment_json['commentRenderer'] - else: - if 'commentTargetTitle' in comment_thread: - video_title = comment_thread['commentTargetTitle']['runs'][0]['text'] - - if 'replies' in comment_thread: - view_replies_text = yt_data_extract.get_plain_text(comment_thread['replies']['commentRepliesRenderer']['moreText']) - view_replies_text = view_replies_text.replace(',', '') - match = re.search(r'(\d+)', view_replies_text) - if match is None: - number_of_replies = 1 - else: - number_of_replies = int(match.group(1)) - comment_renderer = comment_thread['comment']['commentRenderer'] - - comment = { - 'author_id': comment_renderer.get('authorId', ''), - 'author_avatar': comment_renderer['authorThumbnail']['thumbnails'][0]['url'], - 'likes': comment_renderer['likeCount'], - 'published': yt_data_extract.get_plain_text(comment_renderer['publishedTimeText']), - 'text': comment_renderer['contentText'].get('runs', ''), - 'number_of_replies': number_of_replies, - 'comment_id': comment_renderer['commentId'], - } - - if 'authorText' in comment_renderer: # deleted channels have no name or channel link - comment['author'] = yt_data_extract.get_plain_text(comment_renderer['authorText']) - comment['author_url'] = comment_renderer['authorEndpoint']['commandMetadata']['webCommandMetadata']['url'] - comment['author_channel_id'] = comment_renderer['authorEndpoint']['browseEndpoint']['browseId'] - else: - comment['author'] = '' - comment['author_url'] = '' - comment['author_channel_id'] = '' - - comments.append(comment) - except Exception as e: - print('Error parsing comments: ' + str(e)) - comments = () - ctoken = '' - - return { - 'ctoken': ctoken, - 'comments': comments, - 'video_title': video_title, - 'video_id': metadata['video_id'], - 'offset': metadata['offset'], - 'is_replies': metadata['is_replies'], - 'sort': metadata['sort'], - } - def post_process_comments_info(comments_info): for comment in comments_info['comments']: comment['author_url'] = util.URL_ORIGIN + comment['author_url'] @@ -207,7 +120,7 @@ def post_process_comments_info(comments_info): comment['likes_text'] = str(comment['likes']) + ' likes' comments_info['include_avatars'] = settings.enable_comment_avatars - if comments_info['ctoken'] != '': + if comments_info['ctoken']: comments_info['more_comments_url'] = util.URL_ORIGIN + '/comments?ctoken=' + comments_info['ctoken'] comments_info['page_number'] = page_number = str(int(comments_info['offset']/20) + 1) @@ -222,7 +135,7 @@ def post_process_comments_info(comments_info): def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''): if settings.comments_mode: - comments_info = parse_comments_polymer(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key))) + comments_info = yt_data_extract.parse_comments_polymer(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key))) post_process_comments_info(comments_info) post_comment_url = util.URL_ORIGIN + "/post_comment?video_id=" + video_id @@ -247,7 +160,7 @@ def get_comments_page(): ctoken = comment_replies_ctoken(video_id, parent_id) replies = True - comments_info = parse_comments_polymer(request_comments(ctoken, replies)) + comments_info = yt_data_extract.parse_comments_polymer(request_comments(ctoken, replies)) post_process_comments_info(comments_info) if not replies: diff --git a/youtube/util.py b/youtube/util.py index a81ae83..5b63e2a 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -277,15 +277,6 @@ def video_id(url): url_parts = urllib.parse.urlparse(url) return urllib.parse.parse_qs(url_parts.query)['v'][0] -def default_multi_get(object, *keys, default): - ''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors ''' - try: - for key in keys: - object = object[key] - return object - except (IndexError, KeyError): - return default - # default, sddefault, mqdefault, hqdefault, hq720 def get_thumbnail_url(video_id): diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 440cc0d..551b663 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -1,4 +1,4 @@ -from youtube import util +from youtube import util, proto import html import json @@ -59,10 +59,14 @@ def format_text_runs(runs): return result - - - - +def default_multi_get(object, *keys, default): + ''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors ''' + try: + for key in keys: + object = object[key] + return object + except (IndexError, KeyError): + return default def get_url(node): @@ -501,3 +505,102 @@ def extract_playlist_info(polymer_json): return info +def ctoken_metadata(ctoken): + result = dict() + params = proto.parse(proto.b64_to_bytes(ctoken)) + result['video_id'] = proto.parse(params[2])[2].decode('ascii') + + offset_information = proto.parse(params[6]) + result['offset'] = offset_information.get(5, 0) + + result['is_replies'] = False + if (3 in offset_information) and (2 in proto.parse(offset_information[3])): + result['is_replies'] = True + result['sort'] = None + else: + try: + result['sort'] = proto.parse(offset_information[4])[6] + except KeyError: + result['sort'] = 0 + return result + +def parse_comments_polymer(polymer_json): + try: + video_title = '' + response, err = get_response(polymer_json) + if err: + raise Exception(err) + + try: + url = polymer_json[1]['url'] + except (TypeError, IndexError, KeyError): + url = polymer_json['url'] + + ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] + metadata = ctoken_metadata(ctoken) + + try: + comments_raw = response['continuationContents']['commentSectionContinuation']['items'] + except KeyError: + comments_raw = response['continuationContents']['commentRepliesContinuation']['contents'] + + ctoken = default_multi_get(response, 'continuationContents', 'commentSectionContinuation', 'continuations', 0, 'nextContinuationData', 'continuation', default='') + + comments = [] + for comment_json in comments_raw: + number_of_replies = 0 + try: + comment_thread = comment_json['commentThreadRenderer'] + except KeyError: + comment_renderer = comment_json['commentRenderer'] + else: + if 'commentTargetTitle' in comment_thread: + video_title = comment_thread['commentTargetTitle']['runs'][0]['text'] + + if 'replies' in comment_thread: + view_replies_text = get_plain_text(comment_thread['replies']['commentRepliesRenderer']['moreText']) + view_replies_text = view_replies_text.replace(',', '') + match = re.search(r'(\d+)', view_replies_text) + if match is None: + number_of_replies = 1 + else: + number_of_replies = int(match.group(1)) + comment_renderer = comment_thread['comment']['commentRenderer'] + + comment = { + 'author_id': comment_renderer.get('authorId', ''), + 'author_avatar': comment_renderer['authorThumbnail']['thumbnails'][0]['url'], + 'likes': comment_renderer['likeCount'], + 'published': get_plain_text(comment_renderer['publishedTimeText']), + 'text': comment_renderer['contentText'].get('runs', ''), + 'number_of_replies': number_of_replies, + 'comment_id': comment_renderer['commentId'], + } + + if 'authorText' in comment_renderer: # deleted channels have no name or channel link + comment['author'] = get_plain_text(comment_renderer['authorText']) + comment['author_url'] = comment_renderer['authorEndpoint']['commandMetadata']['webCommandMetadata']['url'] + comment['author_channel_id'] = comment_renderer['authorEndpoint']['browseEndpoint']['browseId'] + else: + comment['author'] = '' + comment['author_url'] = '' + comment['author_channel_id'] = '' + + comments.append(comment) + except Exception as e: + print('Error parsing comments: ' + str(e)) + comments = () + ctoken = '' + + return { + 'ctoken': ctoken, + 'comments': comments, + 'video_title': video_title, + 'video_id': metadata['video_id'], + 'offset': metadata['offset'], + 'is_replies': metadata['is_replies'], + 'sort': metadata['sort'], + } + + + -- cgit v1.2.3 From ce8a658a0e56a9dfd3d0145e53f85711c4cbfb11 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Fri, 27 Sep 2019 18:03:19 -0700 Subject: Extraction: Move item extraction into a generic, robust function --- youtube/yt_data_extract.py | 239 +++++++++++++++++++++++++++++++++------------ 1 file changed, 176 insertions(+), 63 deletions(-) (limited to 'youtube') diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 551b663..892e73e 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -4,6 +4,7 @@ import html import json import re import urllib +import collections from math import ceil # videos (all of type str): @@ -58,15 +59,52 @@ def format_text_runs(runs): result += html.escape(text_run["text"]) return result +def default_get(object, key, default, types=()): + '''Like dict.get(), but returns default if the result doesn't match one of the types. + Also works for indexing lists.''' + try: + result = object[key] + except (TypeError, IndexError, KeyError): + return default + + if not types or isinstance(result, types): + return result + else: + return default + -def default_multi_get(object, *keys, default): - ''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors ''' + +def default_multi_get(object, *keys, default, types=()): + '''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. + Last argument is the default value to use in case of any IndexErrors or KeyErrors. + If types is given and the result doesn't match one of those types, default is returned''' try: for key in keys: object = object[key] - return object - except (IndexError, KeyError): + except (TypeError, IndexError, KeyError): return default + else: + if not types or isinstance(object, types): + return object + else: + return default + +def multi_default_multi_get(object, *key_sequences, default=None, types=()): + '''Like default_multi_get, but can try different key sequences in case one fails. + Return default if all of them fail. key_sequences is a list of lists''' + for key_sequence in key_sequences: + _object = object + try: + for key in key_sequence: + _object = _object[key] + except (TypeError, IndexError, KeyError): + pass + else: + if not types or isinstance(_object, types): + return _object + else: + continue + return default def get_url(node): @@ -284,6 +322,7 @@ def parse_info_prepare_for_html(renderer, additional_info={}): return item +# TODO: Type checking def get_response(polymer_json): '''return response, error''' @@ -301,6 +340,123 @@ def get_response(polymer_json): return None, 'Failed to extract response' +list_types = { + 'sectionListRenderer', + 'itemSectionRenderer', + 'gridRenderer', + 'playlistVideoListRenderer', +} + +item_types = { + 'movieRenderer', + 'didYouMeanRenderer', + 'showingResultsForRenderer', + + 'videoRenderer', + 'compactVideoRenderer', + 'gridVideoRenderer', + 'playlistVideoRenderer', + + 'playlistRenderer', + 'compactPlaylistRenderer', + 'gridPlaylistRenderer', + + 'radioRenderer', + 'compactRadioRenderer', + 'gridRadioRenderer', + + 'showRenderer', + 'compactShowRenderer', + 'gridShowRenderer', + + + 'channelRenderer', + 'compactChannelRenderer', + 'gridChannelRenderer', + + 'channelAboutFullMetadataRenderer', +} + +def traverse_browse_renderer(renderer): + for tab in default_get(renderer, 'tabs', (), types=(list, tuple)): + tab_renderer = multi_default_multi_get(tab, ['tabRenderer'], ['expandableTabRenderer'], default=None, types=dict) + if tab_renderer is None: + continue + if tab_renderer.get('selected', False): + return default_get(tab_renderer, 'content', {}, types=(dict)) + print('Could not find tab with content') + return {} + +# these renderers contain one inside them +nested_renderer_dispatch = { + 'singleColumnBrowseResultsRenderer': traverse_browse_renderer, + 'twoColumnBrowseResultsRenderer': traverse_browse_renderer, + 'twoColumnSearchResultsRenderer': lambda renderer: default_get(renderer, 'primaryContents', {}, types=dict), +} + +def extract_items(response): + '''return items, ctoken''' + if 'continuationContents' in response: + # always has just the one [something]Continuation key, but do this just in case they add some tracking key or something + for key, renderer_continuation in default_get(response, 'continuationContents', {}, types=dict).items(): + if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation + items = multi_default_multi_get(renderer_continuation, ['contents'], ['items'], default=None, types=(list, tuple)) + ctoken = default_multi_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str) + return items, ctoken + return [], None + elif 'contents' in response: + ctoken = None + items = [] + + iter_stack = collections.deque() + current_iter = iter(()) + + renderer = default_get(response, 'contents', {}, types=dict) + + while True: + # mode 1: dig into the current renderer + # Will stay in mode 1 (via continue) if a new renderer is found inside this one + # Otherwise, after finding that it is an item renderer, + # contains a list, or contains nothing, + # falls through into mode 2 to get a new renderer + if len(renderer) != 0: + key, value = list(renderer.items())[0] + + # has a list in it, add it to the iter stack + if key in list_types: + renderer_list = multi_default_multi_get(value, ['contents'], ['items'], default=(), types=(list, tuple)) + if renderer_list: + iter_stack.append(current_iter) + current_iter = iter(renderer_list) + + continuation = default_multi_get(value, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str) + if continuation: + ctoken = continuation + + # new renderer nested inside this one + elif key in nested_renderer_dispatch: + renderer = nested_renderer_dispatch[key](value) + continue # back to mode 1 + + # the renderer is an item + elif key in item_types: + items.append(renderer) + + + # mode 2: get a new renderer by iterating. + # goes up the stack for an iterator if one has been exhausted + while current_iter is not None: + try: + renderer = current_iter.__next__() + break + except StopIteration: + try: + current_iter = iter_stack.pop() # go back up the stack + except IndexError: + return items, ctoken + + else: + return [], None def extract_channel_info(polymer_json, tab): response, err = get_response(polymer_json) @@ -341,54 +497,21 @@ def extract_channel_info(polymer_json, tab): return info - # find the tab with content - # example channel where tabs do not have definite index: https://www.youtube.com/channel/UC4gQ8i3FD7YbhOgqUkeQEJg - # TODO: maybe use the 'selected' attribute for this? - if 'continuationContents' not in response: - tab_renderer = None - tab_content = None - for tab_json in response['contents']['twoColumnBrowseResultsRenderer']['tabs']: - try: - tab_renderer = tab_json['tabRenderer'] - except KeyError: - tab_renderer = tab_json['expandableTabRenderer'] + items, _ = extract_items(response) + if tab in ('videos', 'playlists', 'search'): + additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id} + info['items'] = [renderer_info(renderer, additional_info) for renderer in items] + + elif tab == 'about': + for item in items: try: - tab_content = tab_renderer['content'] + channel_metadata = item['channelAboutFullMetadataRenderer'] break except KeyError: pass - else: # didn't break - raise Exception("No tabs found with content") - assert tab == tab_renderer['title'].lower() - - - # extract tab-specific info - if tab in ('videos', 'playlists', 'search'): # find the list of items - if 'continuationContents' in response: - try: - items = response['continuationContents']['gridContinuation']['items'] - except KeyError: - items = response['continuationContents']['sectionListContinuation']['contents'] # for search else: - contents = tab_content['sectionListRenderer']['contents'] - if 'itemSectionRenderer' in contents[0]: - item_section = contents[0]['itemSectionRenderer']['contents'][0] - try: - items = item_section['gridRenderer']['items'] - except KeyError: - if "messageRenderer" in item_section: - items = [] - else: - raise Exception('gridRenderer missing but messageRenderer not found') - else: - items = contents # for search - - additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id} - info['items'] = [renderer_info(renderer, additional_info) for renderer in items] - - elif tab == 'about': - channel_metadata = tab_content['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'] - + info['error'] = 'Could not find channelAboutFullMetadataRenderer' + return info info['links'] = [] for link_json in channel_metadata.get('primaryLinks', ()): @@ -428,10 +551,9 @@ def extract_search_info(polymer_json): info['estimated_results'] = int(response['estimatedResults']) info['estimated_pages'] = ceil(info['estimated_results']/20) - # almost always is the first "section", but if there's an advertisement for a google product like Stadia or Home in the search results, then that becomes the first "section" and the search results are in the second. So just join all of them for resiliency - results = [] - for section in response['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']: - results += section['itemSectionRenderer']['contents'] + + results, _ = extract_items(response) + info['items'] = [] info['corrections'] = {'type': None} @@ -491,12 +613,8 @@ def extract_playlist_info(polymer_json): if err: return {'error': err} info = {'error': None} - try: # first page - video_list = response['contents']['singleColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer']['contents'] - first_page = True - except KeyError: # other pages - video_list = response['continuationContents']['playlistVideoListContinuation']['contents'] - first_page = False + first_page = 'continuationContents' not in response + video_list, _ = extract_items(response) info['items'] = [renderer_info(renderer) for renderer in video_list] @@ -539,12 +657,7 @@ def parse_comments_polymer(polymer_json): ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] metadata = ctoken_metadata(ctoken) - try: - comments_raw = response['continuationContents']['commentSectionContinuation']['items'] - except KeyError: - comments_raw = response['continuationContents']['commentRepliesContinuation']['contents'] - - ctoken = default_multi_get(response, 'continuationContents', 'commentSectionContinuation', 'continuations', 0, 'nextContinuationData', 'continuation', default='') + comments_raw, ctoken = extract_items(response) comments = [] for comment_json in comments_raw: -- cgit v1.2.3 From e68ac26b4e2c216dad41e22da91067e2ddc80d00 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Fri, 27 Sep 2019 18:07:46 -0700 Subject: Extraction: Rename get_response to extract_response and check that returned type is dict --- youtube/yt_data_extract.py | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) (limited to 'youtube') diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 892e73e..13d6ede 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -322,23 +322,14 @@ def parse_info_prepare_for_html(renderer, additional_info={}): return item -# TODO: Type checking -def get_response(polymer_json): +def extract_response(polymer_json): '''return response, error''' + response = multi_default_multi_get(polymer_json, [1, 'response'], ['response'], default=None, types=dict) + if response is None: + return None, 'Failed to extract response' + else: + return response, None - # responses returned for desktop version - try: - return polymer_json[1]['response'], None - except (TypeError, KeyError, IndexError): - pass - - # responses returned for mobile version - try: - return polymer_json['response'], None - except (TypeError, KeyError): - pass - - return None, 'Failed to extract response' list_types = { 'sectionListRenderer', @@ -459,7 +450,7 @@ def extract_items(response): return [], None def extract_channel_info(polymer_json, tab): - response, err = get_response(polymer_json) + response, err = extract_response(polymer_json) if err: return {'error': err} @@ -544,7 +535,7 @@ def extract_channel_info(polymer_json, tab): return info def extract_search_info(polymer_json): - response, err = get_response(polymer_json) + response, err = extract_response(polymer_json) if err: return {'error': err} info = {'error': None} @@ -595,7 +586,7 @@ def extract_search_info(polymer_json): return info def extract_playlist_metadata(polymer_json): - response, err = get_response(polymer_json) + response, err = extract_response(polymer_json) if err: return {'error': err} metadata = renderer_info(response['header']) @@ -609,7 +600,7 @@ def extract_playlist_metadata(polymer_json): return metadata def extract_playlist_info(polymer_json): - response, err = get_response(polymer_json) + response, err = extract_response(polymer_json) if err: return {'error': err} info = {'error': None} @@ -645,7 +636,7 @@ def ctoken_metadata(ctoken): def parse_comments_polymer(polymer_json): try: video_title = '' - response, err = get_response(polymer_json) + response, err = extract_response(polymer_json) if err: raise Exception(err) -- cgit v1.2.3 From 9abb83fdbc05294f186daeefff8c85cfda06b7d2 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Fri, 27 Sep 2019 19:27:19 -0700 Subject: Extraction: Fix did_you_mean and showing_results_for --- youtube/search.py | 11 +++++++++++ youtube/templates/search.html | 6 +++--- youtube/yt_data_extract.py | 15 ++++----------- 3 files changed, 18 insertions(+), 14 deletions(-) (limited to 'youtube') diff --git a/youtube/search.py b/youtube/search.py index ba40f0b..cb66744 100644 --- a/youtube/search.py +++ b/youtube/search.py @@ -83,6 +83,17 @@ def get_search_page(): yt_data_extract.prefix_urls(item_info) yt_data_extract.add_extra_html_info(item_info) + corrections = search_info['corrections'] + if corrections['type'] == 'did_you_mean': + corrected_query_string = request.args.to_dict(flat=False) + corrected_query_string['query'] = [corrections['corrected_query']] + corrections['corrected_query_url'] = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(corrected_query_string, doseq=True) + elif corrections['type'] == 'showing_results_for': + no_autocorrect_query_string = request.args.to_dict(flat=False) + no_autocorrect_query_string['autocorrect'] = ['0'] + no_autocorrect_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(no_autocorrect_query_string, doseq=True) + corrections['original_query_url'] = no_autocorrect_query_url + return flask.render_template('search.html', header_playlist_names = local_playlist.get_playlist_names(), query = query, diff --git a/youtube/templates/search.html b/youtube/templates/search.html index aef914a..8b803e7 100644 --- a/youtube/templates/search.html +++ b/youtube/templates/search.html @@ -29,10 +29,10 @@
Approximately {{ '{:,}'.format(estimated_results) }} results ({{ '{:,}'.format(estimated_pages) }} pages)
{% if corrections['type'] == 'showing_results_for' %} - - + + {% elif corrections['type'] == 'did_you_mean' %} - + {% endif %}
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 13d6ede..cccd679 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -554,27 +554,20 @@ def extract_search_info(polymer_json): continue if type == 'didYouMeanRenderer': renderer = renderer[type] - corrected_query_string = request.args.to_dict(flat=False) - corrected_query_string['query'] = [renderer['correctedQueryEndpoint']['searchEndpoint']['query']] - corrected_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(corrected_query_string, doseq=True) info['corrections'] = { 'type': 'did_you_mean', - 'corrected_query': yt_data_extract.format_text_runs(renderer['correctedQuery']['runs']), - 'corrected_query_url': corrected_query_url, + 'corrected_query': renderer['correctedQueryEndpoint']['searchEndpoint']['query'], + 'corrected_query_text': renderer['correctedQuery']['runs'], } continue if type == 'showingResultsForRenderer': renderer = renderer[type] - no_autocorrect_query_string = request.args.to_dict(flat=False) - no_autocorrect_query_string['autocorrect'] = ['0'] - no_autocorrect_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(no_autocorrect_query_string, doseq=True) info['corrections'] = { 'type': 'showing_results_for', - 'corrected_query': yt_data_extract.format_text_runs(renderer['correctedQuery']['runs']), - 'original_query_url': no_autocorrect_query_url, - 'original_query': renderer['originalQuery']['simpleText'], + 'corrected_query_text': renderer['correctedQuery']['runs'], + 'original_query_text': renderer['originalQuery']['simpleText'], } continue -- cgit v1.2.3 From 4c07546e7a5e5882abdda896009b744e947df1c4 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 17 Oct 2019 19:58:13 -0700 Subject: Extraction: Replace youtube-dl with custom-built watch page extraction --- youtube/templates/watch.html | 21 ++- youtube/util.py | 9 +- youtube/watch.py | 154 +++++++-------- youtube/yt_data_extract.py | 435 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 523 insertions(+), 96 deletions(-) (limited to 'youtube') diff --git a/youtube/templates/watch.html b/youtube/templates/watch.html index 14e953b..e97b638 100644 --- a/youtube/templates/watch.html +++ b/youtube/templates/watch.html @@ -187,8 +187,17 @@ .format-ext{ width: 60px; } - .format-res{ - width:90px; + .format-video-quality{ + width: 140px; + } + .format-audio-quality{ + width: 120px; + } + .format-file-size{ + width: 80px; + } + .format-codecs{ + width: 120px; } {% endblock style %} @@ -227,8 +236,10 @@
  1. {{ format['ext'] }}
  2. -
  3. {{ format['resolution'] }}
  4. -
  5. {{ format['note'] }}
  6. +
  7. {{ format['video_quality'] }}
  8. +
  9. {{ format['audio_quality'] }}
  10. +
  11. {{ format['file_size'] }}
  12. +
  13. {{ format['codecs'] }}
@@ -238,7 +249,7 @@ - {{ description }} + {{ common_elements.text_runs(description) }}
{% if music_list.__len__() != 0 %}
diff --git a/youtube/util.py b/youtube/util.py index 5b63e2a..474e7b5 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -176,7 +176,7 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookieja return content, response return content -mobile_user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1' +mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36' mobile_ua = (('User-Agent', mobile_user_agent),) desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0' desktop_ua = (('User-Agent', desktop_user_agent),) @@ -312,3 +312,10 @@ def uppercase_escape(s): def prefix_url(url): url = url.lstrip('/') # some urls have // before them, which has a special meaning return '/' + url + +def left_remove(string, substring): + '''removes substring from the start of string, if present''' + if string.startswith(substring): + return string[len(substring):] + return string + diff --git a/youtube/watch.py b/youtube/watch.py index 41c90e4..a5e0759 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -5,49 +5,15 @@ import settings from flask import request import flask -from youtube_dl.YoutubeDL import YoutubeDL -from youtube_dl.extractor.youtube import YoutubeError import json import html import gevent import os +import math +import traceback + -def get_related_items(info): - results = [] - for item in info['related_vids']: - if 'list' in item: # playlist: - result = watch_page_related_playlist_info(item) - else: - result = watch_page_related_video_info(item) - yt_data_extract.prefix_urls(result) - yt_data_extract.add_extra_html_info(result) - results.append(result) - return results - - -# json of related items retrieved directly from the watch page has different names for everything -# converts these to standard names -def watch_page_related_video_info(item): - result = {key: item[key] for key in ('id', 'title', 'author')} - result['duration'] = util.seconds_to_timestamp(item['length_seconds']) - try: - result['views'] = item['short_view_count_text'] - except KeyError: - result['views'] = '' - result['thumbnail'] = util.get_thumbnail_url(item['id']) - result['type'] = 'video' - return result - -def watch_page_related_playlist_info(item): - return { - 'size': item['playlist_length'] if item['playlist_length'] != "0" else "50+", - 'title': item['playlist_title'], - 'id': item['list'], - 'first_video_id': item['video_id'], - 'thumbnail': util.get_thumbnail_url(item['video_id']), - 'type': 'playlist', - } def get_video_sources(info): video_sources = [] @@ -55,9 +21,10 @@ def get_video_sources(info): max_resolution = 360 else: max_resolution = settings.default_resolution - for format in info['formats']: - if format['acodec'] != 'none' and format['vcodec'] != 'none' and format['height'] <= max_resolution: + if not all(attr in format for attr in ('height', 'width', 'ext', 'url')): + continue + if 'acodec' in format and 'vcodec' in format and format['height'] <= max_resolution: video_sources.append({ 'src': format['url'], 'type': 'video/' + format['ext'], @@ -134,14 +101,57 @@ def get_ordered_music_list_attributes(music_list): return ordered_attributes +headers = ( + ('Accept', '*/*'), + ('Accept-Language', 'en-US,en;q=0.5'), + ('X-YouTube-Client-Name', '2'), + ('X-YouTube-Client-Version', '2.20180830'), +) + util.mobile_ua -def extract_info(downloader, *args, **kwargs): +def extract_info(video_id): + polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch') try: - return downloader.extract_info(*args, **kwargs) - except YoutubeError as e: - return str(e) - - + polymer_json = json.loads(polymer_json) + except json.decoder.JSONDecodeError: + traceback.print_exc() + return {'error': 'Failed to parse json response'} + return yt_data_extract.extract_watch_info(polymer_json) + +def video_quality_string(format): + if 'vcodec' in format: + result =str(format.get('width', '?')) + 'x' + str(format.get('height', '?')) + if 'fps' in format: + result += ' ' + format['fps'] + 'fps' + return result + elif 'acodec' in format: + return 'audio only' + + return '?' + +def audio_quality_string(format): + if 'acodec' in format: + result = str(format.get('abr', '?')) + 'k' + if 'audio_sample_rate' in format: + result += ' ' + str(format['audio_sample_rate']) + ' Hz' + return result + elif 'vcodec' in format: + return 'video only' + + return '?' + +# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py +def format_bytes(bytes): + if bytes is None: + return 'N/A' + if type(bytes) is str: + bytes = float(bytes) + if bytes == 0.0: + exponent = 0 + else: + exponent = int(math.log(bytes, 1024.0)) + suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent] + converted = float(bytes) / float(1024 ** exponent) + return '%.2f%s' % (converted, suffix) @yt_app.route('/watch') @@ -152,38 +162,26 @@ def get_watch_page(): flask.abort(flask.Response('Incomplete video id (too short): ' + video_id)) lc = request.args.get('lc', '') - if settings.route_tor: - proxy = 'socks5://127.0.0.1:9150/' - else: - proxy = '' - yt_dl_downloader = YoutubeDL(params={'youtube_include_dash_manifest':False, 'proxy':proxy}) tasks = ( gevent.spawn(comments.video_comments, video_id, int(settings.default_comment_sorting), lc=lc ), - gevent.spawn(extract_info, yt_dl_downloader, "https://www.youtube.com/watch?v=" + video_id, download=False) + gevent.spawn(extract_info, video_id) ) gevent.joinall(tasks) comments_info, info = tasks[0].value, tasks[1].value - if isinstance(info, str): # youtube error - return flask.render_template('error.html', error_message = info) + if info['error']: + return flask.render_template('error.html', error_message = info['error']) video_info = { - "duration": util.seconds_to_timestamp(info["duration"]), + "duration": util.seconds_to_timestamp(info["duration"] or 0), "id": info['id'], "title": info['title'], - "author": info['uploader'], + "author": info['author'], } - upload_year = info["upload_date"][0:4] - upload_month = info["upload_date"][4:6] - upload_day = info["upload_date"][6:8] - upload_date = upload_month + "/" + upload_day + "/" + upload_year - - if settings.related_videos_mode: - related_videos = get_related_items(info) - else: - related_videos = [] - + for item in info['related_videos']: + yt_data_extract.prefix_urls(item) + yt_data_extract.add_extra_html_info(item) if settings.gather_googlevideo_domains: with open(os.path.join(settings.data_dir, 'googlevideo-domains.txt'), 'a+', encoding='utf-8') as f: @@ -195,23 +193,29 @@ def get_watch_page(): download_formats = [] for format in info['formats']: + if 'acodec' in format and 'vcodec' in format: + codecs_string = format['acodec'] + ', ' + format['vcodec'] + else: + codecs_string = format.get('acodec') or format.get('vcodec') or '?' download_formats.append({ 'url': format['url'], - 'ext': format['ext'], - 'resolution': yt_dl_downloader.format_resolution(format), - 'note': yt_dl_downloader._format_note(format), + 'ext': format.get('ext', '?'), + 'audio_quality': audio_quality_string(format), + 'video_quality': video_quality_string(format), + 'file_size': format_bytes(format['file_size']), + 'codecs': codecs_string, }) video_sources = get_video_sources(info) - video_height = video_sources[0]['height'] - + video_height = yt_data_extract.default_multi_get(video_sources, 0, 'height', default=360) + video_width = yt_data_extract.default_multi_get(video_sources, 0, 'width', default=640) # 1 second per pixel, or the actual video width - theater_video_target_width = max(640, info['duration'], video_sources[0]['width']) + theater_video_target_width = max(640, info['duration'] or 0, video_width) return flask.render_template('watch.html', header_playlist_names = local_playlist.get_playlist_names(), - uploader_channel_url = '/' + info['uploader_url'], - upload_date = upload_date, + uploader_channel_url = ('/' + info['author_url']) if info['author_url'] else '', + upload_date = info['published_date'], views = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)), likes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)), dislikes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)), @@ -219,7 +223,7 @@ def get_watch_page(): video_info = json.dumps(video_info), video_sources = video_sources, subtitle_sources = get_subtitle_sources(info), - related = related_videos, + related = info['related_videos'], music_list = info['music_list'], music_attributes = get_ordered_music_list_attributes(info['music_list']), comments_info = comments_info, @@ -232,7 +236,7 @@ def get_watch_page(): theater_video_target_width = theater_video_target_width, title = info['title'], - uploader = info['uploader'], + uploader = info['author'], description = info['description'], unlisted = info['unlisted'], ) diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index cccd679..81604fd 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -6,6 +6,7 @@ import re import urllib import collections from math import ceil +import traceback # videos (all of type str): @@ -36,8 +37,112 @@ from math import ceil # size # first_video_id - - +# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py +_formats = { + '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well + '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + + + # 3D videos + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + + # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264'}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264'}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264'}, + + # DASH mp4 video + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, + + # Dash mp4 audio + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, + + # Dash webm + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + + # Dash webm audio + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, + + # Dash webm audio with opus inside + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, + + # RTMP (unnamed) + '_rtmp': {'protocol': 'rtmp'}, + + # av01 video only formats sometimes served with "unknown" codecs + '394': {'vcodec': 'av01.0.05M.08'}, + '395': {'vcodec': 'av01.0.05M.08'}, + '396': {'vcodec': 'av01.0.05M.08'}, + '397': {'vcodec': 'av01.0.05M.08'}, +} def get_plain_text(node): @@ -59,7 +164,7 @@ def format_text_runs(runs): result += html.escape(text_run["text"]) return result -def default_get(object, key, default, types=()): +def default_get(object, key, default=None, types=()): '''Like dict.get(), but returns default if the result doesn't match one of the types. Also works for indexing lists.''' try: @@ -74,7 +179,7 @@ def default_get(object, key, default, types=()): -def default_multi_get(object, *keys, default, types=()): +def default_multi_get(object, *keys, default=None, types=()): '''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors. If types is given and the result doesn't match one of those types, default is returned''' @@ -106,6 +211,11 @@ def multi_default_multi_get(object, *key_sequences, default=None, types=()): continue return default +def remove_redirect(url): + if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking + query_string = url[url.find('?')+1: ] + return urllib.parse.parse_qs(query_string)['q'][0] + return url def get_url(node): try: @@ -239,9 +349,9 @@ def renderer_info(renderer, additional_info={}): type = list(renderer.keys())[0] renderer = renderer[type] info = {} - if type == 'itemSectionRenderer': + if type in ('itemSectionRenderer', 'compactAutoplayRenderer'): return renderer_info(renderer['contents'][0], additional_info) - + if type in ('movieRenderer', 'clarificationRenderer'): info['type'] = 'unsupported' return info @@ -345,6 +455,7 @@ item_types = { 'videoRenderer', 'compactVideoRenderer', + 'compactAutoplayRenderer', 'gridVideoRenderer', 'playlistVideoRenderer', @@ -378,6 +489,11 @@ def traverse_browse_renderer(renderer): print('Could not find tab with content') return {} +def traverse_standard_list(renderer): + renderer_list = multi_default_multi_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple)) + continuation = default_multi_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation') + return renderer_list, continuation + # these renderers contain one inside them nested_renderer_dispatch = { 'singleColumnBrowseResultsRenderer': traverse_browse_renderer, @@ -385,7 +501,16 @@ nested_renderer_dispatch = { 'twoColumnSearchResultsRenderer': lambda renderer: default_get(renderer, 'primaryContents', {}, types=dict), } -def extract_items(response): +# these renderers contain a list of renderers in side them +nested_renderer_list_dispatch = { + 'sectionListRenderer': traverse_standard_list, + 'itemSectionRenderer': traverse_standard_list, + 'gridRenderer': traverse_standard_list, + 'playlistVideoListRenderer': traverse_standard_list, + 'singleColumnWatchNextResults': lambda r: (default_multi_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None), +} + +def extract_items(response, item_types=item_types): '''return items, ctoken''' if 'continuationContents' in response: # always has just the one [something]Continuation key, but do this just in case they add some tracking key or something @@ -414,13 +539,11 @@ def extract_items(response): key, value = list(renderer.items())[0] # has a list in it, add it to the iter stack - if key in list_types: - renderer_list = multi_default_multi_get(value, ['contents'], ['items'], default=(), types=(list, tuple)) + if key in nested_renderer_list_dispatch: + renderer_list, continuation = nested_renderer_list_dispatch[key](value) if renderer_list: iter_stack.append(current_iter) current_iter = iter(renderer_list) - - continuation = default_multi_get(value, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str) if continuation: ctoken = continuation @@ -506,10 +629,7 @@ def extract_channel_info(polymer_json, tab): info['links'] = [] for link_json in channel_metadata.get('primaryLinks', ()): - url = link_json['navigationEndpoint']['urlEndpoint']['url'] - if url.startswith('/redirect'): # youtube puts these on external links to do tracking - query_string = url[url.find('?')+1: ] - url = urllib.parse.parse_qs(query_string)['q'][0] + url = remove_redirect(link_json['navigationEndpoint']['urlEndpoint']['url']) text = get_plain_text(link_json['title']) @@ -699,5 +819,290 @@ def parse_comments_polymer(polymer_json): 'sort': metadata['sort'], } +def check_missing_keys(object, *key_sequences): + for key_sequence in key_sequences: + _object = object + try: + for key in key_sequence: + _object = object[key] + except (KeyError, IndexError, TypeError): + return 'Could not find ' + key + + return None + +def extract_plain_text(node, default=None): + if isinstance(node, str): + return node + + try: + return node['simpleText'] + except (KeyError, TypeError): + pass + + try: + return ''.join(text_run['text'] for text_run in node['runs']) + except (KeyError, TypeError): + pass + + return default + +def extract_formatted_text(node): + try: + result = [] + runs = node['runs'] + for run in runs: + url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') + if url is not None: + run['url'] = remove_redirect(url) + run['text'] = run['url'] # youtube truncates the url text, we don't want that nonsense + return runs + except (KeyError, TypeError): + traceback.print_exc() + pass + + try: + return [{'text': node['simpleText']}] + except (KeyError, TypeError): + pass + + return [] + +def extract_integer(string): + if not isinstance(string, str): + return None + match = re.search(r'(\d+)', string.replace(',', '')) + if match is None: + return None + try: + return int(match.group(1)) + except ValueError: + return None + +def extract_metadata_row_info(video_renderer_info): + # extract category and music list + info = { + 'category': None, + 'music_list': [], + } + + current_song = {} + for row in default_multi_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]): + row_title = extract_plain_text(default_multi_get(row, 'metadataRowRenderer', 'title'), default='') + row_content = extract_plain_text(default_multi_get(row, 'metadataRowRenderer', 'contents', 0)) + if row_title == 'Category': + info['category'] = row_content + elif row_title in ('Song', 'Music'): + if current_song: + info['music_list'].append(current_song) + current_song = {'title': row_content} + elif row_title == 'Artist': + current_song['artist'] = row_content + elif row_title == 'Album': + current_song['album'] = row_content + elif row_title == 'Writers': + current_song['writers'] = row_content + elif row_title.startswith('Licensed'): + current_song['licensor'] = row_content + if current_song: + info['music_list'].append(current_song) + return info + +def extract_watch_info_mobile(top_level): + info = {} + microformat = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) + + info['allowed_countries'] = microformat.get('availableCountries', []) + info['published_date'] = microformat.get('publishDate') + + response = top_level.get('response', {}) + + # video info from metadata renderers + items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'}) + if items: + video_info = items[0]['slimVideoMetadataRenderer'] + else: + print('Failed to extract video metadata') + video_info = {} + + info.update(extract_metadata_row_info(video_info)) + #info['description'] = extract_formatted_text(video_info.get('description')) + info['like_count'] = None + info['dislike_count'] = None + for button in video_info.get('buttons', ()): + button_renderer = button.get('slimMetadataToggleButtonRenderer', {}) + + # all the digits can be found in the accessibility data + count = extract_integer(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label')) + + # this count doesn't have all the digits, it's like 53K for instance + dumb_count = extract_integer(extract_plain_text(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText'))) + + # the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0 + if dumb_count == 0: + count = 0 + + if 'isLike' in button_renderer: + info['like_count'] = count + elif 'isDislike' in button_renderer: + info['dislike_count'] = count + + # comment section info + items, _ = extract_items(response, item_types={'commentSectionRenderer'}) + if items: + comment_info = items[0]['commentSectionRenderer'] + comment_count_text = extract_plain_text(default_multi_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText')) + if comment_count_text == 'Comments': # just this with no number, means 0 comments + info['comment_count'] = 0 + else: + info['comment_count'] = extract_integer(comment_count_text) + info['comments_disabled'] = False + else: # no comment section present means comments are disabled + info['comment_count'] = 0 + info['comments_disabled'] = True + + # related videos + related, _ = extract_items(response) + info['related_videos'] = [renderer_info(renderer) for renderer in related] + + return info + +month_abbreviations = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'} +def extract_watch_info_desktop(top_level): + info = { + 'comment_count': None, + 'comments_disabled': None, + 'allowed_countries': None, + } + + video_info = {} + for renderer in default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()): + if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'): + video_info.update(list(renderer.values())[0]) + + info.update(extract_metadata_row_info(video_info)) + #info['description'] = extract_formatted_text(video_info.get('description', None)) + info['published_date'] = None + date_text = extract_plain_text(video_info.get('dateText', None)) + if date_text is not None: + date_text = util.left_remove(date_text.lower(), 'published on ').replace(',', '') + parts = date_text.split() + if len(parts) == 3: + month, day, year = date_text.split() + month = month_abbreviations.get(month[0:3]) # slicing in case they start writing out the full month name + if month and (re.fullmatch(r'\d\d?', day) is not None) and (re.fullmatch(r'\d{4}', year) is not None): + info['published_date'] = year + '-' + month + '-' + day + + likes_dislikes = default_multi_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/') + if len(likes_dislikes) == 2: + info['like_count'] = extract_integer(likes_dislikes[0]) + info['dislike_count'] = extract_integer(likes_dislikes[1]) + else: + info['like_count'] = None + info['dislike_count'] = None + + #info['title'] = extract_plain_text(video_info.get('title', None)) + #info['author'] = extract_plain_text(default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'title')) + #info['author_id'] = default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') + #info['view_count'] = extract_integer(extract_plain_text(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount'))) + + related = default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[]) + info['related_videos'] = [renderer_info(renderer) for renderer in related] + + return info + + +def extract_watch_info(polymer_json): + info = {'playability_error': None, 'error': None} + + if isinstance(polymer_json, dict): + top_level = polymer_json + elif isinstance(polymer_json, (list, tuple)): + top_level = {} + for page_part in polymer_json: + if not isinstance(page_part, dict): + return {'error': 'Invalid page part'} + top_level.update(page_part) + else: + return {'error': 'Invalid top level polymer data'} + + error = check_missing_keys(top_level, + ['playerResponse'], + ) + if error: + return {'error': error} + + error = check_missing_keys(top_level, + ['player', 'args'], + ['player', 'assets', 'js'], + ) + if error: + info['playability_error'] = error + + + player_args = default_multi_get(top_level, 'player', 'args', default={}) + parsed_formats = [] + + if 'url_encoded_fmt_stream_map' in player_args: + string_formats = player_args['url_encoded_fmt_stream_map'].split(',') + parsed_formats += [dict(urllib.parse.parse_qsl(fmt_string)) for fmt_string in string_formats if fmt_string] + + if 'adaptive_fmts' in player_args: + string_formats = player_args['adaptive_fmts'].split(',') + parsed_formats += [dict(urllib.parse.parse_qsl(fmt_string)) for fmt_string in string_formats if fmt_string] + + info['formats'] = [] + + for parsed_fmt in parsed_formats: + # start with defaults from the big table at the top + if 'itag' in parsed_fmt: + fmt = _formats.get(parsed_fmt['itag'], {}).copy() + else: + fmt = {} + + # then override them + fmt.update(parsed_fmt) + try: + fmt['width'], fmt['height'] = map(int, fmt['size'].split('x')) + except (KeyError, ValueError, TypeError): + pass + + fmt['file_size'] = None + if 'clen' in fmt: + fmt['file_size'] = int(fmt.get('clen')) + else: + match = re.search(r'&clen=(\d+)', fmt.get('url')) + if match: + fmt['file_size'] = int(match.group(1)) + info['formats'].append(fmt) + + info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js') + if info['base_js']: + info['base_js'] = normalize_url(info['base_js']) + + mobile = 'singleColumnWatchNextResults' in default_multi_get(top_level, 'response', 'contents', default={}) + if mobile: + info.update(extract_watch_info_mobile(top_level)) + else: + info.update(extract_watch_info_desktop(top_level)) + + # stuff from videoDetails + video_details = default_multi_get(top_level, 'playerResponse', 'videoDetails', default={}) + info['title'] = extract_plain_text(video_details.get('title')) + info['duration'] = extract_integer(video_details.get('lengthSeconds')) + info['view_count'] = extract_integer(video_details.get('viewCount')) + # videos with no description have a blank string + info['description'] = video_details.get('shortDescription') + info['id'] = video_details.get('videoId') + info['author'] = video_details.get('author') + info['author_id'] = video_details.get('channelId') + info['live'] = video_details.get('isLiveContent') + info['unlisted'] = not video_details.get('isCrawlable', True) + info['tags'] = video_details.get('keywords', []) + + # other stuff + info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None + info['subtitles'] = {} # TODO + + return info -- cgit v1.2.3 From 70b56d6eef4fd9d6c46c8fbf48dfec3ae7a2937e Mon Sep 17 00:00:00 2001 From: James Taylor Date: Fri, 18 Oct 2019 14:02:28 -0700 Subject: Extraction: Add signature decryption --- youtube/watch.py | 149 ++++++++++++++++++++++++++++++++++++++++----- youtube/yt_data_extract.py | 128 ++++++++++++++++++-------------------- 2 files changed, 193 insertions(+), 84 deletions(-) (limited to 'youtube') diff --git a/youtube/watch.py b/youtube/watch.py index a5e0759..959dca2 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -11,8 +11,14 @@ import gevent import os import math import traceback +import re +import urllib - +try: + with open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'r') as f: + decrypt_cache = json.loads(f.read())['decrypt_cache'] +except FileNotFoundError: + decrypt_cache = {} def get_video_sources(info): @@ -22,9 +28,9 @@ def get_video_sources(info): else: max_resolution = settings.default_resolution for format in info['formats']: - if not all(attr in format for attr in ('height', 'width', 'ext', 'url')): + if not all(format[attr] for attr in ('height', 'width', 'ext', 'url')): continue - if 'acodec' in format and 'vcodec' in format and format['height'] <= max_resolution: + if format['acodec'] and format['vcodec'] and format['height'] <= max_resolution: video_sources.append({ 'src': format['url'], 'type': 'video/' + format['ext'], @@ -101,6 +107,112 @@ def get_ordered_music_list_attributes(music_list): return ordered_attributes +def save_decrypt_cache(): + try: + f = open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'w') + except FileNotFoundError: + os.makedirs(settings.data_dir) + f = open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'w') + + f.write(json.dumps({'version': 1, 'decrypt_cache':decrypt_cache}, indent=4, sort_keys=True)) + f.close() + +# adapted from youtube-dl and invidious: +# https://github.com/omarroth/invidious/blob/master/src/invidious/helpers/signatures.cr +decrypt_function_re = re.compile(r'function\(a\)\{(a=a\.split\(""\)[^\}]+)\}') +op_with_arg_re = re.compile(r'[^\.]+\.([^\(]+)\(a,(\d+)\)') +def decrypt_signatures(info): + '''return error string, or False if no errors''' + if not info['formats'] or not info['formats'][0]['s']: + return False # No decryption needed + if not info['base_js']: + return 'Failed to find base.js' + player_name = yt_data_extract.default_get(info['base_js'].split('/'), -2) + if not player_name: + return 'Could not find player name' + + if player_name in decrypt_cache: + print('Using cached decryption function for: ' + player_name) + decryption_function = decrypt_cache[player_name] + else: + base_js = util.fetch_url(info['base_js'], debug_name='base.js', report_text='Fetched player ' + player_name) + base_js = base_js.decode('utf-8') + + decrypt_function_match = decrypt_function_re.search(base_js) + if decrypt_function_match is None: + return 'Could not find decryption function in base.js' + + function_body = decrypt_function_match.group(1).split(';')[1:-1] + if not function_body: + return 'Empty decryption function body' + + var_name = yt_data_extract.default_get(function_body[0].split('.'), 0) + if var_name is None: + return 'Could not find var_name' + + var_body_match = re.search(r'var ' + re.escape(var_name) + r'=\{(.*?)\};', base_js, flags=re.DOTALL) + if var_body_match is None: + return 'Could not find var_body' + + operations = var_body_match.group(1).replace('\n', '').split('},') + if not operations: + return 'Did not find any definitions in var_body' + operations[-1] = operations[-1][:-1] # remove the trailing '}' since we split by '},' on the others + operation_definitions = {} + for op in operations: + colon_index = op.find(':') + opening_brace_index = op.find('{') + + if colon_index == -1 or opening_brace_index == -1: + return 'Could not parse operation' + op_name = op[:colon_index] + op_body = op[opening_brace_index+1:] + if op_body == 'a.reverse()': + operation_definitions[op_name] = 0 + elif op_body == 'a.splice(0,b)': + operation_definitions[op_name] = 1 + elif op_body.startswith('var c=a[0]'): + operation_definitions[op_name] = 2 + else: + return 'Unknown op_body: ' + op_body + + decryption_function = [] + for op_with_arg in function_body: + match = op_with_arg_re.fullmatch(op_with_arg) + if match is None: + return 'Could not parse operation with arg' + op_name = match.group(1) + if op_name not in operation_definitions: + return 'Unknown op_name: ' + op_name + op_argument = match.group(2) + decryption_function.append([operation_definitions[op_name], int(op_argument)]) + + decrypt_cache[player_name] = decryption_function + save_decrypt_cache() + + for format in info['formats']: + if not format['s'] or not format['sp'] or not format['url']: + print('Warning: s, sp, or url not in format') + continue + + a = list(format['s']) + for op, argument in decryption_function: + if op == 0: + a.reverse() + elif op == 1: + a = a[argument:] + else: + operation_2(a, argument) + + signature = ''.join(a) + format['url'] += '&' + format['sp'] + '=' + signature + return False + +def operation_2(a, b): + c = a[0] + a[0] = a[b % len(a)] + a[b % len(a)] = c + headers = ( ('Accept', '*/*'), ('Accept-Language', 'en-US,en;q=0.5'), @@ -115,26 +227,31 @@ def extract_info(video_id): except json.decoder.JSONDecodeError: traceback.print_exc() return {'error': 'Failed to parse json response'} - return yt_data_extract.extract_watch_info(polymer_json) + info = yt_data_extract.extract_watch_info(polymer_json) + error = decrypt_signatures(info) + if error: + print('Error decrypting url signatures: ' + error) + info['playability_error'] = error + return info def video_quality_string(format): - if 'vcodec' in format: - result =str(format.get('width', '?')) + 'x' + str(format.get('height', '?')) - if 'fps' in format: - result += ' ' + format['fps'] + 'fps' + if format['vcodec']: + result =str(format['width'] or '?') + 'x' + str(format['height'] or '?') + if format['fps']: + result += ' ' + str(format['fps']) + 'fps' return result - elif 'acodec' in format: + elif format['acodec']: return 'audio only' return '?' def audio_quality_string(format): - if 'acodec' in format: - result = str(format.get('abr', '?')) + 'k' - if 'audio_sample_rate' in format: + if format['acodec']: + result = str(format['audio_bitrate'] or '?') + 'k' + if format['audio_sample_rate']: result += ' ' + str(format['audio_sample_rate']) + ' Hz' return result - elif 'vcodec' in format: + elif format['vcodec']: return 'video only' return '?' @@ -193,13 +310,13 @@ def get_watch_page(): download_formats = [] for format in info['formats']: - if 'acodec' in format and 'vcodec' in format: + if format['acodec'] and format['vcodec']: codecs_string = format['acodec'] + ', ' + format['vcodec'] else: - codecs_string = format.get('acodec') or format.get('vcodec') or '?' + codecs_string = format['acodec'] or format['vcodec'] or '?' download_formats.append({ 'url': format['url'], - 'ext': format.get('ext', '?'), + 'ext': format['ext'] or '?', 'audio_quality': audio_quality_string(format), 'video_quality': video_quality_string(format), 'file_size': format_bytes(format['file_size']), diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 81604fd..6bfec59 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -39,44 +39,44 @@ import traceback # from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py _formats = { - '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'}, + '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'}, '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, - '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, - '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, - '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well + '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'mp4v'}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 96, 'vcodec': 'h264'}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), audio_bitrate varies as well '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, - '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'}, + '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, # 3D videos - '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, # Apple HTTP Live Streaming - '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'}, - '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'}, - '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264'}, - '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264'}, - '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'}, - '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264'}, + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'h264'}, # DASH mp4 video '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, @@ -93,9 +93,9 @@ _formats = { '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 256, 'container': 'm4a_dash'}, '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, @@ -126,13 +126,13 @@ _formats = { '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, # Dash webm audio - '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, - '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 256}, # Dash webm audio with opus inside - '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, - '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, - '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 160}, # RTMP (unnamed) '_rtmp': {'protocol': 'rtmp'}, @@ -1042,39 +1042,32 @@ def extract_watch_info(polymer_json): player_args = default_multi_get(top_level, 'player', 'args', default={}) - parsed_formats = [] - - if 'url_encoded_fmt_stream_map' in player_args: - string_formats = player_args['url_encoded_fmt_stream_map'].split(',') - parsed_formats += [dict(urllib.parse.parse_qsl(fmt_string)) for fmt_string in string_formats if fmt_string] - - if 'adaptive_fmts' in player_args: - string_formats = player_args['adaptive_fmts'].split(',') - parsed_formats += [dict(urllib.parse.parse_qsl(fmt_string)) for fmt_string in string_formats if fmt_string] + player_response = json.loads(player_args['player_response']) if 'player_response' in player_args else {} + streaming_data = player_response.get('streamingData', {}) + yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', []) info['formats'] = [] - for parsed_fmt in parsed_formats: - # start with defaults from the big table at the top - if 'itag' in parsed_fmt: - fmt = _formats.get(parsed_fmt['itag'], {}).copy() + for yt_fmt in yt_formats: + fmt = {} + fmt['ext'] = None + fmt['audio_bitrate'] = None + fmt['acodec'] = None + fmt['vcodec'] = None + fmt['width'] = yt_fmt.get('width') + fmt['height'] = yt_fmt.get('height') + fmt['file_size'] = yt_fmt.get('contentLength') + fmt['audio_sample_rate'] = yt_fmt.get('audioSampleRate') + fmt['fps'] = yt_fmt.get('fps') + cipher = dict(urllib.parse.parse_qsl(yt_fmt.get('cipher', ''))) + if cipher: + fmt['url'] = cipher.get('url') else: - fmt = {} + fmt['url'] = yt_fmt.get('url') + fmt['s'] = cipher.get('s') + fmt['sp'] = cipher.get('sp') + fmt.update(_formats.get(str(yt_fmt.get('itag')), {})) - # then override them - fmt.update(parsed_fmt) - try: - fmt['width'], fmt['height'] = map(int, fmt['size'].split('x')) - except (KeyError, ValueError, TypeError): - pass - - fmt['file_size'] = None - if 'clen' in fmt: - fmt['file_size'] = int(fmt.get('clen')) - else: - match = re.search(r'&clen=(\d+)', fmt.get('url')) - if match: - fmt['file_size'] = int(match.group(1)) info['formats'].append(fmt) info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js') @@ -1104,5 +1097,4 @@ def extract_watch_info(polymer_json): # other stuff info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None info['subtitles'] = {} # TODO - return info -- cgit v1.2.3 From 79d9a18f815a03498e21dd5769a2e70c7ae7afa5 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Fri, 22 Nov 2019 14:56:53 -0800 Subject: Extraction: return and display any errors preventing video playback --- youtube/templates/watch.html | 41 +++++++++++++++++++++++++++++------------ youtube/watch.py | 3 ++- youtube/yt_data_extract.py | 17 +++++++++-------- 3 files changed, 40 insertions(+), 21 deletions(-) (limited to 'youtube') diff --git a/youtube/templates/watch.html b/youtube/templates/watch.html index e97b638..da3b336 100644 --- a/youtube/templates/watch.html +++ b/youtube/templates/watch.html @@ -14,6 +14,19 @@ text-decoration: underline; } + .playability-error{ + height: 360px; + width: 640px; + grid-column: 2; + background-color: var(--video-background-color); + text-align:center; + } + .playability-error span{ + position: relative; + top: 50%; + transform: translate(-50%, -50%); + } + {% if theater_mode %} video{ grid-column: 1 / span 5; @@ -202,20 +215,24 @@ {% endblock style %} {% block main %} - + {% endif %}

{{ title }}

diff --git a/youtube/watch.py b/youtube/watch.py index 959dca2..8a396a7 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -123,7 +123,7 @@ decrypt_function_re = re.compile(r'function\(a\)\{(a=a\.split\(""\)[^\}]+)\}') op_with_arg_re = re.compile(r'[^\.]+\.([^\(]+)\(a,(\d+)\)') def decrypt_signatures(info): '''return error string, or False if no errors''' - if not info['formats'] or not info['formats'][0]['s']: + if ('formats' not in info) or (not info['formats']) or (not info['formats'][0]['s']): return False # No decryption needed if not info['base_js']: return 'Failed to find base.js' @@ -356,6 +356,7 @@ def get_watch_page(): uploader = info['author'], description = info['description'], unlisted = info['unlisted'], + playability_error = info['playability_error'], ) diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 6bfec59..1a5f21c 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -824,7 +824,7 @@ def check_missing_keys(object, *key_sequences): _object = object try: for key in key_sequence: - _object = object[key] + _object = _object[key] except (KeyError, IndexError, TypeError): return 'Could not find ' + key @@ -1027,22 +1027,21 @@ def extract_watch_info(polymer_json): else: return {'error': 'Invalid top level polymer data'} - error = check_missing_keys(top_level, - ['playerResponse'], - ) - if error: - return {'error': error} - error = check_missing_keys(top_level, ['player', 'args'], ['player', 'assets', 'js'], + ['playerResponse'], ) if error: info['playability_error'] = error - player_args = default_multi_get(top_level, 'player', 'args', default={}) player_response = json.loads(player_args['player_response']) if 'player_response' in player_args else {} + playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None) + playability_reason = default_multi_get(player_response, 'playabilityStatus', 'reason', default='Unknown error') + if playability_status not in (None, 'OK'): + info['playability_error'] = playability_reason + streaming_data = player_response.get('streamingData', {}) yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', []) @@ -1069,6 +1068,8 @@ def extract_watch_info(polymer_json): fmt.update(_formats.get(str(yt_fmt.get('itag')), {})) info['formats'].append(fmt) + if info['formats']: + info['playability_error'] = None # in case they lie info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js') if info['base_js']: -- cgit v1.2.3 From 782a82639753aa40cfd2e23ab23c87af4bee7a73 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Fri, 22 Nov 2019 16:36:33 -0800 Subject: Extraction: extract fields from visible webpage if missing from playerResposne --- youtube/yt_data_extract.py | 92 ++++++++++++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 31 deletions(-) (limited to 'youtube') diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 1a5f21c..a347480 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -830,7 +830,8 @@ def check_missing_keys(object, *key_sequences): return None -def extract_plain_text(node, default=None): +def extract_plain_text(node, default=None, recover_urls=False): + '''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix Youtube's truncation of url text (most prominently seen in descriptions)''' if isinstance(node, str): return node @@ -839,10 +840,21 @@ def extract_plain_text(node, default=None): except (KeyError, TypeError): pass - try: - return ''.join(text_run['text'] for text_run in node['runs']) - except (KeyError, TypeError): - pass + if isinstance(node, dict) and 'runs' in node: + if recover_urls: + result = '' + for run in node['runs']: + url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') + text = run.get('text', '') + # second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text + if url is not None and (text.startswith('http://') or text.startswith('https://')): + url = remove_redirect(url) + result += url # youtube truncates the url text, use actual url instead + else: + result += text + return result + else: + return ''.join(text_run.get('text', '') for text_run in node['runs']) return default @@ -878,6 +890,11 @@ def extract_integer(string): except ValueError: return None +def update_if_not_none(dictionary, key, value): + '''Update dictionary[key] with value if value is not none''' + if key not in dictionary or value is not None: + dictionary[key] = value + def extract_metadata_row_info(video_renderer_info): # extract category and music list info = { @@ -908,6 +925,17 @@ def extract_metadata_row_info(video_renderer_info): return info +def extract_date(date_text): + if date_text is None: + return None + + date_text = date_text.replace(',', '').lower() + parts = date_text.split() + if len(parts) >= 3: + month, day, year = parts[-3:] + month = month_abbreviations.get(month[0:3]) # slicing in case they start writing out the full month name + if month and (re.fullmatch(r'\d\d?', day) is not None) and (re.fullmatch(r'\d{4}', year) is not None): + return year + '-' + month + '-' + day def extract_watch_info_mobile(top_level): info = {} @@ -927,9 +955,20 @@ def extract_watch_info_mobile(top_level): video_info = {} info.update(extract_metadata_row_info(video_info)) - #info['description'] = extract_formatted_text(video_info.get('description')) + info['description'] = extract_plain_text(video_info.get('description'), recover_urls=True) + info['view_count'] = extract_integer(extract_plain_text(video_info.get('expandedSubtitle'))) + info['author'] = extract_plain_text(default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) + info['author_id'] = default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') + info['title'] = extract_plain_text(video_info.get('title')) + info['live'] = 'watching' in extract_plain_text(video_info.get('expandedSubtitle')) + info['unlisted'] = False + for badge in video_info.get('badges', []): + if default_multi_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted': + info['unlisted'] = True info['like_count'] = None info['dislike_count'] = None + if not info['published_date']: + info['published_date'] = extract_date(extract_plain_text(video_info.get('dateText', None))) for button in video_info.get('buttons', ()): button_renderer = button.get('slimMetadataToggleButtonRenderer', {}) @@ -982,17 +1021,8 @@ def extract_watch_info_desktop(top_level): video_info.update(list(renderer.values())[0]) info.update(extract_metadata_row_info(video_info)) - #info['description'] = extract_formatted_text(video_info.get('description', None)) - info['published_date'] = None - date_text = extract_plain_text(video_info.get('dateText', None)) - if date_text is not None: - date_text = util.left_remove(date_text.lower(), 'published on ').replace(',', '') - parts = date_text.split() - if len(parts) == 3: - month, day, year = date_text.split() - month = month_abbreviations.get(month[0:3]) # slicing in case they start writing out the full month name - if month and (re.fullmatch(r'\d\d?', day) is not None) and (re.fullmatch(r'\d{4}', year) is not None): - info['published_date'] = year + '-' + month + '-' + day + info['description'] = extract_plain_text(video_info.get('description', None), recover_urls=True) + info['published_date'] = extract_date(extract_plain_text(video_info.get('dateText', None))) likes_dislikes = default_multi_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/') if len(likes_dislikes) == 2: @@ -1002,10 +1032,10 @@ def extract_watch_info_desktop(top_level): info['like_count'] = None info['dislike_count'] = None - #info['title'] = extract_plain_text(video_info.get('title', None)) - #info['author'] = extract_plain_text(default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'title')) - #info['author_id'] = default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') - #info['view_count'] = extract_integer(extract_plain_text(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount'))) + info['title'] = extract_plain_text(video_info.get('title', None)) + info['author'] = extract_plain_text(default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'title')) + info['author_id'] = default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') + info['view_count'] = extract_integer(extract_plain_text(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount'))) related = default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[]) info['related_videos'] = [renderer_info(renderer) for renderer in related] @@ -1083,17 +1113,17 @@ def extract_watch_info(polymer_json): # stuff from videoDetails video_details = default_multi_get(top_level, 'playerResponse', 'videoDetails', default={}) - info['title'] = extract_plain_text(video_details.get('title')) - info['duration'] = extract_integer(video_details.get('lengthSeconds')) - info['view_count'] = extract_integer(video_details.get('viewCount')) + update_if_not_none(info, 'title', extract_plain_text(video_details.get('title'))) + update_if_not_none(info, 'duration', extract_integer(video_details.get('lengthSeconds'))) + update_if_not_none(info, 'view_count', extract_integer(video_details.get('viewCount'))) # videos with no description have a blank string - info['description'] = video_details.get('shortDescription') - info['id'] = video_details.get('videoId') - info['author'] = video_details.get('author') - info['author_id'] = video_details.get('channelId') - info['live'] = video_details.get('isLiveContent') - info['unlisted'] = not video_details.get('isCrawlable', True) - info['tags'] = video_details.get('keywords', []) + update_if_not_none(info, 'description', video_details.get('shortDescription')) + update_if_not_none(info, 'id', video_details.get('videoId')) + update_if_not_none(info, 'author', video_details.get('author')) + update_if_not_none(info, 'author_id', video_details.get('channelId')) + update_if_not_none(info, 'live', video_details.get('isLiveContent')) + update_if_not_none(info, 'unlisted', not video_details.get('isCrawlable', True)) + update_if_not_none(info, 'tags', video_details.get('keywords', [])) # other stuff info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None -- cgit v1.2.3 From 95da24a2060fe2575a4edd10140a3426424978d4 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 28 Nov 2019 22:24:17 -0800 Subject: Extraction: extract automatic captions --- youtube/yt_data_extract.py | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'youtube') diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index a347480..15ab706 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -3,7 +3,7 @@ from youtube import util, proto import html import json import re -import urllib +import urllib.parse import collections from math import ceil import traceback @@ -1042,7 +1042,7 @@ def extract_watch_info_desktop(top_level): return info - +_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') def extract_watch_info(polymer_json): info = {'playability_error': None, 'error': None} @@ -1072,6 +1072,36 @@ def extract_watch_info(polymer_json): if playability_status not in (None, 'OK'): info['playability_error'] = playability_reason + # automatic captions + + # adapted from youtube_dl: + # https://github.com/ytdl-org/youtube-dl/blob/76e510b92c4a1c4b0001f892504ba2cbb4b8d486/youtube_dl/extractor/youtube.py#L1490-#L1523 + info['automatic_captions'] = {} + + renderer = default_multi_get(player_response, 'captions', 'playerCaptionsTracklistRenderer', default={}) + base_url = default_multi_get(renderer, 'captionTracks', 0, 'baseUrl') + + if base_url and '?' in base_url: + base_url = normalize_url(base_url) + base_url_path, base_url_query_string = base_url.split('?') + url_info = urllib.parse.parse_qs(base_url_query_string) + + for lang in renderer.get('translationLanguages', []): + lang_code = lang.get('languageCode') + if not lang_code: + continue + formats_for_this_lang = [] + for ext in _SUBTITLE_FORMATS: + url_info['tlang'] = [lang_code] + url_info['fmt'] = [ext] + url = base_url_path + '?' + urllib.parse.urlencode(url_info, doseq=True) + formats_for_this_lang.append({ + 'url': url, + 'ext': ext, + }) + info['automatic_captions'][lang_code] = formats_for_this_lang + + # formats streaming_data = player_response.get('streamingData', {}) yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', []) -- cgit v1.2.3 From 205ad29cb0763dd263a5940cdcb3059d189bbfe7 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Fri, 29 Nov 2019 18:36:27 -0800 Subject: Extraction: Add general subtitle extraction and translation --- youtube/watch.py | 132 +++++++++++++++++++++++++++++++-------------- youtube/yt_data_extract.py | 63 +++++++++++----------- 2 files changed, 126 insertions(+), 69 deletions(-) (limited to 'youtube') diff --git a/youtube/watch.py b/youtube/watch.py index 8a396a7..fa697ba 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -44,50 +44,104 @@ def get_video_sources(info): return video_sources +def make_caption_src(info, lang, auto=False, trans_lang=None): + label = lang + if auto: + label += ' (Automatic)' + if trans_lang: + label += ' -> ' + trans_lang + return { + 'url': '/' + yt_data_extract.get_caption_url(info, lang, 'vtt', auto, trans_lang), + 'label': label, + 'srclang': trans_lang[0:2] if trans_lang else lang[0:2], + 'on': False, + } + +def lang_in(lang, sequence): + '''Tests if the language is in sequence, with e.g. en and en-US considered the same''' + lang = lang[0:2] + return lang in (l[0:2] for l in sequence) + +def lang_eq(lang1, lang2): + '''Tests if two iso 639-1 codes are equal, with en and en-US considered the same. + Just because the codes are equal does not mean the dialects are mutually intelligible, but this will have to do for now without a complex language model''' + return lang1[0:2] == lang2[0:2] + +def equiv_lang_in(lang, sequence): + '''Extracts a language in sequence which is equivalent to lang. + e.g. if lang is en, extracts en-GB from sequence. + Necessary because if only a specific variant like en-GB is available, can't ask Youtube for simply en. Need to get the available variant.''' + lang = lang[0:2] + for l in sequence: + if l[0:2] == lang: + return l + return None + def get_subtitle_sources(info): + '''Returns these sources, ordered from least to most intelligible: + native_video_lang (Automatic) + foreign_langs (Manual) + native_video_lang (Automatic) -> pref_lang + foreign_langs (Manual) -> pref_lang + native_video_lang (Manual) -> pref_lang + pref_lang (Automatic) + pref_lang (Manual)''' sources = [] - default_found = False - default = None - for language, formats in info['subtitles'].items(): - for format in formats: - if format['ext'] == 'vtt': - source = { - 'url': '/' + format['url'], - 'label': language, - 'srclang': language, - - # set as on by default if this is the preferred language and a default-on subtitles mode is in settings - 'on': language == settings.subtitles_language and settings.subtitles_mode > 0, - } - - if language == settings.subtitles_language: - default_found = True - default = source - else: - sources.append(source) - break - - # Put it at the end to avoid browser bug when there are too many languages - # (in firefox, it is impossible to select a language near the top of the list because it is cut off) - if default_found: - sources.append(default) + pref_lang = settings.subtitles_language + native_video_lang = None + if info['automatic_caption_languages']: + native_video_lang = info['automatic_caption_languages'][0] - try: - formats = info['automatic_captions'][settings.subtitles_language] - except KeyError: - pass - else: - for format in formats: - if format['ext'] == 'vtt': - sources.append({ - 'url': '/' + format['url'], - 'label': settings.subtitles_language + ' - Automatic', - 'srclang': settings.subtitles_language, + highest_fidelity_is_manual = False - # set as on by default if this is the preferred language and a default-on subtitles mode is in settings - 'on': settings.subtitles_mode == 2 and not default_found, + # Sources are added in very specific order outlined above + # More intelligible sources are put further down to avoid browser bug when there are too many languages + # (in firefox, it is impossible to select a language near the top of the list because it is cut off) - }) + # native_video_lang (Automatic) + if native_video_lang and not lang_eq(native_video_lang, pref_lang): + sources.append(make_caption_src(info, native_video_lang, auto=True)) + + # foreign_langs (Manual) + for lang in info['manual_caption_languages']: + if not lang_eq(lang, pref_lang): + sources.append(make_caption_src(info, lang)) + + if (lang_in(pref_lang, info['translation_languages']) + and not lang_in(pref_lang, info['automatic_caption_languages']) + and not lang_in(pref_lang, info['manual_caption_languages'])): + # native_video_lang (Automatic) -> pref_lang + if native_video_lang and not lang_eq(pref_lang, native_video_lang): + sources.append(make_caption_src(info, native_video_lang, auto=True, trans_lang=pref_lang)) + + # foreign_langs (Manual) -> pref_lang + for lang in info['manual_caption_languages']: + if not lang_eq(lang, native_video_lang): + sources.append(make_caption_src(info, lang, trans_lang=pref_lang)) + + # native_video_lang (Manual) -> pref_lang + if lang_in(native_video_lang, info['manual_caption_languages']): + sources.append(make_caption_src(info, native_video_lang, trans_lang=pref_lang)) + + # pref_lang (Automatic) + if lang_in(pref_lang, info['automatic_caption_languages']): + sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['automatic_caption_languages']), auto=True)) + + # pref_lang (Manual) + if lang_in(pref_lang, info['manual_caption_languages']): + sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['manual_caption_languages']))) + highest_fidelity_is_manual = True + + if sources and sources[-1]['srclang'] == pref_lang: + # set as on by default since it's manual a default-on subtitles mode is in settings + if highest_fidelity_is_manual and settings.subtitles_mode > 0: + sources[-1]['on'] = True + # set as on by default since settings indicate to set it as such even if it's not manual + elif settings.subtitles_mode == 2: + sources[-1]['on'] = True + + if len(sources) == 0: + assert len(info['automatic_caption_languages']) == 0 and len(info['manual_caption_languages']) == 0 return sources diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 15ab706..7c2b717 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -309,6 +309,8 @@ def ajax_info(item_json): youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$') def normalize_url(url): + if url is None: + return None match = youtube_url_re.fullmatch(url) if match is None: raise Exception() @@ -1042,7 +1044,18 @@ def extract_watch_info_desktop(top_level): return info -_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') +def get_caption_url(info, language, format, automatic=False, translation_language=None): + '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.''' + url = info['_captions_base_url'] + url += '&lang=' + language + url += '&fmt=' + format + if automatic: + url += '&kind=asr' + if translation_language: + url += '&tlang=' + translation_language + return url + +SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') def extract_watch_info(polymer_json): info = {'playability_error': None, 'error': None} @@ -1072,34 +1085,25 @@ def extract_watch_info(polymer_json): if playability_status not in (None, 'OK'): info['playability_error'] = playability_reason - # automatic captions - - # adapted from youtube_dl: - # https://github.com/ytdl-org/youtube-dl/blob/76e510b92c4a1c4b0001f892504ba2cbb4b8d486/youtube_dl/extractor/youtube.py#L1490-#L1523 - info['automatic_captions'] = {} - - renderer = default_multi_get(player_response, 'captions', 'playerCaptionsTracklistRenderer', default={}) - base_url = default_multi_get(renderer, 'captionTracks', 0, 'baseUrl') - - if base_url and '?' in base_url: - base_url = normalize_url(base_url) - base_url_path, base_url_query_string = base_url.split('?') - url_info = urllib.parse.parse_qs(base_url_query_string) - - for lang in renderer.get('translationLanguages', []): - lang_code = lang.get('languageCode') - if not lang_code: - continue - formats_for_this_lang = [] - for ext in _SUBTITLE_FORMATS: - url_info['tlang'] = [lang_code] - url_info['fmt'] = [ext] - url = base_url_path + '?' + urllib.parse.urlencode(url_info, doseq=True) - formats_for_this_lang.append({ - 'url': url, - 'ext': ext, - }) - info['automatic_captions'][lang_code] = formats_for_this_lang + # captions + info['automatic_caption_languages'] = [] + info['manual_caption_languages'] = [] + info['translation_languages'] = [] + captions_info = player_response.get('captions', {}) + info['_captions_base_url'] = normalize_url(default_multi_get(captions_info, 'playerCaptionsRenderer', 'baseUrl')) + for caption_track in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()): + lang_code = caption_track.get('languageCode') + if lang_code: + if caption_track.get('kind') == 'asr': + info['automatic_caption_languages'].append(lang_code) + else: + info['manual_caption_languages'].append(lang_code) + for translation_lang_info in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()): + lang_code = translation_lang_info.get('languageCode') + if lang_code: + info['translation_languages'].append(lang_code) + if translation_lang_info.get('isTranslatable') == False: + print('WARNING: Found non-translatable caption language') # formats streaming_data = player_response.get('streamingData', {}) @@ -1157,5 +1161,4 @@ def extract_watch_info(polymer_json): # other stuff info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None - info['subtitles'] = {} # TODO return info -- cgit v1.2.3 From 26f37521babbb2fc4b86ad59354e8c69da1f3897 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 12 Dec 2019 22:13:17 -0800 Subject: Extraction: Bypass age-restriction --- youtube/watch.py | 26 +++++++++--- youtube/yt_data_extract.py | 99 ++++++++++++++++++++++++++++++++-------------- 2 files changed, 90 insertions(+), 35 deletions(-) (limited to 'youtube') diff --git a/youtube/watch.py b/youtube/watch.py index fa697ba..4575c1e 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -275,17 +275,32 @@ headers = ( ) + util.mobile_ua def extract_info(video_id): - polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch') + polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch').decode('utf-8') + # TODO: Decide whether this should be done in yt_data_extract.extract_watch_info try: polymer_json = json.loads(polymer_json) except json.decoder.JSONDecodeError: traceback.print_exc() return {'error': 'Failed to parse json response'} info = yt_data_extract.extract_watch_info(polymer_json) - error = decrypt_signatures(info) - if error: - print('Error decrypting url signatures: ' + error) - info['playability_error'] = error + + # age restriction bypass + if info['age_restricted']: + print('Fetching age restriction bypass page') + data = { + 'video_id': video_id, + 'eurl': 'https://youtube.googleapis.com/v/' + video_id, + } + url = 'https://www.youtube.com/get_video_info?' + urllib.parse.urlencode(data) + video_info_page = util.fetch_url(url, debug_name='get_video_info', report_text='Fetched age restriction bypass page').decode('utf-8') + yt_data_extract.update_with_age_restricted_info(info, video_info_page) + + # signature decryption + decryption_error = decrypt_signatures(info) + if decryption_error: + decryption_error = 'Error decrypting url signatures: ' + decryption_error + info['playability_error'] = decryption_error + return info def video_quality_string(format): @@ -410,6 +425,7 @@ def get_watch_page(): uploader = info['author'], description = info['description'], unlisted = info['unlisted'], + age_restricted = info['age_restricted'], playability_error = info['playability_error'], ) diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 7c2b717..8c5c63d 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -943,6 +943,11 @@ def extract_watch_info_mobile(top_level): info = {} microformat = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) + family_safe = microformat.get('isFamilySafe') + if family_safe is None: + info['age_restricted'] = None + else: + info['age_restricted'] = not family_safe info['allowed_countries'] = microformat.get('availableCountries', []) info['published_date'] = microformat.get('publishDate') @@ -1055,6 +1060,34 @@ def get_caption_url(info, language, format, automatic=False, translation_languag url += '&tlang=' + translation_language return url +def extract_formats(info, player_response): + streaming_data = player_response.get('streamingData', {}) + yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', []) + + info['formats'] = [] + + for yt_fmt in yt_formats: + fmt = {} + fmt['ext'] = None + fmt['audio_bitrate'] = None + fmt['acodec'] = None + fmt['vcodec'] = None + fmt['width'] = yt_fmt.get('width') + fmt['height'] = yt_fmt.get('height') + fmt['file_size'] = yt_fmt.get('contentLength') + fmt['audio_sample_rate'] = yt_fmt.get('audioSampleRate') + fmt['fps'] = yt_fmt.get('fps') + cipher = dict(urllib.parse.parse_qsl(yt_fmt.get('cipher', ''))) + if cipher: + fmt['url'] = cipher.get('url') + else: + fmt['url'] = yt_fmt.get('url') + fmt['s'] = cipher.get('s') + fmt['sp'] = cipher.get('sp') + fmt.update(_formats.get(str(yt_fmt.get('itag')), {})) + + info['formats'].append(fmt) + SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') def extract_watch_info(polymer_json): info = {'playability_error': None, 'error': None} @@ -1080,10 +1113,6 @@ def extract_watch_info(polymer_json): player_args = default_multi_get(top_level, 'player', 'args', default={}) player_response = json.loads(player_args['player_response']) if 'player_response' in player_args else {} - playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None) - playability_reason = default_multi_get(player_response, 'playabilityStatus', 'reason', default='Unknown error') - if playability_status not in (None, 'OK'): - info['playability_error'] = playability_reason # captions info['automatic_caption_languages'] = [] @@ -1106,35 +1135,19 @@ def extract_watch_info(polymer_json): print('WARNING: Found non-translatable caption language') # formats - streaming_data = player_response.get('streamingData', {}) - yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', []) - - info['formats'] = [] - - for yt_fmt in yt_formats: - fmt = {} - fmt['ext'] = None - fmt['audio_bitrate'] = None - fmt['acodec'] = None - fmt['vcodec'] = None - fmt['width'] = yt_fmt.get('width') - fmt['height'] = yt_fmt.get('height') - fmt['file_size'] = yt_fmt.get('contentLength') - fmt['audio_sample_rate'] = yt_fmt.get('audioSampleRate') - fmt['fps'] = yt_fmt.get('fps') - cipher = dict(urllib.parse.parse_qsl(yt_fmt.get('cipher', ''))) - if cipher: - fmt['url'] = cipher.get('url') + extract_formats(info, player_response) + playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None) + playability_reason = default_multi_get(player_response, 'playabilityStatus', 'reason', default='Could not find playability error') + if not info['formats']: + if playability_status not in (None, 'OK'): + info['playability_error'] = playability_reason else: - fmt['url'] = yt_fmt.get('url') - fmt['s'] = cipher.get('s') - fmt['sp'] = cipher.get('sp') - fmt.update(_formats.get(str(yt_fmt.get('itag')), {})) + info['playability_error'] = 'Unknown playability error' - info['formats'].append(fmt) - if info['formats']: - info['playability_error'] = None # in case they lie + # check age-restriction + info['age_restricted'] = (playability_status == 'LOGIN_REQUIRED' and playability_reason and ' age' in playability_reason) + # base_js (for decryption of signatures) info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js') if info['base_js']: info['base_js'] = normalize_url(info['base_js']) @@ -1162,3 +1175,29 @@ def extract_watch_info(polymer_json): # other stuff info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None return info + +def update_with_age_restricted_info(info, video_info_page): + ERROR_PREFIX = 'Error bypassing age-restriction: ' + + video_info = urllib.parse.parse_qs(video_info_page) + player_response = default_multi_get(video_info, 'player_response', 0) + if player_response is None: + info['playability_error'] = ERROR_PREFIX + 'Could not find player_response in video_info_page' + return + try: + player_response = json.loads(player_response) + except json.decoder.JSONDecodeError: + traceback.print_exc() + info['playability_error'] = ERROR_PREFIX + 'Failed to parse json response' + return + + extract_formats(info, player_response) + if info['formats']: + info['playability_error'] = None + else: + playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None) + playability_reason = default_multi_get(player_response, 'playabilityStatus', 'reason', default=ERROR_PREFIX + 'Could not find playability error') + if playability_status not in (None, 'OK'): + info['playability_error'] = ERROR_PREFIX + playability_reason + else: + info['playability_error'] = ERROR_PREFIX + 'Unknown playability error' -- cgit v1.2.3 From ecc1ce42b84362cf73096f2e5e5a9ade1d4aa524 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 12 Dec 2019 22:13:37 -0800 Subject: Extraction: Display that video is age-restricted --- youtube/templates/watch.html | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'youtube') diff --git a/youtube/templates/watch.html b/youtube/templates/watch.html index da3b336..27150d4 100644 --- a/youtube/templates/watch.html +++ b/youtube/templates/watch.html @@ -74,12 +74,21 @@ grid-column: 1 / span 2; min-width: 0; } - .video-info > .is-unlisted{ - background-color: var(--interface-color); + .video-info > .labels{ justify-self:start; - padding-left:2px; - padding-right:2px; + list-style: none; + padding: 0px; + margin: 5px 0px; } + .video-info > .labels:empty{ + margin: 0px; + } + .labels > li{ + display: inline; + margin-right:5px; + background-color: var(--interface-color); + padding: 2px 5px + } .video-info > address{ grid-column: 1; grid-row: 3; @@ -236,9 +245,14 @@

{{ title }}

- {% if unlisted %} - Unlisted - {% endif %} +
    + {%- if unlisted -%} +
  • Unlisted
  • + {%- endif -%} + {%- if age_restricted -%} +
  • Age-restricted
  • + {%- endif -%} +
Uploaded by {{ uploader }}
{{ views }} views -- cgit v1.2.3 From 8c16062ea823532a191f06284a6b850c5d20e810 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sat, 14 Dec 2019 14:37:03 -0800 Subject: Extraction: Fix subtitles not working on certain videos which require more parameters in the captions url --- youtube/yt_data_extract.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'youtube') diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 8c5c63d..42e10db 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -1056,6 +1056,9 @@ def get_caption_url(info, language, format, automatic=False, translation_languag url += '&fmt=' + format if automatic: url += '&kind=asr' + elif language in info['_manual_caption_language_names']: + url += '&name=' + urllib.parse.quote(info['_manual_caption_language_names'][language], safe='') + if translation_language: url += '&tlang=' + translation_language return url @@ -1117,16 +1120,23 @@ def extract_watch_info(polymer_json): # captions info['automatic_caption_languages'] = [] info['manual_caption_languages'] = [] + info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url info['translation_languages'] = [] captions_info = player_response.get('captions', {}) info['_captions_base_url'] = normalize_url(default_multi_get(captions_info, 'playerCaptionsRenderer', 'baseUrl')) for caption_track in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()): lang_code = caption_track.get('languageCode') - if lang_code: - if caption_track.get('kind') == 'asr': - info['automatic_caption_languages'].append(lang_code) - else: - info['manual_caption_languages'].append(lang_code) + if not lang_code: + continue + if caption_track.get('kind') == 'asr': + info['automatic_caption_languages'].append(lang_code) + else: + info['manual_caption_languages'].append(lang_code) + base_url = caption_track.get('baseUrl', '') + lang_name = default_multi_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0) + if lang_name: + info['_manual_caption_language_names'][lang_code] = lang_name + for translation_lang_info in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()): lang_code = translation_lang_info.get('languageCode') if lang_code: -- cgit v1.2.3 From a04aa63efee5813c6083dcdb3defcbcf32ce88f4 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sat, 14 Dec 2019 14:42:39 -0800 Subject: Extraction: Fix subtitles error when video has no automatic captions but has foreign language captions --- youtube/watch.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'youtube') diff --git a/youtube/watch.py b/youtube/watch.py index 4575c1e..77a4b45 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -59,12 +59,16 @@ def make_caption_src(info, lang, auto=False, trans_lang=None): def lang_in(lang, sequence): '''Tests if the language is in sequence, with e.g. en and en-US considered the same''' + if lang is None: + return False lang = lang[0:2] return lang in (l[0:2] for l in sequence) def lang_eq(lang1, lang2): '''Tests if two iso 639-1 codes are equal, with en and en-US considered the same. Just because the codes are equal does not mean the dialects are mutually intelligible, but this will have to do for now without a complex language model''' + if lang1 is None or lang2 is None: + return False return lang1[0:2] == lang2[0:2] def equiv_lang_in(lang, sequence): @@ -116,7 +120,7 @@ def get_subtitle_sources(info): # foreign_langs (Manual) -> pref_lang for lang in info['manual_caption_languages']: - if not lang_eq(lang, native_video_lang): + if not lang_eq(lang, native_video_lang) and not lang_eq(lang, pref_lang): sources.append(make_caption_src(info, lang, trans_lang=pref_lang)) # native_video_lang (Manual) -> pref_lang -- cgit v1.2.3 From e870eea05737e694206c85383f3328cfaa778e90 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sat, 14 Dec 2019 22:08:36 -0800 Subject: Extraction: Add fallback playability error extraction from renderers --- youtube/yt_data_extract.py | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) (limited to 'youtube') diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 42e10db..ec9d683 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -1091,6 +1091,25 @@ def extract_formats(info, player_response): info['formats'].append(fmt) +def extract_playability_error(info, player_response, error_prefix=''): + if info['formats']: + info['playability_error'] = None + return + + playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None) + info['playability_status'] = playability_status + + playability_reason = extract_plain_text(multi_default_multi_get(player_response, + ['playabilityStatus', 'reason'], + ['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'], + default='Could not find playability error') + ) + + if playability_status not in (None, 'OK'): + info['playability_error'] = error_prefix + playability_reason + else: + info['playability_error'] = error_prefix + 'Unknown playability error' + SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') def extract_watch_info(polymer_json): info = {'playability_error': None, 'error': None} @@ -1146,16 +1165,12 @@ def extract_watch_info(polymer_json): # formats extract_formats(info, player_response) - playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None) - playability_reason = default_multi_get(player_response, 'playabilityStatus', 'reason', default='Could not find playability error') - if not info['formats']: - if playability_status not in (None, 'OK'): - info['playability_error'] = playability_reason - else: - info['playability_error'] = 'Unknown playability error' + + # playability errors + extract_playability_error(info, player_response) # check age-restriction - info['age_restricted'] = (playability_status == 'LOGIN_REQUIRED' and playability_reason and ' age' in playability_reason) + info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_reason'] and ' age' in info['playability_reason']) # base_js (for decryption of signatures) info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js') @@ -1202,12 +1217,4 @@ def update_with_age_restricted_info(info, video_info_page): return extract_formats(info, player_response) - if info['formats']: - info['playability_error'] = None - else: - playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None) - playability_reason = default_multi_get(player_response, 'playabilityStatus', 'reason', default=ERROR_PREFIX + 'Could not find playability error') - if playability_status not in (None, 'OK'): - info['playability_error'] = ERROR_PREFIX + playability_reason - else: - info['playability_error'] = ERROR_PREFIX + 'Unknown playability error' + extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX) -- cgit v1.2.3 From 40de1b74ed9d485cac21f3051b581b3b1ff5244a Mon Sep 17 00:00:00 2001 From: James Taylor Date: Tue, 17 Dec 2019 16:02:23 -0800 Subject: Extraction: Extract info from microformat to get views for limited state videos, and as a fallback. Shorten some function names --- youtube/yt_data_extract.py | 99 ++++++++++++++++++++++++++++------------------ 1 file changed, 60 insertions(+), 39 deletions(-) (limited to 'youtube') diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index ec9d683..96021f1 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -832,7 +832,7 @@ def check_missing_keys(object, *key_sequences): return None -def extract_plain_text(node, default=None, recover_urls=False): +def extract_str(node, default=None, recover_urls=False): '''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix Youtube's truncation of url text (most prominently seen in descriptions)''' if isinstance(node, str): return node @@ -881,7 +881,7 @@ def extract_formatted_text(node): return [] -def extract_integer(string): +def extract_int(string): if not isinstance(string, str): return None match = re.search(r'(\d+)', string.replace(',', '')) @@ -892,11 +892,6 @@ def extract_integer(string): except ValueError: return None -def update_if_not_none(dictionary, key, value): - '''Update dictionary[key] with value if value is not none''' - if key not in dictionary or value is not None: - dictionary[key] = value - def extract_metadata_row_info(video_renderer_info): # extract category and music list info = { @@ -906,8 +901,8 @@ def extract_metadata_row_info(video_renderer_info): current_song = {} for row in default_multi_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]): - row_title = extract_plain_text(default_multi_get(row, 'metadataRowRenderer', 'title'), default='') - row_content = extract_plain_text(default_multi_get(row, 'metadataRowRenderer', 'contents', 0)) + row_title = extract_str(default_multi_get(row, 'metadataRowRenderer', 'title'), default='') + row_content = extract_str(default_multi_get(row, 'metadataRowRenderer', 'contents', 0)) if row_title == 'Category': info['category'] = row_content elif row_title in ('Song', 'Music'): @@ -962,12 +957,12 @@ def extract_watch_info_mobile(top_level): video_info = {} info.update(extract_metadata_row_info(video_info)) - info['description'] = extract_plain_text(video_info.get('description'), recover_urls=True) - info['view_count'] = extract_integer(extract_plain_text(video_info.get('expandedSubtitle'))) - info['author'] = extract_plain_text(default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) + info['description'] = extract_str(video_info.get('description'), recover_urls=True) + info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle'))) + info['author'] = extract_str(default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) info['author_id'] = default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') - info['title'] = extract_plain_text(video_info.get('title')) - info['live'] = 'watching' in extract_plain_text(video_info.get('expandedSubtitle')) + info['title'] = extract_str(video_info.get('title')) + info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle')) info['unlisted'] = False for badge in video_info.get('badges', []): if default_multi_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted': @@ -975,15 +970,15 @@ def extract_watch_info_mobile(top_level): info['like_count'] = None info['dislike_count'] = None if not info['published_date']: - info['published_date'] = extract_date(extract_plain_text(video_info.get('dateText', None))) + info['published_date'] = extract_date(extract_str(video_info.get('dateText', None))) for button in video_info.get('buttons', ()): button_renderer = button.get('slimMetadataToggleButtonRenderer', {}) # all the digits can be found in the accessibility data - count = extract_integer(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label')) + count = extract_int(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label')) # this count doesn't have all the digits, it's like 53K for instance - dumb_count = extract_integer(extract_plain_text(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText'))) + dumb_count = extract_int(extract_str(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText'))) # the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0 if dumb_count == 0: @@ -998,11 +993,11 @@ def extract_watch_info_mobile(top_level): items, _ = extract_items(response, item_types={'commentSectionRenderer'}) if items: comment_info = items[0]['commentSectionRenderer'] - comment_count_text = extract_plain_text(default_multi_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText')) + comment_count_text = extract_str(default_multi_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText')) if comment_count_text == 'Comments': # just this with no number, means 0 comments info['comment_count'] = 0 else: - info['comment_count'] = extract_integer(comment_count_text) + info['comment_count'] = extract_int(comment_count_text) info['comments_disabled'] = False else: # no comment section present means comments are disabled info['comment_count'] = 0 @@ -1028,21 +1023,21 @@ def extract_watch_info_desktop(top_level): video_info.update(list(renderer.values())[0]) info.update(extract_metadata_row_info(video_info)) - info['description'] = extract_plain_text(video_info.get('description', None), recover_urls=True) - info['published_date'] = extract_date(extract_plain_text(video_info.get('dateText', None))) + info['description'] = extract_str(video_info.get('description', None), recover_urls=True) + info['published_date'] = extract_date(extract_str(video_info.get('dateText', None))) likes_dislikes = default_multi_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/') if len(likes_dislikes) == 2: - info['like_count'] = extract_integer(likes_dislikes[0]) - info['dislike_count'] = extract_integer(likes_dislikes[1]) + info['like_count'] = extract_int(likes_dislikes[0]) + info['dislike_count'] = extract_int(likes_dislikes[1]) else: info['like_count'] = None info['dislike_count'] = None - info['title'] = extract_plain_text(video_info.get('title', None)) - info['author'] = extract_plain_text(default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'title')) + info['title'] = extract_str(video_info.get('title', None)) + info['author'] = extract_str(default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'title')) info['author_id'] = default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') - info['view_count'] = extract_integer(extract_plain_text(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount'))) + info['view_count'] = extract_int(extract_str(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount'))) related = default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[]) info['related_videos'] = [renderer_info(renderer) for renderer in related] @@ -1093,13 +1088,14 @@ def extract_formats(info, player_response): def extract_playability_error(info, player_response, error_prefix=''): if info['formats']: + info['playability_status'] = None info['playability_error'] = None return playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None) info['playability_status'] = playability_status - playability_reason = extract_plain_text(multi_default_multi_get(player_response, + playability_reason = extract_str(multi_default_multi_get(player_response, ['playabilityStatus', 'reason'], ['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'], default='Could not find playability error') @@ -1110,6 +1106,17 @@ def extract_playability_error(info, player_response, error_prefix=''): else: info['playability_error'] = error_prefix + 'Unknown playability error' +def liberal_update(obj, key, value): + '''Updates obj[key] with value as long as value is not None. + Ensures obj[key] will at least get a value of None, however''' + if (value is not None) or (key not in obj): + obj[key] = value + +def conservative_update(obj, key, value): + '''Only updates obj if it doesn't have key or obj[key] is None''' + if obj.get(key) is None: + obj[key] = value + SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') def extract_watch_info(polymer_json): info = {'playability_error': None, 'error': None} @@ -1183,19 +1190,33 @@ def extract_watch_info(polymer_json): else: info.update(extract_watch_info_desktop(top_level)) - # stuff from videoDetails - video_details = default_multi_get(top_level, 'playerResponse', 'videoDetails', default={}) - update_if_not_none(info, 'title', extract_plain_text(video_details.get('title'))) - update_if_not_none(info, 'duration', extract_integer(video_details.get('lengthSeconds'))) - update_if_not_none(info, 'view_count', extract_integer(video_details.get('viewCount'))) + # stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info + vd = default_multi_get(top_level, 'playerResponse', 'videoDetails', default={}) + liberal_update(info, 'title', extract_str(vd.get('title'))) + liberal_update(info, 'duration', extract_int(vd.get('lengthSeconds'))) + liberal_update(info, 'view_count', extract_int(vd.get('viewCount'))) # videos with no description have a blank string - update_if_not_none(info, 'description', video_details.get('shortDescription')) - update_if_not_none(info, 'id', video_details.get('videoId')) - update_if_not_none(info, 'author', video_details.get('author')) - update_if_not_none(info, 'author_id', video_details.get('channelId')) - update_if_not_none(info, 'live', video_details.get('isLiveContent')) - update_if_not_none(info, 'unlisted', not video_details.get('isCrawlable', True)) - update_if_not_none(info, 'tags', video_details.get('keywords', [])) + liberal_update(info, 'description', vd.get('shortDescription')) + liberal_update(info, 'id', vd.get('videoId')) + liberal_update(info, 'author', vd.get('author')) + liberal_update(info, 'author_id', vd.get('channelId')) + liberal_update(info, 'live', vd.get('isLiveContent')) + liberal_update(info, 'unlisted', not vd.get('isCrawlable', True)) + liberal_update(info, 'tags', vd.get('keywords', [])) + + # fallback stuff from microformat + mf = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) + conservative_update(info, 'title', extract_str(mf.get('title'))) + conservative_update(info, 'duration', extract_int(mf.get('lengthSeconds'))) + # this gives the view count for limited state videos + conservative_update(info, 'view_count', extract_int(mf.get('viewCount'))) + conservative_update(info, 'description', extract_str(mf.get('description'), recover_urls=True)) + conservative_update(info, 'author', mf.get('ownerChannelName')) + conservative_update(info, 'author_id', mf.get('externalChannelId')) + conservative_update(info, 'unlisted', mf.get('isUnlisted')) + liberal_update(info, 'category', mf.get('category')) + liberal_update(info, 'published_date', mf.get('publishDate')) + liberal_update(info, 'uploaded_date', mf.get('uploadDate')) # other stuff info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None -- cgit v1.2.3 From 81c7ecf161b528ba293678e0bdbf42952cc87386 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Tue, 17 Dec 2019 20:39:20 -0800 Subject: Extraction: Make limited state videos work --- youtube/watch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube') diff --git a/youtube/watch.py b/youtube/watch.py index 77a4b45..092885d 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -279,7 +279,7 @@ headers = ( ) + util.mobile_ua def extract_info(video_id): - polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch').decode('utf-8') + polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1&bpctr=9999999999', headers=headers, debug_name='watch').decode('utf-8') # TODO: Decide whether this should be done in yt_data_extract.extract_watch_info try: polymer_json = json.loads(polymer_json) -- cgit v1.2.3 From 45a4ab5acedd2fd7531604d3e817e0742a036c4a Mon Sep 17 00:00:00 2001 From: James Taylor Date: Tue, 17 Dec 2019 20:58:15 -0800 Subject: Extraction: Detect limited state and fix false detection as unlisted --- youtube/templates/watch.html | 3 +++ youtube/watch.py | 1 + youtube/yt_data_extract.py | 12 ++++++++++-- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'youtube') diff --git a/youtube/templates/watch.html b/youtube/templates/watch.html index 27150d4..eaa3786 100644 --- a/youtube/templates/watch.html +++ b/youtube/templates/watch.html @@ -252,6 +252,9 @@ {%- if age_restricted -%}
  • Age-restricted
  • {%- endif -%} + {%- if limited_state -%} +
  • Limited state
  • + {%- endif -%}
    Uploaded by {{ uploader }}
    {{ views }} views diff --git a/youtube/watch.py b/youtube/watch.py index 092885d..fca794e 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -429,6 +429,7 @@ def get_watch_page(): uploader = info['author'], description = info['description'], unlisted = info['unlisted'], + limited_state = info['limited_state'], age_restricted = info['age_restricted'], playability_error = info['playability_error'], ) diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 96021f1..c7a6604 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -1003,6 +1003,13 @@ def extract_watch_info_mobile(top_level): info['comment_count'] = 0 info['comments_disabled'] = True + # check for limited state + items, _ = extract_items(response, item_types={'limitedStateMessageRenderer'}) + if items: + info['limited_state'] = True + else: + info['limited_state'] = False + # related videos related, _ = extract_items(response) info['related_videos'] = [renderer_info(renderer) for renderer in related] @@ -1015,6 +1022,7 @@ def extract_watch_info_desktop(top_level): 'comment_count': None, 'comments_disabled': None, 'allowed_countries': None, + 'limited_state': None, } video_info = {} @@ -1201,7 +1209,7 @@ def extract_watch_info(polymer_json): liberal_update(info, 'author', vd.get('author')) liberal_update(info, 'author_id', vd.get('channelId')) liberal_update(info, 'live', vd.get('isLiveContent')) - liberal_update(info, 'unlisted', not vd.get('isCrawlable', True)) + conservative_update(info, 'unlisted', not vd.get('isCrawlable', True)) #isCrawlable is false on limited state videos even if they aren't unlisted liberal_update(info, 'tags', vd.get('keywords', [])) # fallback stuff from microformat @@ -1213,7 +1221,7 @@ def extract_watch_info(polymer_json): conservative_update(info, 'description', extract_str(mf.get('description'), recover_urls=True)) conservative_update(info, 'author', mf.get('ownerChannelName')) conservative_update(info, 'author_id', mf.get('externalChannelId')) - conservative_update(info, 'unlisted', mf.get('isUnlisted')) + liberal_update(info, 'unlisted', mf.get('isUnlisted')) liberal_update(info, 'category', mf.get('category')) liberal_update(info, 'published_date', mf.get('publishDate')) liberal_update(info, 'uploaded_date', mf.get('uploadDate')) -- cgit v1.2.3 From e98a1965d2cfca725d8d30a4807e816036b60c0b Mon Sep 17 00:00:00 2001 From: James Taylor Date: Tue, 17 Dec 2019 21:06:06 -0800 Subject: Extraction: Fix mistake with age-restriction detection --- youtube/yt_data_extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube') diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index c7a6604..653a79f 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -1185,7 +1185,7 @@ def extract_watch_info(polymer_json): extract_playability_error(info, player_response) # check age-restriction - info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_reason'] and ' age' in info['playability_reason']) + info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error']) # base_js (for decryption of signatures) info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js') -- cgit v1.2.3 From ee0a118a6c7ed0e371fed18dcdace1f18a3cabf6 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Tue, 17 Dec 2019 21:52:31 -0800 Subject: Extraction: Fix thumbnail and remove badges on related videos --- youtube/templates/common_elements.html | 6 ++++-- youtube/templates/watch.html | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'youtube') diff --git a/youtube/templates/common_elements.html b/youtube/templates/common_elements.html index 67655b3..1a417ae 100644 --- a/youtube/templates/common_elements.html +++ b/youtube/templates/common_elements.html @@ -14,7 +14,7 @@ {%- endif -%} {% endmacro %} -{% macro item(info, description=false, horizontal=true, include_author=true) %} +{% macro item(info, description=false, horizontal=true, include_author=true, include_badges=true) %}
    {% if info['type'] == 'video' %} diff --git a/youtube/templates/watch.html b/youtube/templates/watch.html index eaa3786..0ffa358 100644 --- a/youtube/templates/watch.html +++ b/youtube/templates/watch.html @@ -165,9 +165,13 @@ .related-videos-inner{ padding-top: 10px; display: grid; - grid-auto-rows: 94px; + grid-auto-rows: 90px; grid-row-gap: 10px; } + .thumbnail-box{ /* overides rule in shared.css */ + height: 90px !important; + width: 120px !important; + } /* Put related vids below videos when window is too small */ /* 1100px instead of 1080 because W3C is full of idiots who include scrollbar width */ @@ -311,7 +315,7 @@ Related Videos -- cgit v1.2.3 From 98777ee82561ae205f156a7f8497728aecfa080c Mon Sep 17 00:00:00 2001 From: James Taylor Date: Wed, 18 Dec 2019 19:39:16 -0800 Subject: Extraction: Rewrite item_extraction for better error handling and readability, rename extracted names for more consistency --- youtube/__init__.py | 7 + youtube/comments.py | 22 +- youtube/playlist.py | 12 +- youtube/search.py | 6 +- youtube/subscriptions.py | 8 +- youtube/templates/comments.html | 4 +- youtube/templates/common_elements.html | 78 +++--- youtube/templates/playlist.html | 5 +- youtube/templates/watch.html | 6 +- youtube/util.py | 2 + youtube/watch.py | 8 +- youtube/yt_data_extract.py | 487 +++++++++++++++------------------ 12 files changed, 305 insertions(+), 340 deletions(-) (limited to 'youtube') diff --git a/youtube/__init__.py b/youtube/__init__.py index 0137e86..534b9f8 100644 --- a/youtube/__init__.py +++ b/youtube/__init__.py @@ -23,3 +23,10 @@ def inject_theme_preference(): 'theme_path': '/youtube.com/static/' + theme_names[settings.theme] + '.css', } +@yt_app.template_filter('commatize') +def commatize(num): + if num is None: + return '' + if isinstance(num, str): + num = int(num) + return '{:,}'.format(num) diff --git a/youtube/comments.py b/youtube/comments.py index 250a95f..e237f0f 100644 --- a/youtube/comments.py +++ b/youtube/comments.py @@ -91,33 +91,33 @@ def post_process_comments_info(comments_info): comment['author_url'] = util.URL_ORIGIN + comment['author_url'] comment['author_avatar'] = '/' + comment['author_avatar'] - comment['permalink'] = util.URL_ORIGIN + '/watch?v=' + comments_info['video_id'] + '&lc=' + comment['comment_id'] + comment['permalink'] = util.URL_ORIGIN + '/watch?v=' + comments_info['video_id'] + '&lc=' + comment['id'] if comment['author_channel_id'] in accounts.accounts: comment['delete_url'] = (util.URL_ORIGIN + '/delete_comment?video_id=' + comments_info['video_id'] + '&channel_id='+ comment['author_channel_id'] + '&author_id=' + comment['author_id'] - + '&comment_id=' + comment['comment_id']) + + '&comment_id=' + comment['id']) - num_replies = comment['number_of_replies'] - if num_replies == 0: - comment['replies_url'] = util.URL_ORIGIN + '/post_comment?parent_id=' + comment['comment_id'] + "&video_id=" + comments_info['video_id'] + reply_count = comment['reply_count'] + if reply_count == 0: + comment['replies_url'] = util.URL_ORIGIN + '/post_comment?parent_id=' + comment['id'] + "&video_id=" + comments_info['video_id'] else: - comment['replies_url'] = util.URL_ORIGIN + '/comments?parent_id=' + comment['comment_id'] + "&video_id=" + comments_info['video_id'] + comment['replies_url'] = util.URL_ORIGIN + '/comments?parent_id=' + comment['id'] + "&video_id=" + comments_info['video_id'] - if num_replies == 0: + if reply_count == 0: comment['view_replies_text'] = 'Reply' - elif num_replies == 1: + elif reply_count == 1: comment['view_replies_text'] = '1 reply' else: - comment['view_replies_text'] = str(num_replies) + ' replies' + comment['view_replies_text'] = str(reply_count) + ' replies' - if comment['likes'] == 1: + if comment['like_count'] == 1: comment['likes_text'] = '1 like' else: - comment['likes_text'] = str(comment['likes']) + ' likes' + comment['likes_text'] = str(comment['like_count']) + ' likes' comments_info['include_avatars'] = settings.enable_comment_avatars if comments_info['ctoken']: diff --git a/youtube/playlist.py b/youtube/playlist.py index bc2c417..ced0644 100644 --- a/youtube/playlist.py +++ b/youtube/playlist.py @@ -98,13 +98,19 @@ def get_playlist_page(): info['metadata'] = yt_data_extract.extract_playlist_metadata(first_page_json) yt_data_extract.prefix_urls(info['metadata']) - for item in info['items']: + for item in info.get('items', ()): yt_data_extract.prefix_urls(item) yt_data_extract.add_extra_html_info(item) + if 'id' in item: + item['thumbnail'] = '/https://i.ytimg.com/vi/' + item['id'] + '/default.jpg' + + video_count = yt_data_extract.default_multi_get(info, 'metadata', 'video_count') + if video_count is None: + video_count = 40 return flask.render_template('playlist.html', - video_list = info['items'], - num_pages = math.ceil(info['metadata']['size']/20), + video_list = info.get('items', []), + num_pages = math.ceil(video_count/20), parameters_dictionary = request.args, **info['metadata'] diff --git a/youtube/search.py b/youtube/search.py index cb66744..a881557 100644 --- a/youtube/search.py +++ b/youtube/search.py @@ -79,9 +79,9 @@ def get_search_page(): if search_info['error']: return flask.render_template('error.html', error_message = search_info['error']) - for item_info in search_info['items']: - yt_data_extract.prefix_urls(item_info) - yt_data_extract.add_extra_html_info(item_info) + for extract_item_info in search_info['items']: + yt_data_extract.prefix_urls(extract_item_info) + yt_data_extract.add_extra_html_info(extract_item_info) corrections = search_info['corrections'] if corrections['type'] == 'did_you_mean': diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py index e0c71f5..9709467 100644 --- a/youtube/subscriptions.py +++ b/youtube/subscriptions.py @@ -172,7 +172,7 @@ def _get_videos(cursor, number_per_page, offset, tag = None): 'id': db_video[0], 'title': db_video[1], 'duration': db_video[2], - 'published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]), + 'time_published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]), 'author': db_video[5], }) @@ -462,8 +462,10 @@ def _get_upstream_videos(channel_id): videos = channel_info['items'] for i, video_item in enumerate(videos): - if 'description' not in video_item: + if not video_item.get('description'): video_item['description'] = '' + else: + video_item['description'] = ''.join(run.get('text', '') for run in video_item['description']) if video_item['id'] in times_published: video_item['time_published'] = times_published[video_item['id']] @@ -471,7 +473,7 @@ def _get_upstream_videos(channel_id): else: video_item['is_time_published_exact'] = False try: - video_item['time_published'] = youtube_timestamp_to_posix(video_item['published']) - i # subtract a few seconds off the videos so they will be in the right order + video_item['time_published'] = youtube_timestamp_to_posix(video_item['time_published']) - i # subtract a few seconds off the videos so they will be in the right order except KeyError: print(video_item) diff --git a/youtube/templates/comments.html b/youtube/templates/comments.html index 20cde4e..396852a 100644 --- a/youtube/templates/comments.html +++ b/youtube/templates/comments.html @@ -12,11 +12,11 @@ {{ comment['author'] }} {{ common_elements.text_runs(comment['text']) }} - +
    {{ comment['view_replies_text'] }} {% if 'delete_url' is in comment %} diff --git a/youtube/templates/common_elements.html b/youtube/templates/common_elements.html index 1a417ae..4c776b6 100644 --- a/youtube/templates/common_elements.html +++ b/youtube/templates/common_elements.html @@ -9,55 +9,59 @@ {{ text_run["text"] }} {%- endif -%} {%- endfor -%} - {%- else -%} + {%- elif runs -%} {{ runs }} {%- endif -%} {% endmacro %} {% macro item(info, description=false, horizontal=true, include_author=true, include_badges=true) %}
    -
    - - - {% if info['type'] != 'channel' %} -
    - {{ info['size'] if info['type'] == 'playlist' else info['duration'] }} -
    - {% endif %} -
    + {% if info['error'] %} + {{ info['error'] }} + {% else %} +
    + + + {% if info['type'] != 'channel' %} +
    + {{ (info['video_count']|string + ' videos') if info['type'] == 'playlist' else info['duration'] }} +
    + {% endif %} +
    - + -
      - {% if info['type'] == 'channel' %} -
    • {{ info['subscriber_count'] }} subscribers
    • -
    • {{ info['size'] }} videos
    • - {% else %} - {% if include_author %} - {% if 'author_url' is in(info) %} -
    • By {{ info['author'] }}
    • - {% else %} -
    • {{ info['author'] }}
    • +
        + {% if info['type'] == 'channel' %} +
      • {{ info['approx_subscriber_count'] }} subscribers
      • +
      • {{ info['video_count'] }} videos
      • + {% else %} + {% if include_author %} + {% if info.get('author_url') %} +
      • By {{ info['author'] }}
      • + {% else %} +
      • {{ info['author'] }}
      • + {% endif %} + {% endif %} + {% if info.get('approx_view_count') %} +
      • {{ info['approx_view_count'] }} views
      • + {% endif %} + {% if info.get('time_published') %} +
      • {% endif %} {% endif %} - {% if 'views' is in(info) %} -
      • {{ info['views'] }}
      • - {% endif %} - {% if 'published' is in(info) %} -
      • - {% endif %} - {% endif %} -
      +
    - {% if description %} - {{ text_runs(info.get('description', '')) }} - {% endif %} - {% if include_badges %} - {{ info['badges']|join(' | ') }} + {% if description %} + {{ text_runs(info.get('description', '')) }} + {% endif %} + {% if include_badges %} + {{ info['badges']|join(' | ') }} + {% endif %} +
    + {% if info['type'] == 'video' %} + {% endif %} -
    - {% if info['type'] == 'video' %} - {% endif %}
    diff --git a/youtube/templates/playlist.html b/youtube/templates/playlist.html index 52c468e..ebd152b 100644 --- a/youtube/templates/playlist.html +++ b/youtube/templates/playlist.html @@ -54,8 +54,9 @@

    {{ title }}

    {{ author }}
    -
    {{ views }}
    -
    {{ size }} videos
    +
    {{ video_count|commatize }} videos
    +
    {{ view_count|commatize }} views
    +
    Last updated {{ time_published }}
    {{ common_elements.text_runs(description) }}
    diff --git a/youtube/templates/watch.html b/youtube/templates/watch.html index 0ffa358..5bd2a25 100644 --- a/youtube/templates/watch.html +++ b/youtube/templates/watch.html @@ -261,11 +261,11 @@ {%- endif -%}
    Uploaded by {{ uploader }}
    - {{ views }} views + {{ view_count }} views - - + +
    Download
      diff --git a/youtube/util.py b/youtube/util.py index 474e7b5..9023b98 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -310,6 +310,8 @@ def uppercase_escape(s): lambda m: chr(int(m.group(1), base=16)), s) def prefix_url(url): + if url is None: + return None url = url.lstrip('/') # some urls have // before them, which has a special meaning return '/' + url diff --git a/youtube/watch.py b/youtube/watch.py index fca794e..2118319 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -405,10 +405,10 @@ def get_watch_page(): return flask.render_template('watch.html', header_playlist_names = local_playlist.get_playlist_names(), uploader_channel_url = ('/' + info['author_url']) if info['author_url'] else '', - upload_date = info['published_date'], - views = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)), - likes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)), - dislikes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)), + time_published = info['time_published'], + view_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)), + like_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)), + dislike_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)), download_formats = download_formats, video_info = json.dumps(video_info), video_sources = video_sources, diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 653a79f..ea67383 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -8,7 +8,7 @@ import collections from math import ceil import traceback -# videos (all of type str): +# videos: # id # title @@ -17,11 +17,12 @@ import traceback # author_url # thumbnail # description -# published -# duration -# likes -# dislikes -# views +# time_published (str) +# duration (str) +# like_count (int) +# dislike_count (int) +# view_count (int) +# approx_view_count (str) # playlist_index # playlists: @@ -33,8 +34,8 @@ import traceback # author_url # thumbnail # description -# updated -# size +# time_published (str) +# video_count (int) # first_video_id # from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py @@ -144,26 +145,6 @@ _formats = { '397': {'vcodec': 'av01.0.05M.08'}, } - -def get_plain_text(node): - try: - return node['simpleText'] - except KeyError: - return ''.join(text_run['text'] for text_run in node['runs']) - -def format_text_runs(runs): - if isinstance(runs, str): - return runs - result = '' - for text_run in runs: - if text_run.get("bold", False): - result += "" + html.escape(text_run["text"]) + "" - elif text_run.get('italics', False): - result += "" + html.escape(text_run["text"]) + "" - else: - result += html.escape(text_run["text"]) - return result - def default_get(object, key, default=None, types=()): '''Like dict.get(), but returns default if the result doesn't match one of the types. Also works for indexing lists.''' @@ -177,6 +158,19 @@ def default_get(object, key, default=None, types=()): else: return default +def multi_default_get(object, *keys, default=None, types=()): + '''Like default_get, but try other keys if the first fails''' + for key in keys: + try: + result = object[key] + except (TypeError, IndexError, KeyError): + pass + else: + if not types or isinstance(result, types): + return result + else: + continue + return default def default_multi_get(object, *keys, default=None, types=()): @@ -211,101 +205,85 @@ def multi_default_multi_get(object, *key_sequences, default=None, types=()): continue return default +def liberal_update(obj, key, value): + '''Updates obj[key] with value as long as value is not None. + Ensures obj[key] will at least get a value of None, however''' + if (value is not None) or (key not in obj): + obj[key] = value + +def conservative_update(obj, key, value): + '''Only updates obj if it doesn't have key or obj[key] is None''' + if obj.get(key) is None: + obj[key] = value + def remove_redirect(url): if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking query_string = url[url.find('?')+1: ] return urllib.parse.parse_qs(query_string)['q'][0] return url -def get_url(node): - try: - return node['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] - except KeyError: - return node['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] +def _recover_urls(runs): + for run in runs: + url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') + text = run.get('text', '') + # second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text + if url is not None and (text.startswith('http://') or text.startswith('https://')): + url = remove_redirect(url) + run['url'] = url + run['text'] = url # youtube truncates the url text, use actual url instead +def extract_str(node, default=None, recover_urls=False): + '''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix Youtube's truncation of url text (most prominently seen in descriptions)''' + if isinstance(node, str): + return node -def get_text(node): - if node == {}: - return '' try: return node['simpleText'] - except KeyError: + except (KeyError, TypeError): pass - try: - return node['runs'][0]['text'] - except IndexError: # empty text runs - return '' - except KeyError: - print(node) - raise -def get_formatted_text(node): - try: - return node['runs'] - except KeyError: - return node['simpleText'] - -def get_badges(node): - badges = [] - for badge_node in node: - badge = badge_node['metadataBadgeRenderer']['label'] - badges.append(badge) - return badges + if isinstance(node, dict) and 'runs' in node: + if recover_urls: + _recover_urls(node['runs']) + return ''.join(text_run.get('text', '') for text_run in node['runs']) -def get_thumbnail(node): - try: - return node['thumbnails'][0]['url'] # polymer format - except KeyError: - return node['url'] # ajax format - -dispatch = { - -# polymer format - 'title': ('title', get_text), - 'publishedTimeText': ('published', get_text), - 'videoId': ('id', lambda node: node), - 'descriptionSnippet': ('description', get_formatted_text), - 'lengthText': ('duration', get_text), - 'thumbnail': ('thumbnail', get_thumbnail), - 'thumbnails': ('thumbnail', lambda node: node[0]['thumbnails'][0]['url']), - - 'viewCountText': ('views', get_text), - 'numVideosText': ('size', lambda node: get_text(node).split(' ')[0]), # the format is "324 videos" - 'videoCountText': ('size', get_text), - 'playlistId': ('id', lambda node: node), - 'descriptionText': ('description', get_formatted_text), - - 'subscriberCountText': ('subscriber_count', get_text), - 'channelId': ('id', lambda node: node), - 'badges': ('badges', get_badges), - -# ajax format - 'view_count_text': ('views', get_text), - 'num_videos_text': ('size', lambda node: get_text(node).split(' ')[0]), - 'owner_text': ('author', get_text), - 'owner_endpoint': ('author_url', lambda node: node['url']), - 'description': ('description', get_formatted_text), - 'index': ('playlist_index', get_text), - 'short_byline': ('author', get_text), - 'length': ('duration', get_text), - 'video_id': ('id', lambda node: node), + return default -} +def extract_formatted_text(node): + if not node: + return [] + if 'runs' in node: + _recover_urls(node['runs']) + return node['runs'] + elif 'simpleText' in node: + return [{'text': node['simpleText']}] + return [] -def ajax_info(item_json): +def extract_int(string): + if isinstance(string, int): + return string + if not isinstance(string, str): + string = extract_str(string) + if not string: + return None + match = re.search(r'(\d+)', string.replace(',', '')) + if match is None: + return None try: - info = {} - for key, node in item_json.items(): - try: - simple_key, function = dispatch[key] - except KeyError: - continue - info[simple_key] = function(node) - return info - except KeyError: - print(item_json) - raise + return int(match.group(1)) + except ValueError: + return None +def extract_approx_int(string): + '''e.g. "15M" from "15M subscribers"''' + if not isinstance(string, str): + string = extract_str(string) + if not string: + return None + match = re.search(r'(\d+[KMBTkmbt])', string.replace(',', '')) + if match is None: + return None + return match.group(1) youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$') def normalize_url(url): @@ -330,7 +308,7 @@ def prefix_urls(item): def add_extra_html_info(item): if item['type'] == 'video': - item['url'] = util.URL_ORIGIN + '/watch?v=' + item['id'] + item['url'] = (util.URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None video_info = {} for key in ('id', 'title', 'author', 'duration'): @@ -342,17 +320,22 @@ def add_extra_html_info(item): item['video_info'] = json.dumps(video_info) elif item['type'] == 'playlist': - item['url'] = util.URL_ORIGIN + '/playlist?list=' + item['id'] + item['url'] = (util.URL_ORIGIN + '/playlist?list=' + item['id']) if item.get('id') else None elif item['type'] == 'channel': - item['url'] = util.URL_ORIGIN + "/channel/" + item['id'] + item['url'] = (util.URL_ORIGIN + "/channel/" + item['id']) if item.get('id') else None +def extract_item_info(item, additional_info={}): + if not item: + return {'error': 'No item given'} -def renderer_info(renderer, additional_info={}): - type = list(renderer.keys())[0] - renderer = renderer[type] - info = {} + type = default_get(list(item.keys()), 0) + if not type: + return {'error': 'Could not find type'} + item = item[type] + + info = {'error': None} if type in ('itemSectionRenderer', 'compactAutoplayRenderer'): - return renderer_info(renderer['contents'][0], additional_info) + return extract_item_info(default_multi_get(item, 'contents', 0), additional_info) if type in ('movieRenderer', 'clarificationRenderer'): info['type'] = 'unsupported' @@ -360,75 +343,78 @@ def renderer_info(renderer, additional_info={}): info.update(additional_info) - - if type in ('compactVideoRenderer', 'videoRenderer', 'playlistVideoRenderer', 'gridVideoRenderer'): + # type looks like e.g. 'compactVideoRenderer' or 'gridVideoRenderer' + # camelCase split, https://stackoverflow.com/a/37697078 + type_parts = [s.lower() for s in re.sub(r'([A-Z][a-z]+)', r' \1', type).split()] + if len(type_parts) < 2: + info['type'] = 'unsupported' + return + primary_type = type_parts[-2] + if primary_type == 'video': info['type'] = 'video' - elif type in ('playlistRenderer', 'compactPlaylistRenderer', 'gridPlaylistRenderer', - 'radioRenderer', 'compactRadioRenderer', 'gridRadioRenderer', - 'showRenderer', 'compactShowRenderer', 'gridShowRenderer'): + elif primary_type in ('playlist', 'radio', 'show'): info['type'] = 'playlist' - elif type == 'channelRenderer': + elif primary_type == 'channel': info['type'] = 'channel' - elif type == 'playlistHeaderRenderer': - info['type'] = 'playlist_metadata' else: info['type'] = 'unsupported' - return info - try: - if 'viewCountText' in renderer: # prefer this one as it contains all the digits - info['views'] = get_text(renderer['viewCountText']) - elif 'shortViewCountText' in renderer: - info['views'] = get_text(renderer['shortViewCountText']) - - if 'ownerText' in renderer: - info['author'] = renderer['ownerText']['runs'][0]['text'] - info['author_url'] = normalize_url(renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']) - try: - overlays = renderer['thumbnailOverlays'] - except KeyError: - pass - else: - for overlay in overlays: - if 'thumbnailOverlayTimeStatusRenderer' in overlay: - info['duration'] = get_text(overlay['thumbnailOverlayTimeStatusRenderer']['text']) - # show renderers don't have videoCountText - elif 'thumbnailOverlayBottomPanelRenderer' in overlay: - info['size'] = get_text(overlay['thumbnailOverlayBottomPanelRenderer']['text']) - - # show renderers don't have playlistId, have to dig into the url to get it - try: - info['id'] = renderer['navigationEndpoint']['watchEndpoint']['playlistId'] - except KeyError: - pass - for key, node in renderer.items(): - if key in ('longBylineText', 'shortBylineText'): - info['author'] = get_text(node) - try: - info['author_url'] = normalize_url(get_url(node)) - except KeyError: - pass + info['title'] = extract_str(item.get('title')) + info['author'] = extract_str(multi_default_get(item, 'longBylineText', 'shortBylineText', 'ownerText')) + info['author_id'] = extract_str(multi_default_multi_get(item, + ['longBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], + ['shortBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], + ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'] + )) + info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None + info['description'] = extract_formatted_text(multi_default_get(item, 'descriptionSnippet', 'descriptionText')) + info['thumbnail'] = multi_default_multi_get(item, + ['thumbnail', 'thumbnails', 0, 'url'], # videos + ['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists + ['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows + ) - # show renderers don't have thumbnail key at top level, dig into thumbnailRenderer - elif key == 'thumbnailRenderer' and 'showCustomThumbnailRenderer' in node: - info['thumbnail'] = node['showCustomThumbnailRenderer']['thumbnail']['thumbnails'][0]['url'] - else: - try: - simple_key, function = dispatch[key] - except KeyError: - continue - info[simple_key] = function(node) - if info['type'] == 'video' and 'duration' not in info: - info['duration'] = 'Live' + info['badges'] = [] + for badge_node in multi_default_get(item, 'badges', 'ownerBadges', default=()): + badge = default_multi_get(badge_node, 'metadataBadgeRenderer', 'label') + if badge: + info['badges'].append(badge) - return info - except KeyError: - print(renderer) - raise + if primary_type in ('video', 'playlist'): + info['time_published'] = extract_str(item.get('publishedTimeText')) + if primary_type == 'video': + info['id'] = item.get('videoId') + info['view_count'] = extract_int(item.get('viewCountText')) + if info['view_count']: + info['approx_view_count'] = '{:,}'.format(info['view_count']) + else: + info['approx_view_count'] = extract_approx_int(multi_default_get(item, 'shortViewCountText')) + info['duration'] = extract_str(item.get('lengthText')) + elif primary_type == 'playlist': + info['id'] = item.get('playlistId') + info['video_count'] = extract_int(item.get('videoCount')) + elif primary_type == 'channel': + info['id'] = item.get('channelId') + info['approx_subscriber_count'] = extract_approx_int(item.get('subscriberCountText')) + elif primary_type == 'show': + info['id'] = default_multi_get(item, 'navigationEndpoint', 'watchEndpoint', 'playlistId') + + if primary_type in ('playlist', 'channel'): + conservative_update(info, 'video_count', extract_int(item.get('videoCountText'))) + + for overlay in item.get('thumbnailOverlays', []): + conservative_update(info, 'duration', extract_str(default_multi_get( + overlay, 'thumbnailOverlayTimeStatusRenderer', 'text' + ))) + # show renderers don't have videoCountText + conservative_update(info, 'video_count', extract_int(default_multi_get( + overlay, 'thumbnailOverlayBottomPanelRenderer', 'text' + ))) + return info def parse_info_prepare_for_html(renderer, additional_info={}): - item = renderer_info(renderer, additional_info) + item = extract_item_info(renderer, additional_info) prefix_urls(item) add_extra_html_info(item) @@ -616,7 +602,7 @@ def extract_channel_info(polymer_json, tab): items, _ = extract_items(response) if tab in ('videos', 'playlists', 'search'): additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id} - info['items'] = [renderer_info(renderer, additional_info) for renderer in items] + info['items'] = [extract_item_info(renderer, additional_info) for renderer in items] elif tab == 'about': for item in items: @@ -633,7 +619,7 @@ def extract_channel_info(polymer_json, tab): for link_json in channel_metadata.get('primaryLinks', ()): url = remove_redirect(link_json['navigationEndpoint']['urlEndpoint']['url']) - text = get_plain_text(link_json['title']) + text = extract_str(link_json['title']) info['links'].append( (text, url) ) @@ -644,10 +630,10 @@ def extract_channel_info(polymer_json, tab): stat = channel_metadata[stat_name] except KeyError: continue - info['stats'].append(get_plain_text(stat)) + info['stats'].append(extract_str(stat)) if 'description' in channel_metadata: - info['description'] = get_text(channel_metadata['description']) + info['description'] = extract_str(channel_metadata['description']) else: info['description'] = '' @@ -693,9 +679,9 @@ def extract_search_info(polymer_json): } continue - item_info = renderer_info(renderer) - if item_info['type'] != 'unsupported': - info['items'].append(item_info) + i_info = extract_item_info(renderer) + if i_info.get('type') != 'unsupported': + info['items'].append(i_info) return info @@ -704,13 +690,41 @@ def extract_playlist_metadata(polymer_json): response, err = extract_response(polymer_json) if err: return {'error': err} - metadata = renderer_info(response['header']) - metadata['error'] = None - if 'description' not in metadata: - metadata['description'] = '' - - metadata['size'] = int(metadata['size'].replace(',', '')) + metadata = {'error': None} + header = default_multi_get(response, 'header', 'playlistHeaderRenderer', default={}) + metadata['title'] = extract_str(header.get('title')) + + metadata['first_video_id'] = default_multi_get(header, 'playEndpoint', 'watchEndpoint', 'videoId') + first_id = re.search(r'([a-z_\-]{11})', default_multi_get(header, + 'thumbnail', 'thumbnails', 0, 'url', default='')) + if first_id: + conservative_update(metadata, 'first_video_id', first_id.group(1)) + if metadata['first_video_id'] is None: + metadata['thumbnail'] = None + else: + metadata['thumbnail'] = 'https://i.ytimg.com/vi/' + metadata['first_video_id'] + '/mqdefault.jpg' + + metadata['video_count'] = extract_int(header.get('numVideosText')) + metadata['description'] = extract_str(header.get('descriptionText'), default='') + metadata['author'] = extract_str(header.get('ownerText')) + metadata['author_id'] = multi_default_multi_get(header, + ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], + ['ownerEndpoint', 'browseEndpoint', 'browseId']) + if metadata['author_id']: + metadata['author_url'] = 'https://www.youtube.com/channel/' + metadata['author_id'] + else: + metadata['author_url'] = None + metadata['view_count'] = extract_int(header.get('viewCountText')) + metadata['like_count'] = extract_int(header.get('likesCountWithoutLikeText')) + for stat in header.get('stats', ()): + text = extract_str(stat) + if 'videos' in text: + conservative_update(metadata, 'video_count', extract_int(text)) + elif 'views' in text: + conservative_update(metadata, 'view_count', extract_int(text)) + elif 'updated' in text: + metadata['time_published'] = extract_date(text) return metadata @@ -722,7 +736,7 @@ def extract_playlist_info(polymer_json): first_page = 'continuationContents' not in response video_list, _ = extract_items(response) - info['items'] = [renderer_info(renderer) for renderer in video_list] + info['items'] = [extract_item_info(renderer) for renderer in video_list] if first_page: info['metadata'] = extract_playlist_metadata(polymer_json) @@ -777,7 +791,7 @@ def parse_comments_polymer(polymer_json): video_title = comment_thread['commentTargetTitle']['runs'][0]['text'] if 'replies' in comment_thread: - view_replies_text = get_plain_text(comment_thread['replies']['commentRepliesRenderer']['moreText']) + view_replies_text = extract_str(comment_thread['replies']['commentRepliesRenderer']['moreText']) view_replies_text = view_replies_text.replace(',', '') match = re.search(r'(\d+)', view_replies_text) if match is None: @@ -789,15 +803,15 @@ def parse_comments_polymer(polymer_json): comment = { 'author_id': comment_renderer.get('authorId', ''), 'author_avatar': comment_renderer['authorThumbnail']['thumbnails'][0]['url'], - 'likes': comment_renderer['likeCount'], - 'published': get_plain_text(comment_renderer['publishedTimeText']), + 'like_count': comment_renderer['likeCount'], + 'time_published': extract_str(comment_renderer['publishedTimeText']), 'text': comment_renderer['contentText'].get('runs', ''), - 'number_of_replies': number_of_replies, - 'comment_id': comment_renderer['commentId'], + 'reply_count': number_of_replies, + 'id': comment_renderer['commentId'], } if 'authorText' in comment_renderer: # deleted channels have no name or channel link - comment['author'] = get_plain_text(comment_renderer['authorText']) + comment['author'] = extract_str(comment_renderer['authorText']) comment['author_url'] = comment_renderer['authorEndpoint']['commandMetadata']['webCommandMetadata']['url'] comment['author_channel_id'] = comment_renderer['authorEndpoint']['browseEndpoint']['browseId'] else: @@ -832,66 +846,6 @@ def check_missing_keys(object, *key_sequences): return None -def extract_str(node, default=None, recover_urls=False): - '''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix Youtube's truncation of url text (most prominently seen in descriptions)''' - if isinstance(node, str): - return node - - try: - return node['simpleText'] - except (KeyError, TypeError): - pass - - if isinstance(node, dict) and 'runs' in node: - if recover_urls: - result = '' - for run in node['runs']: - url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') - text = run.get('text', '') - # second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text - if url is not None and (text.startswith('http://') or text.startswith('https://')): - url = remove_redirect(url) - result += url # youtube truncates the url text, use actual url instead - else: - result += text - return result - else: - return ''.join(text_run.get('text', '') for text_run in node['runs']) - - return default - -def extract_formatted_text(node): - try: - result = [] - runs = node['runs'] - for run in runs: - url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') - if url is not None: - run['url'] = remove_redirect(url) - run['text'] = run['url'] # youtube truncates the url text, we don't want that nonsense - return runs - except (KeyError, TypeError): - traceback.print_exc() - pass - - try: - return [{'text': node['simpleText']}] - except (KeyError, TypeError): - pass - - return [] - -def extract_int(string): - if not isinstance(string, str): - return None - match = re.search(r'(\d+)', string.replace(',', '')) - if match is None: - return None - try: - return int(match.group(1)) - except ValueError: - return None - def extract_metadata_row_info(video_renderer_info): # extract category and music list info = { @@ -944,7 +898,7 @@ def extract_watch_info_mobile(top_level): else: info['age_restricted'] = not family_safe info['allowed_countries'] = microformat.get('availableCountries', []) - info['published_date'] = microformat.get('publishDate') + info['time_published'] = microformat.get('publishDate') response = top_level.get('response', {}) @@ -962,15 +916,15 @@ def extract_watch_info_mobile(top_level): info['author'] = extract_str(default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) info['author_id'] = default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') info['title'] = extract_str(video_info.get('title')) - info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle')) + info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='') info['unlisted'] = False for badge in video_info.get('badges', []): if default_multi_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted': info['unlisted'] = True info['like_count'] = None info['dislike_count'] = None - if not info['published_date']: - info['published_date'] = extract_date(extract_str(video_info.get('dateText', None))) + if not info['time_published']: + info['time_published'] = extract_date(extract_str(video_info.get('dateText', None))) for button in video_info.get('buttons', ()): button_renderer = button.get('slimMetadataToggleButtonRenderer', {}) @@ -1012,7 +966,7 @@ def extract_watch_info_mobile(top_level): # related videos related, _ = extract_items(response) - info['related_videos'] = [renderer_info(renderer) for renderer in related] + info['related_videos'] = [extract_item_info(renderer) for renderer in related] return info @@ -1032,7 +986,7 @@ def extract_watch_info_desktop(top_level): info.update(extract_metadata_row_info(video_info)) info['description'] = extract_str(video_info.get('description', None), recover_urls=True) - info['published_date'] = extract_date(extract_str(video_info.get('dateText', None))) + info['time_published'] = extract_date(extract_str(video_info.get('dateText', None))) likes_dislikes = default_multi_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/') if len(likes_dislikes) == 2: @@ -1048,7 +1002,7 @@ def extract_watch_info_desktop(top_level): info['view_count'] = extract_int(extract_str(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount'))) related = default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[]) - info['related_videos'] = [renderer_info(renderer) for renderer in related] + info['related_videos'] = [extract_item_info(renderer) for renderer in related] return info @@ -1114,17 +1068,6 @@ def extract_playability_error(info, player_response, error_prefix=''): else: info['playability_error'] = error_prefix + 'Unknown playability error' -def liberal_update(obj, key, value): - '''Updates obj[key] with value as long as value is not None. - Ensures obj[key] will at least get a value of None, however''' - if (value is not None) or (key not in obj): - obj[key] = value - -def conservative_update(obj, key, value): - '''Only updates obj if it doesn't have key or obj[key] is None''' - if obj.get(key) is None: - obj[key] = value - SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') def extract_watch_info(polymer_json): info = {'playability_error': None, 'error': None} @@ -1223,8 +1166,8 @@ def extract_watch_info(polymer_json): conservative_update(info, 'author_id', mf.get('externalChannelId')) liberal_update(info, 'unlisted', mf.get('isUnlisted')) liberal_update(info, 'category', mf.get('category')) - liberal_update(info, 'published_date', mf.get('publishDate')) - liberal_update(info, 'uploaded_date', mf.get('uploadDate')) + liberal_update(info, 'time_published', mf.get('publishDate')) + liberal_update(info, 'time_uploaded', mf.get('uploadDate')) # other stuff info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None -- cgit v1.2.3 From f6bf5213a579b16e17e8d72b51b090ffe4bc9bdb Mon Sep 17 00:00:00 2001 From: James Taylor Date: Wed, 18 Dec 2019 19:43:55 -0800 Subject: Extraction: rename multi_get functions to more descriptive names --- youtube/playlist.py | 2 +- youtube/watch.py | 8 +-- youtube/yt_data_extract.py | 126 ++++++++++++++++++++++----------------------- 3 files changed, 68 insertions(+), 68 deletions(-) (limited to 'youtube') diff --git a/youtube/playlist.py b/youtube/playlist.py index ced0644..5dc8ab7 100644 --- a/youtube/playlist.py +++ b/youtube/playlist.py @@ -104,7 +104,7 @@ def get_playlist_page(): if 'id' in item: item['thumbnail'] = '/https://i.ytimg.com/vi/' + item['id'] + '/default.jpg' - video_count = yt_data_extract.default_multi_get(info, 'metadata', 'video_count') + video_count = yt_data_extract.deep_get(info, 'metadata', 'video_count') if video_count is None: video_count = 40 diff --git a/youtube/watch.py b/youtube/watch.py index 2118319..69ab87b 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -185,7 +185,7 @@ def decrypt_signatures(info): return False # No decryption needed if not info['base_js']: return 'Failed to find base.js' - player_name = yt_data_extract.default_get(info['base_js'].split('/'), -2) + player_name = yt_data_extract.get(info['base_js'].split('/'), -2) if not player_name: return 'Could not find player name' @@ -204,7 +204,7 @@ def decrypt_signatures(info): if not function_body: return 'Empty decryption function body' - var_name = yt_data_extract.default_get(function_body[0].split('.'), 0) + var_name = yt_data_extract.get(function_body[0].split('.'), 0) if var_name is None: return 'Could not find var_name' @@ -397,8 +397,8 @@ def get_watch_page(): }) video_sources = get_video_sources(info) - video_height = yt_data_extract.default_multi_get(video_sources, 0, 'height', default=360) - video_width = yt_data_extract.default_multi_get(video_sources, 0, 'width', default=640) + video_height = yt_data_extract.deep_get(video_sources, 0, 'height', default=360) + video_width = yt_data_extract.deep_get(video_sources, 0, 'width', default=640) # 1 second per pixel, or the actual video width theater_video_target_width = max(640, info['duration'] or 0, video_width) diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index ea67383..6a5e4bb 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -145,7 +145,7 @@ _formats = { '397': {'vcodec': 'av01.0.05M.08'}, } -def default_get(object, key, default=None, types=()): +def get(object, key, default=None, types=()): '''Like dict.get(), but returns default if the result doesn't match one of the types. Also works for indexing lists.''' try: @@ -158,8 +158,8 @@ def default_get(object, key, default=None, types=()): else: return default -def multi_default_get(object, *keys, default=None, types=()): - '''Like default_get, but try other keys if the first fails''' +def multi_get(object, *keys, default=None, types=()): + '''Like get, but try other keys if the first fails''' for key in keys: try: result = object[key] @@ -173,7 +173,7 @@ def multi_default_get(object, *keys, default=None, types=()): return default -def default_multi_get(object, *keys, default=None, types=()): +def deep_get(object, *keys, default=None, types=()): '''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors. If types is given and the result doesn't match one of those types, default is returned''' @@ -188,8 +188,8 @@ def default_multi_get(object, *keys, default=None, types=()): else: return default -def multi_default_multi_get(object, *key_sequences, default=None, types=()): - '''Like default_multi_get, but can try different key sequences in case one fails. +def multi_deep_get(object, *key_sequences, default=None, types=()): + '''Like deep_get, but can try different key sequences in case one fails. Return default if all of them fail. key_sequences is a list of lists''' for key_sequence in key_sequences: _object = object @@ -224,7 +224,7 @@ def remove_redirect(url): def _recover_urls(runs): for run in runs: - url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') + url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') text = run.get('text', '') # second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text if url is not None and (text.startswith('http://') or text.startswith('https://')): @@ -328,14 +328,14 @@ def extract_item_info(item, additional_info={}): if not item: return {'error': 'No item given'} - type = default_get(list(item.keys()), 0) + type = get(list(item.keys()), 0) if not type: return {'error': 'Could not find type'} item = item[type] info = {'error': None} if type in ('itemSectionRenderer', 'compactAutoplayRenderer'): - return extract_item_info(default_multi_get(item, 'contents', 0), additional_info) + return extract_item_info(deep_get(item, 'contents', 0), additional_info) if type in ('movieRenderer', 'clarificationRenderer'): info['type'] = 'unsupported' @@ -360,23 +360,23 @@ def extract_item_info(item, additional_info={}): info['type'] = 'unsupported' info['title'] = extract_str(item.get('title')) - info['author'] = extract_str(multi_default_get(item, 'longBylineText', 'shortBylineText', 'ownerText')) - info['author_id'] = extract_str(multi_default_multi_get(item, + info['author'] = extract_str(multi_get(item, 'longBylineText', 'shortBylineText', 'ownerText')) + info['author_id'] = extract_str(multi_deep_get(item, ['longBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], ['shortBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'] )) info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None - info['description'] = extract_formatted_text(multi_default_get(item, 'descriptionSnippet', 'descriptionText')) - info['thumbnail'] = multi_default_multi_get(item, + info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText')) + info['thumbnail'] = multi_deep_get(item, ['thumbnail', 'thumbnails', 0, 'url'], # videos ['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists ['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows ) info['badges'] = [] - for badge_node in multi_default_get(item, 'badges', 'ownerBadges', default=()): - badge = default_multi_get(badge_node, 'metadataBadgeRenderer', 'label') + for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()): + badge = deep_get(badge_node, 'metadataBadgeRenderer', 'label') if badge: info['badges'].append(badge) @@ -389,7 +389,7 @@ def extract_item_info(item, additional_info={}): if info['view_count']: info['approx_view_count'] = '{:,}'.format(info['view_count']) else: - info['approx_view_count'] = extract_approx_int(multi_default_get(item, 'shortViewCountText')) + info['approx_view_count'] = extract_approx_int(multi_get(item, 'shortViewCountText')) info['duration'] = extract_str(item.get('lengthText')) elif primary_type == 'playlist': info['id'] = item.get('playlistId') @@ -398,17 +398,17 @@ def extract_item_info(item, additional_info={}): info['id'] = item.get('channelId') info['approx_subscriber_count'] = extract_approx_int(item.get('subscriberCountText')) elif primary_type == 'show': - info['id'] = default_multi_get(item, 'navigationEndpoint', 'watchEndpoint', 'playlistId') + info['id'] = deep_get(item, 'navigationEndpoint', 'watchEndpoint', 'playlistId') if primary_type in ('playlist', 'channel'): conservative_update(info, 'video_count', extract_int(item.get('videoCountText'))) for overlay in item.get('thumbnailOverlays', []): - conservative_update(info, 'duration', extract_str(default_multi_get( + conservative_update(info, 'duration', extract_str(deep_get( overlay, 'thumbnailOverlayTimeStatusRenderer', 'text' ))) # show renderers don't have videoCountText - conservative_update(info, 'video_count', extract_int(default_multi_get( + conservative_update(info, 'video_count', extract_int(deep_get( overlay, 'thumbnailOverlayBottomPanelRenderer', 'text' ))) return info @@ -422,7 +422,7 @@ def parse_info_prepare_for_html(renderer, additional_info={}): def extract_response(polymer_json): '''return response, error''' - response = multi_default_multi_get(polymer_json, [1, 'response'], ['response'], default=None, types=dict) + response = multi_deep_get(polymer_json, [1, 'response'], ['response'], default=None, types=dict) if response is None: return None, 'Failed to extract response' else: @@ -468,25 +468,25 @@ item_types = { } def traverse_browse_renderer(renderer): - for tab in default_get(renderer, 'tabs', (), types=(list, tuple)): - tab_renderer = multi_default_multi_get(tab, ['tabRenderer'], ['expandableTabRenderer'], default=None, types=dict) + for tab in get(renderer, 'tabs', (), types=(list, tuple)): + tab_renderer = multi_deep_get(tab, ['tabRenderer'], ['expandableTabRenderer'], default=None, types=dict) if tab_renderer is None: continue if tab_renderer.get('selected', False): - return default_get(tab_renderer, 'content', {}, types=(dict)) + return get(tab_renderer, 'content', {}, types=(dict)) print('Could not find tab with content') return {} def traverse_standard_list(renderer): - renderer_list = multi_default_multi_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple)) - continuation = default_multi_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation') + renderer_list = multi_deep_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple)) + continuation = deep_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation') return renderer_list, continuation # these renderers contain one inside them nested_renderer_dispatch = { 'singleColumnBrowseResultsRenderer': traverse_browse_renderer, 'twoColumnBrowseResultsRenderer': traverse_browse_renderer, - 'twoColumnSearchResultsRenderer': lambda renderer: default_get(renderer, 'primaryContents', {}, types=dict), + 'twoColumnSearchResultsRenderer': lambda renderer: get(renderer, 'primaryContents', {}, types=dict), } # these renderers contain a list of renderers in side them @@ -495,17 +495,17 @@ nested_renderer_list_dispatch = { 'itemSectionRenderer': traverse_standard_list, 'gridRenderer': traverse_standard_list, 'playlistVideoListRenderer': traverse_standard_list, - 'singleColumnWatchNextResults': lambda r: (default_multi_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None), + 'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None), } def extract_items(response, item_types=item_types): '''return items, ctoken''' if 'continuationContents' in response: # always has just the one [something]Continuation key, but do this just in case they add some tracking key or something - for key, renderer_continuation in default_get(response, 'continuationContents', {}, types=dict).items(): + for key, renderer_continuation in get(response, 'continuationContents', {}, types=dict).items(): if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation - items = multi_default_multi_get(renderer_continuation, ['contents'], ['items'], default=None, types=(list, tuple)) - ctoken = default_multi_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str) + items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=None, types=(list, tuple)) + ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str) return items, ctoken return [], None elif 'contents' in response: @@ -515,7 +515,7 @@ def extract_items(response, item_types=item_types): iter_stack = collections.deque() current_iter = iter(()) - renderer = default_get(response, 'contents', {}, types=dict) + renderer = get(response, 'contents', {}, types=dict) while True: # mode 1: dig into the current renderer @@ -692,11 +692,11 @@ def extract_playlist_metadata(polymer_json): return {'error': err} metadata = {'error': None} - header = default_multi_get(response, 'header', 'playlistHeaderRenderer', default={}) + header = deep_get(response, 'header', 'playlistHeaderRenderer', default={}) metadata['title'] = extract_str(header.get('title')) - metadata['first_video_id'] = default_multi_get(header, 'playEndpoint', 'watchEndpoint', 'videoId') - first_id = re.search(r'([a-z_\-]{11})', default_multi_get(header, + metadata['first_video_id'] = deep_get(header, 'playEndpoint', 'watchEndpoint', 'videoId') + first_id = re.search(r'([a-z_\-]{11})', deep_get(header, 'thumbnail', 'thumbnails', 0, 'url', default='')) if first_id: conservative_update(metadata, 'first_video_id', first_id.group(1)) @@ -708,7 +708,7 @@ def extract_playlist_metadata(polymer_json): metadata['video_count'] = extract_int(header.get('numVideosText')) metadata['description'] = extract_str(header.get('descriptionText'), default='') metadata['author'] = extract_str(header.get('ownerText')) - metadata['author_id'] = multi_default_multi_get(header, + metadata['author_id'] = multi_deep_get(header, ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], ['ownerEndpoint', 'browseEndpoint', 'browseId']) if metadata['author_id']: @@ -854,9 +854,9 @@ def extract_metadata_row_info(video_renderer_info): } current_song = {} - for row in default_multi_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]): - row_title = extract_str(default_multi_get(row, 'metadataRowRenderer', 'title'), default='') - row_content = extract_str(default_multi_get(row, 'metadataRowRenderer', 'contents', 0)) + for row in deep_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]): + row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='') + row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0)) if row_title == 'Category': info['category'] = row_content elif row_title in ('Song', 'Music'): @@ -890,7 +890,7 @@ def extract_date(date_text): def extract_watch_info_mobile(top_level): info = {} - microformat = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) + microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) family_safe = microformat.get('isFamilySafe') if family_safe is None: @@ -913,13 +913,13 @@ def extract_watch_info_mobile(top_level): info.update(extract_metadata_row_info(video_info)) info['description'] = extract_str(video_info.get('description'), recover_urls=True) info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle'))) - info['author'] = extract_str(default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) - info['author_id'] = default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') + info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) + info['author_id'] = deep_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') info['title'] = extract_str(video_info.get('title')) info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='') info['unlisted'] = False for badge in video_info.get('badges', []): - if default_multi_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted': + if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted': info['unlisted'] = True info['like_count'] = None info['dislike_count'] = None @@ -929,10 +929,10 @@ def extract_watch_info_mobile(top_level): button_renderer = button.get('slimMetadataToggleButtonRenderer', {}) # all the digits can be found in the accessibility data - count = extract_int(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label')) + count = extract_int(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label')) # this count doesn't have all the digits, it's like 53K for instance - dumb_count = extract_int(extract_str(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText'))) + dumb_count = extract_int(extract_str(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText'))) # the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0 if dumb_count == 0: @@ -947,7 +947,7 @@ def extract_watch_info_mobile(top_level): items, _ = extract_items(response, item_types={'commentSectionRenderer'}) if items: comment_info = items[0]['commentSectionRenderer'] - comment_count_text = extract_str(default_multi_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText')) + comment_count_text = extract_str(deep_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText')) if comment_count_text == 'Comments': # just this with no number, means 0 comments info['comment_count'] = 0 else: @@ -980,7 +980,7 @@ def extract_watch_info_desktop(top_level): } video_info = {} - for renderer in default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()): + for renderer in deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()): if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'): video_info.update(list(renderer.values())[0]) @@ -988,7 +988,7 @@ def extract_watch_info_desktop(top_level): info['description'] = extract_str(video_info.get('description', None), recover_urls=True) info['time_published'] = extract_date(extract_str(video_info.get('dateText', None))) - likes_dislikes = default_multi_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/') + likes_dislikes = deep_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/') if len(likes_dislikes) == 2: info['like_count'] = extract_int(likes_dislikes[0]) info['dislike_count'] = extract_int(likes_dislikes[1]) @@ -997,11 +997,11 @@ def extract_watch_info_desktop(top_level): info['dislike_count'] = None info['title'] = extract_str(video_info.get('title', None)) - info['author'] = extract_str(default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'title')) - info['author_id'] = default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') - info['view_count'] = extract_int(extract_str(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount'))) + info['author'] = extract_str(deep_get(video_info, 'owner', 'videoOwnerRenderer', 'title')) + info['author_id'] = deep_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') + info['view_count'] = extract_int(extract_str(deep_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount'))) - related = default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[]) + related = deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[]) info['related_videos'] = [extract_item_info(renderer) for renderer in related] return info @@ -1054,10 +1054,10 @@ def extract_playability_error(info, player_response, error_prefix=''): info['playability_error'] = None return - playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None) + playability_status = deep_get(player_response, 'playabilityStatus', 'status', default=None) info['playability_status'] = playability_status - playability_reason = extract_str(multi_default_multi_get(player_response, + playability_reason = extract_str(multi_deep_get(player_response, ['playabilityStatus', 'reason'], ['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'], default='Could not find playability error') @@ -1091,7 +1091,7 @@ def extract_watch_info(polymer_json): if error: info['playability_error'] = error - player_args = default_multi_get(top_level, 'player', 'args', default={}) + player_args = deep_get(top_level, 'player', 'args', default={}) player_response = json.loads(player_args['player_response']) if 'player_response' in player_args else {} # captions @@ -1100,8 +1100,8 @@ def extract_watch_info(polymer_json): info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url info['translation_languages'] = [] captions_info = player_response.get('captions', {}) - info['_captions_base_url'] = normalize_url(default_multi_get(captions_info, 'playerCaptionsRenderer', 'baseUrl')) - for caption_track in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()): + info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl')) + for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()): lang_code = caption_track.get('languageCode') if not lang_code: continue @@ -1110,11 +1110,11 @@ def extract_watch_info(polymer_json): else: info['manual_caption_languages'].append(lang_code) base_url = caption_track.get('baseUrl', '') - lang_name = default_multi_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0) + lang_name = deep_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0) if lang_name: info['_manual_caption_language_names'][lang_code] = lang_name - for translation_lang_info in default_multi_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()): + for translation_lang_info in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()): lang_code = translation_lang_info.get('languageCode') if lang_code: info['translation_languages'].append(lang_code) @@ -1131,18 +1131,18 @@ def extract_watch_info(polymer_json): info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error']) # base_js (for decryption of signatures) - info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js') + info['base_js'] = deep_get(top_level, 'player', 'assets', 'js') if info['base_js']: info['base_js'] = normalize_url(info['base_js']) - mobile = 'singleColumnWatchNextResults' in default_multi_get(top_level, 'response', 'contents', default={}) + mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={}) if mobile: info.update(extract_watch_info_mobile(top_level)) else: info.update(extract_watch_info_desktop(top_level)) # stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info - vd = default_multi_get(top_level, 'playerResponse', 'videoDetails', default={}) + vd = deep_get(top_level, 'playerResponse', 'videoDetails', default={}) liberal_update(info, 'title', extract_str(vd.get('title'))) liberal_update(info, 'duration', extract_int(vd.get('lengthSeconds'))) liberal_update(info, 'view_count', extract_int(vd.get('viewCount'))) @@ -1156,7 +1156,7 @@ def extract_watch_info(polymer_json): liberal_update(info, 'tags', vd.get('keywords', [])) # fallback stuff from microformat - mf = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) + mf = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) conservative_update(info, 'title', extract_str(mf.get('title'))) conservative_update(info, 'duration', extract_int(mf.get('lengthSeconds'))) # this gives the view count for limited state videos @@ -1177,7 +1177,7 @@ def update_with_age_restricted_info(info, video_info_page): ERROR_PREFIX = 'Error bypassing age-restriction: ' video_info = urllib.parse.parse_qs(video_info_page) - player_response = default_multi_get(video_info, 'player_response', 0) + player_response = deep_get(video_info, 'player_response', 0) if player_response is None: info['playability_error'] = ERROR_PREFIX + 'Could not find player_response in video_info_page' return -- cgit v1.2.3 From 004e14a53800a5235d850517db8a3b421e804b30 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Wed, 18 Dec 2019 20:53:11 -0800 Subject: Extraction: Use accessibility data to get timestamp and to get views for recommended videos --- youtube/yt_data_extract.py | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'youtube') diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index 6a5e4bb..ac5b78b 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -386,6 +386,16 @@ def extract_item_info(item, additional_info={}): if primary_type == 'video': info['id'] = item.get('videoId') info['view_count'] = extract_int(item.get('viewCountText')) + + # dig into accessibility data to get view_count for videos marked as recommended, and to get time_published + accessibility_label = deep_get(item, 'title', 'accessibility', 'accessibilityData', 'label', default='') + timestamp = re.search(r'(\d+ \w+ ago)', accessibility_label) + if timestamp: + conservative_update(info, 'time_published', timestamp.group(1)) + view_count = re.search(r'(\d+) views', accessibility_label.replace(',', '')) + if view_count: + conservative_update(info, 'view_count', int(view_count.group(1))) + if info['view_count']: info['approx_view_count'] = '{:,}'.format(info['view_count']) else: -- cgit v1.2.3 From 02848a1a3213bb4ad872865768a7b97f663a24ed Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 19 Dec 2019 15:46:16 -0800 Subject: Extraction: Adjust related videos box to fit new time_published information well time_published will be put to the right of the view_count in related videos Author will now always be above the other stats, since it doesn't make a difference in the big search result boxes since the description snippet is always very short (However, it's important the author isn't inline with the other stats in related video boxes since those are so narrow and the author name can be very long) --- youtube/templates/common_elements.html | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'youtube') diff --git a/youtube/templates/common_elements.html b/youtube/templates/common_elements.html index 4c776b6..7914c08 100644 --- a/youtube/templates/common_elements.html +++ b/youtube/templates/common_elements.html @@ -31,18 +31,18 @@ -
        + {% if include_author %} + {% if info.get('author_url') %} +
        By {{ info['author'] }}
        + {% else %} +
        {{ info['author'] }}
        + {% endif %} + {% endif %} +
          {% if info['type'] == 'channel' %}
        • {{ info['approx_subscriber_count'] }} subscribers
        • {{ info['video_count'] }} videos
        • {% else %} - {% if include_author %} - {% if info.get('author_url') %} -
        • By {{ info['author'] }}
        • - {% else %} -
        • {{ info['author'] }}
        • - {% endif %} - {% endif %} {% if info.get('approx_view_count') %}
        • {{ info['approx_view_count'] }} views
        • {% endif %} -- cgit v1.2.3 From beb0976b5bc09a053d027a6e7020bb3a83f4aca1 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 19 Dec 2019 15:50:19 -0800 Subject: Extraction: Rewrite comment extraction, remove author_id and rename author_channel_id to that, fix bug in extract_items author_id (an internal sql-like integer previously required for deleting and editing comments) has been removed by Youtube and is no longer required. Remove it for simplicity. Rename author_channel_id to author_id for consistency with other extraction attributes. extract_items returned None for items instead of [] for empty continuation responses. Fixes that. --- youtube/comments.py | 9 ++-- youtube/post_comment.py | 8 +-- youtube/yt_data_extract.py | 130 ++++++++++++++++++++------------------------- 3 files changed, 67 insertions(+), 80 deletions(-) (limited to 'youtube') diff --git a/youtube/comments.py b/youtube/comments.py index e237f0f..4e79d8b 100644 --- a/youtube/comments.py +++ b/youtube/comments.py @@ -93,11 +93,10 @@ def post_process_comments_info(comments_info): comment['permalink'] = util.URL_ORIGIN + '/watch?v=' + comments_info['video_id'] + '&lc=' + comment['id'] - if comment['author_channel_id'] in accounts.accounts: + if comment['author_id'] in accounts.accounts: comment['delete_url'] = (util.URL_ORIGIN + '/delete_comment?video_id=' + comments_info['video_id'] - + '&channel_id='+ comment['author_channel_id'] - + '&author_id=' + comment['author_id'] + + '&channel_id='+ comment['author_id'] + '&comment_id=' + comment['id']) reply_count = comment['reply_count'] @@ -135,7 +134,7 @@ def post_process_comments_info(comments_info): def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''): if settings.comments_mode: - comments_info = yt_data_extract.parse_comments_polymer(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key))) + comments_info = yt_data_extract.extract_comments_info(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key))) post_process_comments_info(comments_info) post_comment_url = util.URL_ORIGIN + "/post_comment?video_id=" + video_id @@ -160,7 +159,7 @@ def get_comments_page(): ctoken = comment_replies_ctoken(video_id, parent_id) replies = True - comments_info = yt_data_extract.parse_comments_polymer(request_comments(ctoken, replies)) + comments_info = yt_data_extract.extract_comments_info(request_comments(ctoken, replies)) post_process_comments_info(comments_info) if not replies: diff --git a/youtube/post_comment.py b/youtube/post_comment.py index 25d0e3a..78f080f 100644 --- a/youtube/post_comment.py +++ b/youtube/post_comment.py @@ -70,7 +70,7 @@ def _post_comment_reply(text, video_id, parent_comment_id, session_token, cookie print("Comment posting code: " + code) return code -def _delete_comment(video_id, comment_id, author_id, session_token, cookiejar): +def _delete_comment(video_id, comment_id, session_token, cookiejar): headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1', 'Accept': '*/*', @@ -79,7 +79,7 @@ def _delete_comment(video_id, comment_id, author_id, session_token, cookiejar): 'X-YouTube-Client-Version': '2.20180823', 'Content-Type': 'application/x-www-form-urlencoded', } - action = proto.uint(1,6) + proto.string(3, comment_id) + proto.string(5, video_id) + proto.string(9, author_id) + action = proto.uint(1,6) + proto.string(3, comment_id) + proto.string(5, video_id) action = proto.percent_b64encode(action).decode('ascii') sej = json.dumps({"clickTrackingParams":"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=","commandMetadata":{"webCommandMetadata":{"url":"/service_ajax","sendPost":True}},"performCommentActionEndpoint":{"action":action}}) @@ -115,7 +115,7 @@ def delete_comment(): cookiejar = accounts.account_cookiejar(request.values['channel_id']) token = get_session_token(video_id, cookiejar) - code = _delete_comment(video_id, request.values['comment_id'], request.values['author_id'], token, cookiejar) + code = _delete_comment(video_id, request.values['comment_id'], token, cookiejar) if code == "SUCCESS": return flask.redirect(util.URL_ORIGIN + '/comment_delete_success', 303) @@ -147,7 +147,7 @@ def post_comment(): @yt_app.route('/delete_comment', methods=['GET']) def get_delete_comment_page(): - parameters = [(parameter_name, request.args[parameter_name]) for parameter_name in ('video_id', 'channel_id', 'author_id', 'comment_id')] + parameters = [(parameter_name, request.args[parameter_name]) for parameter_name in ('video_id', 'channel_id', 'comment_id')] return flask.render_template('delete_comment.html', parameters = parameters) diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py index ac5b78b..68550cf 100644 --- a/youtube/yt_data_extract.py +++ b/youtube/yt_data_extract.py @@ -259,20 +259,20 @@ def extract_formatted_text(node): return [{'text': node['simpleText']}] return [] -def extract_int(string): +def extract_int(string, default=None): if isinstance(string, int): return string if not isinstance(string, str): string = extract_str(string) if not string: - return None + return default match = re.search(r'(\d+)', string.replace(',', '')) if match is None: - return None + return default try: return int(match.group(1)) except ValueError: - return None + return default def extract_approx_int(string): '''e.g. "15M" from "15M subscribers"''' @@ -514,7 +514,7 @@ def extract_items(response, item_types=item_types): # always has just the one [something]Continuation key, but do this just in case they add some tracking key or something for key, renderer_continuation in get(response, 'continuationContents', {}, types=dict).items(): if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation - items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=None, types=(list, tuple)) + items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=[], types=(list, tuple)) ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str) return items, ctoken return [], None @@ -772,78 +772,66 @@ def ctoken_metadata(ctoken): result['sort'] = 0 return result -def parse_comments_polymer(polymer_json): - try: - video_title = '' - response, err = extract_response(polymer_json) - if err: - raise Exception(err) - - try: - url = polymer_json[1]['url'] - except (TypeError, IndexError, KeyError): - url = polymer_json['url'] +def extract_comments_info(polymer_json): + response, err = extract_response(polymer_json) + if err: + return {'error': err} + info = {'error': None} + url = multi_deep_get(polymer_json, [1, 'url'], ['url']) + if url: ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] metadata = ctoken_metadata(ctoken) + else: + metadata = {} + info['video_id'] = metadata.get('video_id') + info['offset'] = metadata.get('offset') + info['is_replies'] = metadata.get('is_replies') + info['sort'] = metadata.get('sort') + info['video_title'] = None + + comments, ctoken = extract_items(response) + info['comments'] = [] + info['ctoken'] = ctoken + for comment in comments: + comment_info = {} + + if 'commentThreadRenderer' in comment: # top level comments + conservative_update(info, 'is_replies', False) + comment_thread = comment['commentThreadRenderer'] + info['video_title'] = extract_str(comment_thread.get('commentTargetTitle')) + if 'replies' not in comment_thread: + comment_info['reply_count'] = 0 + else: + comment_info['reply_count'] = extract_int(deep_get(comment_thread, + 'replies', 'commentRepliesRenderer', 'moreText' + ), default=1) # With 1 reply, the text reads "View reply" + comment_renderer = deep_get(comment_thread, 'comment', 'commentRenderer', default={}) + elif 'commentRenderer' in comment: # replies + comment_info['reply_count'] = 0 # replyCount, below, not present for replies even if the reply has further replies to it + conservative_update(info, 'is_replies', True) + comment_renderer = comment['commentRenderer'] + else: + comment_renderer = {} - comments_raw, ctoken = extract_items(response) + # These 3 are sometimes absent, likely because the channel was deleted + comment_info['author'] = extract_str(comment_renderer.get('authorText')) + comment_info['author_url'] = deep_get(comment_renderer, + 'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url') + comment_info['author_id'] = deep_get(comment_renderer, + 'authorEndpoint', 'browseEndpoint', 'browseId') - comments = [] - for comment_json in comments_raw: - number_of_replies = 0 - try: - comment_thread = comment_json['commentThreadRenderer'] - except KeyError: - comment_renderer = comment_json['commentRenderer'] - else: - if 'commentTargetTitle' in comment_thread: - video_title = comment_thread['commentTargetTitle']['runs'][0]['text'] - - if 'replies' in comment_thread: - view_replies_text = extract_str(comment_thread['replies']['commentRepliesRenderer']['moreText']) - view_replies_text = view_replies_text.replace(',', '') - match = re.search(r'(\d+)', view_replies_text) - if match is None: - number_of_replies = 1 - else: - number_of_replies = int(match.group(1)) - comment_renderer = comment_thread['comment']['commentRenderer'] - - comment = { - 'author_id': comment_renderer.get('authorId', ''), - 'author_avatar': comment_renderer['authorThumbnail']['thumbnails'][0]['url'], - 'like_count': comment_renderer['likeCount'], - 'time_published': extract_str(comment_renderer['publishedTimeText']), - 'text': comment_renderer['contentText'].get('runs', ''), - 'reply_count': number_of_replies, - 'id': comment_renderer['commentId'], - } + comment_info['author_avatar'] = deep_get(comment_renderer, + 'authorThumbnail', 'thumbnails', 0, 'url') + comment_info['id'] = comment_renderer.get('commentId') + comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText')) + comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText')) + comment_info['like_count'] = comment_renderer.get('likeCount') + liberal_update(comment_info, 'reply_count', comment_renderer.get('replyCount')) - if 'authorText' in comment_renderer: # deleted channels have no name or channel link - comment['author'] = extract_str(comment_renderer['authorText']) - comment['author_url'] = comment_renderer['authorEndpoint']['commandMetadata']['webCommandMetadata']['url'] - comment['author_channel_id'] = comment_renderer['authorEndpoint']['browseEndpoint']['browseId'] - else: - comment['author'] = '' - comment['author_url'] = '' - comment['author_channel_id'] = '' - - comments.append(comment) - except Exception as e: - print('Error parsing comments: ' + str(e)) - comments = () - ctoken = '' - - return { - 'ctoken': ctoken, - 'comments': comments, - 'video_title': video_title, - 'video_id': metadata['video_id'], - 'offset': metadata['offset'], - 'is_replies': metadata['is_replies'], - 'sort': metadata['sort'], - } + info['comments'].append(comment_info) + + return info def check_missing_keys(object, *key_sequences): for key_sequence in key_sequences: -- cgit v1.2.3 From 76376b29a0adf6bd6d7a0202d904f923bdc8aa57 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 19 Dec 2019 19:28:58 -0800 Subject: Extraction: Split yt_data_extract.py into multiple files --- youtube/yt_data_extract.py | 1190 --------------------------- youtube/yt_data_extract/__init__.py | 11 + youtube/yt_data_extract/common.py | 455 ++++++++++ youtube/yt_data_extract/everything_else.py | 273 ++++++ youtube/yt_data_extract/watch_extraction.py | 449 ++++++++++ 5 files changed, 1188 insertions(+), 1190 deletions(-) delete mode 100644 youtube/yt_data_extract.py create mode 100644 youtube/yt_data_extract/__init__.py create mode 100644 youtube/yt_data_extract/common.py create mode 100644 youtube/yt_data_extract/everything_else.py create mode 100644 youtube/yt_data_extract/watch_extraction.py (limited to 'youtube') diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py deleted file mode 100644 index 68550cf..0000000 --- a/youtube/yt_data_extract.py +++ /dev/null @@ -1,1190 +0,0 @@ -from youtube import util, proto - -import html -import json -import re -import urllib.parse -import collections -from math import ceil -import traceback - -# videos: - -# id -# title -# url -# author -# author_url -# thumbnail -# description -# time_published (str) -# duration (str) -# like_count (int) -# dislike_count (int) -# view_count (int) -# approx_view_count (str) -# playlist_index - -# playlists: - -# id -# title -# url -# author -# author_url -# thumbnail -# description -# time_published (str) -# video_count (int) -# first_video_id - -# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py -_formats = { - '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'}, - '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'}, - '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, - '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'mp4v'}, - '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 96, 'vcodec': 'h264'}, - '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, - '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, - '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, - # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), audio_bitrate varies as well - '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, - '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, - '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, - '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'}, - '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'}, - '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, - '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, - '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, - '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, - - - # 3D videos - '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, - '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, - '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, - '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, - '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'}, - '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, - '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, - - # Apple HTTP Live Streaming - '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'}, - '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'}, - '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, - '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, - '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'}, - '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'}, - '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'}, - '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'h264'}, - - # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, - - # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 48, 'container': 'm4a_dash'}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 128, 'container': 'm4a_dash'}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 256, 'container': 'm4a_dash'}, - '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, - '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, - - # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - - # Dash webm audio - '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 128}, - '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 256}, - - # Dash webm audio with opus inside - '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 50}, - '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 70}, - '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 160}, - - # RTMP (unnamed) - '_rtmp': {'protocol': 'rtmp'}, - - # av01 video only formats sometimes served with "unknown" codecs - '394': {'vcodec': 'av01.0.05M.08'}, - '395': {'vcodec': 'av01.0.05M.08'}, - '396': {'vcodec': 'av01.0.05M.08'}, - '397': {'vcodec': 'av01.0.05M.08'}, -} - -def get(object, key, default=None, types=()): - '''Like dict.get(), but returns default if the result doesn't match one of the types. - Also works for indexing lists.''' - try: - result = object[key] - except (TypeError, IndexError, KeyError): - return default - - if not types or isinstance(result, types): - return result - else: - return default - -def multi_get(object, *keys, default=None, types=()): - '''Like get, but try other keys if the first fails''' - for key in keys: - try: - result = object[key] - except (TypeError, IndexError, KeyError): - pass - else: - if not types or isinstance(result, types): - return result - else: - continue - return default - - -def deep_get(object, *keys, default=None, types=()): - '''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. - Last argument is the default value to use in case of any IndexErrors or KeyErrors. - If types is given and the result doesn't match one of those types, default is returned''' - try: - for key in keys: - object = object[key] - except (TypeError, IndexError, KeyError): - return default - else: - if not types or isinstance(object, types): - return object - else: - return default - -def multi_deep_get(object, *key_sequences, default=None, types=()): - '''Like deep_get, but can try different key sequences in case one fails. - Return default if all of them fail. key_sequences is a list of lists''' - for key_sequence in key_sequences: - _object = object - try: - for key in key_sequence: - _object = _object[key] - except (TypeError, IndexError, KeyError): - pass - else: - if not types or isinstance(_object, types): - return _object - else: - continue - return default - -def liberal_update(obj, key, value): - '''Updates obj[key] with value as long as value is not None. - Ensures obj[key] will at least get a value of None, however''' - if (value is not None) or (key not in obj): - obj[key] = value - -def conservative_update(obj, key, value): - '''Only updates obj if it doesn't have key or obj[key] is None''' - if obj.get(key) is None: - obj[key] = value - -def remove_redirect(url): - if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking - query_string = url[url.find('?')+1: ] - return urllib.parse.parse_qs(query_string)['q'][0] - return url - -def _recover_urls(runs): - for run in runs: - url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') - text = run.get('text', '') - # second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text - if url is not None and (text.startswith('http://') or text.startswith('https://')): - url = remove_redirect(url) - run['url'] = url - run['text'] = url # youtube truncates the url text, use actual url instead - -def extract_str(node, default=None, recover_urls=False): - '''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix Youtube's truncation of url text (most prominently seen in descriptions)''' - if isinstance(node, str): - return node - - try: - return node['simpleText'] - except (KeyError, TypeError): - pass - - if isinstance(node, dict) and 'runs' in node: - if recover_urls: - _recover_urls(node['runs']) - return ''.join(text_run.get('text', '') for text_run in node['runs']) - - return default - -def extract_formatted_text(node): - if not node: - return [] - if 'runs' in node: - _recover_urls(node['runs']) - return node['runs'] - elif 'simpleText' in node: - return [{'text': node['simpleText']}] - return [] - -def extract_int(string, default=None): - if isinstance(string, int): - return string - if not isinstance(string, str): - string = extract_str(string) - if not string: - return default - match = re.search(r'(\d+)', string.replace(',', '')) - if match is None: - return default - try: - return int(match.group(1)) - except ValueError: - return default - -def extract_approx_int(string): - '''e.g. "15M" from "15M subscribers"''' - if not isinstance(string, str): - string = extract_str(string) - if not string: - return None - match = re.search(r'(\d+[KMBTkmbt])', string.replace(',', '')) - if match is None: - return None - return match.group(1) - -youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$') -def normalize_url(url): - if url is None: - return None - match = youtube_url_re.fullmatch(url) - if match is None: - raise Exception() - - return 'https://www.youtube.com' + match.group(1) - -def prefix_urls(item): - try: - item['thumbnail'] = util.prefix_url(item['thumbnail']) - except KeyError: - pass - - try: - item['author_url'] = util.prefix_url(item['author_url']) - except KeyError: - pass - -def add_extra_html_info(item): - if item['type'] == 'video': - item['url'] = (util.URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None - - video_info = {} - for key in ('id', 'title', 'author', 'duration'): - try: - video_info[key] = item[key] - except KeyError: - video_info[key] = '' - - item['video_info'] = json.dumps(video_info) - - elif item['type'] == 'playlist': - item['url'] = (util.URL_ORIGIN + '/playlist?list=' + item['id']) if item.get('id') else None - elif item['type'] == 'channel': - item['url'] = (util.URL_ORIGIN + "/channel/" + item['id']) if item.get('id') else None - -def extract_item_info(item, additional_info={}): - if not item: - return {'error': 'No item given'} - - type = get(list(item.keys()), 0) - if not type: - return {'error': 'Could not find type'} - item = item[type] - - info = {'error': None} - if type in ('itemSectionRenderer', 'compactAutoplayRenderer'): - return extract_item_info(deep_get(item, 'contents', 0), additional_info) - - if type in ('movieRenderer', 'clarificationRenderer'): - info['type'] = 'unsupported' - return info - - info.update(additional_info) - - # type looks like e.g. 'compactVideoRenderer' or 'gridVideoRenderer' - # camelCase split, https://stackoverflow.com/a/37697078 - type_parts = [s.lower() for s in re.sub(r'([A-Z][a-z]+)', r' \1', type).split()] - if len(type_parts) < 2: - info['type'] = 'unsupported' - return - primary_type = type_parts[-2] - if primary_type == 'video': - info['type'] = 'video' - elif primary_type in ('playlist', 'radio', 'show'): - info['type'] = 'playlist' - elif primary_type == 'channel': - info['type'] = 'channel' - else: - info['type'] = 'unsupported' - - info['title'] = extract_str(item.get('title')) - info['author'] = extract_str(multi_get(item, 'longBylineText', 'shortBylineText', 'ownerText')) - info['author_id'] = extract_str(multi_deep_get(item, - ['longBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], - ['shortBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], - ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'] - )) - info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None - info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText')) - info['thumbnail'] = multi_deep_get(item, - ['thumbnail', 'thumbnails', 0, 'url'], # videos - ['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists - ['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows - ) - - info['badges'] = [] - for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()): - badge = deep_get(badge_node, 'metadataBadgeRenderer', 'label') - if badge: - info['badges'].append(badge) - - if primary_type in ('video', 'playlist'): - info['time_published'] = extract_str(item.get('publishedTimeText')) - - if primary_type == 'video': - info['id'] = item.get('videoId') - info['view_count'] = extract_int(item.get('viewCountText')) - - # dig into accessibility data to get view_count for videos marked as recommended, and to get time_published - accessibility_label = deep_get(item, 'title', 'accessibility', 'accessibilityData', 'label', default='') - timestamp = re.search(r'(\d+ \w+ ago)', accessibility_label) - if timestamp: - conservative_update(info, 'time_published', timestamp.group(1)) - view_count = re.search(r'(\d+) views', accessibility_label.replace(',', '')) - if view_count: - conservative_update(info, 'view_count', int(view_count.group(1))) - - if info['view_count']: - info['approx_view_count'] = '{:,}'.format(info['view_count']) - else: - info['approx_view_count'] = extract_approx_int(multi_get(item, 'shortViewCountText')) - info['duration'] = extract_str(item.get('lengthText')) - elif primary_type == 'playlist': - info['id'] = item.get('playlistId') - info['video_count'] = extract_int(item.get('videoCount')) - elif primary_type == 'channel': - info['id'] = item.get('channelId') - info['approx_subscriber_count'] = extract_approx_int(item.get('subscriberCountText')) - elif primary_type == 'show': - info['id'] = deep_get(item, 'navigationEndpoint', 'watchEndpoint', 'playlistId') - - if primary_type in ('playlist', 'channel'): - conservative_update(info, 'video_count', extract_int(item.get('videoCountText'))) - - for overlay in item.get('thumbnailOverlays', []): - conservative_update(info, 'duration', extract_str(deep_get( - overlay, 'thumbnailOverlayTimeStatusRenderer', 'text' - ))) - # show renderers don't have videoCountText - conservative_update(info, 'video_count', extract_int(deep_get( - overlay, 'thumbnailOverlayBottomPanelRenderer', 'text' - ))) - return info - -def parse_info_prepare_for_html(renderer, additional_info={}): - item = extract_item_info(renderer, additional_info) - prefix_urls(item) - add_extra_html_info(item) - - return item - -def extract_response(polymer_json): - '''return response, error''' - response = multi_deep_get(polymer_json, [1, 'response'], ['response'], default=None, types=dict) - if response is None: - return None, 'Failed to extract response' - else: - return response, None - - -list_types = { - 'sectionListRenderer', - 'itemSectionRenderer', - 'gridRenderer', - 'playlistVideoListRenderer', -} - -item_types = { - 'movieRenderer', - 'didYouMeanRenderer', - 'showingResultsForRenderer', - - 'videoRenderer', - 'compactVideoRenderer', - 'compactAutoplayRenderer', - 'gridVideoRenderer', - 'playlistVideoRenderer', - - 'playlistRenderer', - 'compactPlaylistRenderer', - 'gridPlaylistRenderer', - - 'radioRenderer', - 'compactRadioRenderer', - 'gridRadioRenderer', - - 'showRenderer', - 'compactShowRenderer', - 'gridShowRenderer', - - - 'channelRenderer', - 'compactChannelRenderer', - 'gridChannelRenderer', - - 'channelAboutFullMetadataRenderer', -} - -def traverse_browse_renderer(renderer): - for tab in get(renderer, 'tabs', (), types=(list, tuple)): - tab_renderer = multi_deep_get(tab, ['tabRenderer'], ['expandableTabRenderer'], default=None, types=dict) - if tab_renderer is None: - continue - if tab_renderer.get('selected', False): - return get(tab_renderer, 'content', {}, types=(dict)) - print('Could not find tab with content') - return {} - -def traverse_standard_list(renderer): - renderer_list = multi_deep_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple)) - continuation = deep_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation') - return renderer_list, continuation - -# these renderers contain one inside them -nested_renderer_dispatch = { - 'singleColumnBrowseResultsRenderer': traverse_browse_renderer, - 'twoColumnBrowseResultsRenderer': traverse_browse_renderer, - 'twoColumnSearchResultsRenderer': lambda renderer: get(renderer, 'primaryContents', {}, types=dict), -} - -# these renderers contain a list of renderers in side them -nested_renderer_list_dispatch = { - 'sectionListRenderer': traverse_standard_list, - 'itemSectionRenderer': traverse_standard_list, - 'gridRenderer': traverse_standard_list, - 'playlistVideoListRenderer': traverse_standard_list, - 'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None), -} - -def extract_items(response, item_types=item_types): - '''return items, ctoken''' - if 'continuationContents' in response: - # always has just the one [something]Continuation key, but do this just in case they add some tracking key or something - for key, renderer_continuation in get(response, 'continuationContents', {}, types=dict).items(): - if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation - items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=[], types=(list, tuple)) - ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str) - return items, ctoken - return [], None - elif 'contents' in response: - ctoken = None - items = [] - - iter_stack = collections.deque() - current_iter = iter(()) - - renderer = get(response, 'contents', {}, types=dict) - - while True: - # mode 1: dig into the current renderer - # Will stay in mode 1 (via continue) if a new renderer is found inside this one - # Otherwise, after finding that it is an item renderer, - # contains a list, or contains nothing, - # falls through into mode 2 to get a new renderer - if len(renderer) != 0: - key, value = list(renderer.items())[0] - - # has a list in it, add it to the iter stack - if key in nested_renderer_list_dispatch: - renderer_list, continuation = nested_renderer_list_dispatch[key](value) - if renderer_list: - iter_stack.append(current_iter) - current_iter = iter(renderer_list) - if continuation: - ctoken = continuation - - # new renderer nested inside this one - elif key in nested_renderer_dispatch: - renderer = nested_renderer_dispatch[key](value) - continue # back to mode 1 - - # the renderer is an item - elif key in item_types: - items.append(renderer) - - - # mode 2: get a new renderer by iterating. - # goes up the stack for an iterator if one has been exhausted - while current_iter is not None: - try: - renderer = current_iter.__next__() - break - except StopIteration: - try: - current_iter = iter_stack.pop() # go back up the stack - except IndexError: - return items, ctoken - - else: - return [], None - -def extract_channel_info(polymer_json, tab): - response, err = extract_response(polymer_json) - if err: - return {'error': err} - - try: - microformat = response['microformat']['microformatDataRenderer'] - - # channel doesn't exist or was terminated - # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org - except KeyError: - if 'alerts' in response and len(response['alerts']) > 0: - return {'error': ' '.join(alert['alertRenderer']['text']['simpleText'] for alert in response['alerts']) } - elif 'errors' in response['responseContext']: - for error in response['responseContext']['errors']['error']: - if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id': - return {'error': 'This channel does not exist'} - return {'error': 'Failure getting microformat'} - - info = {'error': None} - info['current_tab'] = tab - - - # stuff from microformat (info given by youtube for every page on channel) - info['short_description'] = microformat['description'] - info['channel_name'] = microformat['title'] - info['avatar'] = microformat['thumbnail']['thumbnails'][0]['url'] - channel_url = microformat['urlCanonical'].rstrip('/') - channel_id = channel_url[channel_url.rfind('/')+1:] - info['channel_id'] = channel_id - info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id - - info['items'] = [] - - # empty channel - if 'contents' not in response and 'continuationContents' not in response: - return info - - - items, _ = extract_items(response) - if tab in ('videos', 'playlists', 'search'): - additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id} - info['items'] = [extract_item_info(renderer, additional_info) for renderer in items] - - elif tab == 'about': - for item in items: - try: - channel_metadata = item['channelAboutFullMetadataRenderer'] - break - except KeyError: - pass - else: - info['error'] = 'Could not find channelAboutFullMetadataRenderer' - return info - - info['links'] = [] - for link_json in channel_metadata.get('primaryLinks', ()): - url = remove_redirect(link_json['navigationEndpoint']['urlEndpoint']['url']) - - text = extract_str(link_json['title']) - - info['links'].append( (text, url) ) - - - info['stats'] = [] - for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'): - try: - stat = channel_metadata[stat_name] - except KeyError: - continue - info['stats'].append(extract_str(stat)) - - if 'description' in channel_metadata: - info['description'] = extract_str(channel_metadata['description']) - else: - info['description'] = '' - - else: - raise NotImplementedError('Unknown or unsupported channel tab: ' + tab) - - return info - -def extract_search_info(polymer_json): - response, err = extract_response(polymer_json) - if err: - return {'error': err} - info = {'error': None} - info['estimated_results'] = int(response['estimatedResults']) - info['estimated_pages'] = ceil(info['estimated_results']/20) - - - results, _ = extract_items(response) - - - info['items'] = [] - info['corrections'] = {'type': None} - for renderer in results: - type = list(renderer.keys())[0] - if type == 'shelfRenderer': - continue - if type == 'didYouMeanRenderer': - renderer = renderer[type] - - info['corrections'] = { - 'type': 'did_you_mean', - 'corrected_query': renderer['correctedQueryEndpoint']['searchEndpoint']['query'], - 'corrected_query_text': renderer['correctedQuery']['runs'], - } - continue - if type == 'showingResultsForRenderer': - renderer = renderer[type] - - info['corrections'] = { - 'type': 'showing_results_for', - 'corrected_query_text': renderer['correctedQuery']['runs'], - 'original_query_text': renderer['originalQuery']['simpleText'], - } - continue - - i_info = extract_item_info(renderer) - if i_info.get('type') != 'unsupported': - info['items'].append(i_info) - - - return info - -def extract_playlist_metadata(polymer_json): - response, err = extract_response(polymer_json) - if err: - return {'error': err} - - metadata = {'error': None} - header = deep_get(response, 'header', 'playlistHeaderRenderer', default={}) - metadata['title'] = extract_str(header.get('title')) - - metadata['first_video_id'] = deep_get(header, 'playEndpoint', 'watchEndpoint', 'videoId') - first_id = re.search(r'([a-z_\-]{11})', deep_get(header, - 'thumbnail', 'thumbnails', 0, 'url', default='')) - if first_id: - conservative_update(metadata, 'first_video_id', first_id.group(1)) - if metadata['first_video_id'] is None: - metadata['thumbnail'] = None - else: - metadata['thumbnail'] = 'https://i.ytimg.com/vi/' + metadata['first_video_id'] + '/mqdefault.jpg' - - metadata['video_count'] = extract_int(header.get('numVideosText')) - metadata['description'] = extract_str(header.get('descriptionText'), default='') - metadata['author'] = extract_str(header.get('ownerText')) - metadata['author_id'] = multi_deep_get(header, - ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], - ['ownerEndpoint', 'browseEndpoint', 'browseId']) - if metadata['author_id']: - metadata['author_url'] = 'https://www.youtube.com/channel/' + metadata['author_id'] - else: - metadata['author_url'] = None - metadata['view_count'] = extract_int(header.get('viewCountText')) - metadata['like_count'] = extract_int(header.get('likesCountWithoutLikeText')) - for stat in header.get('stats', ()): - text = extract_str(stat) - if 'videos' in text: - conservative_update(metadata, 'video_count', extract_int(text)) - elif 'views' in text: - conservative_update(metadata, 'view_count', extract_int(text)) - elif 'updated' in text: - metadata['time_published'] = extract_date(text) - - return metadata - -def extract_playlist_info(polymer_json): - response, err = extract_response(polymer_json) - if err: - return {'error': err} - info = {'error': None} - first_page = 'continuationContents' not in response - video_list, _ = extract_items(response) - - info['items'] = [extract_item_info(renderer) for renderer in video_list] - - if first_page: - info['metadata'] = extract_playlist_metadata(polymer_json) - - return info - -def ctoken_metadata(ctoken): - result = dict() - params = proto.parse(proto.b64_to_bytes(ctoken)) - result['video_id'] = proto.parse(params[2])[2].decode('ascii') - - offset_information = proto.parse(params[6]) - result['offset'] = offset_information.get(5, 0) - - result['is_replies'] = False - if (3 in offset_information) and (2 in proto.parse(offset_information[3])): - result['is_replies'] = True - result['sort'] = None - else: - try: - result['sort'] = proto.parse(offset_information[4])[6] - except KeyError: - result['sort'] = 0 - return result - -def extract_comments_info(polymer_json): - response, err = extract_response(polymer_json) - if err: - return {'error': err} - info = {'error': None} - - url = multi_deep_get(polymer_json, [1, 'url'], ['url']) - if url: - ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] - metadata = ctoken_metadata(ctoken) - else: - metadata = {} - info['video_id'] = metadata.get('video_id') - info['offset'] = metadata.get('offset') - info['is_replies'] = metadata.get('is_replies') - info['sort'] = metadata.get('sort') - info['video_title'] = None - - comments, ctoken = extract_items(response) - info['comments'] = [] - info['ctoken'] = ctoken - for comment in comments: - comment_info = {} - - if 'commentThreadRenderer' in comment: # top level comments - conservative_update(info, 'is_replies', False) - comment_thread = comment['commentThreadRenderer'] - info['video_title'] = extract_str(comment_thread.get('commentTargetTitle')) - if 'replies' not in comment_thread: - comment_info['reply_count'] = 0 - else: - comment_info['reply_count'] = extract_int(deep_get(comment_thread, - 'replies', 'commentRepliesRenderer', 'moreText' - ), default=1) # With 1 reply, the text reads "View reply" - comment_renderer = deep_get(comment_thread, 'comment', 'commentRenderer', default={}) - elif 'commentRenderer' in comment: # replies - comment_info['reply_count'] = 0 # replyCount, below, not present for replies even if the reply has further replies to it - conservative_update(info, 'is_replies', True) - comment_renderer = comment['commentRenderer'] - else: - comment_renderer = {} - - # These 3 are sometimes absent, likely because the channel was deleted - comment_info['author'] = extract_str(comment_renderer.get('authorText')) - comment_info['author_url'] = deep_get(comment_renderer, - 'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url') - comment_info['author_id'] = deep_get(comment_renderer, - 'authorEndpoint', 'browseEndpoint', 'browseId') - - comment_info['author_avatar'] = deep_get(comment_renderer, - 'authorThumbnail', 'thumbnails', 0, 'url') - comment_info['id'] = comment_renderer.get('commentId') - comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText')) - comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText')) - comment_info['like_count'] = comment_renderer.get('likeCount') - liberal_update(comment_info, 'reply_count', comment_renderer.get('replyCount')) - - info['comments'].append(comment_info) - - return info - -def check_missing_keys(object, *key_sequences): - for key_sequence in key_sequences: - _object = object - try: - for key in key_sequence: - _object = _object[key] - except (KeyError, IndexError, TypeError): - return 'Could not find ' + key - - return None - -def extract_metadata_row_info(video_renderer_info): - # extract category and music list - info = { - 'category': None, - 'music_list': [], - } - - current_song = {} - for row in deep_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]): - row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='') - row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0)) - if row_title == 'Category': - info['category'] = row_content - elif row_title in ('Song', 'Music'): - if current_song: - info['music_list'].append(current_song) - current_song = {'title': row_content} - elif row_title == 'Artist': - current_song['artist'] = row_content - elif row_title == 'Album': - current_song['album'] = row_content - elif row_title == 'Writers': - current_song['writers'] = row_content - elif row_title.startswith('Licensed'): - current_song['licensor'] = row_content - if current_song: - info['music_list'].append(current_song) - - return info - -def extract_date(date_text): - if date_text is None: - return None - - date_text = date_text.replace(',', '').lower() - parts = date_text.split() - if len(parts) >= 3: - month, day, year = parts[-3:] - month = month_abbreviations.get(month[0:3]) # slicing in case they start writing out the full month name - if month and (re.fullmatch(r'\d\d?', day) is not None) and (re.fullmatch(r'\d{4}', year) is not None): - return year + '-' + month + '-' + day - -def extract_watch_info_mobile(top_level): - info = {} - microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) - - family_safe = microformat.get('isFamilySafe') - if family_safe is None: - info['age_restricted'] = None - else: - info['age_restricted'] = not family_safe - info['allowed_countries'] = microformat.get('availableCountries', []) - info['time_published'] = microformat.get('publishDate') - - response = top_level.get('response', {}) - - # video info from metadata renderers - items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'}) - if items: - video_info = items[0]['slimVideoMetadataRenderer'] - else: - print('Failed to extract video metadata') - video_info = {} - - info.update(extract_metadata_row_info(video_info)) - info['description'] = extract_str(video_info.get('description'), recover_urls=True) - info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle'))) - info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) - info['author_id'] = deep_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') - info['title'] = extract_str(video_info.get('title')) - info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='') - info['unlisted'] = False - for badge in video_info.get('badges', []): - if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted': - info['unlisted'] = True - info['like_count'] = None - info['dislike_count'] = None - if not info['time_published']: - info['time_published'] = extract_date(extract_str(video_info.get('dateText', None))) - for button in video_info.get('buttons', ()): - button_renderer = button.get('slimMetadataToggleButtonRenderer', {}) - - # all the digits can be found in the accessibility data - count = extract_int(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label')) - - # this count doesn't have all the digits, it's like 53K for instance - dumb_count = extract_int(extract_str(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText'))) - - # the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0 - if dumb_count == 0: - count = 0 - - if 'isLike' in button_renderer: - info['like_count'] = count - elif 'isDislike' in button_renderer: - info['dislike_count'] = count - - # comment section info - items, _ = extract_items(response, item_types={'commentSectionRenderer'}) - if items: - comment_info = items[0]['commentSectionRenderer'] - comment_count_text = extract_str(deep_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText')) - if comment_count_text == 'Comments': # just this with no number, means 0 comments - info['comment_count'] = 0 - else: - info['comment_count'] = extract_int(comment_count_text) - info['comments_disabled'] = False - else: # no comment section present means comments are disabled - info['comment_count'] = 0 - info['comments_disabled'] = True - - # check for limited state - items, _ = extract_items(response, item_types={'limitedStateMessageRenderer'}) - if items: - info['limited_state'] = True - else: - info['limited_state'] = False - - # related videos - related, _ = extract_items(response) - info['related_videos'] = [extract_item_info(renderer) for renderer in related] - - return info - -month_abbreviations = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'} -def extract_watch_info_desktop(top_level): - info = { - 'comment_count': None, - 'comments_disabled': None, - 'allowed_countries': None, - 'limited_state': None, - } - - video_info = {} - for renderer in deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()): - if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'): - video_info.update(list(renderer.values())[0]) - - info.update(extract_metadata_row_info(video_info)) - info['description'] = extract_str(video_info.get('description', None), recover_urls=True) - info['time_published'] = extract_date(extract_str(video_info.get('dateText', None))) - - likes_dislikes = deep_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/') - if len(likes_dislikes) == 2: - info['like_count'] = extract_int(likes_dislikes[0]) - info['dislike_count'] = extract_int(likes_dislikes[1]) - else: - info['like_count'] = None - info['dislike_count'] = None - - info['title'] = extract_str(video_info.get('title', None)) - info['author'] = extract_str(deep_get(video_info, 'owner', 'videoOwnerRenderer', 'title')) - info['author_id'] = deep_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') - info['view_count'] = extract_int(extract_str(deep_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount'))) - - related = deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[]) - info['related_videos'] = [extract_item_info(renderer) for renderer in related] - - return info - -def get_caption_url(info, language, format, automatic=False, translation_language=None): - '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.''' - url = info['_captions_base_url'] - url += '&lang=' + language - url += '&fmt=' + format - if automatic: - url += '&kind=asr' - elif language in info['_manual_caption_language_names']: - url += '&name=' + urllib.parse.quote(info['_manual_caption_language_names'][language], safe='') - - if translation_language: - url += '&tlang=' + translation_language - return url - -def extract_formats(info, player_response): - streaming_data = player_response.get('streamingData', {}) - yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', []) - - info['formats'] = [] - - for yt_fmt in yt_formats: - fmt = {} - fmt['ext'] = None - fmt['audio_bitrate'] = None - fmt['acodec'] = None - fmt['vcodec'] = None - fmt['width'] = yt_fmt.get('width') - fmt['height'] = yt_fmt.get('height') - fmt['file_size'] = yt_fmt.get('contentLength') - fmt['audio_sample_rate'] = yt_fmt.get('audioSampleRate') - fmt['fps'] = yt_fmt.get('fps') - cipher = dict(urllib.parse.parse_qsl(yt_fmt.get('cipher', ''))) - if cipher: - fmt['url'] = cipher.get('url') - else: - fmt['url'] = yt_fmt.get('url') - fmt['s'] = cipher.get('s') - fmt['sp'] = cipher.get('sp') - fmt.update(_formats.get(str(yt_fmt.get('itag')), {})) - - info['formats'].append(fmt) - -def extract_playability_error(info, player_response, error_prefix=''): - if info['formats']: - info['playability_status'] = None - info['playability_error'] = None - return - - playability_status = deep_get(player_response, 'playabilityStatus', 'status', default=None) - info['playability_status'] = playability_status - - playability_reason = extract_str(multi_deep_get(player_response, - ['playabilityStatus', 'reason'], - ['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'], - default='Could not find playability error') - ) - - if playability_status not in (None, 'OK'): - info['playability_error'] = error_prefix + playability_reason - else: - info['playability_error'] = error_prefix + 'Unknown playability error' - -SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') -def extract_watch_info(polymer_json): - info = {'playability_error': None, 'error': None} - - if isinstance(polymer_json, dict): - top_level = polymer_json - elif isinstance(polymer_json, (list, tuple)): - top_level = {} - for page_part in polymer_json: - if not isinstance(page_part, dict): - return {'error': 'Invalid page part'} - top_level.update(page_part) - else: - return {'error': 'Invalid top level polymer data'} - - error = check_missing_keys(top_level, - ['player', 'args'], - ['player', 'assets', 'js'], - ['playerResponse'], - ) - if error: - info['playability_error'] = error - - player_args = deep_get(top_level, 'player', 'args', default={}) - player_response = json.loads(player_args['player_response']) if 'player_response' in player_args else {} - - # captions - info['automatic_caption_languages'] = [] - info['manual_caption_languages'] = [] - info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url - info['translation_languages'] = [] - captions_info = player_response.get('captions', {}) - info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl')) - for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()): - lang_code = caption_track.get('languageCode') - if not lang_code: - continue - if caption_track.get('kind') == 'asr': - info['automatic_caption_languages'].append(lang_code) - else: - info['manual_caption_languages'].append(lang_code) - base_url = caption_track.get('baseUrl', '') - lang_name = deep_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0) - if lang_name: - info['_manual_caption_language_names'][lang_code] = lang_name - - for translation_lang_info in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()): - lang_code = translation_lang_info.get('languageCode') - if lang_code: - info['translation_languages'].append(lang_code) - if translation_lang_info.get('isTranslatable') == False: - print('WARNING: Found non-translatable caption language') - - # formats - extract_formats(info, player_response) - - # playability errors - extract_playability_error(info, player_response) - - # check age-restriction - info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error']) - - # base_js (for decryption of signatures) - info['base_js'] = deep_get(top_level, 'player', 'assets', 'js') - if info['base_js']: - info['base_js'] = normalize_url(info['base_js']) - - mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={}) - if mobile: - info.update(extract_watch_info_mobile(top_level)) - else: - info.update(extract_watch_info_desktop(top_level)) - - # stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info - vd = deep_get(top_level, 'playerResponse', 'videoDetails', default={}) - liberal_update(info, 'title', extract_str(vd.get('title'))) - liberal_update(info, 'duration', extract_int(vd.get('lengthSeconds'))) - liberal_update(info, 'view_count', extract_int(vd.get('viewCount'))) - # videos with no description have a blank string - liberal_update(info, 'description', vd.get('shortDescription')) - liberal_update(info, 'id', vd.get('videoId')) - liberal_update(info, 'author', vd.get('author')) - liberal_update(info, 'author_id', vd.get('channelId')) - liberal_update(info, 'live', vd.get('isLiveContent')) - conservative_update(info, 'unlisted', not vd.get('isCrawlable', True)) #isCrawlable is false on limited state videos even if they aren't unlisted - liberal_update(info, 'tags', vd.get('keywords', [])) - - # fallback stuff from microformat - mf = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) - conservative_update(info, 'title', extract_str(mf.get('title'))) - conservative_update(info, 'duration', extract_int(mf.get('lengthSeconds'))) - # this gives the view count for limited state videos - conservative_update(info, 'view_count', extract_int(mf.get('viewCount'))) - conservative_update(info, 'description', extract_str(mf.get('description'), recover_urls=True)) - conservative_update(info, 'author', mf.get('ownerChannelName')) - conservative_update(info, 'author_id', mf.get('externalChannelId')) - liberal_update(info, 'unlisted', mf.get('isUnlisted')) - liberal_update(info, 'category', mf.get('category')) - liberal_update(info, 'time_published', mf.get('publishDate')) - liberal_update(info, 'time_uploaded', mf.get('uploadDate')) - - # other stuff - info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None - return info - -def update_with_age_restricted_info(info, video_info_page): - ERROR_PREFIX = 'Error bypassing age-restriction: ' - - video_info = urllib.parse.parse_qs(video_info_page) - player_response = deep_get(video_info, 'player_response', 0) - if player_response is None: - info['playability_error'] = ERROR_PREFIX + 'Could not find player_response in video_info_page' - return - try: - player_response = json.loads(player_response) - except json.decoder.JSONDecodeError: - traceback.print_exc() - info['playability_error'] = ERROR_PREFIX + 'Failed to parse json response' - return - - extract_formats(info, player_response) - extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX) diff --git a/youtube/yt_data_extract/__init__.py b/youtube/yt_data_extract/__init__.py new file mode 100644 index 0000000..f2a93a9 --- /dev/null +++ b/youtube/yt_data_extract/__init__.py @@ -0,0 +1,11 @@ +from .common import (get, multi_get, deep_get, multi_deep_get, + liberal_update, conservative_update, remove_redirect, normalize_url, + extract_str, extract_formatted_text, extract_int, extract_approx_int, + extract_date, extract_item_info, extract_items, extract_response, + prefix_urls, add_extra_html_info, parse_info_prepare_for_html) + +from .everything_else import (extract_channel_info, extract_search_info, + extract_playlist_metadata, extract_playlist_info, extract_comments_info) + +from .watch_extraction import (extract_watch_info, get_caption_url, + update_with_age_restricted_info) diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py new file mode 100644 index 0000000..5fa67bc --- /dev/null +++ b/youtube/yt_data_extract/common.py @@ -0,0 +1,455 @@ +from youtube import util + +import json +import re +import urllib.parse +import collections + +def get(object, key, default=None, types=()): + '''Like dict.get(), but returns default if the result doesn't match one of the types. + Also works for indexing lists.''' + try: + result = object[key] + except (TypeError, IndexError, KeyError): + return default + + if not types or isinstance(result, types): + return result + else: + return default + +def multi_get(object, *keys, default=None, types=()): + '''Like get, but try other keys if the first fails''' + for key in keys: + try: + result = object[key] + except (TypeError, IndexError, KeyError): + pass + else: + if not types or isinstance(result, types): + return result + else: + continue + return default + + +def deep_get(object, *keys, default=None, types=()): + '''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. + Last argument is the default value to use in case of any IndexErrors or KeyErrors. + If types is given and the result doesn't match one of those types, default is returned''' + try: + for key in keys: + object = object[key] + except (TypeError, IndexError, KeyError): + return default + else: + if not types or isinstance(object, types): + return object + else: + return default + +def multi_deep_get(object, *key_sequences, default=None, types=()): + '''Like deep_get, but can try different key sequences in case one fails. + Return default if all of them fail. key_sequences is a list of lists''' + for key_sequence in key_sequences: + _object = object + try: + for key in key_sequence: + _object = _object[key] + except (TypeError, IndexError, KeyError): + pass + else: + if not types or isinstance(_object, types): + return _object + else: + continue + return default + +def liberal_update(obj, key, value): + '''Updates obj[key] with value as long as value is not None. + Ensures obj[key] will at least get a value of None, however''' + if (value is not None) or (key not in obj): + obj[key] = value + +def conservative_update(obj, key, value): + '''Only updates obj if it doesn't have key or obj[key] is None''' + if obj.get(key) is None: + obj[key] = value + +def remove_redirect(url): + if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking + query_string = url[url.find('?')+1: ] + return urllib.parse.parse_qs(query_string)['q'][0] + return url + +youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$') +def normalize_url(url): + if url is None: + return None + match = youtube_url_re.fullmatch(url) + if match is None: + raise Exception() + + return 'https://www.youtube.com' + match.group(1) + +def _recover_urls(runs): + for run in runs: + url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') + text = run.get('text', '') + # second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text + if url is not None and (text.startswith('http://') or text.startswith('https://')): + url = remove_redirect(url) + run['url'] = url + run['text'] = url # youtube truncates the url text, use actual url instead + +def extract_str(node, default=None, recover_urls=False): + '''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix Youtube's truncation of url text (most prominently seen in descriptions)''' + if isinstance(node, str): + return node + + try: + return node['simpleText'] + except (KeyError, TypeError): + pass + + if isinstance(node, dict) and 'runs' in node: + if recover_urls: + _recover_urls(node['runs']) + return ''.join(text_run.get('text', '') for text_run in node['runs']) + + return default + +def extract_formatted_text(node): + if not node: + return [] + if 'runs' in node: + _recover_urls(node['runs']) + return node['runs'] + elif 'simpleText' in node: + return [{'text': node['simpleText']}] + return [] + +def extract_int(string, default=None): + if isinstance(string, int): + return string + if not isinstance(string, str): + string = extract_str(string) + if not string: + return default + match = re.search(r'(\d+)', string.replace(',', '')) + if match is None: + return default + try: + return int(match.group(1)) + except ValueError: + return default + +def extract_approx_int(string): + '''e.g. "15M" from "15M subscribers"''' + if not isinstance(string, str): + string = extract_str(string) + if not string: + return None + match = re.search(r'(\d+[KMBTkmbt])', string.replace(',', '')) + if match is None: + return None + return match.group(1) + +def extract_date(date_text): + '''Input: "Mar 9, 2019". Output: "2019-3-9"''' + if date_text is None: + return None + + date_text = date_text.replace(',', '').lower() + parts = date_text.split() + if len(parts) >= 3: + month, day, year = parts[-3:] + month = month_abbreviations.get(month[0:3]) # slicing in case they start writing out the full month name + if month and (re.fullmatch(r'\d\d?', day) is not None) and (re.fullmatch(r'\d{4}', year) is not None): + return year + '-' + month + '-' + day + +def check_missing_keys(object, *key_sequences): + for key_sequence in key_sequences: + _object = object + try: + for key in key_sequence: + _object = _object[key] + except (KeyError, IndexError, TypeError): + return 'Could not find ' + key + + return None + +def prefix_urls(item): + try: + item['thumbnail'] = util.prefix_url(item['thumbnail']) + except KeyError: + pass + + try: + item['author_url'] = util.prefix_url(item['author_url']) + except KeyError: + pass + +def add_extra_html_info(item): + if item['type'] == 'video': + item['url'] = (util.URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None + + video_info = {} + for key in ('id', 'title', 'author', 'duration'): + try: + video_info[key] = item[key] + except KeyError: + video_info[key] = '' + + item['video_info'] = json.dumps(video_info) + + elif item['type'] == 'playlist': + item['url'] = (util.URL_ORIGIN + '/playlist?list=' + item['id']) if item.get('id') else None + elif item['type'] == 'channel': + item['url'] = (util.URL_ORIGIN + "/channel/" + item['id']) if item.get('id') else None + +def extract_item_info(item, additional_info={}): + if not item: + return {'error': 'No item given'} + + type = get(list(item.keys()), 0) + if not type: + return {'error': 'Could not find type'} + item = item[type] + + info = {'error': None} + if type in ('itemSectionRenderer', 'compactAutoplayRenderer'): + return extract_item_info(deep_get(item, 'contents', 0), additional_info) + + if type in ('movieRenderer', 'clarificationRenderer'): + info['type'] = 'unsupported' + return info + + info.update(additional_info) + + # type looks like e.g. 'compactVideoRenderer' or 'gridVideoRenderer' + # camelCase split, https://stackoverflow.com/a/37697078 + type_parts = [s.lower() for s in re.sub(r'([A-Z][a-z]+)', r' \1', type).split()] + if len(type_parts) < 2: + info['type'] = 'unsupported' + return + primary_type = type_parts[-2] + if primary_type == 'video': + info['type'] = 'video' + elif primary_type in ('playlist', 'radio', 'show'): + info['type'] = 'playlist' + elif primary_type == 'channel': + info['type'] = 'channel' + else: + info['type'] = 'unsupported' + + info['title'] = extract_str(item.get('title')) + info['author'] = extract_str(multi_get(item, 'longBylineText', 'shortBylineText', 'ownerText')) + info['author_id'] = extract_str(multi_deep_get(item, + ['longBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], + ['shortBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], + ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'] + )) + info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None + info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText')) + info['thumbnail'] = multi_deep_get(item, + ['thumbnail', 'thumbnails', 0, 'url'], # videos + ['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists + ['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows + ) + + info['badges'] = [] + for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()): + badge = deep_get(badge_node, 'metadataBadgeRenderer', 'label') + if badge: + info['badges'].append(badge) + + if primary_type in ('video', 'playlist'): + info['time_published'] = extract_str(item.get('publishedTimeText')) + + if primary_type == 'video': + info['id'] = item.get('videoId') + info['view_count'] = extract_int(item.get('viewCountText')) + + # dig into accessibility data to get view_count for videos marked as recommended, and to get time_published + accessibility_label = deep_get(item, 'title', 'accessibility', 'accessibilityData', 'label', default='') + timestamp = re.search(r'(\d+ \w+ ago)', accessibility_label) + if timestamp: + conservative_update(info, 'time_published', timestamp.group(1)) + view_count = re.search(r'(\d+) views', accessibility_label.replace(',', '')) + if view_count: + conservative_update(info, 'view_count', int(view_count.group(1))) + + if info['view_count']: + info['approx_view_count'] = '{:,}'.format(info['view_count']) + else: + info['approx_view_count'] = extract_approx_int(multi_get(item, 'shortViewCountText')) + info['duration'] = extract_str(item.get('lengthText')) + elif primary_type == 'playlist': + info['id'] = item.get('playlistId') + info['video_count'] = extract_int(item.get('videoCount')) + elif primary_type == 'channel': + info['id'] = item.get('channelId') + info['approx_subscriber_count'] = extract_approx_int(item.get('subscriberCountText')) + elif primary_type == 'show': + info['id'] = deep_get(item, 'navigationEndpoint', 'watchEndpoint', 'playlistId') + + if primary_type in ('playlist', 'channel'): + conservative_update(info, 'video_count', extract_int(item.get('videoCountText'))) + + for overlay in item.get('thumbnailOverlays', []): + conservative_update(info, 'duration', extract_str(deep_get( + overlay, 'thumbnailOverlayTimeStatusRenderer', 'text' + ))) + # show renderers don't have videoCountText + conservative_update(info, 'video_count', extract_int(deep_get( + overlay, 'thumbnailOverlayBottomPanelRenderer', 'text' + ))) + return info + +def parse_info_prepare_for_html(renderer, additional_info={}): + item = extract_item_info(renderer, additional_info) + prefix_urls(item) + add_extra_html_info(item) + + return item + +def extract_response(polymer_json): + '''return response, error''' + response = multi_deep_get(polymer_json, [1, 'response'], ['response'], default=None, types=dict) + if response is None: + return None, 'Failed to extract response' + else: + return response, None + + +list_types = { + 'sectionListRenderer', + 'itemSectionRenderer', + 'gridRenderer', + 'playlistVideoListRenderer', +} + +item_types = { + 'movieRenderer', + 'didYouMeanRenderer', + 'showingResultsForRenderer', + + 'videoRenderer', + 'compactVideoRenderer', + 'compactAutoplayRenderer', + 'gridVideoRenderer', + 'playlistVideoRenderer', + + 'playlistRenderer', + 'compactPlaylistRenderer', + 'gridPlaylistRenderer', + + 'radioRenderer', + 'compactRadioRenderer', + 'gridRadioRenderer', + + 'showRenderer', + 'compactShowRenderer', + 'gridShowRenderer', + + + 'channelRenderer', + 'compactChannelRenderer', + 'gridChannelRenderer', + + 'channelAboutFullMetadataRenderer', +} + +def traverse_browse_renderer(renderer): + for tab in get(renderer, 'tabs', (), types=(list, tuple)): + tab_renderer = multi_deep_get(tab, ['tabRenderer'], ['expandableTabRenderer'], default=None, types=dict) + if tab_renderer is None: + continue + if tab_renderer.get('selected', False): + return get(tab_renderer, 'content', {}, types=(dict)) + print('Could not find tab with content') + return {} + +def traverse_standard_list(renderer): + renderer_list = multi_deep_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple)) + continuation = deep_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation') + return renderer_list, continuation + +# these renderers contain one inside them +nested_renderer_dispatch = { + 'singleColumnBrowseResultsRenderer': traverse_browse_renderer, + 'twoColumnBrowseResultsRenderer': traverse_browse_renderer, + 'twoColumnSearchResultsRenderer': lambda renderer: get(renderer, 'primaryContents', {}, types=dict), +} + +# these renderers contain a list of renderers inside them +nested_renderer_list_dispatch = { + 'sectionListRenderer': traverse_standard_list, + 'itemSectionRenderer': traverse_standard_list, + 'gridRenderer': traverse_standard_list, + 'playlistVideoListRenderer': traverse_standard_list, + 'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None), +} + +def extract_items(response, item_types=item_types): + '''return items, ctoken''' + if 'continuationContents' in response: + # always has just the one [something]Continuation key, but do this just in case they add some tracking key or something + for key, renderer_continuation in get(response, 'continuationContents', {}, types=dict).items(): + if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation + items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=[], types=(list, tuple)) + ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str) + return items, ctoken + return [], None + elif 'contents' in response: + ctoken = None + items = [] + + iter_stack = collections.deque() + current_iter = iter(()) + + renderer = get(response, 'contents', {}, types=dict) + + while True: + # mode 1: dig into the current renderer + # Will stay in mode 1 (via continue) if a new renderer is found inside this one + # Otherwise, after finding that it is an item renderer, + # contains a list, or contains nothing, + # falls through into mode 2 to get a new renderer + if len(renderer) != 0: + key, value = list(renderer.items())[0] + + # has a list in it, add it to the iter stack + if key in nested_renderer_list_dispatch: + renderer_list, continuation = nested_renderer_list_dispatch[key](value) + if renderer_list: + iter_stack.append(current_iter) + current_iter = iter(renderer_list) + if continuation: + ctoken = continuation + + # new renderer nested inside this one + elif key in nested_renderer_dispatch: + renderer = nested_renderer_dispatch[key](value) + continue # back to mode 1 + + # the renderer is an item + elif key in item_types: + items.append(renderer) + + + # mode 2: get a new renderer by iterating. + # goes up the stack for an iterator if one has been exhausted + while current_iter is not None: + try: + renderer = current_iter.__next__() + break + except StopIteration: + try: + current_iter = iter_stack.pop() # go back up the stack + except IndexError: + return items, ctoken + + else: + return [], None diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py new file mode 100644 index 0000000..6277c8d --- /dev/null +++ b/youtube/yt_data_extract/everything_else.py @@ -0,0 +1,273 @@ +from .common import (get, multi_get, deep_get, multi_deep_get, + liberal_update, conservative_update, remove_redirect, normalize_url, + extract_str, extract_formatted_text, extract_int, extract_approx_int, + extract_date, check_missing_keys, extract_item_info, extract_items, + extract_response) +from youtube import proto + +import re +import urllib +from math import ceil + +def extract_channel_info(polymer_json, tab): + response, err = extract_response(polymer_json) + if err: + return {'error': err} + + try: + microformat = response['microformat']['microformatDataRenderer'] + + # channel doesn't exist or was terminated + # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org + except KeyError: + if 'alerts' in response and len(response['alerts']) > 0: + return {'error': ' '.join(alert['alertRenderer']['text']['simpleText'] for alert in response['alerts']) } + elif 'errors' in response['responseContext']: + for error in response['responseContext']['errors']['error']: + if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id': + return {'error': 'This channel does not exist'} + return {'error': 'Failure getting microformat'} + + info = {'error': None} + info['current_tab'] = tab + + + # stuff from microformat (info given by youtube for every page on channel) + info['short_description'] = microformat['description'] + info['channel_name'] = microformat['title'] + info['avatar'] = microformat['thumbnail']['thumbnails'][0]['url'] + channel_url = microformat['urlCanonical'].rstrip('/') + channel_id = channel_url[channel_url.rfind('/')+1:] + info['channel_id'] = channel_id + info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id + + info['items'] = [] + + # empty channel + if 'contents' not in response and 'continuationContents' not in response: + return info + + + items, _ = extract_items(response) + if tab in ('videos', 'playlists', 'search'): + additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id} + info['items'] = [extract_item_info(renderer, additional_info) for renderer in items] + + elif tab == 'about': + for item in items: + try: + channel_metadata = item['channelAboutFullMetadataRenderer'] + break + except KeyError: + pass + else: + info['error'] = 'Could not find channelAboutFullMetadataRenderer' + return info + + info['links'] = [] + for link_json in channel_metadata.get('primaryLinks', ()): + url = remove_redirect(link_json['navigationEndpoint']['urlEndpoint']['url']) + + text = extract_str(link_json['title']) + + info['links'].append( (text, url) ) + + + info['stats'] = [] + for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'): + try: + stat = channel_metadata[stat_name] + except KeyError: + continue + info['stats'].append(extract_str(stat)) + + if 'description' in channel_metadata: + info['description'] = extract_str(channel_metadata['description']) + else: + info['description'] = '' + + else: + raise NotImplementedError('Unknown or unsupported channel tab: ' + tab) + + return info + +def extract_search_info(polymer_json): + response, err = extract_response(polymer_json) + if err: + return {'error': err} + info = {'error': None} + info['estimated_results'] = int(response['estimatedResults']) + info['estimated_pages'] = ceil(info['estimated_results']/20) + + + results, _ = extract_items(response) + + + info['items'] = [] + info['corrections'] = {'type': None} + for renderer in results: + type = list(renderer.keys())[0] + if type == 'shelfRenderer': + continue + if type == 'didYouMeanRenderer': + renderer = renderer[type] + + info['corrections'] = { + 'type': 'did_you_mean', + 'corrected_query': renderer['correctedQueryEndpoint']['searchEndpoint']['query'], + 'corrected_query_text': renderer['correctedQuery']['runs'], + } + continue + if type == 'showingResultsForRenderer': + renderer = renderer[type] + + info['corrections'] = { + 'type': 'showing_results_for', + 'corrected_query_text': renderer['correctedQuery']['runs'], + 'original_query_text': renderer['originalQuery']['simpleText'], + } + continue + + i_info = extract_item_info(renderer) + if i_info.get('type') != 'unsupported': + info['items'].append(i_info) + + + return info + +def extract_playlist_metadata(polymer_json): + response, err = extract_response(polymer_json) + if err: + return {'error': err} + + metadata = {'error': None} + header = deep_get(response, 'header', 'playlistHeaderRenderer', default={}) + metadata['title'] = extract_str(header.get('title')) + + metadata['first_video_id'] = deep_get(header, 'playEndpoint', 'watchEndpoint', 'videoId') + first_id = re.search(r'([a-z_\-]{11})', deep_get(header, + 'thumbnail', 'thumbnails', 0, 'url', default='')) + if first_id: + conservative_update(metadata, 'first_video_id', first_id.group(1)) + if metadata['first_video_id'] is None: + metadata['thumbnail'] = None + else: + metadata['thumbnail'] = 'https://i.ytimg.com/vi/' + metadata['first_video_id'] + '/mqdefault.jpg' + + metadata['video_count'] = extract_int(header.get('numVideosText')) + metadata['description'] = extract_str(header.get('descriptionText'), default='') + metadata['author'] = extract_str(header.get('ownerText')) + metadata['author_id'] = multi_deep_get(header, + ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], + ['ownerEndpoint', 'browseEndpoint', 'browseId']) + if metadata['author_id']: + metadata['author_url'] = 'https://www.youtube.com/channel/' + metadata['author_id'] + else: + metadata['author_url'] = None + metadata['view_count'] = extract_int(header.get('viewCountText')) + metadata['like_count'] = extract_int(header.get('likesCountWithoutLikeText')) + for stat in header.get('stats', ()): + text = extract_str(stat) + if 'videos' in text: + conservative_update(metadata, 'video_count', extract_int(text)) + elif 'views' in text: + conservative_update(metadata, 'view_count', extract_int(text)) + elif 'updated' in text: + metadata['time_published'] = extract_date(text) + + return metadata + +def extract_playlist_info(polymer_json): + response, err = extract_response(polymer_json) + if err: + return {'error': err} + info = {'error': None} + first_page = 'continuationContents' not in response + video_list, _ = extract_items(response) + + info['items'] = [extract_item_info(renderer) for renderer in video_list] + + if first_page: + info['metadata'] = extract_playlist_metadata(polymer_json) + + return info + +def ctoken_metadata(ctoken): + result = dict() + params = proto.parse(proto.b64_to_bytes(ctoken)) + result['video_id'] = proto.parse(params[2])[2].decode('ascii') + + offset_information = proto.parse(params[6]) + result['offset'] = offset_information.get(5, 0) + + result['is_replies'] = False + if (3 in offset_information) and (2 in proto.parse(offset_information[3])): + result['is_replies'] = True + result['sort'] = None + else: + try: + result['sort'] = proto.parse(offset_information[4])[6] + except KeyError: + result['sort'] = 0 + return result + +def extract_comments_info(polymer_json): + response, err = extract_response(polymer_json) + if err: + return {'error': err} + info = {'error': None} + + url = multi_deep_get(polymer_json, [1, 'url'], ['url']) + if url: + ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] + metadata = ctoken_metadata(ctoken) + else: + metadata = {} + info['video_id'] = metadata.get('video_id') + info['offset'] = metadata.get('offset') + info['is_replies'] = metadata.get('is_replies') + info['sort'] = metadata.get('sort') + info['video_title'] = None + + comments, ctoken = extract_items(response) + info['comments'] = [] + info['ctoken'] = ctoken + for comment in comments: + comment_info = {} + + if 'commentThreadRenderer' in comment: # top level comments + conservative_update(info, 'is_replies', False) + comment_thread = comment['commentThreadRenderer'] + info['video_title'] = extract_str(comment_thread.get('commentTargetTitle')) + if 'replies' not in comment_thread: + comment_info['reply_count'] = 0 + else: + comment_info['reply_count'] = extract_int(deep_get(comment_thread, + 'replies', 'commentRepliesRenderer', 'moreText' + ), default=1) # With 1 reply, the text reads "View reply" + comment_renderer = deep_get(comment_thread, 'comment', 'commentRenderer', default={}) + elif 'commentRenderer' in comment: # replies + comment_info['reply_count'] = 0 # replyCount, below, not present for replies even if the reply has further replies to it + conservative_update(info, 'is_replies', True) + comment_renderer = comment['commentRenderer'] + else: + comment_renderer = {} + + # These 3 are sometimes absent, likely because the channel was deleted + comment_info['author'] = extract_str(comment_renderer.get('authorText')) + comment_info['author_url'] = deep_get(comment_renderer, + 'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url') + comment_info['author_id'] = deep_get(comment_renderer, + 'authorEndpoint', 'browseEndpoint', 'browseId') + + comment_info['author_avatar'] = deep_get(comment_renderer, + 'authorThumbnail', 'thumbnails', 0, 'url') + comment_info['id'] = comment_renderer.get('commentId') + comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText')) + comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText')) + comment_info['like_count'] = comment_renderer.get('likeCount') + liberal_update(comment_info, 'reply_count', comment_renderer.get('replyCount')) + + info['comments'].append(comment_info) + + return info diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py new file mode 100644 index 0000000..1166344 --- /dev/null +++ b/youtube/yt_data_extract/watch_extraction.py @@ -0,0 +1,449 @@ +from .common import (get, multi_get, deep_get, multi_deep_get, + liberal_update, conservative_update, remove_redirect, normalize_url, + extract_str, extract_formatted_text, extract_int, extract_approx_int, + extract_date, check_missing_keys, extract_item_info, extract_items, + extract_response) + +import json +import urllib.parse +import traceback + +# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py +_formats = { + '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'}, + '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'}, + '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'mp4v'}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 96, 'vcodec': 'h264'}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), audio_bitrate varies as well + '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'}, + '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + + + # 3D videos + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, + + # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'h264'}, + + # DASH mp4 video + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, + + # Dash mp4 audio + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 256, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, + + # Dash webm + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + + # Dash webm audio + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 256}, + + # Dash webm audio with opus inside + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 160}, + + # RTMP (unnamed) + '_rtmp': {'protocol': 'rtmp'}, + + # av01 video only formats sometimes served with "unknown" codecs + '394': {'vcodec': 'av01.0.05M.08'}, + '395': {'vcodec': 'av01.0.05M.08'}, + '396': {'vcodec': 'av01.0.05M.08'}, + '397': {'vcodec': 'av01.0.05M.08'}, +} + +def extract_metadata_row_info(video_renderer_info): + # extract category and music list + info = { + 'category': None, + 'music_list': [], + } + + current_song = {} + for row in deep_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]): + row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='') + row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0)) + if row_title == 'Category': + info['category'] = row_content + elif row_title in ('Song', 'Music'): + if current_song: + info['music_list'].append(current_song) + current_song = {'title': row_content} + elif row_title == 'Artist': + current_song['artist'] = row_content + elif row_title == 'Album': + current_song['album'] = row_content + elif row_title == 'Writers': + current_song['writers'] = row_content + elif row_title.startswith('Licensed'): + current_song['licensor'] = row_content + if current_song: + info['music_list'].append(current_song) + + return info + +def extract_watch_info_mobile(top_level): + info = {} + microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) + + family_safe = microformat.get('isFamilySafe') + if family_safe is None: + info['age_restricted'] = None + else: + info['age_restricted'] = not family_safe + info['allowed_countries'] = microformat.get('availableCountries', []) + info['time_published'] = microformat.get('publishDate') + + response = top_level.get('response', {}) + + # video info from metadata renderers + items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'}) + if items: + video_info = items[0]['slimVideoMetadataRenderer'] + else: + print('Failed to extract video metadata') + video_info = {} + + info.update(extract_metadata_row_info(video_info)) + info['description'] = extract_str(video_info.get('description'), recover_urls=True) + info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle'))) + info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) + info['author_id'] = deep_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') + info['title'] = extract_str(video_info.get('title')) + info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='') + info['unlisted'] = False + for badge in video_info.get('badges', []): + if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted': + info['unlisted'] = True + info['like_count'] = None + info['dislike_count'] = None + if not info['time_published']: + info['time_published'] = extract_date(extract_str(video_info.get('dateText', None))) + for button in video_info.get('buttons', ()): + button_renderer = button.get('slimMetadataToggleButtonRenderer', {}) + + # all the digits can be found in the accessibility data + count = extract_int(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label')) + + # this count doesn't have all the digits, it's like 53K for instance + dumb_count = extract_int(extract_str(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText'))) + + # the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0 + if dumb_count == 0: + count = 0 + + if 'isLike' in button_renderer: + info['like_count'] = count + elif 'isDislike' in button_renderer: + info['dislike_count'] = count + + # comment section info + items, _ = extract_items(response, item_types={'commentSectionRenderer'}) + if items: + comment_info = items[0]['commentSectionRenderer'] + comment_count_text = extract_str(deep_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText')) + if comment_count_text == 'Comments': # just this with no number, means 0 comments + info['comment_count'] = 0 + else: + info['comment_count'] = extract_int(comment_count_text) + info['comments_disabled'] = False + else: # no comment section present means comments are disabled + info['comment_count'] = 0 + info['comments_disabled'] = True + + # check for limited state + items, _ = extract_items(response, item_types={'limitedStateMessageRenderer'}) + if items: + info['limited_state'] = True + else: + info['limited_state'] = False + + # related videos + related, _ = extract_items(response) + info['related_videos'] = [extract_item_info(renderer) for renderer in related] + + return info + +month_abbreviations = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'} +def extract_watch_info_desktop(top_level): + info = { + 'comment_count': None, + 'comments_disabled': None, + 'allowed_countries': None, + 'limited_state': None, + } + + video_info = {} + for renderer in deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()): + if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'): + video_info.update(list(renderer.values())[0]) + + info.update(extract_metadata_row_info(video_info)) + info['description'] = extract_str(video_info.get('description', None), recover_urls=True) + info['time_published'] = extract_date(extract_str(video_info.get('dateText', None))) + + likes_dislikes = deep_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/') + if len(likes_dislikes) == 2: + info['like_count'] = extract_int(likes_dislikes[0]) + info['dislike_count'] = extract_int(likes_dislikes[1]) + else: + info['like_count'] = None + info['dislike_count'] = None + + info['title'] = extract_str(video_info.get('title', None)) + info['author'] = extract_str(deep_get(video_info, 'owner', 'videoOwnerRenderer', 'title')) + info['author_id'] = deep_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') + info['view_count'] = extract_int(extract_str(deep_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount'))) + + related = deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[]) + info['related_videos'] = [extract_item_info(renderer) for renderer in related] + + return info + +def get_caption_url(info, language, format, automatic=False, translation_language=None): + '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.''' + url = info['_captions_base_url'] + url += '&lang=' + language + url += '&fmt=' + format + if automatic: + url += '&kind=asr' + elif language in info['_manual_caption_language_names']: + url += '&name=' + urllib.parse.quote(info['_manual_caption_language_names'][language], safe='') + + if translation_language: + url += '&tlang=' + translation_language + return url + +def extract_formats(info, player_response): + streaming_data = player_response.get('streamingData', {}) + yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', []) + + info['formats'] = [] + + for yt_fmt in yt_formats: + fmt = {} + fmt['ext'] = None + fmt['audio_bitrate'] = None + fmt['acodec'] = None + fmt['vcodec'] = None + fmt['width'] = yt_fmt.get('width') + fmt['height'] = yt_fmt.get('height') + fmt['file_size'] = yt_fmt.get('contentLength') + fmt['audio_sample_rate'] = yt_fmt.get('audioSampleRate') + fmt['fps'] = yt_fmt.get('fps') + cipher = dict(urllib.parse.parse_qsl(yt_fmt.get('cipher', ''))) + if cipher: + fmt['url'] = cipher.get('url') + else: + fmt['url'] = yt_fmt.get('url') + fmt['s'] = cipher.get('s') + fmt['sp'] = cipher.get('sp') + fmt.update(_formats.get(str(yt_fmt.get('itag')), {})) + + info['formats'].append(fmt) + +def extract_playability_error(info, player_response, error_prefix=''): + if info['formats']: + info['playability_status'] = None + info['playability_error'] = None + return + + playability_status = deep_get(player_response, 'playabilityStatus', 'status', default=None) + info['playability_status'] = playability_status + + playability_reason = extract_str(multi_deep_get(player_response, + ['playabilityStatus', 'reason'], + ['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'], + default='Could not find playability error') + ) + + if playability_status not in (None, 'OK'): + info['playability_error'] = error_prefix + playability_reason + else: + info['playability_error'] = error_prefix + 'Unknown playability error' + +SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') +def extract_watch_info(polymer_json): + info = {'playability_error': None, 'error': None} + + if isinstance(polymer_json, dict): + top_level = polymer_json + elif isinstance(polymer_json, (list, tuple)): + top_level = {} + for page_part in polymer_json: + if not isinstance(page_part, dict): + return {'error': 'Invalid page part'} + top_level.update(page_part) + else: + return {'error': 'Invalid top level polymer data'} + + error = check_missing_keys(top_level, + ['player', 'args'], + ['player', 'assets', 'js'], + ['playerResponse'], + ) + if error: + info['playability_error'] = error + + player_args = deep_get(top_level, 'player', 'args', default={}) + player_response = json.loads(player_args['player_response']) if 'player_response' in player_args else {} + + # captions + info['automatic_caption_languages'] = [] + info['manual_caption_languages'] = [] + info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url + info['translation_languages'] = [] + captions_info = player_response.get('captions', {}) + info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl')) + for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()): + lang_code = caption_track.get('languageCode') + if not lang_code: + continue + if caption_track.get('kind') == 'asr': + info['automatic_caption_languages'].append(lang_code) + else: + info['manual_caption_languages'].append(lang_code) + base_url = caption_track.get('baseUrl', '') + lang_name = deep_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0) + if lang_name: + info['_manual_caption_language_names'][lang_code] = lang_name + + for translation_lang_info in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()): + lang_code = translation_lang_info.get('languageCode') + if lang_code: + info['translation_languages'].append(lang_code) + if translation_lang_info.get('isTranslatable') == False: + print('WARNING: Found non-translatable caption language') + + # formats + extract_formats(info, player_response) + + # playability errors + extract_playability_error(info, player_response) + + # check age-restriction + info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error']) + + # base_js (for decryption of signatures) + info['base_js'] = deep_get(top_level, 'player', 'assets', 'js') + if info['base_js']: + info['base_js'] = normalize_url(info['base_js']) + + mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={}) + if mobile: + info.update(extract_watch_info_mobile(top_level)) + else: + info.update(extract_watch_info_desktop(top_level)) + + # stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info + vd = deep_get(top_level, 'playerResponse', 'videoDetails', default={}) + liberal_update(info, 'title', extract_str(vd.get('title'))) + liberal_update(info, 'duration', extract_int(vd.get('lengthSeconds'))) + liberal_update(info, 'view_count', extract_int(vd.get('viewCount'))) + # videos with no description have a blank string + liberal_update(info, 'description', vd.get('shortDescription')) + liberal_update(info, 'id', vd.get('videoId')) + liberal_update(info, 'author', vd.get('author')) + liberal_update(info, 'author_id', vd.get('channelId')) + liberal_update(info, 'live', vd.get('isLiveContent')) + conservative_update(info, 'unlisted', not vd.get('isCrawlable', True)) #isCrawlable is false on limited state videos even if they aren't unlisted + liberal_update(info, 'tags', vd.get('keywords', [])) + + # fallback stuff from microformat + mf = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) + conservative_update(info, 'title', extract_str(mf.get('title'))) + conservative_update(info, 'duration', extract_int(mf.get('lengthSeconds'))) + # this gives the view count for limited state videos + conservative_update(info, 'view_count', extract_int(mf.get('viewCount'))) + conservative_update(info, 'description', extract_str(mf.get('description'), recover_urls=True)) + conservative_update(info, 'author', mf.get('ownerChannelName')) + conservative_update(info, 'author_id', mf.get('externalChannelId')) + liberal_update(info, 'unlisted', mf.get('isUnlisted')) + liberal_update(info, 'category', mf.get('category')) + liberal_update(info, 'time_published', mf.get('publishDate')) + liberal_update(info, 'time_uploaded', mf.get('uploadDate')) + + # other stuff + info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None + return info + +def update_with_age_restricted_info(info, video_info_page): + ERROR_PREFIX = 'Error bypassing age-restriction: ' + + video_info = urllib.parse.parse_qs(video_info_page) + player_response = deep_get(video_info, 'player_response', 0) + if player_response is None: + info['playability_error'] = ERROR_PREFIX + 'Could not find player_response in video_info_page' + return + try: + player_response = json.loads(player_response) + except json.decoder.JSONDecodeError: + traceback.print_exc() + info['playability_error'] = ERROR_PREFIX + 'Failed to parse json response' + return + + extract_formats(info, player_response) + extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX) -- cgit v1.2.3 From d1d908d5b1aadb0dc75b25df1a47789c021f89e2 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 19 Dec 2019 19:48:53 -0800 Subject: Extraction: Move html post processing stuff from yt_data_extract to util --- youtube/channel.py | 4 ++-- youtube/local_playlist.py | 2 +- youtube/playlist.py | 6 +++--- youtube/search.py | 4 ++-- youtube/subscriptions.py | 2 +- youtube/util.py | 38 ++++++++++++++++++++++++++++++++++++ youtube/watch.py | 4 ++-- youtube/yt_data_extract/__init__.py | 3 +-- youtube/yt_data_extract/common.py | 39 ------------------------------------- 9 files changed, 50 insertions(+), 52 deletions(-) (limited to 'youtube') diff --git a/youtube/channel.py b/youtube/channel.py index 67a79ad..ad06e3f 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -142,8 +142,8 @@ def post_process_channel_info(info): info['avatar'] = util.prefix_url(info['avatar']) info['channel_url'] = util.prefix_url(info['channel_url']) for item in info['items']: - yt_data_extract.prefix_urls(item) - yt_data_extract.add_extra_html_info(item) + util.prefix_urls(item) + util.add_extra_html_info(item) diff --git a/youtube/local_playlist.py b/youtube/local_playlist.py index 2375ba2..0b47c72 100644 --- a/youtube/local_playlist.py +++ b/youtube/local_playlist.py @@ -57,7 +57,7 @@ def get_local_playlist_videos(name, offset=0, amount=50): info['thumbnail'] = util.get_thumbnail_url(info['id']) missing_thumbnails.append(info['id']) info['type'] = 'video' - yt_data_extract.add_extra_html_info(info) + util.add_extra_html_info(info) videos.append(info) except json.decoder.JSONDecodeError: if not video_json.strip() == '': diff --git a/youtube/playlist.py b/youtube/playlist.py index 5dc8ab7..3ca235a 100644 --- a/youtube/playlist.py +++ b/youtube/playlist.py @@ -97,10 +97,10 @@ def get_playlist_page(): if page != '1': info['metadata'] = yt_data_extract.extract_playlist_metadata(first_page_json) - yt_data_extract.prefix_urls(info['metadata']) + util.prefix_urls(info['metadata']) for item in info.get('items', ()): - yt_data_extract.prefix_urls(item) - yt_data_extract.add_extra_html_info(item) + util.prefix_urls(item) + util.add_extra_html_info(item) if 'id' in item: item['thumbnail'] = '/https://i.ytimg.com/vi/' + item['id'] + '/default.jpg' diff --git a/youtube/search.py b/youtube/search.py index a881557..0f6bbc4 100644 --- a/youtube/search.py +++ b/youtube/search.py @@ -80,8 +80,8 @@ def get_search_page(): return flask.render_template('error.html', error_message = search_info['error']) for extract_item_info in search_info['items']: - yt_data_extract.prefix_urls(extract_item_info) - yt_data_extract.add_extra_html_info(extract_item_info) + util.prefix_urls(extract_item_info) + util.add_extra_html_info(extract_item_info) corrections = search_info['corrections'] if corrections['type'] == 'did_you_mean': diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py index 9709467..dd058b3 100644 --- a/youtube/subscriptions.py +++ b/youtube/subscriptions.py @@ -766,7 +766,7 @@ def get_subscriptions_page(): video['thumbnail'] = util.URL_ORIGIN + '/data/subscription_thumbnails/' + video['id'] + '.jpg' video['type'] = 'video' video['item_size'] = 'small' - yt_data_extract.add_extra_html_info(video) + util.add_extra_html_info(video) tags = _get_all_tags(cursor) diff --git a/youtube/util.py b/youtube/util.py index 9023b98..feeec8c 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -1,4 +1,5 @@ import settings +from youtube import yt_data_extract import socks, sockshandler import gzip import brotli @@ -6,6 +7,7 @@ import urllib.parse import re import time import os +import json import gevent import gevent.queue import gevent.lock @@ -321,3 +323,39 @@ def left_remove(string, substring): return string[len(substring):] return string + +def prefix_urls(item): + try: + item['thumbnail'] = prefix_url(item['thumbnail']) + except KeyError: + pass + + try: + item['author_url'] = prefix_url(item['author_url']) + except KeyError: + pass + +def add_extra_html_info(item): + if item['type'] == 'video': + item['url'] = (URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None + + video_info = {} + for key in ('id', 'title', 'author', 'duration'): + try: + video_info[key] = item[key] + except KeyError: + video_info[key] = '' + + item['video_info'] = json.dumps(video_info) + + elif item['type'] == 'playlist': + item['url'] = (URL_ORIGIN + '/playlist?list=' + item['id']) if item.get('id') else None + elif item['type'] == 'channel': + item['url'] = (URL_ORIGIN + "/channel/" + item['id']) if item.get('id') else None + +def parse_info_prepare_for_html(renderer, additional_info={}): + item = yt_data_extract.extract_item_info(renderer, additional_info) + prefix_urls(item) + add_extra_html_info(item) + + return item diff --git a/youtube/watch.py b/youtube/watch.py index 69ab87b..45d658f 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -370,8 +370,8 @@ def get_watch_page(): } for item in info['related_videos']: - yt_data_extract.prefix_urls(item) - yt_data_extract.add_extra_html_info(item) + util.prefix_urls(item) + util.add_extra_html_info(item) if settings.gather_googlevideo_domains: with open(os.path.join(settings.data_dir, 'googlevideo-domains.txt'), 'a+', encoding='utf-8') as f: diff --git a/youtube/yt_data_extract/__init__.py b/youtube/yt_data_extract/__init__.py index f2a93a9..f2f07c0 100644 --- a/youtube/yt_data_extract/__init__.py +++ b/youtube/yt_data_extract/__init__.py @@ -1,8 +1,7 @@ from .common import (get, multi_get, deep_get, multi_deep_get, liberal_update, conservative_update, remove_redirect, normalize_url, extract_str, extract_formatted_text, extract_int, extract_approx_int, - extract_date, extract_item_info, extract_items, extract_response, - prefix_urls, add_extra_html_info, parse_info_prepare_for_html) + extract_date, extract_item_info, extract_items, extract_response) from .everything_else import (extract_channel_info, extract_search_info, extract_playlist_metadata, extract_playlist_info, extract_comments_info) diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index 5fa67bc..459b5e9 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -1,6 +1,3 @@ -from youtube import util - -import json import re import urllib.parse import collections @@ -179,35 +176,6 @@ def check_missing_keys(object, *key_sequences): return None -def prefix_urls(item): - try: - item['thumbnail'] = util.prefix_url(item['thumbnail']) - except KeyError: - pass - - try: - item['author_url'] = util.prefix_url(item['author_url']) - except KeyError: - pass - -def add_extra_html_info(item): - if item['type'] == 'video': - item['url'] = (util.URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None - - video_info = {} - for key in ('id', 'title', 'author', 'duration'): - try: - video_info[key] = item[key] - except KeyError: - video_info[key] = '' - - item['video_info'] = json.dumps(video_info) - - elif item['type'] == 'playlist': - item['url'] = (util.URL_ORIGIN + '/playlist?list=' + item['id']) if item.get('id') else None - elif item['type'] == 'channel': - item['url'] = (util.URL_ORIGIN + "/channel/" + item['id']) if item.get('id') else None - def extract_item_info(item, additional_info={}): if not item: return {'error': 'No item given'} @@ -307,13 +275,6 @@ def extract_item_info(item, additional_info={}): ))) return info -def parse_info_prepare_for_html(renderer, additional_info={}): - item = extract_item_info(renderer, additional_info) - prefix_urls(item) - add_extra_html_info(item) - - return item - def extract_response(polymer_json): '''return response, error''' response = multi_deep_get(polymer_json, [1, 'response'], ['response'], default=None, types=dict) -- cgit v1.2.3 From 4a3529df9577b660a2f493ab63ef08f10320b38e Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 19 Dec 2019 20:12:37 -0800 Subject: Extraction: Move stuff around in files and put underscores in front of internal helper function names Move get_captions_url in watch_extraction to bottom next to other exported, public functions --- youtube/yt_data_extract/common.py | 17 +++++---- youtube/yt_data_extract/everything_else.py | 4 +-- youtube/yt_data_extract/watch_extraction.py | 54 ++++++++++++++--------------- 3 files changed, 37 insertions(+), 38 deletions(-) (limited to 'youtube') diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index 459b5e9..4681a86 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -322,7 +322,7 @@ item_types = { 'channelAboutFullMetadataRenderer', } -def traverse_browse_renderer(renderer): +def _traverse_browse_renderer(renderer): for tab in get(renderer, 'tabs', (), types=(list, tuple)): tab_renderer = multi_deep_get(tab, ['tabRenderer'], ['expandableTabRenderer'], default=None, types=dict) if tab_renderer is None: @@ -332,24 +332,24 @@ def traverse_browse_renderer(renderer): print('Could not find tab with content') return {} -def traverse_standard_list(renderer): +def _traverse_standard_list(renderer): renderer_list = multi_deep_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple)) continuation = deep_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation') return renderer_list, continuation # these renderers contain one inside them nested_renderer_dispatch = { - 'singleColumnBrowseResultsRenderer': traverse_browse_renderer, - 'twoColumnBrowseResultsRenderer': traverse_browse_renderer, + 'singleColumnBrowseResultsRenderer': _traverse_browse_renderer, + 'twoColumnBrowseResultsRenderer': _traverse_browse_renderer, 'twoColumnSearchResultsRenderer': lambda renderer: get(renderer, 'primaryContents', {}, types=dict), } # these renderers contain a list of renderers inside them nested_renderer_list_dispatch = { - 'sectionListRenderer': traverse_standard_list, - 'itemSectionRenderer': traverse_standard_list, - 'gridRenderer': traverse_standard_list, - 'playlistVideoListRenderer': traverse_standard_list, + 'sectionListRenderer': _traverse_standard_list, + 'itemSectionRenderer': _traverse_standard_list, + 'gridRenderer': _traverse_standard_list, + 'playlistVideoListRenderer': _traverse_standard_list, 'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None), } @@ -411,6 +411,5 @@ def extract_items(response, item_types=item_types): current_iter = iter_stack.pop() # go back up the stack except IndexError: return items, ctoken - else: return [], None diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py index 6277c8d..6dc5248 100644 --- a/youtube/yt_data_extract/everything_else.py +++ b/youtube/yt_data_extract/everything_else.py @@ -192,7 +192,7 @@ def extract_playlist_info(polymer_json): return info -def ctoken_metadata(ctoken): +def _ctoken_metadata(ctoken): result = dict() params = proto.parse(proto.b64_to_bytes(ctoken)) result['video_id'] = proto.parse(params[2])[2].decode('ascii') @@ -220,7 +220,7 @@ def extract_comments_info(polymer_json): url = multi_deep_get(polymer_json, [1, 'url'], ['url']) if url: ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] - metadata = ctoken_metadata(ctoken) + metadata = _ctoken_metadata(ctoken) else: metadata = {} info['video_id'] = metadata.get('video_id') diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index 1166344..ff39f62 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -115,7 +115,7 @@ _formats = { '397': {'vcodec': 'av01.0.05M.08'}, } -def extract_metadata_row_info(video_renderer_info): +def _extract_metadata_row_info(video_renderer_info): # extract category and music list info = { 'category': None, @@ -145,7 +145,7 @@ def extract_metadata_row_info(video_renderer_info): return info -def extract_watch_info_mobile(top_level): +def _extract_watch_info_mobile(top_level): info = {} microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) @@ -167,7 +167,7 @@ def extract_watch_info_mobile(top_level): print('Failed to extract video metadata') video_info = {} - info.update(extract_metadata_row_info(video_info)) + info.update(_extract_metadata_row_info(video_info)) info['description'] = extract_str(video_info.get('description'), recover_urls=True) info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle'))) info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) @@ -228,7 +228,7 @@ def extract_watch_info_mobile(top_level): return info month_abbreviations = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'} -def extract_watch_info_desktop(top_level): +def _extract_watch_info_desktop(top_level): info = { 'comment_count': None, 'comments_disabled': None, @@ -241,7 +241,7 @@ def extract_watch_info_desktop(top_level): if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'): video_info.update(list(renderer.values())[0]) - info.update(extract_metadata_row_info(video_info)) + info.update(_extract_metadata_row_info(video_info)) info['description'] = extract_str(video_info.get('description', None), recover_urls=True) info['time_published'] = extract_date(extract_str(video_info.get('dateText', None))) @@ -263,21 +263,7 @@ def extract_watch_info_desktop(top_level): return info -def get_caption_url(info, language, format, automatic=False, translation_language=None): - '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.''' - url = info['_captions_base_url'] - url += '&lang=' + language - url += '&fmt=' + format - if automatic: - url += '&kind=asr' - elif language in info['_manual_caption_language_names']: - url += '&name=' + urllib.parse.quote(info['_manual_caption_language_names'][language], safe='') - - if translation_language: - url += '&tlang=' + translation_language - return url - -def extract_formats(info, player_response): +def _extract_formats(info, player_response): streaming_data = player_response.get('streamingData', {}) yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', []) @@ -305,7 +291,7 @@ def extract_formats(info, player_response): info['formats'].append(fmt) -def extract_playability_error(info, player_response, error_prefix=''): +def _extract_playability_error(info, player_response, error_prefix=''): if info['formats']: info['playability_status'] = None info['playability_error'] = None @@ -379,10 +365,10 @@ def extract_watch_info(polymer_json): print('WARNING: Found non-translatable caption language') # formats - extract_formats(info, player_response) + _extract_formats(info, player_response) # playability errors - extract_playability_error(info, player_response) + _extract_playability_error(info, player_response) # check age-restriction info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error']) @@ -394,9 +380,9 @@ def extract_watch_info(polymer_json): mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={}) if mobile: - info.update(extract_watch_info_mobile(top_level)) + info.update(_extract_watch_info_mobile(top_level)) else: - info.update(extract_watch_info_desktop(top_level)) + info.update(_extract_watch_info_desktop(top_level)) # stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info vd = deep_get(top_level, 'playerResponse', 'videoDetails', default={}) @@ -430,6 +416,20 @@ def extract_watch_info(polymer_json): info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None return info +def get_caption_url(info, language, format, automatic=False, translation_language=None): + '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.''' + url = info['_captions_base_url'] + url += '&lang=' + language + url += '&fmt=' + format + if automatic: + url += '&kind=asr' + elif language in info['_manual_caption_language_names']: + url += '&name=' + urllib.parse.quote(info['_manual_caption_language_names'][language], safe='') + + if translation_language: + url += '&tlang=' + translation_language + return url + def update_with_age_restricted_info(info, video_info_page): ERROR_PREFIX = 'Error bypassing age-restriction: ' @@ -445,5 +445,5 @@ def update_with_age_restricted_info(info, video_info_page): info['playability_error'] = ERROR_PREFIX + 'Failed to parse json response' return - extract_formats(info, player_response) - extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX) + _extract_formats(info, player_response) + _extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX) -- cgit v1.2.3 From 6b7a1212e30b713453aa7d2b3a7122e97689dad0 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 19 Dec 2019 21:28:21 -0800 Subject: Extraction: Move non-stateful signature decryption functionality into yt_data_extract --- youtube/watch.py | 97 ++++------------------------- youtube/yt_data_extract/__init__.py | 3 +- youtube/yt_data_extract/watch_extraction.py | 96 ++++++++++++++++++++++++++++ 3 files changed, 110 insertions(+), 86 deletions(-) (limited to 'youtube') diff --git a/youtube/watch.py b/youtube/watch.py index 45d658f..429f272 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -11,7 +11,6 @@ import gevent import os import math import traceback -import re import urllib try: @@ -175,101 +174,29 @@ def save_decrypt_cache(): f.write(json.dumps({'version': 1, 'decrypt_cache':decrypt_cache}, indent=4, sort_keys=True)) f.close() -# adapted from youtube-dl and invidious: -# https://github.com/omarroth/invidious/blob/master/src/invidious/helpers/signatures.cr -decrypt_function_re = re.compile(r'function\(a\)\{(a=a\.split\(""\)[^\}]+)\}') -op_with_arg_re = re.compile(r'[^\.]+\.([^\(]+)\(a,(\d+)\)') def decrypt_signatures(info): '''return error string, or False if no errors''' - if ('formats' not in info) or (not info['formats']) or (not info['formats'][0]['s']): - return False # No decryption needed + if not yt_data_extract.requires_decryption(info): + return False + if not info['player_name']: + return 'Could not find player name' if not info['base_js']: return 'Failed to find base.js' - player_name = yt_data_extract.get(info['base_js'].split('/'), -2) - if not player_name: - return 'Could not find player name' + player_name = info['player_name'] if player_name in decrypt_cache: print('Using cached decryption function for: ' + player_name) - decryption_function = decrypt_cache[player_name] + info['decryption_function'] = decrypt_cache[player_name] else: base_js = util.fetch_url(info['base_js'], debug_name='base.js', report_text='Fetched player ' + player_name) base_js = base_js.decode('utf-8') - - decrypt_function_match = decrypt_function_re.search(base_js) - if decrypt_function_match is None: - return 'Could not find decryption function in base.js' - - function_body = decrypt_function_match.group(1).split(';')[1:-1] - if not function_body: - return 'Empty decryption function body' - - var_name = yt_data_extract.get(function_body[0].split('.'), 0) - if var_name is None: - return 'Could not find var_name' - - var_body_match = re.search(r'var ' + re.escape(var_name) + r'=\{(.*?)\};', base_js, flags=re.DOTALL) - if var_body_match is None: - return 'Could not find var_body' - - operations = var_body_match.group(1).replace('\n', '').split('},') - if not operations: - return 'Did not find any definitions in var_body' - operations[-1] = operations[-1][:-1] # remove the trailing '}' since we split by '},' on the others - operation_definitions = {} - for op in operations: - colon_index = op.find(':') - opening_brace_index = op.find('{') - - if colon_index == -1 or opening_brace_index == -1: - return 'Could not parse operation' - op_name = op[:colon_index] - op_body = op[opening_brace_index+1:] - if op_body == 'a.reverse()': - operation_definitions[op_name] = 0 - elif op_body == 'a.splice(0,b)': - operation_definitions[op_name] = 1 - elif op_body.startswith('var c=a[0]'): - operation_definitions[op_name] = 2 - else: - return 'Unknown op_body: ' + op_body - - decryption_function = [] - for op_with_arg in function_body: - match = op_with_arg_re.fullmatch(op_with_arg) - if match is None: - return 'Could not parse operation with arg' - op_name = match.group(1) - if op_name not in operation_definitions: - return 'Unknown op_name: ' + op_name - op_argument = match.group(2) - decryption_function.append([operation_definitions[op_name], int(op_argument)]) - - decrypt_cache[player_name] = decryption_function + err = yt_data_extract.extract_decryption_function(info, base_js) + if err: + return err + decrypt_cache[player_name] = info['decryption_function'] save_decrypt_cache() - - for format in info['formats']: - if not format['s'] or not format['sp'] or not format['url']: - print('Warning: s, sp, or url not in format') - continue - - a = list(format['s']) - for op, argument in decryption_function: - if op == 0: - a.reverse() - elif op == 1: - a = a[argument:] - else: - operation_2(a, argument) - - signature = ''.join(a) - format['url'] += '&' + format['sp'] + '=' + signature - return False - -def operation_2(a, b): - c = a[0] - a[0] = a[b % len(a)] - a[b % len(a)] = c + err = yt_data_extract.decrypt_signatures(info) + return err headers = ( ('Accept', '*/*'), diff --git a/youtube/yt_data_extract/__init__.py b/youtube/yt_data_extract/__init__.py index f2f07c0..898141e 100644 --- a/youtube/yt_data_extract/__init__.py +++ b/youtube/yt_data_extract/__init__.py @@ -7,4 +7,5 @@ from .everything_else import (extract_channel_info, extract_search_info, extract_playlist_metadata, extract_playlist_info, extract_comments_info) from .watch_extraction import (extract_watch_info, get_caption_url, - update_with_age_restricted_info) + update_with_age_restricted_info, requires_decryption, + extract_decryption_function, decrypt_signatures) diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index ff39f62..09abbe3 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -7,6 +7,7 @@ from .common import (get, multi_get, deep_get, multi_deep_get, import json import urllib.parse import traceback +import re # from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py _formats = { @@ -377,7 +378,11 @@ def extract_watch_info(polymer_json): info['base_js'] = deep_get(top_level, 'player', 'assets', 'js') if info['base_js']: info['base_js'] = normalize_url(info['base_js']) + info['player_name'] = get(info['base_js'].split('/'), -2) + else: + info['player_name'] = None + # extract stuff from visible parts of page mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={}) if mobile: info.update(_extract_watch_info_mobile(top_level)) @@ -447,3 +452,94 @@ def update_with_age_restricted_info(info, video_info_page): _extract_formats(info, player_response) _extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX) + +def requires_decryption(info): + return ('formats' in info) and info['formats'] and info['formats'][0]['s'] + +# adapted from youtube-dl and invidious: +# https://github.com/omarroth/invidious/blob/master/src/invidious/helpers/signatures.cr +decrypt_function_re = re.compile(r'function\(a\)\{(a=a\.split\(""\)[^\}]+)\}') +op_with_arg_re = re.compile(r'[^\.]+\.([^\(]+)\(a,(\d+)\)') +def extract_decryption_function(info, base_js): + '''Insert decryption function into info. Return error string if not successful. + Decryption function is a list of list[2] of numbers. + It is advisable to cache the decryption function (uniquely identified by info['player_name']) so base.js (1 MB) doesn't need to be redownloaded each time''' + info['decryption_function'] = None + decrypt_function_match = decrypt_function_re.search(base_js) + if decrypt_function_match is None: + return 'Could not find decryption function in base.js' + + function_body = decrypt_function_match.group(1).split(';')[1:-1] + if not function_body: + return 'Empty decryption function body' + + var_name = get(function_body[0].split('.'), 0) + if var_name is None: + return 'Could not find var_name' + + var_body_match = re.search(r'var ' + re.escape(var_name) + r'=\{(.*?)\};', base_js, flags=re.DOTALL) + if var_body_match is None: + return 'Could not find var_body' + + operations = var_body_match.group(1).replace('\n', '').split('},') + if not operations: + return 'Did not find any definitions in var_body' + operations[-1] = operations[-1][:-1] # remove the trailing '}' since we split by '},' on the others + operation_definitions = {} + for op in operations: + colon_index = op.find(':') + opening_brace_index = op.find('{') + + if colon_index == -1 or opening_brace_index == -1: + return 'Could not parse operation' + op_name = op[:colon_index] + op_body = op[opening_brace_index+1:] + if op_body == 'a.reverse()': + operation_definitions[op_name] = 0 + elif op_body == 'a.splice(0,b)': + operation_definitions[op_name] = 1 + elif op_body.startswith('var c=a[0]'): + operation_definitions[op_name] = 2 + else: + return 'Unknown op_body: ' + op_body + + decryption_function = [] + for op_with_arg in function_body: + match = op_with_arg_re.fullmatch(op_with_arg) + if match is None: + return 'Could not parse operation with arg' + op_name = match.group(1) + if op_name not in operation_definitions: + return 'Unknown op_name: ' + op_name + op_argument = match.group(2) + decryption_function.append([operation_definitions[op_name], int(op_argument)]) + + info['decryption_function'] = decryption_function + return False + +def _operation_2(a, b): + c = a[0] + a[0] = a[b % len(a)] + a[b % len(a)] = c + +def decrypt_signatures(info): + '''Applies info['decryption_function'] to decrypt all the signatures. Return err.''' + if not info.get('decryption_function'): + return 'decryption_function not in info' + for format in info['formats']: + if not format['s'] or not format['sp'] or not format['url']: + print('Warning: s, sp, or url not in format') + continue + + a = list(format['s']) + for op, argument in info['decryption_function']: + if op == 0: + a.reverse() + elif op == 1: + a = a[argument:] + else: + _operation_2(a, argument) + + signature = ''.join(a) + format['url'] += '&' + format['sp'] + '=' + signature + return False -- cgit v1.2.3