diff options
Diffstat (limited to 'youtube')
-rw-r--r-- | youtube/channel.py | 52 | ||||
-rw-r--r-- | youtube/proto.py | 11 | ||||
-rw-r--r-- | youtube/templates/channel.html | 11 | ||||
-rw-r--r-- | youtube/util.py | 79 | ||||
-rw-r--r-- | youtube/watch.py | 66 | ||||
-rw-r--r-- | youtube/yt_data_extract/common.py | 7 | ||||
-rw-r--r-- | youtube/yt_data_extract/everything_else.py | 95 |
7 files changed, 231 insertions, 90 deletions
diff --git a/youtube/channel.py b/youtube/channel.py index 75b0a15..e177c38 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -84,6 +84,40 @@ def channel_ctoken_v5(channel_id, page, sort, tab, view=1): return base64.urlsafe_b64encode(pointless_nest).decode('ascii') + +def channel_about_ctoken(channel_id): + return proto.make_protobuf( + ('base64p', + [ + [2, 80226972, + [ + [2, 2, channel_id], + [2, 3, + ('base64p', + [ + [2, 110, + [ + [2, 3, + [ + [2, 19, + [ + [2, 1, b'66b0e9e9-0000-2820-9589-582429a83980'], + ] + ], + ] + ], + ] + ], + ] + ) + ], + ] + ], + ] + ) + ) + + # https://github.com/user234683/youtube-local/issues/151 def channel_ctoken_v4(channel_id, page, sort, tab, view=1): new_sort = (2 if int(sort) == 1 else 1) @@ -359,7 +393,7 @@ def post_process_channel_info(info): util.add_extra_html_info(item) if info['current_tab'] == 'about': for i, (text, url) in enumerate(info['links']): - if util.YOUTUBE_URL_RE.fullmatch(url): + if isinstance(url, str) and util.YOUTUBE_URL_RE.fullmatch(url): info['links'][i] = (text, util.prefix_url(url)) @@ -469,7 +503,13 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None): number_of_videos, polymer_json = tasks[0].value, tasks[1].value elif tab == 'about': - polymer_json = util.fetch_url(base_url + '/about?pbj=1', headers_desktop, debug_name='gen_channel_about') + # polymer_json = util.fetch_url(base_url + '/about?pbj=1', headers_desktop, debug_name='gen_channel_about') + channel_id = get_channel_id(base_url) + ctoken = channel_about_ctoken(channel_id) + polymer_json = util.call_youtube_api('web', 'browse', { + 'continuation': ctoken, + }) + continuation=True elif tab == 'playlists' and page_number == 1: polymer_json = util.fetch_url(base_url+ '/playlists?pbj=1&view=1&sort=' + playlist_sort_codes[sort], headers_desktop, debug_name='gen_channel_playlists') elif tab == 'playlists': @@ -491,6 +531,9 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None): json.loads(polymer_json), tab, continuation=continuation ) + if info['error'] is not None: + return flask.render_template('error.html', error_message=info['error']) + if channel_id: info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id info['channel_id'] = channel_id @@ -498,7 +541,7 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None): channel_id = info['channel_id'] # Will have microformat present, cache metadata while we have it - if channel_id and default_params and tab != 'videos': + if channel_id and default_params and tab not in ('videos', 'about'): metadata = extract_metadata_for_caching(info) set_cached_metadata(channel_id, metadata) # Otherwise, populate with our (hopefully cached) metadata @@ -515,9 +558,6 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None): for item in info['items']: item.update(additional_info) - if info['error'] is not None: - return flask.render_template('error.html', error_message = info['error']) - if tab in ('videos', 'shorts', 'streams'): info['number_of_videos'] = number_of_videos info['number_of_pages'] = math.ceil(number_of_videos/page_size) diff --git a/youtube/proto.py b/youtube/proto.py index d8b1fcd..924e983 100644 --- a/youtube/proto.py +++ b/youtube/proto.py @@ -141,6 +141,17 @@ base64_enc_funcs = { def _make_protobuf(data): + ''' + Input: Recursive list of protobuf objects or base-64 encodings + Output: Protobuf bytestring + Each protobuf object takes the form [wire_type, field_number, field_data] + If a string protobuf has a list/tuple of length 2, this has the form + (base64 type, data) + The base64 types are + - base64 means a base64 encode with equals sign paddings + - base64s means a base64 encode without padding + - base64p means a url base64 encode with equals signs replaced with %3D + ''' # must be dict mapping field_number to [wire_type, value] if isinstance(data, dict): new_data = [] diff --git a/youtube/templates/channel.html b/youtube/templates/channel.html index 8d2249c..c43f488 100644 --- a/youtube/templates/channel.html +++ b/youtube/templates/channel.html @@ -51,8 +51,11 @@ <ul> {% for (before_text, stat, after_text) in [ ('Joined ', date_joined, ''), - ('', view_count|commatize, ' views'), + ('', approx_view_count, ' views'), ('', approx_subscriber_count, ' subscribers'), + ('', approx_video_count, ' videos'), + ('Country: ', country, ''), + ('Canonical Url: ', canonical_url, ''), ] %} {% if stat %} <li>{{ before_text + stat|string + after_text }}</li> @@ -65,7 +68,11 @@ <hr> <ul> {% for text, url in links %} - <li><a href="{{ url }}">{{ text }}</a></li> + {% if url %} + <li><a href="{{ url }}">{{ text }}</a></li> + {% else %} + <li>{{ text }}</li> + {% endif %} {% endfor %} </ul> </div> diff --git a/youtube/util.py b/youtube/util.py index 5e4af55..dd38109 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -665,6 +665,85 @@ def to_valid_filename(name): return name +# https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/youtube.py#L72 +INNERTUBE_CLIENTS = { + 'android': { + 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', + 'INNERTUBE_CONTEXT': { + 'client': { + 'hl': 'en', + 'gl': 'US', + 'clientName': 'ANDROID', + 'clientVersion': '17.31.35', + 'osName': 'Android', + 'osVersion': '12', + 'androidSdkVersion': 31, + 'userAgent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 12) gzip' + }, + # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287 + #'thirdParty': { + # 'embedUrl': 'https://google.com', # Can be any valid URL + #} + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, + 'REQUIRE_JS_PLAYER': False, + }, + + # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option) + # See: https://github.com/zerodytrash/YouTube-Internal-Clients + 'tv_embedded': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'hl': 'en', + 'gl': 'US', + 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', + 'clientVersion': '2.0', + }, + # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287 + 'thirdParty': { + 'embedUrl': 'https://google.com', # Can be any valid URL + } + + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 85, + 'REQUIRE_JS_PLAYER': True, + }, + + 'web': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20220801.00.00', + 'userAgent': desktop_user_agent, + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 1 + }, +} + + +def call_youtube_api(client, api, data): + client_params = INNERTUBE_CLIENTS[client] + context = client_params['INNERTUBE_CONTEXT'] + key = client_params['INNERTUBE_API_KEY'] + host = client_params.get('INNERTUBE_HOST') or 'www.youtube.com' + user_agent = context['client'].get('userAgent') or mobile_user_agent + + url = 'https://' + host + '/youtubei/v1/' + api + '?key=' + key + data['context'] = context + + data = json.dumps(data) + headers = (('Content-Type', 'application/json'),('User-Agent', user_agent)) + response = fetch_url( + url, data=data, headers=headers, + debug_name='youtubei_' + api + '_' + client, + report_text='Fetched ' + client + ' youtubei ' + api + ).decode('utf-8') + return response + + def strip_non_ascii(string): ''' Returns the string without non ASCII characters''' stripped = (c for c in string if 0 < ord(c) < 127) diff --git a/youtube/watch.py b/youtube/watch.py index 04c52a1..0d03250 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -19,51 +19,6 @@ from urllib.parse import parse_qs, urlencode from types import SimpleNamespace from math import ceil -# https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/youtube.py#L72 -INNERTUBE_CLIENTS = { - 'android': { - 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', - 'INNERTUBE_CONTEXT': { - 'client': { - 'hl': 'en', - 'gl': 'US', - 'clientName': 'ANDROID', - 'clientVersion': '17.31.35', - 'osName': 'Android', - 'osVersion': '12', - 'androidSdkVersion': 31, - 'userAgent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 12) gzip' - }, - # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287 - #'thirdParty': { - # 'embedUrl': 'https://google.com', # Can be any valid URL - #} - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, - 'REQUIRE_JS_PLAYER': False, - }, - - # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option) - # See: https://github.com/zerodytrash/YouTube-Internal-Clients - 'tv_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', - 'INNERTUBE_CONTEXT': { - 'client': { - 'hl': 'en', - 'gl': 'US', - 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', - 'clientVersion': '2.0', - }, - # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287 - 'thirdParty': { - 'embedUrl': 'https://google.com', # Can be any valid URL - } - - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 85, - 'REQUIRE_JS_PLAYER': True, - }, -} try: with open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'r') as f: @@ -386,26 +341,11 @@ def _add_to_error(info, key, additional_message): def fetch_player_response(client, video_id): - client_params = INNERTUBE_CLIENTS[client] - context = client_params['INNERTUBE_CONTEXT'] - key = client_params['INNERTUBE_API_KEY'] - host = client_params.get('INNERTUBE_HOST') or 'www.youtube.com' - user_agent = context['client'].get('userAgent') or util.mobile_user_agent - - url = 'https://' + host + '/youtubei/v1/player?key=' + key - data = { + return util.call_youtube_api(client, 'player', { 'videoId': video_id, - 'context': context, 'params': 'CgIQBg', - } - data = json.dumps(data) - headers = (('Content-Type', 'application/json'),('User-Agent', user_agent)) - player_response = util.fetch_url( - url, data=data, headers=headers, - debug_name='youtubei_player_' + client, - report_text='Fetched ' + client + ' youtubei player' - ).decode('utf-8') - return player_response + }) + def fetch_watch_page_info(video_id, playlist_id, index): # bpctr=9999999999 will bypass are-you-sure dialogs for controversial diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index e7b31b7..7903db5 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -185,7 +185,7 @@ def extract_int(string, default=None, whole_word=True): return default def extract_approx_int(string): - '''e.g. "15.1M" from "15.1M subscribers"''' + '''e.g. "15.1M" from "15.1M subscribers" or '4,353' from 4353''' if not isinstance(string, str): string = extract_str(string) if not string: @@ -193,7 +193,10 @@ def extract_approx_int(string): match = re.search(r'\b(\d+(?:\.\d+)?[KMBTkmbt]?)\b', string.replace(',', '')) if match is None: return None - return match.group(1) + result = match.group(1) + if re.fullmatch(r'\d+', result): + result = '{:,}'.format(int(result)) + return result MONTH_ABBREVIATIONS = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'} def extract_date(date_text): diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py index 7740c67..0f64649 100644 --- a/youtube/yt_data_extract/everything_else.py +++ b/youtube/yt_data_extract/everything_else.py @@ -85,23 +85,84 @@ def extract_channel_info(polymer_json, tab, continuation=False): if tab in ('search', 'playlists'): info['is_last_page'] = (ctoken is None) elif tab == 'about': - items, _ = extract_items(response, item_types={'channelAboutFullMetadataRenderer'}) - if not items: - info['error'] = 'Could not find channelAboutFullMetadataRenderer' - return info - channel_metadata = items[0]['channelAboutFullMetadataRenderer'] - - info['links'] = [] - for link_json in channel_metadata.get('primaryLinks', ()): - url = remove_redirect(deep_get(link_json, 'navigationEndpoint', 'urlEndpoint', 'url')) - if not (url.startswith('http://') or url.startswith('https://')): - url = 'http://' + url - text = extract_str(link_json.get('title')) - info['links'].append( (text, url) ) - - info['date_joined'] = extract_date(channel_metadata.get('joinedDateText')) - info['view_count'] = extract_int(channel_metadata.get('viewCountText')) - info['description'] = extract_str(channel_metadata.get('description'), default='') + # Latest type + items, _ = extract_items(response, item_types={'aboutChannelRenderer'}) + if items: + a_metadata = deep_get(items, 0, 'aboutChannelRenderer', + 'metadata', 'aboutChannelViewModel') + if not a_metadata: + info['error'] = 'Could not find aboutChannelViewModel' + return info + + info['links'] = [] + for link_outer in a_metadata.get('links', ()): + link = link_outer.get('channelExternalLinkViewModel') or {} + link_content = extract_str(deep_get(link, 'link', 'content')) + for run in deep_get(link, 'link', 'commandRuns') or (): + url = remove_redirect(deep_get(run, 'onTap', + 'innertubeCommand', 'urlEndpoint', 'url')) + if url and not (url.startswith('http://') + or url.startswith('https://')): + url = 'https://' + url + if link_content is None or (link_content in url): + break + else: # didn't break + url = link_content + if url and not (url.startswith('http://') + or url.startswith('https://')): + url = 'https://' + url + text = extract_str(deep_get(link, 'title', 'content')) + info['links'].append( (text, url) ) + + info['date_joined'] = extract_date( + a_metadata.get('joinedDateText') + ) + info['view_count'] = extract_int(a_metadata.get('viewCountText')) + info['approx_view_count'] = extract_approx_int( + a_metadata.get('viewCountText') + ) + info['description'] = extract_str( + a_metadata.get('description'), default='' + ) + info['approx_video_count'] = extract_approx_int( + a_metadata.get('videoCountText') + ) + info['approx_subscriber_count'] = extract_approx_int( + a_metadata.get('subscriberCountText') + ) + info['country'] = extract_str(a_metadata.get('country')) + info['canonical_url'] = extract_str( + a_metadata.get('canonicalChannelUrl') + ) + + # Old type + else: + items, _ = extract_items(response, + item_types={'channelAboutFullMetadataRenderer'}) + if not items: + info['error'] = 'Could not find aboutChannelRenderer or channelAboutFullMetadataRenderer' + return info + a_metadata = items[0]['channelAboutFullMetadataRenderer'] + + info['links'] = [] + for link_json in a_metadata.get('primaryLinks', ()): + url = remove_redirect(deep_get(link_json, 'navigationEndpoint', + 'urlEndpoint', 'url')) + if url and not (url.startswith('http://') + or url.startswith('https://')): + url = 'https://' + url + text = extract_str(link_json.get('title')) + info['links'].append( (text, url) ) + + info['date_joined'] = extract_date(a_metadata.get('joinedDateText')) + info['view_count'] = extract_int(a_metadata.get('viewCountText')) + info['description'] = extract_str(a_metadata.get( + 'description'), default='') + + info['approx_video_count'] = None + info['approx_subscriber_count'] = None + info['country'] = None + info['canonical_url'] = None else: raise NotImplementedError('Unknown or unsupported channel tab: ' + tab) |