diff options
Diffstat (limited to 'youtube/channel.py')
-rw-r--r-- | youtube/channel.py | 350 |
1 files changed, 306 insertions, 44 deletions
diff --git a/youtube/channel.py b/youtube/channel.py index 69092d3..81881eb 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -1,6 +1,8 @@ import base64 -from youtube import util, yt_data_extract, local_playlist, subscriptions +from youtube import (util, yt_data_extract, local_playlist, subscriptions, + playlist) from youtube import yt_app +import settings import urllib import json @@ -31,6 +33,132 @@ headers_mobile = ( real_cookie = (('Cookie', 'VISITOR_INFO1_LIVE=8XihrAcN1l4'),) generic_cookie = (('Cookie', 'VISITOR_INFO1_LIVE=ST1Ti53r4fU'),) +# added an extra nesting under the 2nd base64 compared to v4 +# added tab support +# changed offset field to uint id 1 +def channel_ctoken_v5(channel_id, page, sort, tab, view=1): + new_sort = (2 if int(sort) == 1 else 1) + offset = 30*(int(page) - 1) + if tab == 'videos': + tab = 15 + elif tab == 'shorts': + tab = 10 + elif tab == 'streams': + tab = 14 + pointless_nest = proto.string(80226972, + proto.string(2, channel_id) + + proto.string(3, + proto.percent_b64encode( + proto.string(110, + proto.string(3, + proto.string(tab, + proto.string(1, + proto.string(1, + proto.unpadded_b64encode( + proto.string(1, + proto.string(1, + proto.unpadded_b64encode( + proto.string(2, + b"ST:" + + proto.unpadded_b64encode( + proto.uint(1, offset) + ) + ) + ) + ) + ) + ) + ) + # targetId, just needs to be present but + # doesn't need to be correct + + proto.string(2, "63faaff0-0000-23fe-80f0-582429d11c38") + ) + # 1 - newest, 2 - popular + + proto.uint(3, new_sort) + ) + ) + ) + ) + ) + ) + + return base64.urlsafe_b64encode(pointless_nest).decode('ascii') + + +def channel_about_ctoken(channel_id): + return proto.make_protobuf( + ('base64p', + [ + [2, 80226972, + [ + [2, 2, channel_id], + [2, 3, + ('base64p', + [ + [2, 110, + [ + [2, 3, + [ + [2, 19, + [ + [2, 1, b'66b0e9e9-0000-2820-9589-582429a83980'], + ] + ], + ] + ], + ] + ], + ] + ) + ], + ] + ], + ] + ) + ) + + +# https://github.com/user234683/youtube-local/issues/151 +def channel_ctoken_v4(channel_id, page, sort, tab, view=1): + new_sort = (2 if int(sort) == 1 else 1) + offset = str(30*(int(page) - 1)) + pointless_nest = proto.string(80226972, + proto.string(2, channel_id) + + proto.string(3, + proto.percent_b64encode( + proto.string(110, + proto.string(3, + proto.string(15, + proto.string(1, + proto.string(1, + proto.unpadded_b64encode( + proto.string(1, + proto.unpadded_b64encode( + proto.string(2, + b"ST:" + + proto.unpadded_b64encode( + proto.string(2, offset) + ) + ) + ) + ) + ) + ) + # targetId, just needs to be present but + # doesn't need to be correct + + proto.string(2, "63faaff0-0000-23fe-80f0-582429d11c38") + ) + # 1 - newest, 2 - popular + + proto.uint(3, new_sort) + ) + ) + ) + ) + ) + ) + + return base64.urlsafe_b64encode(pointless_nest).decode('ascii') + # SORT: # videos: # Popular - 1 @@ -75,15 +203,15 @@ def channel_ctoken_v2(channel_id, page, sort, tab, view=1): 2: 17254859483345278706, 1: 16570086088270825023, }[int(sort)] - page_token = proto.string(61, proto.unpadded_b64encode( - proto.string(1, proto.uint(1, schema_number) + proto.string( - 2, - proto.string(1, proto.unpadded_b64encode(proto.uint(1, offset))) - )))) + page_token = proto.string(61, proto.unpadded_b64encode(proto.string(1, + proto.uint(1, schema_number) + proto.string(2, + proto.string(1, proto.unpadded_b64encode(proto.uint(1,offset))) + ) + ))) tab = proto.string(2, tab) sort = proto.uint(3, int(sort)) - # page = proto.string(15, str(page) ) + #page = proto.string(15, str(page)) shelf_view = proto.uint(4, 0) view = proto.uint(6, int(view)) @@ -118,8 +246,12 @@ def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1, message = 'Got channel tab' if print_status else None if not ctoken: - ctoken = channel_ctoken_v3(channel_id, page, sort, tab, view) + if tab in ('videos', 'shorts', 'streams'): + ctoken = channel_ctoken_v5(channel_id, page, sort, tab, view) + else: + ctoken = channel_ctoken_v3(channel_id, page, sort, tab, view) ctoken = ctoken.replace('=', '%3D') + # Not sure what the purpose of the key is or whether it will change # For now it seems to be constant for the API endpoint, not dependent # on the browsing session or channel @@ -132,7 +264,7 @@ def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1, 'hl': 'en', 'gl': 'US', 'clientName': 'WEB', - 'clientVersion': '2.20180830', + 'clientVersion': '2.20240327.00.00', }, }, 'continuation': ctoken, @@ -147,7 +279,8 @@ def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1, # cache entries expire after 30 minutes -@cachetools.func.ttl_cache(maxsize=128, ttl=30*60) +number_of_videos_cache = cachetools.TTLCache(128, 30*60) +@cachetools.cached(number_of_videos_cache) def get_number_of_videos_channel(channel_id): if channel_id is None: return 1000 @@ -159,7 +292,7 @@ def get_number_of_videos_channel(channel_id): try: response = util.fetch_url(url, headers_mobile, debug_name='number_of_videos', report_text='Got number of videos') - except urllib.error.HTTPError as e: + except (urllib.error.HTTPError, util.FetchError) as e: traceback.print_exc() print("Couldn't retrieve number of videos") return 1000 @@ -172,18 +305,20 @@ def get_number_of_videos_channel(channel_id): return int(match.group(1).replace(',','')) else: return 0 +def set_cached_number_of_videos(channel_id, num_videos): + @cachetools.cached(number_of_videos_cache) + def dummy_func_using_same_cache(channel_id): + return num_videos + dummy_func_using_same_cache(channel_id) channel_id_re = re.compile(r'videos\.xml\?channel_id=([a-zA-Z0-9_-]{24})"') - - @cachetools.func.lru_cache(maxsize=128) def get_channel_id(base_url): # method that gives the smallest possible response at ~4 kb # needs to be as fast as possible base_url = base_url.replace('https://www', 'https://m') # avoid redirect - response = util.fetch_url( - base_url + '/about?pbj=1', headers_mobile, + response = util.fetch_url(base_url + '/about?pbj=1', headers_mobile, debug_name='get_channel_id', report_text='Got channel id').decode('utf-8') match = channel_id_re.search(response) if match: @@ -191,6 +326,31 @@ def get_channel_id(base_url): return None +metadata_cache = cachetools.LRUCache(128) +@cachetools.cached(metadata_cache) +def get_metadata(channel_id): + base_url = 'https://www.youtube.com/channel/' + channel_id + polymer_json = util.fetch_url(base_url + '/about?pbj=1', + headers_desktop, + debug_name='gen_channel_about', + report_text='Retrieved channel metadata') + info = yt_data_extract.extract_channel_info(json.loads(polymer_json), + 'about', + continuation=False) + return extract_metadata_for_caching(info) +def set_cached_metadata(channel_id, metadata): + @cachetools.cached(metadata_cache) + def dummy_func_using_same_cache(channel_id): + return metadata + dummy_func_using_same_cache(channel_id) +def extract_metadata_for_caching(channel_info): + metadata = {} + for key in ('approx_subscriber_count', 'short_description', 'channel_name', + 'avatar'): + metadata[key] = channel_info[key] + return metadata + + def get_number_of_videos_general(base_url): return get_number_of_videos_channel(get_channel_id(base_url)) @@ -211,7 +371,7 @@ def get_channel_search_json(channel_id, query, page): 'hl': 'en', 'gl': 'US', 'clientName': 'WEB', - 'clientVersion': '2.20180830', + 'clientVersion': '2.20240327.00.00', }, }, 'continuation': ctoken, @@ -229,15 +389,20 @@ def post_process_channel_info(info): info['avatar'] = util.prefix_url(info['avatar']) info['channel_url'] = util.prefix_url(info['channel_url']) for item in info['items']: + item['thumbnail'] = "https://i.ytimg.com/vi/{}/hqdefault.jpg".format(item['id']) util.prefix_urls(item) util.add_extra_html_info(item) + if info['current_tab'] == 'about': + for i, (text, url) in enumerate(info['links']): + if isinstance(url, str) and util.YOUTUBE_URL_RE.fullmatch(url): + info['links'][i] = (text, util.prefix_url(url)) -def get_channel_first_page(base_url=None, channel_id=None): +def get_channel_first_page(base_url=None, tab='videos', channel_id=None): if channel_id: base_url = 'https://www.youtube.com/channel/' + channel_id - return util.fetch_url(base_url + '/videos?pbj=1&view=0', headers_desktop, - debug_name='gen_channel_videos') + return util.fetch_url(base_url + '/' + tab + '?pbj=1&view=0', + headers_desktop, debug_name='gen_channel_' + tab) playlist_sort_codes = {'2': "da", '3': "dd", '4': "lad"} @@ -246,63 +411,159 @@ playlist_sort_codes = {'2': "da", '3': "dd", '4': "lad"} # youtube.com/user/[username]/[tab] # youtube.com/c/[custom]/[tab] # youtube.com/[custom]/[tab] - - def get_channel_page_general_url(base_url, tab, request, channel_id=None): page_number = int(request.args.get('page', 1)) - sort = request.args.get('sort', '3') + # sort 1: views + # sort 2: oldest + # sort 3: newest + # sort 4: newest - no shorts (Just a kludge on our end, not internal to yt) + default_sort = '3' if settings.include_shorts_in_channel else '4' + sort = request.args.get('sort', default_sort) view = request.args.get('view', '1') query = request.args.get('query', '') ctoken = request.args.get('ctoken', '') - default_params = (page_number == 1 and sort == '3' and view == '1') - - if tab == 'videos' and channel_id and not default_params: - tasks = ( - gevent.spawn(get_number_of_videos_channel, channel_id), - gevent.spawn(get_channel_tab, channel_id, page_number, sort, - 'videos', view, ctoken) - ) - gevent.joinall(tasks) - util.check_gevent_exceptions(*tasks) - number_of_videos, polymer_json = tasks[0].value, tasks[1].value - elif tab == 'videos': + include_shorts = (sort != '4') + default_params = (page_number == 1 and sort in ('3', '4') and view == '1') + continuation = bool(ctoken) # whether or not we're using a continuation + page_size = 30 + try_channel_api = True + polymer_json = None + + # Use the special UU playlist which contains all the channel's uploads + if tab == 'videos' and sort in ('3', '4'): + if not channel_id: + channel_id = get_channel_id(base_url) + if page_number == 1 and include_shorts: + tasks = ( + gevent.spawn(playlist.playlist_first_page, + 'UU' + channel_id[2:], + report_text='Retrieved channel videos'), + gevent.spawn(get_metadata, channel_id), + ) + gevent.joinall(tasks) + util.check_gevent_exceptions(*tasks) + + # Ignore the metadata for now, it is cached and will be + # recalled later + pl_json = tasks[0].value + pl_info = yt_data_extract.extract_playlist_info(pl_json) + number_of_videos = pl_info['metadata']['video_count'] + if number_of_videos is None: + number_of_videos = 1000 + else: + set_cached_number_of_videos(channel_id, number_of_videos) + else: + tasks = ( + gevent.spawn(playlist.get_videos, 'UU' + channel_id[2:], + page_number, include_shorts=include_shorts), + gevent.spawn(get_metadata, channel_id), + gevent.spawn(get_number_of_videos_channel, channel_id), + ) + gevent.joinall(tasks) + util.check_gevent_exceptions(*tasks) + + pl_json = tasks[0].value + pl_info = yt_data_extract.extract_playlist_info(pl_json) + number_of_videos = tasks[2].value + + info = pl_info + info['channel_id'] = channel_id + info['current_tab'] = 'videos' + if info['items']: # Success + page_size = 100 + try_channel_api = False + else: # Try the first-page method next + try_channel_api = True + + # Use the regular channel API + if tab in ('shorts', 'streams') or (tab=='videos' and try_channel_api): if channel_id: num_videos_call = (get_number_of_videos_channel, channel_id) else: num_videos_call = (get_number_of_videos_general, base_url) + + # Use ctoken method, which YouTube changes all the time + if channel_id and not default_params: + if sort == 4: + _sort = 3 + else: + _sort = sort + page_call = (get_channel_tab, channel_id, page_number, _sort, + tab, view, ctoken) + # Use the first-page method, which won't break + else: + page_call = (get_channel_first_page, base_url, tab) + tasks = ( gevent.spawn(*num_videos_call), - gevent.spawn(get_channel_first_page, base_url=base_url), + gevent.spawn(*page_call), ) gevent.joinall(tasks) util.check_gevent_exceptions(*tasks) number_of_videos, polymer_json = tasks[0].value, tasks[1].value + elif tab == 'about': - polymer_json = util.fetch_url(base_url + '/about?pbj=1', headers_desktop, debug_name='gen_channel_about') + # polymer_json = util.fetch_url(base_url + '/about?pbj=1', headers_desktop, debug_name='gen_channel_about') + channel_id = get_channel_id(base_url) + ctoken = channel_about_ctoken(channel_id) + polymer_json = util.call_youtube_api('web', 'browse', { + 'continuation': ctoken, + }) + continuation=True elif tab == 'playlists' and page_number == 1: polymer_json = util.fetch_url(base_url+ '/playlists?pbj=1&view=1&sort=' + playlist_sort_codes[sort], headers_desktop, debug_name='gen_channel_playlists') elif tab == 'playlists': polymer_json = get_channel_tab(channel_id, page_number, sort, 'playlists', view) + continuation = True elif tab == 'search' and channel_id: polymer_json = get_channel_search_json(channel_id, query, page_number) elif tab == 'search': url = base_url + '/search?pbj=1&query=' + urllib.parse.quote(query, safe='') polymer_json = util.fetch_url(url, headers_desktop, debug_name='gen_channel_search') + elif tab == 'videos': + pass else: flask.abort(404, 'Unknown channel tab: ' + tab) - info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) + if polymer_json is not None: + info = yt_data_extract.extract_channel_info( + json.loads(polymer_json), tab, continuation=continuation + ) + if info['error'] is not None: return flask.render_template('error.html', error_message=info['error']) - post_process_channel_info(info) - if tab == 'videos': + if channel_id: + info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id + info['channel_id'] = channel_id + else: + channel_id = info['channel_id'] + + # Will have microformat present, cache metadata while we have it + if channel_id and default_params and tab not in ('videos', 'about'): + metadata = extract_metadata_for_caching(info) + set_cached_metadata(channel_id, metadata) + # Otherwise, populate with our (hopefully cached) metadata + elif channel_id and info.get('channel_name') is None: + metadata = get_metadata(channel_id) + for key, value in metadata.items(): + yt_data_extract.conservative_update(info, key, value) + # need to add this metadata to the videos/playlists + additional_info = { + 'author': info['channel_name'], + 'author_id': info['channel_id'], + 'author_url': info['channel_url'], + } + for item in info['items']: + item.update(additional_info) + + if tab in ('videos', 'shorts', 'streams'): info['number_of_videos'] = number_of_videos - info['number_of_pages'] = math.ceil(number_of_videos/30) + info['number_of_pages'] = math.ceil(number_of_videos/page_size) info['header_playlist_names'] = local_playlist.get_playlist_names() - if tab in ('videos', 'playlists'): + if tab in ('videos', 'shorts', 'streams', 'playlists'): info['current_sort'] = sort elif tab == 'search': info['search_box_value'] = query @@ -311,9 +572,10 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None): info['page_number'] = page_number info['subscribed'] = subscriptions.is_subscribed(info['channel_id']) - return flask.render_template( - 'channel.html', - parameters_dictionary=request.args, + post_process_channel_info(info) + + return flask.render_template('channel.html', + parameters_dictionary = request.args, **info ) |