diff options
-rw-r--r-- | youtube/channel.py | 39 | ||||
-rw-r--r-- | youtube/templates/channel.html | 8 | ||||
-rw-r--r-- | youtube/yt_data_extract/common.py | 47 | ||||
-rw-r--r-- | youtube/yt_data_extract/everything_else.py | 2 |
4 files changed, 75 insertions, 21 deletions
diff --git a/youtube/channel.py b/youtube/channel.py index 4cf6cdf..5c757d3 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -32,16 +32,23 @@ real_cookie = (('Cookie', 'VISITOR_INFO1_LIVE=8XihrAcN1l4'),) generic_cookie = (('Cookie', 'VISITOR_INFO1_LIVE=ST1Ti53r4fU'),) # added an extra nesting under the 2nd base64 compared to v4 +# added tab support def channel_ctoken_v5(channel_id, page, sort, tab, view=1): new_sort = (2 if int(sort) == 1 else 1) offset = str(30*(int(page) - 1)) + if tab == 'videos': + tab = 15 + elif tab == 'shorts': + tab = 10 + elif tab == 'streams': + tab = 14 pointless_nest = proto.string(80226972, proto.string(2, channel_id) + proto.string(3, proto.percent_b64encode( proto.string(110, proto.string(3, - proto.string(15, + proto.string(tab, proto.string(1, proto.string(1, proto.unpadded_b64encode( @@ -167,7 +174,7 @@ def channel_ctoken_v2(channel_id, page, sort, tab, view=1): tab = proto.string(2, tab) sort = proto.uint(3, int(sort)) - # page = proto.string(15, str(page) ) + #page = proto.string(15, str(page)) shelf_view = proto.uint(4, 0) view = proto.uint(6, int(view)) @@ -202,7 +209,7 @@ def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1, message = 'Got channel tab' if print_status else None if not ctoken: - if tab == 'videos': + if tab in ('videos', 'shorts', 'streams'): ctoken = channel_ctoken_v5(channel_id, page, sort, tab, view) else: ctoken = channel_ctoken_v3(channel_id, page, sort, tab, view) @@ -349,11 +356,11 @@ def post_process_channel_info(info): info['links'][i] = (text, util.prefix_url(url)) -def get_channel_first_page(base_url=None, channel_id=None): +def get_channel_first_page(base_url=None, channel_id=None, tab='videos'): if channel_id: base_url = 'https://www.youtube.com/channel/' + channel_id - return util.fetch_url(base_url + '/videos?pbj=1&view=0', headers_desktop, - debug_name='gen_channel_videos') + return util.fetch_url(base_url + '/' + tab + '?pbj=1&view=0', + headers_desktop, debug_name='gen_channel_' + tab) playlist_sort_codes = {'2': "da", '3': "dd", '4': "lad"} @@ -374,24 +381,25 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None): default_params = (page_number == 1 and sort == '3' and view == '1') continuation = bool(ctoken) # whether or not we're using a continuation - if tab == 'videos' and channel_id and not default_params: + if (tab in ('videos', 'shorts', 'streams') and channel_id and + not default_params): tasks = ( gevent.spawn(get_number_of_videos_channel, channel_id), gevent.spawn(get_channel_tab, channel_id, page_number, sort, - 'videos', view, ctoken) + tab, view, ctoken) ) gevent.joinall(tasks) util.check_gevent_exceptions(*tasks) number_of_videos, polymer_json = tasks[0].value, tasks[1].value continuation = True - elif tab == 'videos': + elif tab in ('videos', 'shorts', 'streams'): if channel_id: num_videos_call = (get_number_of_videos_channel, channel_id) else: num_videos_call = (get_number_of_videos_general, base_url) tasks = ( gevent.spawn(*num_videos_call), - gevent.spawn(get_channel_first_page, base_url=base_url), + gevent.spawn(get_channel_first_page, base_url=base_url, tab=tab), ) gevent.joinall(tasks) util.check_gevent_exceptions(*tasks) @@ -440,13 +448,13 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None): item.update(additional_info) if info['error'] is not None: - return flask.render_template('error.html', error_message=info['error']) + return flask.render_template('error.html', error_message = info['error']) - if tab == 'videos': + if tab in ('videos', 'shorts', 'streams'): info['number_of_videos'] = number_of_videos info['number_of_pages'] = math.ceil(number_of_videos/30) info['header_playlist_names'] = local_playlist.get_playlist_names() - if tab in ('videos', 'playlists'): + if tab in ('videos', 'shorts', 'streams', 'playlists'): info['current_sort'] = sort elif tab == 'search': info['search_box_value'] = query @@ -457,9 +465,8 @@ def get_channel_page_general_url(base_url, tab, request, channel_id=None): post_process_channel_info(info) - return flask.render_template( - 'channel.html', - parameters_dictionary=request.args, + return flask.render_template('channel.html', + parameters_dictionary = request.args, **info ) diff --git a/youtube/templates/channel.html b/youtube/templates/channel.html index 6266aab..b86cd54 100644 --- a/youtube/templates/channel.html +++ b/youtube/templates/channel.html @@ -33,7 +33,7 @@ <hr/> <nav class="channel-tabs"> - {% for tab_name in ('Videos', 'Playlists', 'About') %} + {% for tab_name in ('Videos', 'Shorts', 'Streams', 'Playlists', 'About') %} {% if tab_name.lower() == current_tab %} <a class="tab page-button">{{ tab_name }}</a> {% else %} @@ -73,7 +73,7 @@ <!-- new--> <div id="links-metadata"> - {% if current_tab == 'videos' %} + {% if current_tab in ('videos', 'shorts', 'streams') %} {% set sorts = [('1', 'views'), ('2', 'oldest'), ('3', 'newest')] %} <div id="number-of-results">{{ number_of_videos }} videos</div> {% elif current_tab == 'playlists' %} @@ -110,11 +110,11 @@ <hr/> <footer class="pagination-container"> - {% if current_tab == 'videos' and current_sort.__str__() == '2' %} + {% if (current_tab in ('videos', 'shorts', 'streams')) and current_sort.__str__() == '2' %} <nav class="next-previous-button-row"> {{ common_elements.next_previous_ctoken_buttons(None, ctoken, channel_url + '/' + current_tab, parameters_dictionary) }} </nav> - {% elif current_tab == 'videos' %} + {% elif current_tab in ('videos', 'shorts', 'streams') %} <nav class="pagination-list"> {{ common_elements.page_buttons(number_of_pages, channel_url + '/' + current_tab, parameters_dictionary, include_ends=(current_sort.__str__() == '3')) }} </nav> diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index fcefbf7..5680b16 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -249,6 +249,9 @@ def extract_item_info(item, additional_info={}): primary_type = type_parts[-2] if primary_type == 'video': info['type'] = 'video' + elif type_parts[0] == 'reel': # shorts + info['type'] = 'video' + primary_type = 'short' elif primary_type in ('playlist', 'radio', 'show'): info['type'] = 'playlist' info['playlist_type'] = primary_type @@ -343,6 +346,48 @@ def extract_item_info(item, additional_info={}): else: info['index'] = None + elif primary_type == 'short': + info['id'] = item.get('videoId') + if not info['id']: + info['id'] = deep_get(item,'navigationEndpoint', + 'reelWatchEndpoint', 'videoId') + info['approx_view_count'] = extract_approx_int(item.get('viewCountText')) + + # handle case where it is "No views" + if not info['approx_view_count']: + if ('No views' in item.get('shortViewCountText', '') + or 'no views' in accessibility_label.lower()): + info['view_count'] = 0 + info['approx_view_count'] = '0' + + # dig into accessibility data to get duration for shorts + accessibility_label = multi_deep_get(item, + ['accessibility', 'accessibilityData', 'label'], + default='') + + duration = re.search(r'(\d+) (second|seconds|minute) - play video', + accessibility_label) + if duration.group(2) == 'minute': + info['duration'] = "1:00" + else: + info['duration'] = "0:" + duration.group(1).zfill(2) + + # if it's an item in a playlist, get its index + if 'index' in item: # url has wrong index on playlist page + info['index'] = extract_int(item.get('index')) + elif 'indexText' in item: + # Current item in playlist has ▶ instead of the actual index, must + # dig into url + match = re.search(r'index=(\d+)', deep_get(item, + 'navigationEndpoint', 'commandMetadata', 'webCommandMetadata', + 'url', default='')) + if match is None: # worth a try then + info['index'] = extract_int(item.get('indexText')) + else: + info['index'] = int(match.group(1)) + else: + info['index'] = None + elif primary_type in ('playlist', 'radio'): info['id'] = item.get('playlistId') info['video_count'] = extract_int(item.get('videoCount')) @@ -398,6 +443,8 @@ _item_types = { 'gridVideoRenderer', 'playlistVideoRenderer', + 'reelItemRenderer', + 'playlistRenderer', 'compactPlaylistRenderer', 'gridPlaylistRenderer', diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py index 9a6e31a..745d08f 100644 --- a/youtube/yt_data_extract/everything_else.py +++ b/youtube/yt_data_extract/everything_else.py @@ -73,7 +73,7 @@ def extract_channel_info(polymer_json, tab, continuation=False): #if 'contents' not in response and 'continuationContents' not in response: # return info - if tab in ('videos', 'playlists', 'search'): + if tab in ('videos', 'shorts', 'streams', 'playlists', 'search'): items, ctoken = extract_items(response) additional_info = { 'author': info['channel_name'], |