From a61ba6b8f45d94bf8e89a9f351c5c6cac2379387 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sat, 16 Feb 2019 15:21:39 -0800 Subject: playlist: fix error when there's no description --- youtube/playlist.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'youtube') diff --git a/youtube/playlist.py b/youtube/playlist.py index cc0da33..419ed00 100644 --- a/youtube/playlist.py +++ b/youtube/playlist.py @@ -2,13 +2,14 @@ import base64 import youtube.common as common import urllib import json -from string import Template +import string +from youtube import template import youtube.proto as proto import gevent import math with open("yt_playlist_template.html", "r") as file: - yt_playlist_template = Template(file.read()) + yt_playlist_template = template.Template(file.read()) @@ -76,7 +77,7 @@ def get_videos(playlist_id, page): return info -playlist_stat_template = Template(''' +playlist_stat_template = string.Template('''
$stat
''') def get_playlist_page(env, start_response): start_response('200 OK', [('Content-type','text/html'),]) -- cgit v1.2.3 From b32330be4f15dd044e6212f526e52375f0a0f6c2 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 21 Feb 2019 21:32:31 -0800 Subject: refactor common.py into 3 files --- youtube/accounts.py | 18 +- youtube/channel.py | 64 ++-- youtube/comments.py | 59 ++-- youtube/common.py | 748 --------------------------------------------- youtube/html_common.py | 429 ++++++++++++++++++++++++++ youtube/local_playlist.py | 23 +- youtube/playlist.py | 25 +- youtube/post_comment.py | 36 +-- youtube/search.py | 25 +- youtube/util.py | 153 ++++++++++ youtube/watch.py | 26 +- youtube/youtube.py | 4 +- youtube/yt_data_extract.py | 205 +++++++++++++ 13 files changed, 928 insertions(+), 887 deletions(-) delete mode 100644 youtube/common.py create mode 100644 youtube/html_common.py create mode 100644 youtube/util.py create mode 100644 youtube/yt_data_extract.py (limited to 'youtube') diff --git a/youtube/accounts.py b/youtube/accounts.py index bde9852..375bf2a 100644 --- a/youtube/accounts.py +++ b/youtube/accounts.py @@ -1,10 +1,10 @@ # Contains functions having to do with logging in +from youtube import util, html_common +import settings import urllib import json -from youtube import common import re -import settings import http.cookiejar import io import os @@ -106,7 +106,7 @@ def get_account_login_page(env, start_response): ''' page = ''' -
+
@@ -130,10 +130,10 @@ Using Tor to log in should only be done if the account was created using a proxy
''' - return common.yt_basic_template.substitute( + return html_common.yt_basic_template.substitute( page_title = "Login", style = style, - header = common.get_header(), + header = html_common.get_header(), page = page, ).encode('utf-8') @@ -229,7 +229,7 @@ def _login(username, password, cookiejar, use_tor): Taken from youtube-dl """ - login_page = common.fetch_url(_LOGIN_URL, yt_dl_headers, report_text='Downloaded login page', cookiejar_receive=cookiejar, use_tor=use_tor).decode('utf-8') + login_page = util.fetch_url(_LOGIN_URL, yt_dl_headers, report_text='Downloaded login page', cookiejar_receive=cookiejar, use_tor=use_tor).decode('utf-8') '''with open('debug/login_page', 'w', encoding='utf-8') as f: f.write(login_page)''' #print(cookiejar.as_lwp_str()) @@ -255,7 +255,7 @@ def _login(username, password, cookiejar, use_tor): 'Google-Accounts-XSRF': 1, } headers.update(yt_dl_headers) - result = common.fetch_url(url, headers, report_text=note, data=data, cookiejar_send=cookiejar, cookiejar_receive=cookiejar, use_tor=use_tor).decode('utf-8') + result = util.fetch_url(url, headers, report_text=note, data=data, cookiejar_send=cookiejar, cookiejar_receive=cookiejar, use_tor=use_tor).decode('utf-8') #print(cookiejar.as_lwp_str()) '''with open('debug/' + note, 'w', encoding='utf-8') as f: f.write(result)''' @@ -387,7 +387,7 @@ def _login(username, password, cookiejar, use_tor): return False try: - check_cookie_results = common.fetch_url(check_cookie_url, headers=yt_dl_headers, report_text="Checked cookie", cookiejar_send=cookiejar, cookiejar_receive=cookiejar, use_tor=use_tor).decode('utf-8') + check_cookie_results = util.fetch_url(check_cookie_url, headers=yt_dl_headers, report_text="Checked cookie", cookiejar_send=cookiejar, cookiejar_receive=cookiejar, use_tor=use_tor).decode('utf-8') except (urllib.error.URLError, compat_http_client.HTTPException, socket.error) as err: return False @@ -398,7 +398,7 @@ def _login(username, password, cookiejar, use_tor): warn('Unable to log in') return False - select_site_page = common.fetch_url('https://m.youtube.com/select_site', headers=common.mobile_ua, report_text="Retrieved page for channel id", cookiejar_send=cookiejar, use_tor=use_tor).decode('utf-8') + select_site_page = util.fetch_url('https://m.youtube.com/select_site', headers=util.mobile_ua, report_text="Retrieved page for channel id", cookiejar_send=cookiejar, use_tor=use_tor).decode('utf-8') match = _CHANNEL_ID_RE.search(select_site_page) if match is None: warn('Failed to find channel id') diff --git a/youtube/channel.py b/youtube/channel.py index 9577525..e9f315b 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -1,6 +1,6 @@ import base64 -import youtube.common as common -from youtube.common import default_multi_get, URL_ORIGIN, get_thumbnail_url, video_id +from youtube import util, yt_data_extract, html_common + import http_errors import urllib import json @@ -91,7 +91,7 @@ def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1): url = "https://www.youtube.com/browse_ajax?ctoken=" + ctoken print("Sending channel tab ajax request") - content = common.fetch_url(url, common.desktop_ua + headers_1) + content = util.fetch_url(url, util.desktop_ua + headers_1) print("Finished recieving channel tab response") '''with open('debug/channel_debug', 'wb') as f: @@ -110,7 +110,7 @@ def get_number_of_videos(channel_id): # Sometimes retrieving playlist info fails with 403 for no discernable reason try: - response = common.fetch_url(url, common.mobile_ua + headers_pbj) + response = util.fetch_url(url, util.mobile_ua + headers_pbj) except urllib.error.HTTPError as e: if e.code != 403: raise @@ -133,20 +133,20 @@ def get_channel_id(username): # method that gives the smallest possible response at ~10 kb # needs to be as fast as possible url = 'https://m.youtube.com/user/' + username + '/about?ajax=1&disable_polymer=true' - response = common.fetch_url(url, common.mobile_ua + headers_1).decode('utf-8') + response = util.fetch_url(url, util.mobile_ua + headers_1).decode('utf-8') return re.search(r'"channel_id":\s*"([a-zA-Z0-9_-]*)"', response).group(1) def grid_items_html(items, additional_info={}): result = ''' ''' return result def list_items_html(items, additional_info={}): result = ''' ''' return result @@ -168,11 +168,11 @@ def channel_tabs_html(channel_id, current_tab, search_box_value=''): ) else: result += channel_tab_template.substitute( - href_attribute = ' href="' + URL_ORIGIN + '/channel/' + channel_id + '/' + tab_name.lower() + '"', + href_attribute = ' href="' + util.URL_ORIGIN + '/channel/' + channel_id + '/' + tab_name.lower() + '"', tab_name = tab_name, ) result += channel_search_template.substitute( - action = URL_ORIGIN + "/channel/" + channel_id + "/search", + action = util.URL_ORIGIN + "/channel/" + channel_id + "/search", search_box_value = html.escape(search_box_value), ) return result @@ -192,7 +192,7 @@ def channel_sort_buttons_html(channel_id, tab, current_sort): ) else: result += channel_sort_button_template.substitute( - href_attribute=' href="' + URL_ORIGIN + '/channel/' + channel_id + '/' + tab + '?sort=' + sort_number + '"', + href_attribute=' href="' + util.URL_ORIGIN + '/channel/' + channel_id + '/' + tab + '?sort=' + sort_number + '"', text = 'Sort by ' + sort_name ) return result @@ -246,14 +246,14 @@ def channel_videos_html(polymer_json, current_page=1, current_sort=3, number_of_ items_html = grid_items_html(items, {'author': microformat['title']}) return yt_channel_items_template.substitute( - header = common.get_header(), + header = html_common.get_header(), channel_title = microformat['title'], channel_tabs = channel_tabs_html(channel_id, 'Videos'), sort_buttons = channel_sort_buttons_html(channel_id, 'videos', current_sort), avatar = '/' + microformat['thumbnail']['thumbnails'][0]['url'], page_title = microformat['title'] + ' - Channel', items = items_html, - page_buttons = common.page_buttons_html(current_page, math.ceil(number_of_videos/30), URL_ORIGIN + "/channel/" + channel_id + "/videos", current_query_string), + page_buttons = html_common.page_buttons_html(current_page, math.ceil(number_of_videos/30), util.URL_ORIGIN + "/channel/" + channel_id + "/videos", current_query_string), number_of_results = '{:,}'.format(number_of_videos) + " videos", ) @@ -267,7 +267,7 @@ def channel_playlists_html(polymer_json, current_sort=3): items_html = grid_items_html(items, {'author': microformat['title']}) return yt_channel_items_template.substitute( - header = common.get_header(), + header = html_common.get_header(), channel_title = microformat['title'], channel_tabs = channel_tabs_html(channel_id, 'Playlists'), sort_buttons = channel_sort_buttons_html(channel_id, 'playlists', current_sort), @@ -310,25 +310,25 @@ def channel_about_page(polymer_json): channel_links += channel_link_template.substitute( url = html.escape(url), - text = common.get_plain_text(link_json['title']), + text = yt_data_extract.get_plain_text(link_json['title']), ) stats = '' for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'): try: - stat_value = common.get_plain_text(channel_metadata[stat_name]) + stat_value = yt_data_extract.get_plain_text(channel_metadata[stat_name]) except KeyError: continue else: stats += stat_template.substitute(stat_value=stat_value) try: - description = common.format_text_runs(common.get_formatted_text(channel_metadata['description'])) + description = yt_data_extract.format_text_runs(yt_data_extract.get_formatted_text(channel_metadata['description'])) except KeyError: description = '' return yt_channel_about_template.substitute( - header = common.get_header(), - page_title = common.get_plain_text(channel_metadata['title']) + ' - About', - channel_title = common.get_plain_text(channel_metadata['title']), + header = html_common.get_header(), + page_title = yt_data_extract.get_plain_text(channel_metadata['title']) + ' - About', + channel_title = yt_data_extract.get_plain_text(channel_metadata['title']), avatar = html.escape(avatar), description = description, links = channel_links, @@ -351,13 +351,13 @@ def channel_search_page(polymer_json, query, current_page=1, number_of_videos = items_html = list_items_html(items) return yt_channel_items_template.substitute( - header = common.get_header(), + header = html_common.get_header(), channel_title = html.escape(microformat['title']), channel_tabs = channel_tabs_html(channel_id, '', query), avatar = '/' + microformat['thumbnail']['thumbnails'][0]['url'], page_title = html.escape(query + ' - Channel search'), items = items_html, - page_buttons = common.page_buttons_html(current_page, math.ceil(number_of_videos/29), URL_ORIGIN + "/channel/" + channel_id + "/search", current_query_string), + page_buttons = html_common.page_buttons_html(current_page, math.ceil(number_of_videos/29), util.URL_ORIGIN + "/channel/" + channel_id + "/search", current_query_string), number_of_results = '', sort_buttons = '', ) @@ -367,7 +367,7 @@ def get_channel_search_json(channel_id, query, page): ctoken = proto.string(2, channel_id) + proto.string(3, params) + proto.string(11, query) ctoken = base64.urlsafe_b64encode(proto.nested(80226972, ctoken)).decode('ascii') - polymer_json = common.fetch_url("https://www.youtube.com/browse_ajax?ctoken=" + ctoken, common.desktop_ua + headers_1) + polymer_json = util.fetch_url("https://www.youtube.com/browse_ajax?ctoken=" + ctoken, util.desktop_ua + headers_1) '''with open('debug/channel_search_debug', 'wb') as f: f.write(polymer_json)''' polymer_json = json.loads(polymer_json) @@ -384,10 +384,10 @@ def get_channel_page(env, start_response): tab = 'videos' parameters = env['parameters'] - page_number = int(common.default_multi_get(parameters, 'page', 0, default='1')) - sort = common.default_multi_get(parameters, 'sort', 0, default='3') - view = common.default_multi_get(parameters, 'view', 0, default='1') - query = common.default_multi_get(parameters, 'query', 0, default='') + page_number = int(util.default_multi_get(parameters, 'page', 0, default='1')) + sort = util.default_multi_get(parameters, 'sort', 0, default='3') + view = util.default_multi_get(parameters, 'view', 0, default='1') + query = util.default_multi_get(parameters, 'query', 0, default='') if tab == 'videos': tasks = ( @@ -399,11 +399,11 @@ def get_channel_page(env, start_response): result = channel_videos_html(polymer_json, page_number, sort, number_of_videos, env['QUERY_STRING']) elif tab == 'about': - polymer_json = common.fetch_url('https://www.youtube.com/channel/' + channel_id + '/about?pbj=1', common.desktop_ua + headers_1) + polymer_json = util.fetch_url('https://www.youtube.com/channel/' + channel_id + '/about?pbj=1', util.desktop_ua + headers_1) polymer_json = json.loads(polymer_json) result = channel_about_page(polymer_json) elif tab == 'playlists': - polymer_json = common.fetch_url('https://www.youtube.com/channel/' + channel_id + '/playlists?pbj=1&view=1&sort=' + playlist_sort_codes[sort], common.desktop_ua + headers_1) + polymer_json = util.fetch_url('https://www.youtube.com/channel/' + channel_id + '/playlists?pbj=1&view=1&sort=' + playlist_sort_codes[sort], util.desktop_ua + headers_1) '''with open('debug/channel_playlists_debug', 'wb') as f: f.write(polymer_json)''' polymer_json = json.loads(polymer_json) @@ -443,22 +443,22 @@ def get_channel_page_general_url(env, start_response): return b'Invalid channel url' if page == 'videos': - polymer_json = common.fetch_url(base_url + '/videos?pbj=1&view=0', common.desktop_ua + headers_1) + polymer_json = util.fetch_url(base_url + '/videos?pbj=1&view=0', util.desktop_ua + headers_1) '''with open('debug/user_page_videos', 'wb') as f: f.write(polymer_json)''' polymer_json = json.loads(polymer_json) result = channel_videos_html(polymer_json) elif page == 'about': - polymer_json = common.fetch_url(base_url + '/about?pbj=1', common.desktop_ua + headers_1) + polymer_json = util.fetch_url(base_url + '/about?pbj=1', util.desktop_ua + headers_1) polymer_json = json.loads(polymer_json) result = channel_about_page(polymer_json) elif page == 'playlists': - polymer_json = common.fetch_url(base_url+ '/playlists?pbj=1&view=1', common.desktop_ua + headers_1) + polymer_json = util.fetch_url(base_url+ '/playlists?pbj=1&view=1', util.desktop_ua + headers_1) polymer_json = json.loads(polymer_json) result = channel_playlists_html(polymer_json) elif page == 'search': raise NotImplementedError() - '''polymer_json = common.fetch_url('https://www.youtube.com/user' + username + '/search?pbj=1&' + query_string, common.desktop_ua + headers_1) + '''polymer_json = util.fetch_url('https://www.youtube.com/user' + username + '/search?pbj=1&' + query_string, util.desktop_ua + headers_1) polymer_json = json.loads(polymer_json) return channel_search_page(''' else: diff --git a/youtube/comments.py b/youtube/comments.py index 10209e7..4087b47 100644 --- a/youtube/comments.py +++ b/youtube/comments.py @@ -1,13 +1,14 @@ +from youtube import proto, util, html_common, yt_data_extract, accounts +import settings + import json -from youtube import proto, common, accounts import base64 -from youtube.common import uppercase_escape, default_multi_get, format_text_runs, URL_ORIGIN, fetch_url from string import Template import urllib.request import urllib import html -import settings import re + comment_area_template = Template('''
$video-metadata @@ -130,7 +131,7 @@ def request_comments(ctoken, replies=False): url = base_url + ctoken.replace("=", "%3D") + "&pbj=1" for i in range(0,8): # don't retry more than 8 times - content = fetch_url(url, headers=mobile_headers, report_text="Retrieved comments") + content = util.fetch_url(url, headers=mobile_headers, report_text="Retrieved comments") if content[0:4] == b")]}'": # random closing characters included at beginning of response for some reason content = content[4:] elif content[0:10] == b'\nPost comment''' - other_sort_url = common.URL_ORIGIN + '/comments?ctoken=' + make_comment_ctoken(video_id, sort=1 - sort, lc=lc) + other_sort_url = util.URL_ORIGIN + '/comments?ctoken=' + make_comment_ctoken(video_id, sort=1 - sort, lc=lc) other_sort_name = 'newest' if sort == 0 else 'top' other_sort_link = '''Sort by ''' + other_sort_name + '''''' @@ -314,7 +315,7 @@ def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''): if ctoken == '': more_comments_button = '' else: - more_comments_button = more_comments_template.substitute(url = common.URL_ORIGIN + '/comments?ctoken=' + ctoken) + more_comments_button = more_comments_template.substitute(url = util.URL_ORIGIN + '/comments?ctoken=' + ctoken) result = '''
\n''' result += comment_links + '\n' @@ -350,7 +351,7 @@ comment_box_template = Template(''' - Add account + Add account $video_id_input @@ -359,7 +360,7 @@ $options def get_comments_page(env, start_response): start_response('200 OK', [('Content-type','text/html'),] ) parameters = env['parameters'] - ctoken = default_multi_get(parameters, 'ctoken', 0, default='') + ctoken = util.default_multi_get(parameters, 'ctoken', 0, default='') replies = False if not ctoken: video_id = parameters['video_id'][0] @@ -384,17 +385,17 @@ def get_comments_page(env, start_response): page_number = page_number, sort = 'top' if metadata['sort'] == 0 else 'newest', title = html.escape(comment_info['video_title']), - url = common.URL_ORIGIN + '/watch?v=' + metadata['video_id'], + url = util.URL_ORIGIN + '/watch?v=' + metadata['video_id'], thumbnail = '/i.ytimg.com/vi/'+ metadata['video_id'] + '/mqdefault.jpg', ) comment_box = comment_box_template.substitute( - form_action= common.URL_ORIGIN + '/post_comment', + form_action= util.URL_ORIGIN + '/post_comment', video_id_input='''''', post_text='Post comment', options=comment_box_account_options(), ) - other_sort_url = common.URL_ORIGIN + '/comments?ctoken=' + make_comment_ctoken(metadata['video_id'], sort=1 - metadata['sort']) + other_sort_url = util.URL_ORIGIN + '/comments?ctoken=' + make_comment_ctoken(metadata['video_id'], sort=1 - metadata['sort']) other_sort_name = 'newest' if metadata['sort'] == 0 else 'top' other_sort_link = '''Sort by ''' + other_sort_name + '''''' @@ -408,7 +409,7 @@ def get_comments_page(env, start_response): if ctoken == '': more_comments_button = '' else: - more_comments_button = more_comments_template.substitute(url = URL_ORIGIN + '/comments?ctoken=' + ctoken) + more_comments_button = more_comments_template.substitute(url = util.URL_ORIGIN + '/comments?ctoken=' + ctoken) comments_area = '
\n' comments_area += video_metadata + comment_box + comment_links + '\n' comments_area += '
\n' @@ -417,7 +418,7 @@ def get_comments_page(env, start_response): comments_area += more_comments_button + '\n' comments_area += '
\n' return yt_comments_template.substitute( - header = common.get_header(), + header = html_common.get_header(), comments_area = comments_area, page_title = page_title, ).encode('utf-8') diff --git a/youtube/common.py b/youtube/common.py deleted file mode 100644 index cb963ce..0000000 --- a/youtube/common.py +++ /dev/null @@ -1,748 +0,0 @@ -from youtube.template import Template -from youtube import local_playlist -import settings -import html -import json -import re -import urllib.parse -import gzip -import brotli -import time -import socks, sockshandler - -URL_ORIGIN = "/https://www.youtube.com" - - -# videos (all of type str): - -# id -# title -# url -# author -# author_url -# thumbnail -# description -# published -# duration -# likes -# dislikes -# views -# playlist_index - -# playlists: - -# id -# title -# url -# author -# author_url -# thumbnail -# description -# updated -# size -# first_video_id - - -with open('yt_basic_template.html', 'r', encoding='utf-8') as file: - yt_basic_template = Template(file.read()) - - - - -page_button_template = Template('''$page''') -current_page_button_template = Template('''
$page''') - -medium_playlist_item_template = Template(''' -
-
- - -
- $size -
-
- - $title - -
$stats
-
-
-''') -medium_video_item_template = Template(''' -
-
- - - $duration - - - $title - -
$stats
- - $description - $badges -
- -
-''') - -small_video_item_template = Template(''' -
-
- - - $duration - - $title - -
$author
- $views - -
- -
-''') - -small_playlist_item_template = Template(''' -
-
- - -
- $size -
-
- $title - -
$author
-
-
-''') - -medium_channel_item_template = Template(''' -
-
- - - $duration - - - $title - - $subscriber_count - $size - - $description -
-
-''') - - -class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler): - '''Separate cookiejars for receiving and sending''' - def __init__(self, cookiejar_send=None, cookiejar_receive=None): - import http.cookiejar - self.cookiejar_send = cookiejar_send - self.cookiejar_receive = cookiejar_receive - - def http_request(self, request): - if self.cookiejar_send is not None: - self.cookiejar_send.add_cookie_header(request) - return request - - def http_response(self, request, response): - if self.cookiejar_receive is not None: - self.cookiejar_receive.extract_cookies(response, request) - return response - - https_request = http_request - https_response = http_response - - -def decode_content(content, encoding_header): - encodings = encoding_header.replace(' ', '').split(',') - for encoding in reversed(encodings): - if encoding == 'identity': - continue - if encoding == 'br': - content = brotli.decompress(content) - elif encoding == 'gzip': - content = gzip.decompress(content) - return content - -def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookiejar_send=None, cookiejar_receive=None, use_tor=True): - ''' - When cookiejar_send is set to a CookieJar object, - those cookies will be sent in the request (but cookies in response will not be merged into it) - When cookiejar_receive is set to a CookieJar object, - cookies received in the response will be merged into the object (nothing will be sent from it) - When both are set to the same object, cookies will be sent from the object, - and response cookies will be merged into it. - ''' - headers = dict(headers) # Note: Calling dict() on a dict will make a copy - headers['Accept-Encoding'] = 'gzip, br' - - # prevent python version being leaked by urllib if User-Agent isn't provided - # (urllib will use ex. Python-urllib/3.6 otherwise) - if 'User-Agent' not in headers and 'user-agent' not in headers and 'User-agent' not in headers: - headers['User-Agent'] = 'Python-urllib' - - if data is not None: - if isinstance(data, str): - data = data.encode('ascii') - elif not isinstance(data, bytes): - data = urllib.parse.urlencode(data).encode('ascii') - - start_time = time.time() - - - req = urllib.request.Request(url, data=data, headers=headers) - - cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive) - - if use_tor and settings.route_tor: - opener = urllib.request.build_opener(sockshandler.SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150), cookie_processor) - else: - opener = urllib.request.build_opener(cookie_processor) - - response = opener.open(req, timeout=timeout) - response_time = time.time() - - - content = response.read() - read_finish = time.time() - if report_text: - print(report_text, ' Latency:', round(response_time - start_time,3), ' Read time:', round(read_finish - response_time,3)) - content = decode_content(content, response.getheader('Content-Encoding', default='identity')) - return content - -mobile_user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1' -mobile_ua = (('User-Agent', mobile_user_agent),) -desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0' -desktop_ua = (('User-Agent', desktop_user_agent),) - -def dict_add(*dicts): - for dictionary in dicts[1:]: - dicts[0].update(dictionary) - return dicts[0] - -def video_id(url): - url_parts = urllib.parse.urlparse(url) - return urllib.parse.parse_qs(url_parts.query)['v'][0] - -def uppercase_escape(s): - return re.sub( - r'\\U([0-9a-fA-F]{8})', - lambda m: chr(int(m.group(1), base=16)), s) - -def default_multi_get(object, *keys, default): - ''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors ''' - try: - for key in keys: - object = object[key] - return object - except (IndexError, KeyError): - return default - -def get_plain_text(node): - try: - return html.escape(node['simpleText']) - except KeyError: - return unformmated_text_runs(node['runs']) - -def unformmated_text_runs(runs): - result = '' - for text_run in runs: - result += html.escape(text_run["text"]) - return result - -def format_text_runs(runs): - if isinstance(runs, str): - return runs - result = '' - for text_run in runs: - if text_run.get("bold", False): - result += "" + html.escape(text_run["text"]) + "" - elif text_run.get('italics', False): - result += "" + html.escape(text_run["text"]) + "" - else: - result += html.escape(text_run["text"]) - return result - -# default, sddefault, mqdefault, hqdefault, hq720 -def get_thumbnail_url(video_id): - return "/i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" - -def seconds_to_timestamp(seconds): - seconds = int(seconds) - hours, seconds = divmod(seconds,3600) - minutes, seconds = divmod(seconds,60) - if hours != 0: - timestamp = str(hours) + ":" - timestamp += str(minutes).zfill(2) # zfill pads with zeros - else: - timestamp = str(minutes) - - timestamp += ":" + str(seconds).zfill(2) - return timestamp - - -# ----- -# HTML -# ----- - -def small_video_item_html(item): - video_info = json.dumps({key: item[key] for key in ('id', 'title', 'author', 'duration')}) - return small_video_item_template.substitute( - title = html.escape(item["title"]), - views = item["views"], - author = html.escape(item["author"]), - duration = item["duration"], - url = URL_ORIGIN + "/watch?v=" + item["id"], - thumbnail = get_thumbnail_url(item['id']), - video_info = html.escape(video_info), - ) - -def small_playlist_item_html(item): - return small_playlist_item_template.substitute( - title=html.escape(item["title"]), - size = item['size'], - author="", - url = URL_ORIGIN + "/playlist?list=" + item["id"], - thumbnail= get_thumbnail_url(item['first_video_id']), - ) - -def medium_playlist_item_html(item): - return medium_playlist_item_template.substitute( - title=html.escape(item["title"]), - size = item['size'], - author=item['author'], - author_url= URL_ORIGIN + item['author_url'], - url = URL_ORIGIN + "/playlist?list=" + item["id"], - thumbnail= item['thumbnail'], - ) - -def medium_video_item_html(medium_video_info): - info = medium_video_info - - return medium_video_item_template.substitute( - title=html.escape(info["title"]), - views=info["views"], - published = info["published"], - description = format_text_runs(info["description"]), - author=html.escape(info["author"]), - author_url=info["author_url"], - duration=info["duration"], - url = URL_ORIGIN + "/watch?v=" + info["id"], - thumbnail=info['thumbnail'], - datetime='', # TODO - ) - - -header_template = Template(''' -
- - - - - - - -
-
- - -$playlists - - - -
- Local playlists -
-
-''') -playlist_option_template = Template('''''') -def get_header(search_box_value=""): - playlists = '' - for name in local_playlist.get_playlist_names(): - playlists += playlist_option_template.substitute(name = name) - return header_template.substitute(playlists = playlists, search_box_value = html.escape(search_box_value)) - - - -def get_url(node): - try: - return node['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] - except KeyError: - return node['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] - - -def get_text(node): - try: - return node['simpleText'] - except KeyError: - pass - try: - return node['runs'][0]['text'] - except IndexError: # empty text runs - return '' - -def get_formatted_text(node): - try: - return node['runs'] - except KeyError: - return node['simpleText'] - -def get_badges(node): - badges = [] - for badge_node in node: - badge = badge_node['metadataBadgeRenderer']['label'] - if badge.lower() != 'new': - badges.append(badge) - return badges - -def get_thumbnail(node): - try: - return node['thumbnails'][0]['url'] # polymer format - except KeyError: - return node['url'] # ajax format - -dispatch = { - -# polymer format - 'title': ('title', get_text), - 'publishedTimeText': ('published', get_text), - 'videoId': ('id', lambda node: node), - 'descriptionSnippet': ('description', get_formatted_text), - 'lengthText': ('duration', get_text), - 'thumbnail': ('thumbnail', get_thumbnail), - 'thumbnails': ('thumbnail', lambda node: node[0]['thumbnails'][0]['url']), - - 'viewCountText': ('views', get_text), - 'numVideosText': ('size', lambda node: get_text(node).split(' ')[0]), # the format is "324 videos" - 'videoCountText': ('size', get_text), - 'playlistId': ('id', lambda node: node), - 'descriptionText': ('description', get_formatted_text), - - 'subscriberCountText': ('subscriber_count', get_text), - 'channelId': ('id', lambda node: node), - 'badges': ('badges', get_badges), - -# ajax format - 'view_count_text': ('views', get_text), - 'num_videos_text': ('size', lambda node: get_text(node).split(' ')[0]), - 'owner_text': ('author', get_text), - 'owner_endpoint': ('author_url', lambda node: node['url']), - 'description': ('description', get_formatted_text), - 'index': ('playlist_index', get_text), - 'short_byline': ('author', get_text), - 'length': ('duration', get_text), - 'video_id': ('id', lambda node: node), - -} - -def renderer_info(renderer): - try: - info = {} - if 'viewCountText' in renderer: # prefer this one as it contains all the digits - info['views'] = get_text(renderer['viewCountText']) - elif 'shortViewCountText' in renderer: - info['views'] = get_text(renderer['shortViewCountText']) - - if 'ownerText' in renderer: - info['author'] = renderer['ownerText']['runs'][0]['text'] - info['author_url'] = renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] - try: - overlays = renderer['thumbnailOverlays'] - except KeyError: - pass - else: - for overlay in overlays: - if 'thumbnailOverlayTimeStatusRenderer' in overlay: - info['duration'] = get_text(overlay['thumbnailOverlayTimeStatusRenderer']['text']) - # show renderers don't have videoCountText - elif 'thumbnailOverlayBottomPanelRenderer' in overlay: - info['size'] = get_text(overlay['thumbnailOverlayBottomPanelRenderer']['text']) - - # show renderers don't have playlistId, have to dig into the url to get it - try: - info['id'] = renderer['navigationEndpoint']['watchEndpoint']['playlistId'] - except KeyError: - pass - for key, node in renderer.items(): - if key in ('longBylineText', 'shortBylineText'): - info['author'] = get_text(node) - try: - info['author_url'] = get_url(node) - except KeyError: - pass - - # show renderers don't have thumbnail key at top level, dig into thumbnailRenderer - elif key == 'thumbnailRenderer' and 'showCustomThumbnailRenderer' in node: - info['thumbnail'] = node['showCustomThumbnailRenderer']['thumbnail']['thumbnails'][0]['url'] - else: - try: - simple_key, function = dispatch[key] - except KeyError: - continue - info[simple_key] = function(node) - return info - except KeyError: - print(renderer) - raise - -def ajax_info(item_json): - try: - info = {} - for key, node in item_json.items(): - try: - simple_key, function = dispatch[key] - except KeyError: - continue - info[simple_key] = function(node) - return info - except KeyError: - print(item_json) - raise - -def badges_html(badges): - return ' | '.join(map(html.escape, badges)) - - - - - -html_transform_dispatch = { - 'title': html.escape, - 'published': html.escape, - 'id': html.escape, - 'description': format_text_runs, - 'duration': html.escape, - 'thumbnail': lambda url: html.escape('/' + url.lstrip('/')), - 'size': html.escape, - 'author': html.escape, - 'author_url': lambda url: html.escape(URL_ORIGIN + url), - 'views': html.escape, - 'subscriber_count': html.escape, - 'badges': badges_html, - 'playlist_index': html.escape, -} - -def get_html_ready(item): - html_ready = {} - for key, value in item.items(): - try: - function = html_transform_dispatch[key] - except KeyError: - continue - html_ready[key] = function(value) - return html_ready - - -author_template_url = Template('''
By $author
''') -author_template = Template('''
$author
''') -stat_templates = ( - Template('''$views'''), - Template(''''''), -) -def get_stats(html_ready): - stats = [] - if 'author' in html_ready: - if 'author_url' in html_ready: - stats.append(author_template_url.substitute(html_ready)) - else: - stats.append(author_template.substitute(html_ready)) - for stat in stat_templates: - try: - stats.append(stat.strict_substitute(html_ready)) - except KeyError: - pass - return ' | '.join(stats) - -def video_item_html(item, template, html_exclude=set()): - - video_info = {} - for key in ('id', 'title', 'author'): - try: - video_info[key] = item[key] - except KeyError: - video_info[key] = '' - try: - video_info['duration'] = item['duration'] - except KeyError: - video_info['duration'] = 'Live' # livestreams don't have a duration - - html_ready = get_html_ready(item) - - html_ready['video_info'] = html.escape(json.dumps(video_info) ) - html_ready['url'] = URL_ORIGIN + "/watch?v=" + html_ready['id'] - html_ready['datetime'] = '' #TODO - - for key in html_exclude: - del html_ready[key] - html_ready['stats'] = get_stats(html_ready) - - return template.substitute(html_ready) - - -def playlist_item_html(item, template, html_exclude=set()): - html_ready = get_html_ready(item) - - html_ready['url'] = URL_ORIGIN + "/playlist?list=" + html_ready['id'] - html_ready['datetime'] = '' #TODO - - for key in html_exclude: - del html_ready[key] - html_ready['stats'] = get_stats(html_ready) - - return template.substitute(html_ready) - - - - - - -def update_query_string(query_string, items): - parameters = urllib.parse.parse_qs(query_string) - parameters.update(items) - return urllib.parse.urlencode(parameters, doseq=True) - -page_button_template = Template('''$page''') -current_page_button_template = Template('''
$page
''') - -def page_buttons_html(current_page, estimated_pages, url, current_query_string): - if current_page <= 5: - page_start = 1 - page_end = min(9, estimated_pages) - else: - page_start = current_page - 4 - page_end = min(current_page + 4, estimated_pages) - - result = "" - for page in range(page_start, page_end+1): - if page == current_page: - template = current_page_button_template - else: - template = page_button_template - result += template.substitute(page=page, href = url + "?" + update_query_string(current_query_string, {'page': [str(page)]}) ) - return result - - - - - - - -showing_results_for = Template(''' -
-
Showing results for $corrected_query
-
Search instead for $original_query
-
-''') - -did_you_mean = Template(''' -
-
Did you mean $corrected_query
-
-''') - -def renderer_html(renderer, additional_info={}, current_query_string=''): - type = list(renderer.keys())[0] - renderer = renderer[type] - if type == 'itemSectionRenderer': - return renderer_html(renderer['contents'][0], additional_info, current_query_string) - - if type == 'channelRenderer': - info = renderer_info(renderer) - html_ready = get_html_ready(info) - html_ready['url'] = URL_ORIGIN + "/channel/" + html_ready['id'] - return medium_channel_item_template.substitute(html_ready) - - if type in ('movieRenderer', 'clarificationRenderer'): - return '' - - info = renderer_info(renderer) - info.update(additional_info) - html_exclude = set(additional_info.keys()) - if type == 'compactVideoRenderer': - return video_item_html(info, small_video_item_template, html_exclude=html_exclude) - if type in ('compactPlaylistRenderer', 'compactRadioRenderer', 'compactShowRenderer'): - return playlist_item_html(info, small_playlist_item_template, html_exclude=html_exclude) - if type in ('videoRenderer', 'gridVideoRenderer'): - return video_item_html(info, medium_video_item_template, html_exclude=html_exclude) - if type in ('playlistRenderer', 'gridPlaylistRenderer', 'radioRenderer', 'gridRadioRenderer', 'gridShowRenderer', 'showRenderer'): - return playlist_item_html(info, medium_playlist_item_template, html_exclude=html_exclude) - - #print(renderer) - #raise NotImplementedError('Unknown renderer type: ' + type) - return '' diff --git a/youtube/html_common.py b/youtube/html_common.py new file mode 100644 index 0000000..27a582b --- /dev/null +++ b/youtube/html_common.py @@ -0,0 +1,429 @@ +from youtube.template import Template +from youtube import local_playlist, yt_data_extract, util + +import json +import html + + +with open('yt_basic_template.html', 'r', encoding='utf-8') as file: + yt_basic_template = Template(file.read()) + + + + +page_button_template = Template('''$page''') +current_page_button_template = Template('''
$page''') + +medium_playlist_item_template = Template(''' +
+
+ + +
+ $size +
+
+ + $title + +
$stats
+
+
+''') +medium_video_item_template = Template(''' +
+
+ + + $duration + + + $title + +
$stats
+ + $description + $badges +
+ +
+''') + +small_video_item_template = Template(''' +
+
+ + + $duration + + $title + +
$author
+ $views + +
+ +
+''') + +small_playlist_item_template = Template(''' +
+
+ + +
+ $size +
+
+ $title + +
$author
+
+
+''') + +medium_channel_item_template = Template(''' +
+
+ + + $duration + + + $title + + $subscriber_count + $size + + $description +
+
+''') + + + + +# ----- +# HTML +# ----- + +def small_video_item_html(item): + video_info = json.dumps({key: item[key] for key in ('id', 'title', 'author', 'duration')}) + return small_video_item_template.substitute( + title = html.escape(item["title"]), + views = item["views"], + author = html.escape(item["author"]), + duration = item["duration"], + url = util.URL_ORIGIN + "/watch?v=" + item["id"], + thumbnail = util.get_thumbnail_url(item['id']), + video_info = html.escape(video_info), + ) + +def small_playlist_item_html(item): + return small_playlist_item_template.substitute( + title=html.escape(item["title"]), + size = item['size'], + author="", + url = util.URL_ORIGIN + "/playlist?list=" + item["id"], + thumbnail= util.get_thumbnail_url(item['first_video_id']), + ) + +def medium_playlist_item_html(item): + return medium_playlist_item_template.substitute( + title=html.escape(item["title"]), + size = item['size'], + author=item['author'], + author_url= util.URL_ORIGIN + item['author_url'], + url = util.URL_ORIGIN + "/playlist?list=" + item["id"], + thumbnail= item['thumbnail'], + ) + +def medium_video_item_html(medium_video_info): + info = medium_video_info + + return medium_video_item_template.substitute( + title=html.escape(info["title"]), + views=info["views"], + published = info["published"], + description = yt_data_extract.format_text_runs(info["description"]), + author=html.escape(info["author"]), + author_url=info["author_url"], + duration=info["duration"], + url = util.URL_ORIGIN + "/watch?v=" + info["id"], + thumbnail=info['thumbnail'], + datetime='', # TODO + ) + + +header_template = Template(''' +
+ + + +
+
+ + +$playlists + + + +
+ Local playlists +
+
+''') +playlist_option_template = Template('''''') +def get_header(search_box_value=""): + playlists = '' + for name in local_playlist.get_playlist_names(): + playlists += playlist_option_template.substitute(name = name) + return header_template.substitute(playlists = playlists, search_box_value = html.escape(search_box_value)) + + + + + + + + + + + +def badges_html(badges): + return ' | '.join(map(html.escape, badges)) + + +html_transform_dispatch = { + 'title': html.escape, + 'published': html.escape, + 'id': html.escape, + 'description': yt_data_extract.format_text_runs, + 'duration': html.escape, + 'thumbnail': lambda url: html.escape('/' + url.lstrip('/')), + 'size': html.escape, + 'author': html.escape, + 'author_url': lambda url: html.escape(util.URL_ORIGIN + url), + 'views': html.escape, + 'subscriber_count': html.escape, + 'badges': badges_html, + 'playlist_index': html.escape, +} + +def get_html_ready(item): + html_ready = {} + for key, value in item.items(): + try: + function = html_transform_dispatch[key] + except KeyError: + continue + html_ready[key] = function(value) + return html_ready + + +author_template_url = Template('''
By $author
''') +author_template = Template('''
$author
''') +stat_templates = ( + Template('''$views'''), + Template(''''''), +) +def get_stats(html_ready): + stats = [] + if 'author' in html_ready: + if 'author_url' in html_ready: + stats.append(author_template_url.substitute(html_ready)) + else: + stats.append(author_template.substitute(html_ready)) + for stat in stat_templates: + try: + stats.append(stat.strict_substitute(html_ready)) + except KeyError: + pass + return ' | '.join(stats) + +def video_item_html(item, template, html_exclude=set()): + + video_info = {} + for key in ('id', 'title', 'author'): + try: + video_info[key] = item[key] + except KeyError: + video_info[key] = '' + try: + video_info['duration'] = item['duration'] + except KeyError: + video_info['duration'] = 'Live' # livestreams don't have a duration + + html_ready = get_html_ready(item) + + html_ready['video_info'] = html.escape(json.dumps(video_info) ) + html_ready['url'] = util.URL_ORIGIN + "/watch?v=" + html_ready['id'] + html_ready['datetime'] = '' #TODO + + for key in html_exclude: + del html_ready[key] + html_ready['stats'] = get_stats(html_ready) + + return template.substitute(html_ready) + + +def playlist_item_html(item, template, html_exclude=set()): + html_ready = get_html_ready(item) + + html_ready['url'] = util.URL_ORIGIN + "/playlist?list=" + html_ready['id'] + html_ready['datetime'] = '' #TODO + + for key in html_exclude: + del html_ready[key] + html_ready['stats'] = get_stats(html_ready) + + return template.substitute(html_ready) + + + + + + + +page_button_template = Template('''$page''') +current_page_button_template = Template('''
$page
''') + +def page_buttons_html(current_page, estimated_pages, url, current_query_string): + if current_page <= 5: + page_start = 1 + page_end = min(9, estimated_pages) + else: + page_start = current_page - 4 + page_end = min(current_page + 4, estimated_pages) + + result = "" + for page in range(page_start, page_end+1): + if page == current_page: + template = current_page_button_template + else: + template = page_button_template + result += template.substitute(page=page, href = url + "?" + util.update_query_string(current_query_string, {'page': [str(page)]}) ) + return result + + + + + + + +showing_results_for = Template(''' +
+
Showing results for $corrected_query
+
Search instead for $original_query
+
+''') + +did_you_mean = Template(''' +
+
Did you mean $corrected_query
+
+''') + +def renderer_html(renderer, additional_info={}, current_query_string=''): + type = list(renderer.keys())[0] + renderer = renderer[type] + if type == 'itemSectionRenderer': + return renderer_html(renderer['contents'][0], additional_info, current_query_string) + + if type == 'channelRenderer': + info = yt_data_extract.renderer_info(renderer) + html_ready = get_html_ready(info) + html_ready['url'] = util.URL_ORIGIN + "/channel/" + html_ready['id'] + return medium_channel_item_template.substitute(html_ready) + + if type in ('movieRenderer', 'clarificationRenderer'): + return '' + + info = yt_data_extract.renderer_info(renderer) + info.update(additional_info) + html_exclude = set(additional_info.keys()) + if type == 'compactVideoRenderer': + return video_item_html(info, small_video_item_template, html_exclude=html_exclude) + if type in ('compactPlaylistRenderer', 'compactRadioRenderer', 'compactShowRenderer'): + return playlist_item_html(info, small_playlist_item_template, html_exclude=html_exclude) + if type in ('videoRenderer', 'gridVideoRenderer'): + return video_item_html(info, medium_video_item_template, html_exclude=html_exclude) + if type in ('playlistRenderer', 'gridPlaylistRenderer', 'radioRenderer', 'gridRadioRenderer', 'gridShowRenderer', 'showRenderer'): + return playlist_item_html(info, medium_playlist_item_template, html_exclude=html_exclude) + + #print(renderer) + #raise NotImplementedError('Unknown renderer type: ' + type) + return '' \ No newline at end of file diff --git a/youtube/local_playlist.py b/youtube/local_playlist.py index 0375040..e354013 100644 --- a/youtube/local_playlist.py +++ b/youtube/local_playlist.py @@ -1,11 +1,12 @@ +from youtube.template import Template +from youtube import util, html_common +import settings + import os import json -from youtube.template import Template -from youtube import common import html import gevent import urllib -import settings playlists_directory = os.path.join(settings.data_dir, "playlists") thumbnails_directory = os.path.join(settings.data_dir, "playlist_thumbnails") @@ -38,7 +39,7 @@ def download_thumbnail(playlist_name, video_id): url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" save_location = os.path.join(thumbnails_directory, playlist_name, video_id + ".jpg") try: - thumbnail = common.fetch_url(url, report_text="Saved local playlist thumbnail: " + video_id) + thumbnail = util.fetch_url(url, report_text="Saved local playlist thumbnail: " + video_id) except urllib.error.HTTPError as e: print("Failed to download thumbnail for " + video_id + ": " + str(e)) return @@ -78,15 +79,15 @@ def get_local_playlist_page(name): if info['id'] + ".jpg" in thumbnails: info['thumbnail'] = "/youtube.com/data/playlist_thumbnails/" + name + "/" + info['id'] + ".jpg" else: - info['thumbnail'] = common.get_thumbnail_url(info['id']) + info['thumbnail'] = util.get_thumbnail_url(info['id']) missing_thumbnails.append(info['id']) - videos_html += common.video_item_html(info, common.small_video_item_template) + videos_html += html_common.video_item_html(info, html_common.small_video_item_template) except json.decoder.JSONDecodeError: pass gevent.spawn(download_thumbnails, name, missing_thumbnails) return local_playlist_template.substitute( page_title = name + ' - Local playlist', - header = common.get_header(), + header = html_common.get_header(), videos = videos_html, title = name, page_buttons = '' @@ -127,11 +128,11 @@ def get_playlists_list_page(): page = '''
    \n''' list_item_template = Template('''
  • $name
  • \n''') for name in get_playlist_names(): - page += list_item_template.substitute(url = html.escape(common.URL_ORIGIN + '/playlists/' + name), name = html.escape(name)) + page += list_item_template.substitute(url = html.escape(util.URL_ORIGIN + '/playlists/' + name), name = html.escape(name)) page += '''
\n''' - return common.yt_basic_template.substitute( + return html_common.yt_basic_template.substitute( page_title = "Local playlists", - header = common.get_header(), + header = html_common.get_header(), style = '', page = page, ) @@ -151,7 +152,7 @@ def path_edit_playlist(env, start_response): if parameters['action'][0] == 'remove': playlist_name = env['path_parts'][1] remove_from_playlist(playlist_name, parameters['video_info_list']) - start_response('303 See Other', [('Location', common.URL_ORIGIN + env['PATH_INFO']),] ) + start_response('303 See Other', [('Location', util.URL_ORIGIN + env['PATH_INFO']),] ) return b'' else: diff --git a/youtube/playlist.py b/youtube/playlist.py index 419ed00..fbe6448 100644 --- a/youtube/playlist.py +++ b/youtube/playlist.py @@ -1,10 +1,9 @@ +from youtube import util, yt_data_extract, html_common, template, proto + import base64 -import youtube.common as common import urllib import json import string -from youtube import template -import youtube.proto as proto import gevent import math @@ -49,10 +48,10 @@ headers_1 = ( def playlist_first_page(playlist_id, report_text = "Retrieved playlist"): url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&pbj=1' - content = common.fetch_url(url, common.mobile_ua + headers_1, report_text=report_text) + content = util.fetch_url(url, util.mobile_ua + headers_1, report_text=report_text) '''with open('debug/playlist_debug', 'wb') as f: f.write(content)''' - content = json.loads(common.uppercase_escape(content.decode('utf-8'))) + content = json.loads(util.uppercase_escape(content.decode('utf-8'))) return content @@ -69,11 +68,11 @@ def get_videos(playlist_id, page): 'X-YouTube-Client-Version': '2.20180508', } - content = common.fetch_url(url, headers, report_text="Retrieved playlist") + content = util.fetch_url(url, headers, report_text="Retrieved playlist") '''with open('debug/playlist_debug', 'wb') as f: f.write(content)''' - info = json.loads(common.uppercase_escape(content.decode('utf-8'))) + info = json.loads(util.uppercase_escape(content.decode('utf-8'))) return info @@ -101,22 +100,22 @@ def get_playlist_page(env, start_response): video_list = this_page_json['response']['continuationContents']['playlistVideoListContinuation']['contents'] videos_html = '' for video_json in video_list: - info = common.renderer_info(video_json['playlistVideoRenderer']) - videos_html += common.video_item_html(info, common.small_video_item_template) + info = yt_data_extract.renderer_info(video_json['playlistVideoRenderer']) + videos_html += html_common.video_item_html(info, html_common.small_video_item_template) - metadata = common.renderer_info(first_page_json['response']['header']['playlistHeaderRenderer']) + metadata = yt_data_extract.renderer_info(first_page_json['response']['header']['playlistHeaderRenderer']) video_count = int(metadata['size'].replace(',', '')) - page_buttons = common.page_buttons_html(int(page), math.ceil(video_count/20), common.URL_ORIGIN + "/playlist", env['QUERY_STRING']) + page_buttons = html_common.page_buttons_html(int(page), math.ceil(video_count/20), util.URL_ORIGIN + "/playlist", env['QUERY_STRING']) - html_ready = common.get_html_ready(metadata) + html_ready = html_common.get_html_ready(metadata) html_ready['page_title'] = html_ready['title'] + ' - Page ' + str(page) stats = '' stats += playlist_stat_template.substitute(stat=html_ready['size'] + ' videos') stats += playlist_stat_template.substitute(stat=html_ready['views']) return yt_playlist_template.substitute( - header = common.get_header(), + header = html_common.get_header(), videos = videos_html, page_buttons = page_buttons, stats = stats, diff --git a/youtube/post_comment.py b/youtube/post_comment.py index 92c45e1..876a1c0 100644 --- a/youtube/post_comment.py +++ b/youtube/post_comment.py @@ -1,11 +1,11 @@ # Contains functions having to do with posting/editing/deleting comments +from youtube import util, html_common, proto, comments, accounts +import settings import urllib import json -from youtube import common, proto, comments, accounts import re import traceback -import settings import os def _post_comment(text, video_id, session_token, cookiejar): @@ -31,7 +31,7 @@ def _post_comment(text, video_id, session_token, cookiejar): data = urllib.parse.urlencode(data_dict).encode() - content = common.fetch_url("https://m.youtube.com/service_ajax?name=createCommentEndpoint", headers=headers, data=data, cookiejar_send=cookiejar) + content = util.fetch_url("https://m.youtube.com/service_ajax?name=createCommentEndpoint", headers=headers, data=data, cookiejar_send=cookiejar) code = json.loads(content)['code'] print("Comment posting code: " + code) @@ -62,7 +62,7 @@ def _post_comment_reply(text, video_id, parent_comment_id, session_token, cookie } data = urllib.parse.urlencode(data_dict).encode() - content = common.fetch_url("https://m.youtube.com/service_ajax?name=createCommentReplyEndpoint", headers=headers, data=data, cookiejar_send=cookiejar) + content = util.fetch_url("https://m.youtube.com/service_ajax?name=createCommentReplyEndpoint", headers=headers, data=data, cookiejar_send=cookiejar) code = json.loads(content)['code'] print("Comment posting code: " + code) @@ -90,7 +90,7 @@ def _delete_comment(video_id, comment_id, author_id, session_token, cookiejar): } data = urllib.parse.urlencode(data_dict).encode() - content = common.fetch_url("https://m.youtube.com/service_ajax?name=performCommentActionEndpoint", headers=headers, data=data, cookiejar_send=cookiejar) + content = util.fetch_url("https://m.youtube.com/service_ajax?name=performCommentActionEndpoint", headers=headers, data=data, cookiejar_send=cookiejar) code = json.loads(content)['code'] print("Comment deletion code: " + code) return code @@ -101,8 +101,8 @@ def get_session_token(video_id, cookiejar): # youtube-dl uses disable_polymer=1 which uses a different request format which has an obfuscated javascript algorithm to generate a parameter called "bgr" # Tokens retrieved from disable_polymer pages only work with that format. Tokens retrieved on mobile only work using mobile requests # Additionally, tokens retrieved without sending the same cookie won't work. So this is necessary even if the bgr and stuff was reverse engineered. - headers = {'User-Agent': common.mobile_user_agent} - mobile_page = common.fetch_url('https://m.youtube.com/watch?v=' + video_id, headers, report_text="Retrieved session token for comment", cookiejar_send=cookiejar, cookiejar_receive=cookiejar).decode() + headers = {'User-Agent': util.mobile_user_agent} + mobile_page = util.fetch_url('https://m.youtube.com/watch?v=' + video_id, headers, report_text="Retrieved session token for comment", cookiejar_send=cookiejar, cookiejar_receive=cookiejar).decode() match = xsrf_token_regex.search(mobile_page) if match: return match.group(1).replace("%3D", "=") @@ -118,9 +118,9 @@ def delete_comment(env, start_response): code = _delete_comment(video_id, parameters['comment_id'][0], parameters['author_id'][0], token, cookiejar) if code == "SUCCESS": - start_response('303 See Other', [('Location', common.URL_ORIGIN + '/comment_delete_success'),] ) + start_response('303 See Other', [('Location', util.URL_ORIGIN + '/comment_delete_success'),] ) else: - start_response('303 See Other', [('Location', common.URL_ORIGIN + '/comment_delete_fail'),] ) + start_response('303 See Other', [('Location', util.URL_ORIGIN + '/comment_delete_fail'),] ) def post_comment(env, start_response): parameters = env['parameters'] @@ -131,11 +131,11 @@ def post_comment(env, start_response): if 'parent_id' in parameters: code = _post_comment_reply(parameters['comment_text'][0], parameters['video_id'][0], parameters['parent_id'][0], token, cookiejar) - start_response('303 See Other', (('Location', common.URL_ORIGIN + '/comments?' + env['QUERY_STRING']),) ) + start_response('303 See Other', (('Location', util.URL_ORIGIN + '/comments?' + env['QUERY_STRING']),) ) else: code = _post_comment(parameters['comment_text'][0], parameters['video_id'][0], token, cookiejar) - start_response('303 See Other', (('Location', common.URL_ORIGIN + '/comments?ctoken=' + comments.make_comment_ctoken(video_id, sort=1)),) ) + start_response('303 See Other', (('Location', util.URL_ORIGIN + '/comments?ctoken=' + comments.make_comment_ctoken(video_id, sort=1)),) ) return b'' @@ -163,10 +163,10 @@ def get_delete_comment_page(env, start_response): page += ''' ''' - return common.yt_basic_template.substitute( + return html_common.yt_basic_template.substitute( page_title = "Delete comment?", style = style, - header = common.get_header(), + header = html_common.get_header(), page = page, ).encode('utf-8') @@ -174,7 +174,7 @@ def get_post_comment_page(env, start_response): start_response('200 OK', [('Content-type','text/html'),]) parameters = env['parameters'] video_id = parameters['video_id'][0] - parent_id = common.default_multi_get(parameters, 'parent_id', 0, default='') + parent_id = util.default_multi_get(parameters, 'parent_id', 0, default='') style = ''' main{ display: grid; @@ -194,23 +194,23 @@ textarea{ }''' if parent_id: # comment reply comment_box = comments.comment_box_template.substitute( - form_action = common.URL_ORIGIN + '/comments?parent_id=' + parent_id + "&video_id=" + video_id, + form_action = util.URL_ORIGIN + '/comments?parent_id=' + parent_id + "&video_id=" + video_id, video_id_input = '', post_text = "Post reply", options=comments.comment_box_account_options(), ) else: comment_box = comments.comment_box_template.substitute( - form_action = common.URL_ORIGIN + '/post_comment', + form_action = util.URL_ORIGIN + '/post_comment', video_id_input = '''''', post_text = "Post comment", options=comments.comment_box_account_options(), ) page = '''
\n''' + comment_box + '''
\n''' - return common.yt_basic_template.substitute( + return html_common.yt_basic_template.substitute( page_title = "Post comment reply" if parent_id else "Post a comment", style = style, - header = common.get_header(), + header = html_common.get_header(), page = page, ).encode('utf-8') diff --git a/youtube/search.py b/youtube/search.py index db65eaa..0cef0f3 100644 --- a/youtube/search.py +++ b/youtube/search.py @@ -1,11 +1,12 @@ +from youtube import util, html_common, yt_data_extract, proto + import json import urllib import html from string import Template import base64 from math import ceil -from youtube.common import default_multi_get, get_thumbnail_url, URL_ORIGIN -from youtube import common, proto + with open("yt_search_results_template.html", "r") as file: yt_search_results_template = file.read() @@ -54,7 +55,7 @@ def get_search_json(query, page, autocorrect, sort, filters): 'X-YouTube-Client-Version': '2.20180418', } url += "&pbj=1&sp=" + page_number_to_sp_parameter(page, autocorrect, sort, filters).replace("=", "%3D") - content = common.fetch_url(url, headers=headers, report_text="Got search results") + content = util.fetch_url(url, headers=headers, report_text="Got search results") info = json.loads(content) return info @@ -70,9 +71,9 @@ def get_search_page(env, start_response): start_response('200 OK', [('Content-type','text/html'),]) parameters = env['parameters'] if len(parameters) == 0: - return common.yt_basic_template.substitute( + return html_common.yt_basic_template.substitute( page_title = "Search", - header = common.get_header(), + header = html_common.get_header(), style = '', page = '', ).encode('utf-8') @@ -100,24 +101,24 @@ def get_search_page(env, start_response): renderer = renderer[type] corrected_query_string = parameters.copy() corrected_query_string['query'] = [renderer['correctedQueryEndpoint']['searchEndpoint']['query']] - corrected_query_url = URL_ORIGIN + '/search?' + urllib.parse.urlencode(corrected_query_string, doseq=True) + corrected_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(corrected_query_string, doseq=True) corrections = did_you_mean.substitute( corrected_query_url = corrected_query_url, - corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), + corrected_query = yt_data_extract.format_text_runs(renderer['correctedQuery']['runs']), ) continue if type == 'showingResultsForRenderer': renderer = renderer[type] no_autocorrect_query_string = parameters.copy() no_autocorrect_query_string['autocorrect'] = ['0'] - no_autocorrect_query_url = URL_ORIGIN + '/search?' + urllib.parse.urlencode(no_autocorrect_query_string, doseq=True) + no_autocorrect_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(no_autocorrect_query_string, doseq=True) corrections = showing_results_for.substitute( - corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), + corrected_query = yt_data_extract.format_text_runs(renderer['correctedQuery']['runs']), original_query_url = no_autocorrect_query_url, original_query = html.escape(renderer['originalQuery']['simpleText']), ) continue - result_list_html += common.renderer_html(renderer, current_query_string=env['QUERY_STRING']) + result_list_html += html_common.renderer_html(renderer, current_query_string=env['QUERY_STRING']) page = int(page) if page <= 5: @@ -129,13 +130,13 @@ def get_search_page(env, start_response): result = Template(yt_search_results_template).substitute( - header = common.get_header(query), + header = html_common.get_header(query), results = result_list_html, page_title = query + " - Search", search_box_value = html.escape(query), number_of_results = '{:,}'.format(estimated_results), number_of_pages = '{:,}'.format(estimated_pages), - page_buttons = common.page_buttons_html(page, estimated_pages, URL_ORIGIN + "/search", env['QUERY_STRING']), + page_buttons = html_common.page_buttons_html(page, estimated_pages, util.URL_ORIGIN + "/search", env['QUERY_STRING']), corrections = corrections ) return result.encode('utf-8') diff --git a/youtube/util.py b/youtube/util.py new file mode 100644 index 0000000..2ebd8bc --- /dev/null +++ b/youtube/util.py @@ -0,0 +1,153 @@ +import socks, sockshandler +import gzip +import brotli +import urllib.parse +import re +import time +import settings + + +URL_ORIGIN = "/https://www.youtube.com" + + +class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler): + '''Separate cookiejars for receiving and sending''' + def __init__(self, cookiejar_send=None, cookiejar_receive=None): + import http.cookiejar + self.cookiejar_send = cookiejar_send + self.cookiejar_receive = cookiejar_receive + + def http_request(self, request): + if self.cookiejar_send is not None: + self.cookiejar_send.add_cookie_header(request) + return request + + def http_response(self, request, response): + if self.cookiejar_receive is not None: + self.cookiejar_receive.extract_cookies(response, request) + return response + + https_request = http_request + https_response = http_response + + +def decode_content(content, encoding_header): + encodings = encoding_header.replace(' ', '').split(',') + for encoding in reversed(encodings): + if encoding == 'identity': + continue + if encoding == 'br': + content = brotli.decompress(content) + elif encoding == 'gzip': + content = gzip.decompress(content) + return content + +def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookiejar_send=None, cookiejar_receive=None, use_tor=True): + ''' + When cookiejar_send is set to a CookieJar object, + those cookies will be sent in the request (but cookies in response will not be merged into it) + When cookiejar_receive is set to a CookieJar object, + cookies received in the response will be merged into the object (nothing will be sent from it) + When both are set to the same object, cookies will be sent from the object, + and response cookies will be merged into it. + ''' + headers = dict(headers) # Note: Calling dict() on a dict will make a copy + headers['Accept-Encoding'] = 'gzip, br' + + # prevent python version being leaked by urllib if User-Agent isn't provided + # (urllib will use ex. Python-urllib/3.6 otherwise) + if 'User-Agent' not in headers and 'user-agent' not in headers and 'User-agent' not in headers: + headers['User-Agent'] = 'Python-urllib' + + if data is not None: + if isinstance(data, str): + data = data.encode('ascii') + elif not isinstance(data, bytes): + data = urllib.parse.urlencode(data).encode('ascii') + + start_time = time.time() + + + req = urllib.request.Request(url, data=data, headers=headers) + + cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive) + + if use_tor and settings.route_tor: + opener = urllib.request.build_opener(sockshandler.SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150), cookie_processor) + else: + opener = urllib.request.build_opener(cookie_processor) + + response = opener.open(req, timeout=timeout) + response_time = time.time() + + + content = response.read() + read_finish = time.time() + if report_text: + print(report_text, ' Latency:', round(response_time - start_time,3), ' Read time:', round(read_finish - response_time,3)) + content = decode_content(content, response.getheader('Content-Encoding', default='identity')) + return content + +mobile_user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1' +mobile_ua = (('User-Agent', mobile_user_agent),) +desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0' +desktop_ua = (('User-Agent', desktop_user_agent),) + + + + + + + + + + +def dict_add(*dicts): + for dictionary in dicts[1:]: + dicts[0].update(dictionary) + return dicts[0] + +def video_id(url): + url_parts = urllib.parse.urlparse(url) + return urllib.parse.parse_qs(url_parts.query)['v'][0] + +def default_multi_get(object, *keys, default): + ''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors ''' + try: + for key in keys: + object = object[key] + return object + except (IndexError, KeyError): + return default + + +# default, sddefault, mqdefault, hqdefault, hq720 +def get_thumbnail_url(video_id): + return "/i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" + +def seconds_to_timestamp(seconds): + seconds = int(seconds) + hours, seconds = divmod(seconds,3600) + minutes, seconds = divmod(seconds,60) + if hours != 0: + timestamp = str(hours) + ":" + timestamp += str(minutes).zfill(2) # zfill pads with zeros + else: + timestamp = str(minutes) + + timestamp += ":" + str(seconds).zfill(2) + return timestamp + + + +def update_query_string(query_string, items): + parameters = urllib.parse.parse_qs(query_string) + parameters.update(items) + return urllib.parse.urlencode(parameters, doseq=True) + + + +def uppercase_escape(s): + return re.sub( + r'\\U([0-9a-fA-F]{8})', + lambda m: chr(int(m.group(1), base=16)), s) \ No newline at end of file diff --git a/youtube/watch.py b/youtube/watch.py index 04a5b5d..e8c10a2 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -1,12 +1,12 @@ +from youtube import util, html_common, comments + from youtube_dl.YoutubeDL import YoutubeDL from youtube_dl.extractor.youtube import YoutubeError import json import urllib from string import Template import html -import youtube.common as common -from youtube.common import default_multi_get, get_thumbnail_url, video_id, URL_ORIGIN -import youtube.comments as comments + import gevent import settings import os @@ -127,9 +127,9 @@ def get_related_items_html(info): result = "" for item in info['related_vids']: if 'list' in item: # playlist: - result += common.small_playlist_item_html(watch_page_related_playlist_info(item)) + result += html_common.small_playlist_item_html(watch_page_related_playlist_info(item)) else: - result += common.small_video_item_html(watch_page_related_video_info(item)) + result += html_common.small_video_item_html(watch_page_related_video_info(item)) return result @@ -137,7 +137,7 @@ def get_related_items_html(info): # converts these to standard names def watch_page_related_video_info(item): result = {key: item[key] for key in ('id', 'title', 'author')} - result['duration'] = common.seconds_to_timestamp(item['length_seconds']) + result['duration'] = util.seconds_to_timestamp(item['length_seconds']) try: result['views'] = item['short_view_count_text'] except KeyError: @@ -155,9 +155,9 @@ def watch_page_related_playlist_info(item): def sort_formats(info): sorted_formats = info['formats'].copy() - sorted_formats.sort(key=lambda x: default_multi_get(_formats, x['format_id'], 'height', default=0)) + sorted_formats.sort(key=lambda x: util.default_multi_get(_formats, x['format_id'], 'height', default=0)) for index, format in enumerate(sorted_formats): - if default_multi_get(_formats, format['format_id'], 'height', default=0) >= 360: + if util.default_multi_get(_formats, format['format_id'], 'height', default=0) >= 360: break sorted_formats = sorted_formats[index:] + sorted_formats[0:index] sorted_formats = [format for format in info['formats'] if format['acodec'] != 'none' and format['vcodec'] != 'none'] @@ -236,7 +236,7 @@ def get_watch_page(env, start_response): start_response('200 OK', [('Content-type','text/html'),]) - lc = common.default_multi_get(env['parameters'], 'lc', 0, default='') + lc = util.default_multi_get(env['parameters'], 'lc', 0, default='') if settings.route_tor: proxy = 'socks5://127.0.0.1:9150/' else: @@ -256,17 +256,17 @@ def get_watch_page(env, start_response): #chosen_format = choose_format(info) if isinstance(info, str): # youtube error - return common.yt_basic_template.substitute( + return html_common.yt_basic_template.substitute( page_title = "Error", style = "", - header = common.get_header(), + header = html_common.get_header(), page = html.escape(info), ).encode('utf-8') sorted_formats = sort_formats(info) video_info = { - "duration": common.seconds_to_timestamp(info["duration"]), + "duration": util.seconds_to_timestamp(info["duration"]), "id": info['id'], "title": info['title'], "author": info['uploader'], @@ -338,7 +338,7 @@ def get_watch_page(env, start_response): page = yt_watch_template.substitute( video_title = html.escape(info["title"]), page_title = html.escape(info["title"]), - header = common.get_header(), + header = html_common.get_header(), uploader = html.escape(info["uploader"]), uploader_channel_url = '/' + info["uploader_url"], upload_date = upload_date, diff --git a/youtube/youtube.py b/youtube/youtube.py index b6b12fb..a6a216e 100644 --- a/youtube/youtube.py +++ b/youtube/youtube.py @@ -1,7 +1,7 @@ import mimetypes import urllib.parse import os -from youtube import local_playlist, watch, search, playlist, channel, comments, common, post_comment, accounts +from youtube import local_playlist, watch, search, playlist, channel, comments, post_comment, accounts, util import settings YOUTUBE_FILES = ( "/shared.css", @@ -64,7 +64,7 @@ def youtube(env, start_response): elif path.startswith("/api/"): start_response('200 OK', [('Content-type', 'text/vtt'),] ) - result = common.fetch_url('https://www.youtube.com' + path + ('?' + query_string if query_string else '')) + result = util.fetch_url('https://www.youtube.com' + path + ('?' + query_string if query_string else '')) result = result.replace(b"align:start position:0%", b"") return result diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py new file mode 100644 index 0000000..5483911 --- /dev/null +++ b/youtube/yt_data_extract.py @@ -0,0 +1,205 @@ +import html + +# videos (all of type str): + +# id +# title +# url +# author +# author_url +# thumbnail +# description +# published +# duration +# likes +# dislikes +# views +# playlist_index + +# playlists: + +# id +# title +# url +# author +# author_url +# thumbnail +# description +# updated +# size +# first_video_id + + + + + + + +def get_plain_text(node): + try: + return html.escape(node['simpleText']) + except KeyError: + return unformmated_text_runs(node['runs']) + +def unformmated_text_runs(runs): + result = '' + for text_run in runs: + result += html.escape(text_run["text"]) + return result + +def format_text_runs(runs): + if isinstance(runs, str): + return runs + result = '' + for text_run in runs: + if text_run.get("bold", False): + result += "" + html.escape(text_run["text"]) + "" + elif text_run.get('italics', False): + result += "" + html.escape(text_run["text"]) + "" + else: + result += html.escape(text_run["text"]) + return result + + + + + + + + +def get_url(node): + try: + return node['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + except KeyError: + return node['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + + +def get_text(node): + try: + return node['simpleText'] + except KeyError: + pass + try: + return node['runs'][0]['text'] + except IndexError: # empty text runs + return '' + +def get_formatted_text(node): + try: + return node['runs'] + except KeyError: + return node['simpleText'] + +def get_badges(node): + badges = [] + for badge_node in node: + badge = badge_node['metadataBadgeRenderer']['label'] + if badge.lower() != 'new': + badges.append(badge) + return badges + +def get_thumbnail(node): + try: + return node['thumbnails'][0]['url'] # polymer format + except KeyError: + return node['url'] # ajax format + +dispatch = { + +# polymer format + 'title': ('title', get_text), + 'publishedTimeText': ('published', get_text), + 'videoId': ('id', lambda node: node), + 'descriptionSnippet': ('description', get_formatted_text), + 'lengthText': ('duration', get_text), + 'thumbnail': ('thumbnail', get_thumbnail), + 'thumbnails': ('thumbnail', lambda node: node[0]['thumbnails'][0]['url']), + + 'viewCountText': ('views', get_text), + 'numVideosText': ('size', lambda node: get_text(node).split(' ')[0]), # the format is "324 videos" + 'videoCountText': ('size', get_text), + 'playlistId': ('id', lambda node: node), + 'descriptionText': ('description', get_formatted_text), + + 'subscriberCountText': ('subscriber_count', get_text), + 'channelId': ('id', lambda node: node), + 'badges': ('badges', get_badges), + +# ajax format + 'view_count_text': ('views', get_text), + 'num_videos_text': ('size', lambda node: get_text(node).split(' ')[0]), + 'owner_text': ('author', get_text), + 'owner_endpoint': ('author_url', lambda node: node['url']), + 'description': ('description', get_formatted_text), + 'index': ('playlist_index', get_text), + 'short_byline': ('author', get_text), + 'length': ('duration', get_text), + 'video_id': ('id', lambda node: node), + +} + +def renderer_info(renderer): + try: + info = {} + if 'viewCountText' in renderer: # prefer this one as it contains all the digits + info['views'] = get_text(renderer['viewCountText']) + elif 'shortViewCountText' in renderer: + info['views'] = get_text(renderer['shortViewCountText']) + + if 'ownerText' in renderer: + info['author'] = renderer['ownerText']['runs'][0]['text'] + info['author_url'] = renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + try: + overlays = renderer['thumbnailOverlays'] + except KeyError: + pass + else: + for overlay in overlays: + if 'thumbnailOverlayTimeStatusRenderer' in overlay: + info['duration'] = get_text(overlay['thumbnailOverlayTimeStatusRenderer']['text']) + # show renderers don't have videoCountText + elif 'thumbnailOverlayBottomPanelRenderer' in overlay: + info['size'] = get_text(overlay['thumbnailOverlayBottomPanelRenderer']['text']) + + # show renderers don't have playlistId, have to dig into the url to get it + try: + info['id'] = renderer['navigationEndpoint']['watchEndpoint']['playlistId'] + except KeyError: + pass + for key, node in renderer.items(): + if key in ('longBylineText', 'shortBylineText'): + info['author'] = get_text(node) + try: + info['author_url'] = get_url(node) + except KeyError: + pass + + # show renderers don't have thumbnail key at top level, dig into thumbnailRenderer + elif key == 'thumbnailRenderer' and 'showCustomThumbnailRenderer' in node: + info['thumbnail'] = node['showCustomThumbnailRenderer']['thumbnail']['thumbnails'][0]['url'] + else: + try: + simple_key, function = dispatch[key] + except KeyError: + continue + info[simple_key] = function(node) + return info + except KeyError: + print(renderer) + raise + +def ajax_info(item_json): + try: + info = {} + for key, node in item_json.items(): + try: + simple_key, function = dispatch[key] + except KeyError: + continue + info[simple_key] = function(node) + return info + except KeyError: + print(item_json) + raise + + -- cgit v1.2.3 From 1e1f55c9e5d98ba076bc67e7abe9e4d77d84c65b Mon Sep 17 00:00:00 2001 From: James Taylor Date: Mon, 25 Feb 2019 20:47:49 -0800 Subject: Use persistent connections --- youtube/util.py | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 11 deletions(-) (limited to 'youtube') diff --git a/youtube/util.py b/youtube/util.py index 2ebd8bc..e0f4a2c 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -1,14 +1,75 @@ +import settings import socks, sockshandler import gzip import brotli import urllib.parse import re import time -import settings +# The trouble with the requests library: It ships its own certificate bundle via certifi +# instead of using the system certificate store, meaning self-signed certificates +# configured by the user will not work. Some draconian networks block TLS unless a corporate +# certificate is installed on the system. Additionally, some users install a self signed cert +# in order to use programs to modify or monitor requests made by programs on the system. + +# Finally, certificates expire and need to be updated, or are sometimes revoked. Sometimes +# certificate authorites go rogue and need to be untrusted. Since we are going through Tor exit nodes, +# this becomes all the more important. A rogue CA could issue a fake certificate for accounts.google.com, and a +# malicious exit node could use this to decrypt traffic when logging in and retrieve passwords. Examples: +# https://www.engadget.com/2015/10/29/google-warns-symantec-over-certificates/ +# https://nakedsecurity.sophos.com/2013/12/09/serious-security-google-finds-fake-but-trusted-ssl-certificates-for-its-domains-made-in-france/ + +# In the requests documentation it says: +# "Before version 2.16, Requests bundled a set of root CAs that it trusted, sourced from the Mozilla trust store. +# The certificates were only updated once for each Requests version. When certifi was not installed, +# this led to extremely out-of-date certificate bundles when using significantly older versions of Requests. +# For the sake of security we recommend upgrading certifi frequently!" +# (http://docs.python-requests.org/en/master/user/advanced/#ca-certificates) + +# Expecting users to remember to manually update certifi on Linux isn't reasonable in my view. +# On windows, this is even worse since I am distributing all dependencies. This program is not +# updated frequently, and using requests would lead to outdated certificates. Certificates +# should be updated with OS updates, instead of thousands of developers of different programs +# being expected to do this correctly 100% of the time. + +# There is hope that this might be fixed eventually: +# https://github.com/kennethreitz/requests/issues/2966 + +# Until then, I will use a mix of urllib3 and urllib. +import urllib3 +import urllib3.contrib.socks URL_ORIGIN = "/https://www.youtube.com" +connection_pool = urllib3.PoolManager(cert_reqs = 'CERT_REQUIRED') + +old_tor_connection_pool = None +tor_connection_pool = urllib3.contrib.socks.SOCKSProxyManager('socks5://127.0.0.1:9150/', cert_reqs = 'CERT_REQUIRED') + +tor_pool_refresh_time = time.monotonic() # prevent problems due to clock changes + +def get_pool(use_tor): + global old_tor_connection_pool + global tor_connection_pool + global tor_pool_refresh_time + + if not use_tor: + return connection_pool + + # Tor changes circuits after 10 minutes: https://tor.stackexchange.com/questions/262/for-how-long-does-a-circuit-stay-alive + current_time = time.monotonic() + if current_time - tor_pool_refresh_time > 300: # close pool after 5 minutes + tor_connection_pool.clear() + + # Keep a reference for 5 min to avoid it getting garbage collected while sockets still in use + old_tor_connection_pool = tor_connection_pool + + tor_connection_pool = urllib3.contrib.socks.SOCKSProxyManager('socks5://127.0.0.1:9150/', cert_reqs = 'CERT_REQUIRED') + tor_pool_refresh_time = current_time + + return tor_connection_pool + + class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler): '''Separate cookiejars for receiving and sending''' @@ -42,7 +103,7 @@ def decode_content(content, encoding_header): content = gzip.decompress(content) return content -def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookiejar_send=None, cookiejar_receive=None, use_tor=True): +def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookiejar_send=None, cookiejar_receive=None, use_tor=True, return_response=False): ''' When cookiejar_send is set to a CookieJar object, those cookies will be sent in the request (but cookies in response will not be merged into it) @@ -59,7 +120,9 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookieja if 'User-Agent' not in headers and 'user-agent' not in headers and 'User-agent' not in headers: headers['User-Agent'] = 'Python-urllib' + method = "GET" if data is not None: + method = "POST" if isinstance(data, str): data = data.encode('ascii') elif not isinstance(data, bytes): @@ -67,25 +130,38 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookieja start_time = time.time() + if cookiejar_send is not None or cookiejar_receive is not None: # Use urllib + req = urllib.request.Request(url, data=data, headers=headers) - req = urllib.request.Request(url, data=data, headers=headers) + cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive) - cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive) + if use_tor and settings.route_tor: + opener = urllib.request.build_opener(sockshandler.SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150), cookie_processor) + else: + opener = urllib.request.build_opener(cookie_processor) - if use_tor and settings.route_tor: - opener = urllib.request.build_opener(sockshandler.SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150), cookie_processor) - else: - opener = urllib.request.build_opener(cookie_processor) + response = opener.open(req, timeout=timeout) + response_time = time.time() - response = opener.open(req, timeout=timeout) - response_time = time.time() + content = response.read() + + else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them. + pool = get_pool(use_tor and settings.route_tor) + + response = pool.request(method, url, headers=headers, timeout=timeout, preload_content=False) + response_time = time.time() + + content = response.read() + response.release_conn() - content = response.read() read_finish = time.time() if report_text: print(report_text, ' Latency:', round(response_time - start_time,3), ' Read time:', round(read_finish - response_time,3)) content = decode_content(content, response.getheader('Content-Encoding', default='identity')) + + if return_response: + return content, response return content mobile_user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1' -- cgit v1.2.3 From 89daa27be257c6170e7d730529bee9ce1294f73d Mon Sep 17 00:00:00 2001 From: James Taylor Date: Tue, 26 Feb 2019 17:15:51 -0800 Subject: prevent urllib3 from decoding content --- youtube/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube') diff --git a/youtube/util.py b/youtube/util.py index e0f4a2c..9950815 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -149,7 +149,7 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookieja else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them. pool = get_pool(use_tor and settings.route_tor) - response = pool.request(method, url, headers=headers, timeout=timeout, preload_content=False) + response = pool.request(method, url, headers=headers, timeout=timeout, preload_content=False, decode_content=False) response_time = time.time() content = response.read() -- cgit v1.2.3 From 8ceaa34a53ebe4c1fb78342bbf70763641cbd186 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Tue, 26 Feb 2019 17:27:53 -0800 Subject: html_common: remove x_item_html functions --- youtube/html_common.py | 50 -------------------------------------------------- youtube/watch.py | 8 ++++++-- 2 files changed, 6 insertions(+), 52 deletions(-) (limited to 'youtube') diff --git a/youtube/html_common.py b/youtube/html_common.py index 27a582b..8e65a1f 100644 --- a/youtube/html_common.py +++ b/youtube/html_common.py @@ -103,56 +103,6 @@ medium_channel_item_template = Template(''' -# ----- -# HTML -# ----- - -def small_video_item_html(item): - video_info = json.dumps({key: item[key] for key in ('id', 'title', 'author', 'duration')}) - return small_video_item_template.substitute( - title = html.escape(item["title"]), - views = item["views"], - author = html.escape(item["author"]), - duration = item["duration"], - url = util.URL_ORIGIN + "/watch?v=" + item["id"], - thumbnail = util.get_thumbnail_url(item['id']), - video_info = html.escape(video_info), - ) - -def small_playlist_item_html(item): - return small_playlist_item_template.substitute( - title=html.escape(item["title"]), - size = item['size'], - author="", - url = util.URL_ORIGIN + "/playlist?list=" + item["id"], - thumbnail= util.get_thumbnail_url(item['first_video_id']), - ) - -def medium_playlist_item_html(item): - return medium_playlist_item_template.substitute( - title=html.escape(item["title"]), - size = item['size'], - author=item['author'], - author_url= util.URL_ORIGIN + item['author_url'], - url = util.URL_ORIGIN + "/playlist?list=" + item["id"], - thumbnail= item['thumbnail'], - ) - -def medium_video_item_html(medium_video_info): - info = medium_video_info - - return medium_video_item_template.substitute( - title=html.escape(info["title"]), - views=info["views"], - published = info["published"], - description = yt_data_extract.format_text_runs(info["description"]), - author=html.escape(info["author"]), - author_url=info["author_url"], - duration=info["duration"], - url = util.URL_ORIGIN + "/watch?v=" + info["id"], - thumbnail=info['thumbnail'], - datetime='', # TODO - ) header_template = Template(''' diff --git a/youtube/watch.py b/youtube/watch.py index e8c10a2..06b525a 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -127,9 +127,11 @@ def get_related_items_html(info): result = "" for item in info['related_vids']: if 'list' in item: # playlist: - result += html_common.small_playlist_item_html(watch_page_related_playlist_info(item)) + item = watch_page_related_playlist_info(item) + result += html_common.playlist_item_html(item, html_common.small_playlist_item_template) else: - result += html_common.small_video_item_html(watch_page_related_video_info(item)) + item = watch_page_related_video_info(item) + result += html_common.video_item_html(item, html_common.small_video_item_template) return result @@ -142,6 +144,7 @@ def watch_page_related_video_info(item): result['views'] = item['short_view_count_text'] except KeyError: result['views'] = '' + result['thumbnail'] = util.get_thumbnail_url(item['id']) return result def watch_page_related_playlist_info(item): @@ -150,6 +153,7 @@ def watch_page_related_playlist_info(item): 'title': item['playlist_title'], 'id': item['list'], 'first_video_id': item['video_id'], + 'thumbnail': util.get_thumbnail_url(item['video_id']), } -- cgit v1.2.3 From 0ecf3d08be9206b1067fa37301fb187256dc17ab Mon Sep 17 00:00:00 2001 From: James Taylor Date: Mon, 25 Mar 2019 13:15:54 -0700 Subject: Fix error with channel searches with non-ascii characters --- youtube/proto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube') diff --git a/youtube/proto.py b/youtube/proto.py index 004375a..d966455 100644 --- a/youtube/proto.py +++ b/youtube/proto.py @@ -60,7 +60,7 @@ def unpadded_b64encode(data): def as_bytes(value): if isinstance(value, str): - return value.encode('ascii') + return value.encode('utf-8') return value -- cgit v1.2.3 From 9f93b9429c77e631972186049fbc7518e2cf5d4b Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 9 May 2019 23:07:43 -0700 Subject: Fix comment parsing error due to comments from deleted channels --- youtube/comments.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'youtube') diff --git a/youtube/comments.py b/youtube/comments.py index 4087b47..94b086e 100644 --- a/youtube/comments.py +++ b/youtube/comments.py @@ -234,10 +234,7 @@ def parse_comments_polymer(content, replies=False): comment_raw = comment_raw['commentRenderer'] comment = { - 'author': yt_data_extract.get_plain_text(comment_raw['authorText']), - 'author_url': comment_raw['authorEndpoint']['commandMetadata']['webCommandMetadata']['url'], - 'author_channel_id': comment_raw['authorEndpoint']['browseEndpoint']['browseId'], - 'author_id': comment_raw['authorId'], + 'author_id': comment_raw.get('authorId', ''), 'author_avatar': comment_raw['authorThumbnail']['thumbnails'][0]['url'], 'likes': comment_raw['likeCount'], 'published': yt_data_extract.get_plain_text(comment_raw['publishedTimeText']), @@ -247,6 +244,16 @@ def parse_comments_polymer(content, replies=False): 'video_id': video_id, 'comment_id': comment_raw['commentId'], } + + if 'authorText' in comment_raw: # deleted channels have no name or channel link + comment['author'] = yt_data_extract.get_plain_text(comment_raw['authorText']) + comment['author_url'] = comment_raw['authorEndpoint']['commandMetadata']['webCommandMetadata']['url'] + comment['author_channel_id'] = comment_raw['authorEndpoint']['browseEndpoint']['browseId'] + else: + comment['author'] = '' + comment['author_url'] = '' + comment['author_channel_id'] = '' + comments.append(comment) except Exception as e: print('Error parsing comments: ' + str(e)) -- cgit v1.2.3