From 98157bf1bf1223ffa7556d2d21cfac6f07675f9d Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sat, 30 Jun 2018 23:34:46 -0700 Subject: initial commit --- .gitignore | 9 + server.py | 141 +++++++++ youtube/channel.py | 252 ++++++++++++++++ youtube/comments.css | 59 ++++ youtube/comments.py | 166 +++++++++++ youtube/common.py | 639 ++++++++++++++++++++++++++++++++++++++++ youtube/opensearch.xml | 11 + youtube/playlist.py | 243 +++++++++++++++ youtube/proto.py | 65 ++++ youtube/search.py | 231 +++++++++++++++ youtube/shared.css | 271 +++++++++++++++++ youtube/subscriptions.py | 18 ++ youtube/template.py | 132 +++++++++ youtube/watch.py | 294 ++++++++++++++++++ youtube/watch_later.py | 11 + youtube/youtube.py | 60 ++++ yt_channel_about_template.html | 128 ++++++++ yt_channel_items_template.html | 134 +++++++++ yt_comments_template.html | 62 ++++ yt_playlist_template.html | 132 +++++++++ yt_search_results_template.html | 105 +++++++ yt_search_template.html | 108 +++++++ yt_watch_template.html | 148 ++++++++++ 23 files changed, 3419 insertions(+) create mode 100644 .gitignore create mode 100644 server.py create mode 100644 youtube/channel.py create mode 100644 youtube/comments.css create mode 100644 youtube/comments.py create mode 100644 youtube/common.py create mode 100644 youtube/opensearch.xml create mode 100644 youtube/playlist.py create mode 100644 youtube/proto.py create mode 100644 youtube/search.py create mode 100644 youtube/shared.css create mode 100644 youtube/subscriptions.py create mode 100644 youtube/template.py create mode 100644 youtube/watch.py create mode 100644 youtube/watch_later.py create mode 100644 youtube/youtube.py create mode 100644 yt_channel_about_template.html create mode 100644 yt_channel_items_template.html create mode 100644 yt_comments_template.html create mode 100644 yt_playlist_template.html create mode 100644 yt_search_results_template.html create mode 100644 yt_search_template.html create mode 100644 yt_watch_template.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b0d993e --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +__pycache__/ +*.py[cod] +*$py.class +youtube_dl/ +banned_addresses.txt +youtube/common_old.py +youtube/common_older.py +youtube/watch_old.py +youtube/watch_later.txt \ No newline at end of file diff --git a/server.py b/server.py new file mode 100644 index 0000000..cc88d05 --- /dev/null +++ b/server.py @@ -0,0 +1,141 @@ +from gevent import monkey +monkey.patch_all() +import gevent.socket + +from gevent.pywsgi import WSGIServer +from youtube.youtube import youtube +import urllib +import socket +import socks +import subprocess +import re + +ROUTE_TOR = True +TOR_PATH = ***REMOVED*** +PORT_NUMBER=80 +ALLOW_FOREIGN_ADDRESSES=True + +BAN_FILE = "banned_addresses.txt" +with open(BAN_FILE, 'r') as f: + banned_addresses = f.read().splitlines() + +def ban_address(address): + banned_addresses.append(address) + with open(BAN_FILE, 'a') as f: + f.write(address + "\n") + + +def youtu_be(env, start_response): + id = env['PATH_INFO'][1:] + env['PATH_INFO'] = '/watch' + env['QUERY_STRING'] = 'v=' + id + return youtube(env, start_response) + +def proxy_site(env, start_response): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)', + 'Accept': '*/*', + } + url = "https://" + env['SERVER_NAME'] + env['PATH_INFO'] + if env['QUERY_STRING']: + url += '?' + env['QUERY_STRING'] + req = urllib.request.Request(url, headers=headers) + response = urllib.request.urlopen(req, timeout = 10) + start_response('200 OK', () ) + return response.read() + +site_handlers = { + 'youtube.com':youtube, + 'youtu.be':youtu_be, + 'ytimg.com': proxy_site, + 'yt3.ggpht.com': proxy_site, + 'lh3.googleusercontent.com': proxy_site, + +} + +def split_url(url): + ''' Split https://sub.example.com/foo/bar.html into ('sub.example.com', '/foo/bar.html')''' + # XXX: Is this regex safe from REDOS? + # python STILL doesn't have a proper regular expression engine like grep uses built in... + match = re.match(r'(?:https?://)?([\w-]+(?:\.[\w-]+)+?)(/.*|$)', url) + if match is None: + raise ValueError('Invalid or unsupported url: ' + url) + + return match.group(1), match.group(2) + + + +def error_code(code, start_response): + start_response(code, ()) + return code.encode() + +def site_dispatch(env, start_response): + client_address = env['REMOTE_ADDR'] + try: + method = env['REQUEST_METHOD'] + path = env['PATH_INFO'] + if client_address in banned_addresses: + yield error_code('403 Fuck Off', start_response) + return + if method=="POST" and client_address not in ('127.0.0.1', '::1'): + yield error_code('403 Forbidden', start_response) + return + if "phpmyadmin" in path or (path == "/" and method == "HEAD"): + ban_address(client_address) + start_response('403 Fuck Off', ()) + yield b'403 Fuck Off' + return + + '''if env['QUERY_STRING']: + path += '?' + env['QUERY_STRING']''' + #path_parts = urllib.parse.urlparse(path) + try: + env['SERVER_NAME'], env['PATH_INFO'] = split_url(path[1:]) + except ValueError: + yield error_code('404 Not Found', start_response) + return + + base_name = '' + for domain in reversed(env['SERVER_NAME'].split('.')): + if base_name == '': + base_name = domain + else: + base_name = domain + '.' + base_name + + try: + handler = site_handlers[base_name] + except KeyError: + continue + else: + yield handler(env, start_response) + break + else: # did not break + yield error_code('404 Not Found', start_response) + return + + + except (socket.error, ConnectionAbortedError) as e: + start_response('500 Internal Server Error', ()) + print(str(e)) + yield b'500 Internal Server Error' + + except Exception: + start_response('500 Internal Server Error', ()) + raise + return + + + + +if ROUTE_TOR: + #subprocess.Popen(TOR_PATH) + socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 9150) + socket.socket = socks.socksocket + gevent.socket.socket = socks.socksocket + +if ALLOW_FOREIGN_ADDRESSES: + server = WSGIServer(('0.0.0.0', PORT_NUMBER), site_dispatch) +else: + server = WSGIServer(('127.0.0.1', PORT_NUMBER), site_dispatch) +print('Started httpserver on port ' , PORT_NUMBER) +server.serve_forever() diff --git a/youtube/channel.py b/youtube/channel.py new file mode 100644 index 0000000..d993d3b --- /dev/null +++ b/youtube/channel.py @@ -0,0 +1,252 @@ +import base64 +import youtube.common as common +from youtube.common import default_multi_get, URL_ORIGIN, get_thumbnail_url, video_id +import urllib +import json +from string import Template +import youtube.proto as proto +import html +import math +import gevent +import re +import functools + +with open("yt_channel_items_template.html", "r") as file: + yt_channel_items_template = Template(file.read()) + +with open("yt_channel_about_template.html", "r") as file: + yt_channel_about_template = Template(file.read()) + +'''continuation = Proto( + Field('optional', 'continuation', 80226972, Proto( + Field('optional', 'browse_id', 2, String), + Field('optional', 'params', 3, Base64(Proto( + Field('optional', 'channel_tab', 2, String), + Field('optional', 'sort', 3, ENUM + Field('optional', 'page', 15, String), + ))) + )) +)''' + + +'''channel_continuation = Proto( + Field('optional', 'pointless_nest', 80226972, Proto( + Field('optional', 'channel_id', 2, String), + Field('optional', 'continuation_info', 3, Base64(Proto( + Field('optional', 'channel_tab', 2, String), + Field('optional', 'sort', 3, ENUM + Field('optional', 'page', 15, String), + ))) + )) +)''' + +headers_1 = ( + ('Accept', '*/*'), + ('Accept-Language', 'en-US,en;q=0.5'), + ('X-YouTube-Client-Name', '1'), + ('X-YouTube-Client-Version', '2.20180614'), +) +# https://www.youtube.com/browse_ajax?action_continuation=1&direct_render=1&continuation=4qmFsgJAEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk1yZ0JBQSUzRCUzRA%3D%3D +# https://www.youtube.com/browse_ajax?ctoken=4qmFsgJAEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk1yZ0JBQSUzRCUzRA%3D%3D&continuation=4qmFsgJAEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk1yZ0JBQSUzRCUzRA%3D%3D&itct=CDsQybcCIhMIhZi1krTc2wIVjMicCh2HXQnhKJsc + +# grid view: 4qmFsgJAEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk1yZ0JBQSUzRCUzRA +# list view: 4qmFsgJCEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJkVnWjJhV1JsYjNNWUF5QUFNQUk0QVdBQmFnQjZBVEs0QVFBJTNE +# SORT: +# Popular - 1 +# Oldest - 2 +# Newest - 3 + +# view: +# grid: 0 or 1 +# list: 2 +def channel_ctoken(channel_id, page, sort, tab, view=1): + + tab = proto.string(2, tab ) + sort = proto.uint(3, int(sort)) + page = proto.string(15, str(page) ) + view = proto.uint(6, int(view)) + continuation_info = proto.string( 3, proto.percent_b64encode(tab + view + sort + page) ) + + channel_id = proto.string(2, channel_id ) + pointless_nest = proto.string(80226972, channel_id + continuation_info) + + return base64.urlsafe_b64encode(pointless_nest).decode('ascii') + +def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1): + ctoken = channel_ctoken(channel_id, page, sort, tab, view).replace('=', '%3D') + url = "https://www.youtube.com/browse_ajax?ctoken=" + ctoken + + print("Sending channel tab ajax request") + content = common.fetch_url(url, headers_1) + print("Finished recieving channel tab response") + + info = json.loads(content) + return info + + +grid_video_item_template = Template(''' +
+
+ + + $duration + + $title + + $views + + +
+ +
+''') + +def grid_video_item_info(grid_video_renderer, author): + renderer = grid_video_renderer + return { + "title": renderer['title']['simpleText'], + "id": renderer['videoId'], + "views": renderer['viewCountText'].get('simpleText', None) or renderer['viewCountText']['runs'][0]['text'], + "author": author, + "duration": default_multi_get(renderer, 'lengthText', 'simpleText', default=''), # livestreams dont have a length + "published": default_multi_get(renderer, 'publishedTimeText', 'simpleText', default=''), + } + +def grid_video_item_html(item): + video_info = json.dumps({key: item[key] for key in ('id', 'title', 'author', 'duration')}) + return grid_video_item_template.substitute( + title = html.escape(item["title"]), + views = item["views"], + duration = item["duration"], + url = URL_ORIGIN + "/watch?v=" + item["id"], + thumbnail = get_thumbnail_url(item['id']), + video_info = html.escape(json.dumps(video_info)), + published = item["published"], + datetime = '', # TODO + ) + +def get_number_of_videos(channel_id): + # Uploads playlist + playlist_id = 'UU' + channel_id[2:] + url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&ajax=1&disable_polymer=true' + print("Getting number of videos") + response = common.fetch_url(url, common.mobile_ua + headers_1) + with open('playlist_debug_metadata', 'wb') as f: + f.write(response) + response = response.decode('utf-8') + print("Got response for number of videos") + return int(re.search(r'"num_videos_text":\s*{(?:"item_type":\s*"formatted_string",)?\s*"runs":\s*\[{"text":\s*"([\d,]*) videos"', response).group(1).replace(',','')) + +@functools.lru_cache(maxsize=128) +def get_channel_id(username): + # method that gives the smallest possible response at ~10 kb + # needs to be as fast as possible + url = 'https://m.youtube.com/user/' + username + '/about?ajax=1&disable_polymer=true' + response = common.fetch_url(url, common.mobile_ua + headers_1).decode('utf-8') + return re.search(r'"channel_id":\s*"([a-zA-Z0-9_-]*)"', response).group(1) + + +def channel_videos_html(polymer_json, current_page=1, number_of_videos = 1000, current_query_string=''): + microformat = polymer_json[1]['response']['microformat']['microformatDataRenderer'] + channel_url = microformat['urlCanonical'].rstrip('/') + channel_id = channel_url[channel_url.rfind('/')+1:] + try: + items = polymer_json[1]['response']['continuationContents']['gridContinuation']['items'] + except KeyError: + items = polymer_json[1]['response']['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['gridRenderer']['items'] + items_html = '' + for video in items: + items_html += grid_video_item_html(grid_video_item_info(video['gridVideoRenderer'], microformat['title'])) + + return yt_channel_items_template.substitute( + channel_title = microformat['title'], + channel_about_url = URL_ORIGIN + "/channel/" + channel_id + "/about", + avatar = '/' + microformat['thumbnail']['thumbnails'][0]['url'], + page_title = microformat['title'] + ' - Channel', + items = items_html, + page_buttons = common.page_buttons_html(current_page, math.ceil(number_of_videos/30), URL_ORIGIN + "/channel/" + channel_id + "/videos", current_query_string) + ) + +channel_link_template = Template(''' +$text''') +stat_template = Template(''' +
  • $stat_value
  • ''') +def channel_about_page(polymer_json): + avatar = '/' + polymer_json[1]['response']['microformat']['microformatDataRenderer']['thumbnail']['thumbnails'][0]['url'] + # my goodness... + channel_metadata = polymer_json[1]['response']['contents']['twoColumnBrowseResultsRenderer']['tabs'][5]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'] + channel_links = '' + for link_json in channel_metadata['primaryLinks']: + channel_links += channel_link_template.substitute( + url = html.escape(link_json['navigationEndpoint']['urlEndpoint']['url']), + text = common.get_plain_text(link_json['title']), + ) + + stats = '' + for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'): + try: + stat_value = common.get_plain_text(channel_metadata[stat_name]) + except KeyError: + continue + else: + stats += stat_template.substitute(stat_value=stat_value) + try: + description = common.format_text_runs(common.get_formatted_text(channel_metadata['description'])) + except KeyError: + description = '' + return yt_channel_about_template.substitute( + page_title = common.get_plain_text(channel_metadata['title']) + ' - About', + channel_title = common.get_plain_text(channel_metadata['title']), + avatar = html.escape(avatar), + description = description, + links = channel_links, + stats = stats, + channel_videos_url = common.URL_ORIGIN + '/channel/' + channel_metadata['channelId'] + '/videos', + ) + +def get_channel_page(url, query_string=''): + path_components = url.rstrip('/').lstrip('/').split('/') + channel_id = path_components[0] + try: + tab = path_components[1] + except IndexError: + tab = 'videos' + + parameters = urllib.parse.parse_qs(query_string) + page_number = int(common.default_multi_get(parameters, 'page', 0, default='1')) + sort = common.default_multi_get(parameters, 'sort', 0, default='3') + view = common.default_multi_get(parameters, 'view', 0, default='1') + + if tab == 'videos': + tasks = ( + gevent.spawn(get_number_of_videos, channel_id ), + gevent.spawn(get_channel_tab, channel_id, page_number, sort, 'videos', view) + ) + gevent.joinall(tasks) + number_of_videos, polymer_json = tasks[0].value, tasks[1].value + + return channel_videos_html(polymer_json, page_number, number_of_videos, query_string) + elif tab == 'about': + polymer_json = common.fetch_url('https://www.youtube.com/channel/' + channel_id + '/about?pbj=1', headers_1) + polymer_json = json.loads(polymer_json) + return channel_about_page(polymer_json) + else: + raise ValueError('Unknown channel tab: ' + tab) + +def get_user_page(url, query_string=''): + path_components = url.rstrip('/').lstrip('/').split('/') + username = path_components[0] + try: + page = path_components[1] + except IndexError: + page = 'videos' + if page == 'videos': + polymer_json = common.fetch_url('https://www.youtube.com/user/' + username + '/videos?pbj=1', headers_1) + polymer_json = json.loads(polymer_json) + return channel_videos_html(polymer_json) + elif page == 'about': + polymer_json = common.fetch_url('https://www.youtube.com/user/' + username + '/about?pbj=1', headers_1) + polymer_json = json.loads(polymer_json) + return channel_about_page(polymer_json) + else: + raise ValueError('Unknown channel page: ' + page) \ No newline at end of file diff --git a/youtube/comments.css b/youtube/comments.css new file mode 100644 index 0000000..93a6495 --- /dev/null +++ b/youtube/comments.css @@ -0,0 +1,59 @@ +.comments{ + grid-row-gap: 10px; + display: grid; + align-content:start; +} + +.comment{ + display:grid; + grid-template-columns: 0fr 0fr 1fr; + grid-template-rows: 0fr 0fr 0fr 0fr; + background-color: #dadada; +} + +.comment .author-avatar{ + grid-column: 1; + grid-row: 1 / span 3; + align-self: start; + margin-right: 5px; +} + +.comment address{ + grid-column: 2; + grid-row: 1; + margin-right:15px; + white-space: nowrap; +} + +.comment .text{ + grid-column: 2 / span 2; + grid-row: 2; + white-space: pre-line; + min-width: 0; +} + +.comment time{ + grid-column: 3; + grid-row: 1; + white-space: nowrap; + +} + + +.comment .likes{ + grid-column:2; + grid-row:3; + font-weight:bold; + white-space: nowrap; +} + +.comment .replies{ + grid-column:2 / span 2; + grid-row:4; + justify-self:start; +} + +.more-comments{ + justify-self:center; + +} \ No newline at end of file diff --git a/youtube/comments.py b/youtube/comments.py new file mode 100644 index 0000000..4b30a48 --- /dev/null +++ b/youtube/comments.py @@ -0,0 +1,166 @@ +import json +import youtube.proto as proto +import base64 +from youtube.common import uppercase_escape, default_multi_get, format_text_runs, URL_ORIGIN, fetch_url +from string import Template +import urllib.request +import urllib +import html +comment_template = Template(''' +
    +
    + + + +
    + $author +
    + $text + + +$replies +
    + +
    +''') +reply_link_template = Template(''' + View replies +''') +with open("yt_comments_template.html", "r") as file: + yt_comments_template = Template(file.read()) + + +# $replies_link_text + + +# Here's what I know about the secret key (starting with ASJN_i) +# *The secret key definitely contains the following information (or perhaps the information is stored at youtube's servers): +# -Video id +# -Offset +# -Sort +# *If the video id or sort in the ctoken contradicts the ASJN, the response is an error. The offset encoded outside the ASJN is ignored entirely. +# *The ASJN is base64 encoded data, indicated by the fact that the character after "ASJN_i" is one of ("0", "1", "2", "3") +# *The encoded data is not valid protobuf +# *The encoded data (after the 5 or so bytes that are always the same) is indistinguishable from random data according to a battery of randomness tests +# *The ASJN in the ctoken provided by a response changes in regular intervals of about a second or two. +# *Old ASJN's continue to work, and start at the same comment even if new comments have been posted since +# *The ASJN has no relation with any of the data in the response it came from + +def make_comment_ctoken(video_id, sort=0, offset=0, secret_key=''): + video_id = proto.as_bytes(video_id) + secret_key = proto.as_bytes(secret_key) + + + page_info = proto.string(4,video_id) + proto.uint(6, sort) + offset_information = proto.nested(4, page_info) + proto.uint(5, offset) + if secret_key: + offset_information = proto.string(1, secret_key) + offset_information + + result = proto.nested(2, proto.string(2, video_id)) + proto.uint(3,6) + proto.nested(6, offset_information) + return base64.urlsafe_b64encode(result).decode('ascii') + +mobile_headers = { + 'Host': 'm.youtube.com', + 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1', + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.5', + 'X-YouTube-Client-Name': '2', + 'X-YouTube-Client-Version': '1.20180613', +} +def request_comments(ctoken, replies=False): + if replies: # let's make it use different urls for no reason despite all the data being encoded + base_url = "https://m.youtube.com/watch_comment?action_get_comment_replies=1&ctoken=" + else: + base_url = "https://m.youtube.com/watch_comment?action_get_comments=1&ctoken=" + url = base_url + ctoken.replace("=", "%3D") + "&pbj=1" + print("Sending comments ajax request") + for i in range(0,8): # don't retry more than 8 times + content = fetch_url(url, headers=mobile_headers) + if content[0:4] == b")]}'": # random closing characters included at beginning of response for some reason + content = content[4:] + elif content[0:10] == b'\n, retrying") + continue + break + '''with open('comments_debug', 'wb') as f: + f.write(content)''' + return content + +def parse_comments(content, replies=False): + try: + content = json.loads(uppercase_escape(content.decode('utf-8'))) + #print(content) + comments_raw = content['content']['continuation_contents']['contents'] + ctoken = default_multi_get(content, 'content', 'continuation_contents', 'continuations', 0, 'continuation', default='') + + comments = [] + for comment_raw in comments_raw: + replies_url = '' + if not replies: + if comment_raw['replies'] is not None: + ctoken = comment_raw['replies']['continuations'][0]['continuation'] + replies_url = URL_ORIGIN + '/comments?ctoken=' + ctoken + "&replies=1" + comment_raw = comment_raw['comment'] + comment = { + 'author': comment_raw['author']['runs'][0]['text'], + 'author_url': comment_raw['author_endpoint']['url'], + 'author_avatar': comment_raw['author_thumbnail']['url'], + 'likes': comment_raw['like_count'], + 'published': comment_raw['published_time']['runs'][0]['text'], + 'text': comment_raw['content']['runs'], + 'reply_count': '', + 'replies_url': replies_url, + } + comments.append(comment) + except Exception as e: + print('Error parsing comments: ' + str(e)) + comments = () + ctoken = '' + else: + print("Finished getting and parsing comments") + return {'ctoken': ctoken, 'comments': comments} + +def get_comments_html(result): + html_result = '' + for comment in result['comments']: + replies = '' + if comment['replies_url']: + replies = reply_link_template.substitute(url=comment['replies_url']) + html_result += comment_template.substitute( + author=html.escape(comment['author']), + author_url = URL_ORIGIN + comment['author_url'], + author_avatar = '/' + comment['author_avatar'], + likes = str(comment['likes']) + ' likes' if str(comment['likes']) != '0' else '', + published = comment['published'], + text = format_text_runs(comment['text']), + datetime = '', #TODO + replies=replies, + #replies='', + ) + return html_result, result['ctoken'] + +def video_comments(video_id, sort=0, offset=0, secret_key=''): + result = parse_comments(request_comments(make_comment_ctoken(video_id, sort, offset, secret_key))) + return get_comments_html(result) + +more_comments_template = Template('''More comments''') + +def get_comments_page(query_string): + parameters = urllib.parse.parse_qs(query_string) + ctoken = parameters['ctoken'][0] + replies = default_multi_get(parameters, 'replies', 0, default="0") == "1" + + result = parse_comments(request_comments(ctoken, replies), replies) + comments_html, ctoken = get_comments_html(result) + if ctoken == '': + more_comments_button = '' + else: + more_comments_button = more_comments_template.substitute(url = URL_ORIGIN + '/comments?ctoken=' + ctoken) + + return yt_comments_template.substitute( + comments = comments_html, + page_title = 'Comments', + more_comments_button=more_comments_button, + ) + diff --git a/youtube/common.py b/youtube/common.py new file mode 100644 index 0000000..67bd81f --- /dev/null +++ b/youtube/common.py @@ -0,0 +1,639 @@ +from youtube.template import Template +import html +import json +import re +import urllib.parse +import gzip +import brotli +import time + + +URL_ORIGIN = "/https://www.youtube.com" + + +# videos (all of type str): + +# id +# title +# url +# author +# author_url +# thumbnail +# description +# published +# duration +# likes +# dislikes +# views +# playlist_index + +# playlists: + +# id +# title +# url +# author +# author_url +# thumbnail +# description +# updated +# size +# first_video_id + + + + + + + +page_button_template = Template('''$page''') +current_page_button_template = Template('''
    $page''') + +medium_playlist_item_template = Template(''' + +''') +medium_video_item_template = Template(''' +
    + + + $duration + + + $title + +
    $stats
    + + + $description + $badges +
    +''') + +small_video_item_template = Template(''' +
    +
    + + + $duration + + $title + +
    $author
    + $views + +
    + +
    +''') + +small_playlist_item_template = Template(''' +
    +
    + + +
    + $size +
    +
    + $title + +
    $author
    +
    +
    +''') + +medium_channel_item_template = Template(''' +
    + + + $duration + + + $title + + $subscriber_count + $size + + $description +
    +''') + + +def fetch_url(url, headers=(), timeout=5, report_text=None): + if isinstance(headers, list): + headers += [('Accept-Encoding', 'gzip, br')] + headers = dict(headers) + elif isinstance(headers, tuple): + headers += (('Accept-Encoding', 'gzip, br'),) + headers = dict(headers) + else: + headers = headers.copy() + headers['Accept-Encoding'] = 'gzip, br' + + start_time = time.time() + + req = urllib.request.Request(url, headers=headers) + response = urllib.request.urlopen(req, timeout=timeout) + response_time = time.time() + + content = response.read() + read_finish = time.time() + if report_text: + print(report_text, 'Latency:', response_time - start_time, ' Read time:', read_finish - response_time) + encodings = response.getheader('Content-Encoding', default='identity').replace(' ', '').split(',') + for encoding in reversed(encodings): + if encoding == 'identity': + continue + if encoding == 'br': + content = brotli.decompress(content) + elif encoding == 'gzip': + content = gzip.decompress(content) + return content + +mobile_ua = (('User-Agent', 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'),) + +def dict_add(*dicts): + for dictionary in dicts[1:]: + dicts[0].update(dictionary) + return dicts[0] + +def video_id(url): + url_parts = urllib.parse.urlparse(url) + return urllib.parse.parse_qs(url_parts.query)['v'][0] + +def uppercase_escape(s): + return re.sub( + r'\\U([0-9a-fA-F]{8})', + lambda m: chr(int(m.group(1), base=16)), s) + +def default_multi_get(object, *keys, default): + ''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors ''' + try: + for key in keys: + object = object[key] + return object + except (IndexError, KeyError): + return default + +def get_plain_text(node): + try: + return html.escape(node['simpleText']) + except KeyError: + return unformmated_text_runs(node['runs']) + +def unformmated_text_runs(runs): + result = '' + for text_run in runs: + result += html.escape(text_run["text"]) + return result + +def format_text_runs(runs): + if isinstance(runs, str): + return runs + result = '' + for text_run in runs: + if text_run.get("bold", False): + result += "" + html.escape(text_run["text"]) + "" + elif text_run.get('italics', False): + result += "" + html.escape(text_run["text"]) + "" + else: + result += html.escape(text_run["text"]) + return result + +# default, sddefault, mqdefault, hqdefault, hq720 +def get_thumbnail_url(video_id): + return "/i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" + +def seconds_to_timestamp(seconds): + seconds = int(seconds) + hours, seconds = divmod(seconds,3600) + minutes, seconds = divmod(seconds,60) + if hours != 0: + timestamp = str(hours) + ":" + timestamp += str(minutes).zfill(2) # zfill pads with zeros + else: + timestamp = str(minutes) + + timestamp += ":" + str(seconds).zfill(2) + return timestamp + +# playlists: + +# id +# title +# url +# author +# author_url +# thumbnail +# description +# updated +# size +# first_video_id +def medium_playlist_item_info(playlist_renderer): + renderer = playlist_renderer + try: + author_url = URL_ORIGIN + renderer['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + except KeyError: # radioRenderer + author_url = '' + try: + thumbnail = renderer['thumbnails'][0]['thumbnails'][0]['url'] + except KeyError: + thumbnail = renderer['thumbnail']['thumbnails'][0]['url'] + return { + "title": renderer["title"]["simpleText"], + 'id': renderer["playlistId"], + 'size': renderer.get('videoCount', '50+'), + "author": default_multi_get(renderer,'longBylineText','runs',0,'text', default='Youtube'), + "author_url": author_url, + 'thumbnail': thumbnail, + } + +def medium_video_item_info(video_renderer): + renderer = video_renderer + try: + return { + "title": renderer["title"]["simpleText"], + "id": renderer["videoId"], + "description": renderer.get("descriptionSnippet",dict()).get('runs',[]), # a list of text runs (formmated), rather than plain text + "thumbnail": get_thumbnail_url(renderer["videoId"]), + "views": renderer['viewCountText'].get('simpleText', None) or renderer['viewCountText']['runs'][0]['text'], + "duration": default_multi_get(renderer, 'lengthText', 'simpleText', default=''), # livestreams dont have a length + "author": renderer['longBylineText']['runs'][0]['text'], + "author_url": URL_ORIGIN + renderer['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + "published": default_multi_get(renderer, 'publishedTimeText', 'simpleText', default=''), + } + except KeyError: + print(renderer) + raise + +def small_video_item_info(compact_video_renderer): + renderer = compact_video_renderer + return { + "title": renderer['title']['simpleText'], + "id": renderer['videoId'], + "views": renderer['viewCountText'].get('simpleText', None) or renderer['viewCountText']['runs'][0]['text'], + "duration": default_multi_get(renderer, 'lengthText', 'simpleText', default=''), # livestreams dont have a length + "author": renderer['longBylineText']['runs'][0]['text'], + "author_url": renderer['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + } + + +# ----- +# HTML +# ----- + +def small_video_item_html(item): + video_info = json.dumps({key: item[key] for key in ('id', 'title', 'author', 'duration')}) + return small_video_item_template.substitute( + title = html.escape(item["title"]), + views = item["views"], + author = html.escape(item["author"]), + duration = item["duration"], + url = URL_ORIGIN + "/watch?v=" + item["id"], + thumbnail = get_thumbnail_url(item['id']), + video_info = html.escape(json.dumps(video_info)), + ) + +def small_playlist_item_html(item): + return small_playlist_item_template.substitute( + title=html.escape(item["title"]), + size = item['size'], + author="", + url = URL_ORIGIN + "/playlist?list=" + item["id"], + thumbnail= get_thumbnail_url(item['first_video_id']), + ) + +def medium_playlist_item_html(item): + return medium_playlist_item_template.substitute( + title=html.escape(item["title"]), + size = item['size'], + author=item['author'], + author_url= URL_ORIGIN + item['author_url'], + url = URL_ORIGIN + "/playlist?list=" + item["id"], + thumbnail= item['thumbnail'], + ) + +def medium_video_item_html(medium_video_info): + info = medium_video_info + + return medium_video_item_template.substitute( + title=html.escape(info["title"]), + views=info["views"], + published = info["published"], + description = format_text_runs(info["description"]), + author=html.escape(info["author"]), + author_url=info["author_url"], + duration=info["duration"], + url = URL_ORIGIN + "/watch?v=" + info["id"], + thumbnail=info['thumbnail'], + datetime='', # TODO + ) + +html_functions = { + 'compactVideoRenderer': lambda x: small_video_item_html(small_video_item_info(x)), + 'videoRenderer': lambda x: medium_video_item_html(medium_video_item_info(x)), + 'compactPlaylistRenderer': lambda x: small_playlist_item_html(small_playlist_item_info(x)), + 'playlistRenderer': lambda x: medium_playlist_item_html(medium_playlist_item_info(x)), + 'channelRenderer': lambda x: '', + 'radioRenderer': lambda x: medium_playlist_item_html(medium_playlist_item_info(x)), + 'compactRadioRenderer': lambda x: small_playlist_item_html(small_playlist_item_info(x)), + 'didYouMeanRenderer': lambda x: '', +} + + + + + + + +def get_url(node): + try: + return node['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + except KeyError: + return node['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + + +def get_text(node): + try: + return node['simpleText'] + except KeyError: + return node['runs'][0]['text'] + +def get_formatted_text(node): + try: + return node['runs'] + except KeyError: + return node['simpleText'] + +def get_badges(node): + badges = [] + for badge_node in node: + badge = badge_node['metadataBadgeRenderer']['label'] + if badge.lower() != 'new': + badges.append(badge) + return badges + +def get_thumbnail(node): + try: + return node['thumbnails'][0]['url'] # polymer format + except KeyError: + return node['url'] # ajax format + +dispatch = { + +# polymer format + 'title': ('title', get_text), + 'publishedTimeText': ('published', get_text), + 'videoId': ('id', lambda node: node), + 'descriptionSnippet': ('description', get_formatted_text), + 'lengthText': ('duration', get_text), + 'thumbnail': ('thumbnail', get_thumbnail), + 'thumbnails': ('thumbnail', lambda node: node[0]['thumbnails'][0]['url']), + + 'videoCountText': ('size', get_text), + 'playlistId': ('id', lambda node: node), + + 'subscriberCountText': ('subscriber_count', get_text), + 'channelId': ('id', lambda node: node), + 'badges': ('badges', get_badges), + +# ajax format + 'view_count_text': ('views', get_text), + 'num_videos_text': ('size', lambda node: get_text(node).split(' ')[0]), + 'owner_text': ('author', get_text), + 'owner_endpoint': ('author_url', lambda node: node['url']), + 'description': ('description', get_formatted_text), + 'index': ('playlist_index', get_text), + 'short_byline': ('author', get_text), + 'length': ('duration', get_text), + 'video_id': ('id', lambda node: node), + +} + +def renderer_info(renderer): + try: + info = {} + if 'viewCountText' in renderer: # prefer this one as it contains all the digits + info['views'] = get_text(renderer['viewCountText']) + elif 'shortViewCountText' in renderer: + info['views'] = get_text(renderer['shortViewCountText']) + + for key, node in renderer.items(): + if key in ('longBylineText', 'shortBylineText'): + info['author'] = get_text(node) + try: + info['author_url'] = get_url(node) + except KeyError: + pass + + continue + + try: + simple_key, function = dispatch[key] + except KeyError: + continue + info[simple_key] = function(node) + return info + except KeyError: + print(renderer) + raise + +def ajax_info(item_json): + try: + info = {} + for key, node in item_json.items(): + try: + simple_key, function = dispatch[key] + except KeyError: + continue + info[simple_key] = function(node) + return info + except KeyError: + print(item_json) + raise + +def badges_html(badges): + return ' | '.join(map(html.escape, badges)) + + + + + +html_transform_dispatch = { + 'title': html.escape, + 'published': html.escape, + 'id': html.escape, + 'description': format_text_runs, + 'duration': html.escape, + 'thumbnail': lambda url: html.escape('/' + url.lstrip('/')), + 'size': html.escape, + 'author': html.escape, + 'author_url': lambda url: html.escape(URL_ORIGIN + url), + 'views': html.escape, + 'subscriber_count': html.escape, + 'badges': badges_html, + 'playlist_index': html.escape, +} + +def get_html_ready(item): + html_ready = {} + for key, value in item.items(): + try: + function = html_transform_dispatch[key] + except KeyError: + continue + html_ready[key] = function(value) + return html_ready + + +author_template_url = Template('''
    By $author
    ''') +author_template = Template('''
    By $author
    ''') +stat_templates = ( + Template('''$views'''), + Template(''''''), +) +def get_video_stats(html_ready): + stats = [] + if 'author' in html_ready: + if 'author_url' in html_ready: + stats.append(author_template_url.substitute(html_ready)) + else: + stats.append(author_template.substitute(html_ready)) + for stat in stat_templates: + try: + stats.append(stat.strict_substitute(html_ready)) + except KeyError: + pass + return ' | '.join(stats) + +def video_item_html(item, template): + html_ready = get_html_ready(item) + video_info = {} + for key in ('id', 'title', 'author'): + try: + video_info[key] = html_ready[key] + except KeyError: + video_info[key] = '' + try: + video_info['duration'] = html_ready['duration'] + except KeyError: + video_info['duration'] = 'Live' # livestreams don't have a duration + + html_ready['video_info'] = html.escape(json.dumps(video_info) ) + html_ready['url'] = URL_ORIGIN + "/watch?v=" + html_ready['id'] + html_ready['datetime'] = '' #TODO + + html_ready['stats'] = get_video_stats(html_ready) + + return template.substitute(html_ready) + + +def playlist_item_html(item, template): + html_ready = get_html_ready(item) + + html_ready['url'] = URL_ORIGIN + "/playlist?list=" + html_ready['id'] + html_ready['datetime'] = '' #TODO + return template.substitute(html_ready) + + + + + + +def make_query_string(query_string): + return '&'.join(key + '=' + ','.join(values) for key,values in query_string.items()) + +def update_query_string(query_string, items): + parameters = urllib.parse.parse_qs(query_string) + parameters.update(items) + return make_query_string(parameters) + +page_button_template = Template('''$page''') +current_page_button_template = Template('''
    $page
    ''') + +def page_buttons_html(current_page, estimated_pages, url, current_query_string): + if current_page <= 5: + page_start = 1 + page_end = min(9, estimated_pages) + else: + page_start = current_page - 4 + page_end = min(current_page + 4, estimated_pages) + + result = "" + for page in range(page_start, page_end+1): + if page == current_page: + template = current_page_button_template + else: + template = page_button_template + result += template.substitute(page=page, href = url + "?" + update_query_string(current_query_string, {'page': [str(page)]}) ) + return result + + + + + + + +showing_results_for = Template(''' +
    +
    Showing results for $corrected_query
    +
    Search instead for $original_query
    +
    +''') + +did_you_mean = Template(''' +
    +
    Did you mean $corrected_query
    +
    +''') + +def renderer_html(renderer, additional_info={}, current_query_string=''): + type = list(renderer.keys())[0] + renderer = renderer[type] + if type in ('videoRenderer', 'playlistRenderer', 'radioRenderer', 'compactVideoRenderer', 'compactPlaylistRenderer', 'compactRadioRenderer', 'gridVideoRenderer', 'gridPlaylistRenderer', 'gridRadioRenderer'): + info = renderer_info(renderer) + info.update(additional_info) + if type == 'compactVideoRenderer': + return video_item_html(info, small_video_item_template) + if type in ('compactPlaylistRenderer', 'compactRadioRenderer'): + return playlist_item_html(info, small_playlist_item_template) + if type in ('videoRenderer', 'gridVideoRenderer'): + return video_item_html(info, medium_video_item_template) + if type in ('playlistRenderer', 'gridPlaylistRenderer', 'radioRenderer', 'gridRadioRenderer'): + return playlist_item_html(info, medium_playlist_item_template) + + if type == 'channelRenderer': + info = renderer_info(renderer) + html_ready = get_html_ready(info) + html_ready['url'] = URL_ORIGIN + "/channel/" + html_ready['id'] + return medium_channel_item_template.substitute(html_ready) + + if type == 'movieRenderer': + return '' + print(renderer) + raise NotImplementedError('Unknown renderer type: ' + type) + + +'videoRenderer' +'playlistRenderer' +'channelRenderer' +'radioRenderer' +'gridVideoRenderer' +'gridPlaylistRenderer' + +'didYouMeanRenderer' +'showingResultsForRenderer' diff --git a/youtube/opensearch.xml b/youtube/opensearch.xml new file mode 100644 index 0000000..1764138 --- /dev/null +++ b/youtube/opensearch.xml @@ -0,0 +1,11 @@ + +Youtube local +no CIA shit in the background +UTF-8 +data:image/x-icon;base64,R0lGODlhEAAQAJECAP8AAAAAAP///wAAACH5BAEAAAIALAAAAAAQABAAAAIplI+py+0NogQuyBDEnEd2kHkfFWUamEzmpZSfmaIHPHrRguUm/fT+UwAAOw== + + + + +http://localhost/youtube.com/search + \ No newline at end of file diff --git a/youtube/playlist.py b/youtube/playlist.py new file mode 100644 index 0000000..fc09191 --- /dev/null +++ b/youtube/playlist.py @@ -0,0 +1,243 @@ +import base64 +import youtube.common as common +import urllib +import json +from string import Template +import youtube.proto as proto +import gevent +import math + +with open("yt_playlist_template.html", "r") as file: + yt_playlist_template = Template(file.read()) + + + + + + +def youtube_obfuscated_endian(offset): + if offset < 128: + return bytes((offset,)) + first_byte = 255 & offset + second_byte = 255 & (offset >> 7) + second_byte = second_byte | 1 + + # The next 2 bytes encode the offset in little endian order, + # BUT, it's done in a strange way. The least significant bit (LSB) of the second byte is not part + # of the offset. Instead, to get the number which the two bytes encode, that LSB + # of the second byte is combined with the most significant bit (MSB) of the first byte + # in a logical AND. Replace the two bits with the result of the AND to get the two little endian + # bytes that represent the offset. + + return bytes((first_byte, second_byte)) + + + +# just some garbage that's required, don't know what it means, if it means anything. +ctoken_header = b'\xe2\xa9\x85\xb2\x02' # e2 a9 85 b2 02 + +def byte(x): + return bytes((x,)) + +# TL;DR: the offset is hidden inside 3 nested base 64 encodes with random junk data added on the side periodically +def create_ctoken(playlist_id, offset): + obfuscated_offset = b'\x08' + youtube_obfuscated_endian(offset) # 0x08 slapped on for no apparent reason + obfuscated_offset = b'PT:' + base64.urlsafe_b64encode(obfuscated_offset).replace(b'=', b'') + obfuscated_offset = b'z' + byte(len(obfuscated_offset)) + obfuscated_offset + obfuscated_offset = base64.urlsafe_b64encode(obfuscated_offset).replace(b'=', b'%3D') + + playlist_bytes = b'VL' + bytes(playlist_id, 'ascii') + main_info = b'\x12' + byte(len(playlist_bytes)) + playlist_bytes + b'\x1a' + byte(len(obfuscated_offset)) + obfuscated_offset + + ctoken = base64.urlsafe_b64encode(ctoken_header + byte(len(main_info)) + main_info) + + return ctoken.decode('ascii') + +def playlist_ctoken(playlist_id, offset): + + offset = proto.uint(1, offset) + # this is just obfuscation as far as I can tell. It doesn't even follow protobuf + offset = b'PT:' + proto.unpadded_b64encode(offset) + offset = proto.string(15, offset) + + continuation_info = proto.string( 3, proto.percent_b64encode(offset) ) + + playlist_id = proto.string(2, 'VL' + playlist_id ) + pointless_nest = proto.string(80226972, playlist_id + continuation_info) + + return base64.urlsafe_b64encode(pointless_nest).decode('ascii') + +# initial request types: +# polymer_json: https://m.youtube.com/playlist?list=PLv3TTBr1W_9tppikBxAE_G6qjWdBljBHJ&pbj=1&lact=0 +# ajax json: https://m.youtube.com/playlist?list=PLv3TTBr1W_9tppikBxAE_G6qjWdBljBHJ&pbj=1&lact=0 with header X-YouTube-Client-Version: 1.20180418 + + +# continuation request types: +# polymer_json: https://m.youtube.com/playlist?&ctoken=[...]&pbj=1 +# ajax json: https://m.youtube.com/playlist?action_continuation=1&ajax=1&ctoken=[...] + + +headers_1 = ( + ('Accept', '*/*'), + ('Accept-Language', 'en-US,en;q=0.5'), + ('X-YouTube-Client-Name', '1'), + ('X-YouTube-Client-Version', '2.20180614'), +) + +def playlist_first_page(playlist_id): + url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&ajax=1&disable_polymer=true' + content = common.fetch_url(url, common.mobile_ua + headers_1) + if content[0:4] == b")]}'": + content = content[4:] + content = json.loads(common.uppercase_escape(content.decode('utf-8'))) + return content + +ajax_info_dispatch = { + 'view_count_text': ('views', common.get_text), + 'num_videos_text': ('size', lambda node: common.get_text(node).split(' ')[0]), + 'thumbnail': ('thumbnail', lambda node: node.url), + 'title': ('title', common.get_text), + 'owner_text': ('author', common.get_text), + 'owner_endpoint': ('author_url', lambda node: node.url), + 'description': ('description', common.get_formatted_text), + +} +def metadata_info(ajax_json): + info = {} + try: + for key, node in ajax_json.items(): + try: + simple_key, function = dispatch[key] + except KeyError: + continue + info[simple_key] = function(node) + return info + except (KeyError,IndexError): + print(ajax_json) + raise + + + + +#https://m.youtube.com/playlist?itct=CBMQybcCIhMIptj9xJaJ2wIV2JKcCh3Idwu-&ctoken=4qmFsgI2EiRWTFBMT3kwajlBdmxWWlB0bzZJa2pLZnB1MFNjeC0tN1BHVEMaDmVnWlFWRHBEUWxFJTNE&pbj=1 +def get_videos_ajax(playlist_id, page): + + url = "https://m.youtube.com/playlist?action_continuation=1&ajax=1&ctoken=" + playlist_ctoken(playlist_id, (int(page)-1)*20) + headers = { + 'User-Agent': ' Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1', + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.5', + 'X-YouTube-Client-Name': '2', + 'X-YouTube-Client-Version': '1.20180508', + } + print("Sending playlist ajax request") + content = common.fetch_url(url, headers) + with open('playlist_debug', 'wb') as f: + f.write(content) + content = content[4:] + print("Finished recieving playlist response") + + info = json.loads(common.uppercase_escape(content.decode('utf-8'))) + return info + +def get_playlist_videos(ajax_json): + videos = [] + #info = get_bloated_playlist_videos(playlist_id, page) + #print(info) + video_list = ajax_json['content']['continuation_contents']['contents'] + + + for video_json_crap in video_list: + try: + videos.append({ + "title": video_json_crap["title"]['runs'][0]['text'], + "id": video_json_crap["video_id"], + "views": "", + "duration": common.default_multi_get(video_json_crap, 'length', 'runs', 0, 'text', default=''), # livestreams dont have a length + "author": video_json_crap['short_byline']['runs'][0]['text'], + "author_url": '', + "published": '', + 'playlist_index': '', + + }) + except (KeyError, IndexError): + print(video_json_crap) + raise + return videos + +def get_playlist_videos_format2(playlist_id, page): + videos = [] + info = get_bloated_playlist_videos(playlist_id, page) + video_list = info['response']['continuationContents']['playlistVideoListContinuation']['contents'] + + for video_json_crap in video_list: + + video_json_crap = video_json_crap['videoRenderer'] + + try: + videos.append({ + "title": video_json_crap["title"]['runs'][0]['text'], + "video_id": video_json_crap["videoId"], + "views": "", + "duration": common.default_multi_get(video_json_crap, 'lengthText', 'runs', 0, 'text', default=''), # livestreams dont have a length + "uploader": video_json_crap['shortBylineText']['runs'][0]['text'], + "uploader_url": common.ORIGIN_URL + video_json_crap['shortBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + "published": common.default_multi_get(video_json_crap, 'publishedTimeText', 'simpleText', default=''), + 'playlist_index': video_json_crap['index']['runs'][0]['text'], + + }) + except (KeyError, IndexError): + print(video_json_crap) + raise + return videos + + +def playlist_videos_html(ajax_json): + result = '' + for info in get_playlist_videos(ajax_json): + result += common.small_video_item_html(info) + return result + +playlist_stat_template = Template(''' +
    $stat
    ''') +def get_playlist_page(query_string): + parameters = urllib.parse.parse_qs(query_string) + playlist_id = parameters['list'][0] + page = parameters.get("page", "1")[0] + if page == "1": + first_page_json = playlist_first_page(playlist_id) + this_page_json = first_page_json + else: + tasks = ( + gevent.spawn(playlist_first_page, playlist_id ), + gevent.spawn(get_videos_ajax, playlist_id, page) + ) + gevent.joinall(tasks) + first_page_json, this_page_json = tasks[0].value, tasks[1].value + + try: + video_list = this_page_json['content']['section_list']['contents'][0]['contents'][0]['contents'] + except KeyError: + video_list = this_page_json['content']['continuation_contents']['contents'] + videos_html = '' + for video_json in video_list: + info = common.ajax_info(video_json) + videos_html += common.video_item_html(info, common.small_video_item_template) + + + metadata = common.ajax_info(first_page_json['content']['playlist_header']) + video_count = int(metadata['size'].replace(',', '')) + page_buttons = common.page_buttons_html(int(page), math.ceil(video_count/20), common.URL_ORIGIN + "/playlist", query_string) + + html_ready = common.get_html_ready(metadata) + html_ready['page_title'] = html_ready['title'] + ' - Page ' + str(page) + + stats = '' + stats += playlist_stat_template.substitute(stat=html_ready['size'] + ' videos') + stats += playlist_stat_template.substitute(stat=html_ready['views']) + return yt_playlist_template.substitute( + videos = videos_html, + page_buttons = page_buttons, + stats = stats, + **html_ready + ) \ No newline at end of file diff --git a/youtube/proto.py b/youtube/proto.py new file mode 100644 index 0000000..9f9dbcc --- /dev/null +++ b/youtube/proto.py @@ -0,0 +1,65 @@ +from math import ceil +import base64 + +def byte(n): + return bytes((n,)) + + +def varint_encode(offset): + '''In this encoding system, for each 8-bit byte, the first bit is 1 if there are more bytes, and 0 is this is the last one. + The next 7 bits are data. These 7-bit sections represent the data in Little endian order. For example, suppose the data is + aaaaaaabbbbbbbccccccc (each of these sections is 7 bits). It will be encoded as: + 1ccccccc 1bbbbbbb 0aaaaaaa + + This encoding is used in youtube parameters to encode offsets and to encode the length for length-prefixed data. + See https://developers.google.com/protocol-buffers/docs/encoding#varints for more info.''' + needed_bytes = ceil(offset.bit_length()/7) or 1 # (0).bit_length() returns 0, but we need 1 in that case. + encoded_bytes = bytearray(needed_bytes) + for i in range(0, needed_bytes - 1): + encoded_bytes[i] = (offset & 127) | 128 # 7 least significant bits + offset = offset >> 7 + encoded_bytes[-1] = offset & 127 # leave first bit as zero for last byte + + return bytes(encoded_bytes) + + +def varint_decode(encoded): + decoded = 0 + for i, byte in enumerate(encoded): + decoded |= (byte & 127) << 7*i + + if not (byte & 128): + break + return decoded + + +def string(field_number, data): + data = as_bytes(data) + return _proto_field(2, field_number, varint_encode(len(data)) + data) +nested = string + +def uint(field_number, value): + return _proto_field(0, field_number, varint_encode(value)) + + + + +def _proto_field(wire_type, field_number, data): + ''' See https://developers.google.com/protocol-buffers/docs/encoding#structure ''' + return varint_encode( (field_number << 3) | wire_type) + data + + + +def percent_b64encode(data): + return base64.urlsafe_b64encode(data).replace(b'=', b'%3D') + + +def unpadded_b64encode(data): + return base64.urlsafe_b64encode(data).replace(b'=', b'') + +def as_bytes(value): + if isinstance(value, str): + return value.encode('ascii') + return value + + \ No newline at end of file diff --git a/youtube/search.py b/youtube/search.py new file mode 100644 index 0000000..5268dbe --- /dev/null +++ b/youtube/search.py @@ -0,0 +1,231 @@ +import json +import urllib +import html +from string import Template +import base64 +from math import ceil +from youtube.common import default_multi_get, get_thumbnail_url, URL_ORIGIN +import youtube.common as common + +with open("yt_search_results_template.html", "r") as file: + yt_search_results_template = file.read() + +with open("yt_search_template.html", "r") as file: + yt_search_template = file.read() + +page_button_template = Template('''$page''') +current_page_button_template = Template('''
    $page
    ''') +video_result_template = ''' +
    + + + $length + + + $video_title + +
    Uploaded by $uploader
    + $views + + + + + $description +
    +''' + + + +# Sort: 1 + # Upload date: 2 + # View count: 3 + # Rating: 1 +# Offset: 9 +# Filters: 2 + # Upload date: 1 + # Type: 2 + # Duration: 3 + + +features = { + '4k': 14, + 'hd': 4, + 'hdr': 25, + 'subtitles': 5, + 'creative_commons': 6, + '3d': 7, + 'live': 8, + 'purchased': 9, + '360': 15, + 'location': 23, +} + +def page_number_to_sp_parameter(page): + offset = (int(page) - 1)*20 # 20 results per page + first_byte = 255 & offset + second_byte = 255 & (offset >> 7) + second_byte = second_byte | 1 + + # 0b01001000 is required, and is always the same. + # The next 2 bytes encode the offset in little endian order, + # BUT, it's done in a strange way. The least significant bit (LSB) of the second byte is not part + # of the offset. Instead, to get the number which the two bytes encode, that LSB + # of the second byte is combined with the most significant bit (MSB) of the first byte + # in a logical AND. Replace the two bits with the result of the AND to get the two little endian + # bytes that represent the offset. + # I figured this out by trial and error on the sp parameter. I don't know why it's done like this; + # perhaps it's just obfuscation. + param_bytes = bytes((0b01001000, first_byte, second_byte)) + param_encoded = urllib.parse.quote(base64.urlsafe_b64encode(param_bytes)) + return param_encoded + +def get_search_json(query, page): + url = "https://www.youtube.com/results?search_query=" + urllib.parse.quote_plus(query) + headers = { + 'Host': 'www.youtube.com', + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)', + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.5', + 'X-YouTube-Client-Name': '1', + 'X-YouTube-Client-Version': '2.20180418', + } + url += "&pbj=1&sp=" + page_number_to_sp_parameter(page) + content = common.fetch_url(url, headers=headers) + info = json.loads(content) + return info + +"""def get_search_info(query, page): + result_info = dict() + info = get_bloated_search_info(query, page) + + estimated_results = int(info[1]['response']['estimatedResults']) + estimated_pages = ceil(estimated_results/20) + result_info['estimated_results'] = estimated_results + result_info['estimated_pages'] = estimated_pages + + result_info['results'] = [] + # this is what you get when you hire H-1B's + video_list = info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'] + + + for video_json_crap in video_list: + # they have a dictionary whose only content is another dictionary... + try: + type = list(video_json_crap.keys())[0] + except KeyError: + continue #channelRenderer or playlistRenderer + '''description = "" + for text_run in video_json_crap["descriptionSnippet"]["runs"]: + if text_run.get("bold", False): + description += "" + html.escape''' + try: + result_info['results'].append({ + "title": video_json_crap["title"]["simpleText"], + "video_id": video_json_crap["videoId"], + "description": video_json_crap.get("descriptionSnippet",dict()).get('runs',[]), # a list of text runs (formmated), rather than plain text + "thumbnail": get_thumbnail_url(video_json_crap["videoId"]), + "views_text": video_json_crap['viewCountText'].get('simpleText', None) or video_json_crap['viewCountText']['runs'][0]['text'], + "length_text": default_multi_get(video_json_crap, 'lengthText', 'simpleText', default=''), # livestreams dont have a length + "uploader": video_json_crap['longBylineText']['runs'][0]['text'], + "uploader_url": URL_ORIGIN + video_json_crap['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + "published_time_text": default_multi_get(video_json_crap, 'publishedTimeText', 'simpleText', default=''), + + }) + except KeyError: + print(video_json_crap) + raise + return result_info""" + + +def page_buttons_html(page_start, page_end, current_page, query): + result = "" + for page in range(page_start, page_end+1): + if page == current_page: + template = current_page_button_template + else: + template = page_button_template + result += template.substitute(page=page, href=URL_ORIGIN + "/search?query=" + urllib.parse.quote_plus(query) + "&page=" + str(page)) + return result + +showing_results_for = Template(''' +
    Showing results for $corrected_query
    +
    Search instead for $original_query
    +''') +did_you_mean = Template(''' +
    Did you mean $corrected_query
    +''') +def get_search_page(query_string, parameters=()): + qs_query = urllib.parse.parse_qs(query_string) + if len(qs_query) == 0: + return yt_search_template + query = qs_query["query"][0] + page = qs_query.get("page", "1")[0] + + info = get_search_json(query, page) + + estimated_results = int(info[1]['response']['estimatedResults']) + estimated_pages = ceil(estimated_results/20) + results = info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'] + + corrections = '' + result_list_html = "" + for renderer in results: + type = list(renderer.keys())[0] + if type == 'shelfRenderer': + continue + if type == 'didYouMeanRenderer': + renderer = renderer[type] + corrected_query_string = urllib.parse.parse_qs(query_string) + corrected_query_string['query'] = [renderer['correctedQueryEndpoint']['searchEndpoint']['query']] + corrected_query_url = URL_ORIGIN + '/search?' + common.make_query_string(corrected_query_string) + corrections = did_you_mean.substitute( + corrected_query_url = corrected_query_url, + corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), + ) + continue + if type == 'showingResultsForRenderer': + renderer = renderer[type] + no_autocorrect_query_string = urllib.parse.parse_qs(query_string) + no_autocorrect_query_string['autocorrect'] = ['0'] + no_autocorrect_query_url = URL_ORIGIN + '/search?' + common.make_query_string(no_autocorrect_query_string) + corrections = showing_results_for.substitute( + corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), + original_query_url = no_autocorrect_query_url, + original_query = html.escape(renderer['originalQuery']['simpleText']), + ) + continue + result_list_html += common.renderer_html(renderer, current_query_string=query_string) + '''type = list(result.keys())[0] + result = result[type] + if type == "showingResultsForRenderer": + url = URL_ORIGIN + "/search" + if len(parameters) > 0: + url += ';' + ';'.join(parameters) + url += '?' + '&'.join(key + '=' + ','.join(values) for key,values in qs_query.items()) + + result_list_html += showing_results_for_template.substitute( + corrected_query=common.format_text_runs(result['correctedQuery']['runs']), + + ) + else: + result_list_html += common.html_functions[type](result)''' + + page = int(page) + if page <= 5: + page_start = 1 + page_end = min(9, estimated_pages) + else: + page_start = page - 4 + page_end = min(page + 4, estimated_pages) + + + result = Template(yt_search_results_template).substitute( + results = result_list_html, + page_title = query + " - Search", + search_box_value = html.escape(query), + number_of_results = '{:,}'.format(estimated_results), + number_of_pages = '{:,}'.format(estimated_pages), + page_buttons = page_buttons_html(page_start, page_end, page, query), + corrections = corrections + ) + return result \ No newline at end of file diff --git a/youtube/shared.css b/youtube/shared.css new file mode 100644 index 0000000..39e76f4 --- /dev/null +++ b/youtube/shared.css @@ -0,0 +1,271 @@ +h1, h2, h3, h4, h5, h6, div{ + margin:0; + padding:0; + +} + + +body{ + margin:0; + padding: 0; + color:#222; + + + background-color:#cccccc; + + min-height:100vh; + + display:grid; + grid-template-rows: 50px 1fr; +} + + header{ + background-color:#333333; + + grid-row: 1; + } + + main{ + grid-row: 2; + } + +button{ + padding:0; /* Fuck browser-specific styling. Fix your shit mozilla */ +} +address{ + font-style:normal; +} +#site-search{ + display: grid; + grid-template-columns: 1fr 0fr; + +} + + #site-search .search-box{ + align-self:center; + height:25px; + border:0; + + grid-column: 1; + } + #site-search .search-button{ + grid-column: 2; + align-self:center; + height:25px; + + border-style:solid; + border-width:1px; + } + + +.full-item{ + display: grid; + grid-template-rows: 0fr 0fr 0fr 0fr 0fr; + grid-template-columns: 1fr 1fr; + +} + .full-item video{ + grid-column: 1 / span 2; + grid-row: 1; + } + .full-item .title{ + grid-column: 1 / span 2; + grid-row:2; + min-width: 0; + } + .full-item address{ + grid-column: 1; + grid-row: 3; + justify-self: start; + } + .full-item .views{ + grid-column: 2; + grid-row: 3; + justify-self:end; + } + .full-item time{ + grid-column: 1; + grid-row: 4; + justify-self:start; + } + .full-item .likes-dislikes{ + grid-column: 2; + grid-row: 4; + justify-self:end; + } + .full-item .description{ + background-color:#d0d0d0; + margin-top:8px; + white-space: pre-line; + min-width: 0; + + grid-column: 1 / span 2; + grid-row: 5; + } + +.medium-item{ + background-color:#bcbcbc; + display: grid; + align-content: start; + grid-template-columns: 246px 1fr 0fr; + grid-template-rows: 0fr 0fr 0fr 0fr 0fr 1fr; +} + .medium-item .title{ + grid-column:2 / span 2; + grid-row:1; + min-width: 0; + } + .medium-item address{ + display:inline; + } + /*.medium-item .views{ + grid-column: 3; + grid-row: 2; + justify-self:end; + } + .medium-item time{ + grid-column: 2; + grid-row: 3; + justify-self:start; + }*/ + .medium-item .stats{ + grid-column: 2 / span 2; + grid-row: 2; + } + + .medium-item .description{ + grid-column: 2 / span 2; + grid-row: 4; + } + .medium-item .badges{ + grid-column: 2 / span 2; + grid-row: 5; + } + /* thumbnail size */ + .medium-item img{ + /*height:138px; + width:246px;*/ + height:100%; + justify-self:center; + } + +.small-item-box{ + color: #767676; + font-size: 12px; + + display:grid; + grid-template-columns: 1fr 0fr; + grid-template-rows: 94px; +} + +.small-item{ + background-color:#bcbcbc; + align-content: start; + text-decoration:none; + + display: grid; + grid-template-columns: 168px 1fr; + grid-column-gap: 5px; + grid-template-rows: 0fr 0fr 0fr 1fr; +} + .small-item .title{ + grid-column:2; + grid-row:1; + margin:0; + + color: #333; + font-size: 16px; + font-weight: 500; + text-decoration:initial; + min-width: 0; + } + .small-item address{ + grid-column: 2; + grid-row: 2; + justify-self: start; + } + + .small-item .views{ + grid-column: 2; + grid-row: 3; + justify-self:start; + } + /* thumbnail size */ + .small-item img{ + /*height:94px; + width:168px;*/ + height:100%; + justify-self:center; + } + +.item-checkbox{ + justify-self:start; + align-self:center; + height:30px; + width:30px; + + grid-column: 2; +} + +/* ---Thumbnails for videos---- */ +.video-thumbnail-box{ + grid-column:1; + grid-row:1 / span 6; + + display:grid; + grid-template-columns: 1fr 0fr; +} + .video-thumbnail-img{ + grid-column:1 / span 2; + grid-row:1; + } + .video-duration{ + grid-column: 2; + grid-row: 1; + align-self: end; + opacity: .8; + color: #ffffff; + font-size: 12px; + background-color: #000000; + } + +/* ---Thumbnails for playlists---- */ +.playlist-thumbnail-box{ + grid-column:1; + grid-row:1 / span 5; + + display:grid; + grid-template-columns: 3fr 2fr; +} + .playlist-thumbnail-img{ + grid-column:1 / span 2; + grid-row:1; + } + .playlist-thumbnail-info{ + grid-column:2; + grid-row:1; + + display: grid; + align-items:center; + + text-align:center; + white-space: pre-line; + opacity: .8; + color: #cfcfcf; + background-color: #000000; + } + +.page-button-row{ + justify-self:center; + display: grid; + grid-auto-columns: 40px; + grid-auto-flow: column; + height: 40px; +} + .page-button{ + background-color: #e9e9e9; + border-style: outset; + border-width: 2px; + font-weight: bold; + text-align: center; + } \ No newline at end of file diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py new file mode 100644 index 0000000..5edf6fc --- /dev/null +++ b/youtube/subscriptions.py @@ -0,0 +1,18 @@ +import urllib + +with open("subscriptions.txt", 'r', encoding='utf-8') as file: + subscriptions = file.read() + +# Line format: "channel_id channel_name" +# Example: +# UCYO_jab_esuFRV4b17AJtAw 3Blue1Brown + +subscriptions = ((line[0:24], line[25: ]) for line in subscriptions.splitlines()) + +def get_new_videos(): + for channel_id, channel_name in subscriptions: + + + + +def get_subscriptions_page(): diff --git a/youtube/template.py b/youtube/template.py new file mode 100644 index 0000000..7f13415 --- /dev/null +++ b/youtube/template.py @@ -0,0 +1,132 @@ + +import re as _re +from collections import ChainMap as _ChainMap + +class _TemplateMetaclass(type): + pattern = r""" + %(delim)s(?: + (?P%(delim)s) | # Escape sequence of two delimiters + (?P%(id)s) | # delimiter and a Python identifier + {(?P%(id)s)} | # delimiter and a braced identifier + (?P) # Other ill-formed delimiter exprs + ) + """ + + def __init__(cls, name, bases, dct): + super(_TemplateMetaclass, cls).__init__(name, bases, dct) + if 'pattern' in dct: + pattern = cls.pattern + else: + pattern = _TemplateMetaclass.pattern % { + 'delim' : _re.escape(cls.delimiter), + 'id' : cls.idpattern, + } + cls.pattern = _re.compile(pattern, cls.flags | _re.VERBOSE) + + +class Template(metaclass=_TemplateMetaclass): + """A string class for supporting $-substitutions.""" + + delimiter = '$' + idpattern = r'[_a-z][_a-z0-9]*' + flags = _re.IGNORECASE + + def __init__(self, template): + self.template = template + + # Search for $$, $identifier, ${identifier}, and any bare $'s + + def _invalid(self, mo): + i = mo.start('invalid') + lines = self.template[:i].splitlines(keepends=True) + if not lines: + colno = 1 + lineno = 1 + else: + colno = i - len(''.join(lines[:-1])) + lineno = len(lines) + raise ValueError('Invalid placeholder in string: line %d, col %d' % + (lineno, colno)) + + def substitute(*args, **kws): + if not args: + raise TypeError("descriptor 'substitute' of 'Template' object " + "needs an argument") + self, *args = args # allow the "self" keyword be passed + if len(args) > 1: + raise TypeError('Too many positional arguments') + if not args: + mapping = kws + elif kws: + mapping = _ChainMap(kws, args[0]) + else: + mapping = args[0] + # Helper function for .sub() + def convert(mo): + # Check the most common path first. + named = mo.group('named') or mo.group('braced') + if named is not None: + return str(mapping.get(named,'')) + if mo.group('escaped') is not None: + return self.delimiter + if mo.group('invalid') is not None: + self._invalid(mo) + raise ValueError('Unrecognized named group in pattern', + self.pattern) + return self.pattern.sub(convert, self.template) + + def strict_substitute(*args, **kws): + if not args: + raise TypeError("descriptor 'substitute' of 'Template' object " + "needs an argument") + self, *args = args # allow the "self" keyword be passed + if len(args) > 1: + raise TypeError('Too many positional arguments') + if not args: + mapping = kws + elif kws: + mapping = _ChainMap(kws, args[0]) + else: + mapping = args[0] + # Helper function for .sub() + def convert(mo): + # Check the most common path first. + named = mo.group('named') or mo.group('braced') + if named is not None: + return str(mapping[named]) + if mo.group('escaped') is not None: + return self.delimiter + if mo.group('invalid') is not None: + self._invalid(mo) + raise ValueError('Unrecognized named group in pattern', + self.pattern) + return self.pattern.sub(convert, self.template) + + def safe_substitute(*args, **kws): + if not args: + raise TypeError("descriptor 'safe_substitute' of 'Template' object " + "needs an argument") + self, *args = args # allow the "self" keyword be passed + if len(args) > 1: + raise TypeError('Too many positional arguments') + if not args: + mapping = kws + elif kws: + mapping = _ChainMap(kws, args[0]) + else: + mapping = args[0] + # Helper function for .sub() + def convert(mo): + named = mo.group('named') or mo.group('braced') + if named is not None: + try: + return str(mapping[named]) + except KeyError: + return mo.group() + if mo.group('escaped') is not None: + return self.delimiter + if mo.group('invalid') is not None: + return mo.group() + raise ValueError('Unrecognized named group in pattern', + self.pattern) + return self.pattern.sub(convert, self.template) \ No newline at end of file diff --git a/youtube/watch.py b/youtube/watch.py new file mode 100644 index 0000000..b8aa17d --- /dev/null +++ b/youtube/watch.py @@ -0,0 +1,294 @@ +from youtube_dl.YoutubeDL import YoutubeDL +import json +import urllib +from string import Template +import html +import youtube.common as common +from youtube.common import default_multi_get, get_thumbnail_url, video_id, URL_ORIGIN +import youtube.comments as comments +import gevent + +video_height_priority = (360, 480, 240, 720, 1080) + + +_formats = { + '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well + '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + + + # 3D videos + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, + + # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, + + # DASH mp4 video + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, + + # Dash mp4 audio + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, + + # Dash webm + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + + # Dash webm audio + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, + + # Dash webm audio with opus inside + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, + + # RTMP (unnamed) + '_rtmp': {'protocol': 'rtmp'}, +} + + + + +source_tag_template = Template(''' +''') + +with open("yt_watch_template.html", "r") as file: + yt_watch_template = Template(file.read()) + + + +# example: +#https://www.youtube.com/related_ajax?ctoken=CBQSJhILVGNxV29rOEF1YkXAAQDIAQDgAQGiAg0o____________AUAAGAAq0gEInJOqsOyB1tAaCNeMgaD4spLIKQioxdHSu8SF9JgBCLr27tnaioDpXwj1-L_R3s7r2wcIv8TnueeUo908CMXSganIrvHDJgiVuMirrqbgqYABCJDsu8PBzdGW8wEI_-WI2t-c-IlQCOK_m_KB_rP5wAEIl7S4serqnq5YCNSs55mMt8qLyQEImvutmp-x9LaCAQiVg96VpY_pqJMBCOPsgdTflsGRsQEI7ZfYleKIub0tCIrcsb7a_uu95gEIi9Gz6_bC76zEAQjo1c_W8JzlkhI%3D&continuation=CBQSJhILVGNxV29rOEF1YkXAAQDIAQDgAQGiAg0o____________AUAAGAAq0gEInJOqsOyB1tAaCNeMgaD4spLIKQioxdHSu8SF9JgBCLr27tnaioDpXwj1-L_R3s7r2wcIv8TnueeUo908CMXSganIrvHDJgiVuMirrqbgqYABCJDsu8PBzdGW8wEI_-WI2t-c-IlQCOK_m_KB_rP5wAEIl7S4serqnq5YCNSs55mMt8qLyQEImvutmp-x9LaCAQiVg96VpY_pqJMBCOPsgdTflsGRsQEI7ZfYleKIub0tCIrcsb7a_uu95gEIi9Gz6_bC76zEAQjo1c_W8JzlkhI%3D&itct=CCkQybcCIhMIg8PShInX2gIVgdvBCh15WA0ZKPgd +def get_bloated_more_related_videos(video_url, related_videos_token, id_token): + related_videos_token = urllib.parse.quote(related_videos_token) + url = "https://www.youtube.com/related_ajax?ctoken=" + related_videos_token + "&continuation=" + related_videos_token + headers = { + 'Host': 'www.youtube.com', + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)', + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.5', + 'Referer': video_url, + 'X-YouTube-Client-Name': '1', + 'X-YouTube-Client-Version': '2.20180418', + 'X-Youtube-Identity-Token': id_token, + + } + #print(url) + req = urllib.request.Request(url, headers=headers) + response = urllib.request.urlopen(req, timeout = 5) + content = response.read() + info = json.loads(content) + return info + +def get_more_related_videos_info(video_url, related_videos_token, id_token): + results = [] + info = get_bloated_more_related_videos(video_url, related_videos_token, id_token) + bloated_results = info[1]['response']['continuationContents']['watchNextSecondaryResultsContinuation']['results'] + for bloated_result in bloated_results: + bloated_result = bloated_result['compactVideoRenderer'] + results.append({ + "title": bloated_result['title']['simpleText'], + "video_id": bloated_result['videoId'], + "views_text": bloated_result['viewCountText']['simpleText'], + "length_text": default_multi_get(bloated_result, 'lengthText', 'simpleText', default=''), # livestreams dont have a length + "length_text": bloated_result['lengthText']['simpleText'], + "uploader_name": bloated_result['longBylineText']['runs'][0]['text'], + "uploader_url": bloated_result['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + }) + return results + +def more_related_videos_html(video_info): + related_videos = get_related_videos(url, 1, video_info['related_videos_token'], video_info['id_token']) + + related_videos_html = "" + for video in related_videos: + related_videos_html += Template(video_related_template).substitute( + video_title=html.escape(video["title"]), + views=video["views_text"], + uploader=html.escape(video["uploader_name"]), + uploader_channel_url=video["uploader_url"], + length=video["length_text"], + video_url = "/youtube.com/watch?v=" + video["video_id"], + thumbnail_url= get_thumbnail_url(video['video_id']), + ) + return related_videos_html + + + +def get_related_items_html(info): + result = "" + for item in info['related_vids']: + if 'list' in item: # playlist: + result += common.small_playlist_item_html(watch_page_related_playlist_info(item)) + else: + result += common.small_video_item_html(watch_page_related_video_info(item)) + return result + + +# json of related items retrieved directly from the watch page has different names for everything +# converts these to standard names +def watch_page_related_video_info(item): + result = {key: item[key] for key in ('id', 'title', 'author')} + result['duration'] = common.seconds_to_timestamp(item['length_seconds']) + try: + result['views'] = item['short_view_count_text'] + except KeyError: + result['views'] = '' + return result + +def watch_page_related_playlist_info(item): + return { + 'size': item['playlist_length'] if item['playlist_length'] != "0" else "50+", + 'title': item['playlist_title'], + 'id': item['list'], + 'first_video_id': item['video_id'], + } + + +def sort_formats(info): + info['formats'].sort(key=lambda x: default_multi_get(_formats, x['format_id'], 'height', default=0)) + for index, format in enumerate(info['formats']): + if default_multi_get(_formats, format['format_id'], 'height', default=0) >= 360: + break + info['formats'] = info['formats'][index:] + info['formats'][0:index] + info['formats'] = [format for format in info['formats'] if format['acodec'] != 'none' and format['vcodec'] != 'none'] + +def formats_html(info): + result = '' + for format in info['formats']: + result += source_tag_template.substitute( + src=format['url'], + type='audio/' + format['ext'] if format['vcodec'] == "none" else 'video/' + format['ext'], + ) + return result + +def choose_format(info): + suitable_formats = [] + with open('teste.txt', 'w', encoding='utf-8') as f: + f.write(json.dumps(info['formats'])) + for format in info['formats']: + if (format["ext"] in ("mp4", "webm") + and format["acodec"] != "none" + and format["vcodec"] != "none" + and format.get("height","none") in video_height_priority): + suitable_formats.append(format) + + current_best = (suitable_formats[0],video_height_priority.index(suitable_formats[0]["height"])) + for format in suitable_formats: + video_priority_index = video_height_priority.index(format["height"]) + if video_priority_index < current_best[1]: + current_best = (format, video_priority_index) + return current_best[0] + +more_comments_template = Template('''More comments''') +def get_watch_page(query_string): + id = urllib.parse.parse_qs(query_string)['v'][0] + tasks = ( + gevent.spawn(comments.video_comments, id ), + gevent.spawn(YoutubeDL(params={'youtube_include_dash_manifest':False}).extract_info, "https://www.youtube.com/watch?v=" + id, download=False) + ) + gevent.joinall(tasks) + comments_info, info = tasks[0].value, tasks[1].value + comments_html, ctoken = comments_info + + if ctoken == '': + more_comments_button = '' + else: + more_comments_button = more_comments_template.substitute(url = URL_ORIGIN + '/comments?ctoken=' + ctoken) + #comments_html = comments.comments_html(video_id(url)) + #info = YoutubeDL().extract_info(url, download=False) + + #chosen_format = choose_format(info) + sort_formats(info) + + + + upload_year = info["upload_date"][0:4] + upload_month = info["upload_date"][4:6] + upload_day = info["upload_date"][6:8] + upload_date = upload_month + "/" + upload_day + "/" + upload_year + + related_videos_html = get_related_items_html(info) + + page = yt_watch_template.substitute( + video_title=html.escape(info["title"]), + page_title=html.escape(info["title"]), + uploader=html.escape(info["uploader"]), + uploader_channel_url='/' + info["uploader_url"], + #upload_date=datetime.datetime.fromtimestamp(info["timestamp"]).strftime("%d %b %Y %H:%M:%S"), + upload_date = upload_date, + views='{:,}'.format(info["view_count"]), + likes=(lambda x: '{:,}'.format(x) if x is not None else "")(info["like_count"]), + dislikes=(lambda x: '{:,}'.format(x) if x is not None else "")(info["dislike_count"]), + description=html.escape(info["description"]), + video_sources=formats_html(info), + related = related_videos_html, + comments=comments_html, + more_comments_button = more_comments_button, + ) + return page \ No newline at end of file diff --git a/youtube/watch_later.py b/youtube/watch_later.py new file mode 100644 index 0000000..126fb6e --- /dev/null +++ b/youtube/watch_later.py @@ -0,0 +1,11 @@ +import os.path +import json +watch_later_file = os.path.normpath("youtube/watch_later.txt") +def add_to_watch_later(video_info_list): + with open(watch_later_file, "a", encoding='utf-8') as file: + for info in video_info_list: + file.write(info + "\n") + + +def get_watch_later_page(): + pass \ No newline at end of file diff --git a/youtube/youtube.py b/youtube/youtube.py new file mode 100644 index 0000000..7ec75c0 --- /dev/null +++ b/youtube/youtube.py @@ -0,0 +1,60 @@ +import mimetypes +import urllib.parse +from youtube import watch_later, watch, search, playlist, channel, comments +YOUTUBE_FILES = ( + "/shared.css", + "/opensearch.xml", + '/comments.css', +) + +def youtube(env, start_response): + path, method, query_string = env['PATH_INFO'], env['REQUEST_METHOD'], env['QUERY_STRING'] + if method == "GET": + if path in YOUTUBE_FILES: + with open("youtube" + path, 'rb') as f: + mime_type = mimetypes.guess_type(path)[0] or 'application/octet-stream' + start_response('200 OK', (('Content-type',mime_type),) ) + return f.read() + + elif path == "/comments": + start_response('200 OK', (('Content-type','text/html'),) ) + return comments.get_comments_page(query_string).encode() + + elif path == "/watch": + start_response('200 OK', (('Content-type','text/html'),) ) + return watch.get_watch_page(query_string).encode() + + elif path == "/search": + start_response('200 OK', (('Content-type','text/html'),) ) + return search.get_search_page(query_string).encode() + + elif path == "/playlist": + start_response('200 OK', (('Content-type','text/html'),) ) + return playlist.get_playlist_page(query_string).encode() + + elif path.startswith("/channel/"): + start_response('200 OK', (('Content-type','text/html'),) ) + return channel.get_channel_page(path[9:], query_string=query_string).encode() + + elif path.startswith("/user/"): + start_response('200 OK', (('Content-type','text/html'),) ) + return channel.get_user_page(path[6:], query_string=query_string).encode() + + else: + start_response('404 Not Found', () ) + return b'404 Not Found' + + elif method == "POST": + if path == "/edit_playlist": + fields = urllib.parse.parse_qs(env['wsgi.input'].read().decode()) + if fields['action'][0] == 'add' and fields['playlist_name'][0] == 'watch_later': + watch_later.add_to_watch_later(fields['video_info_list']) + + start_response('204 No Content', ()) + else: + start_response('404 Not Found', ()) + return b'404 Not Found' + + else: + start_response('501 Not Implemented', ()) + return b'501 Not Implemented' \ No newline at end of file diff --git a/yt_channel_about_template.html b/yt_channel_about_template.html new file mode 100644 index 0000000..d45e7a0 --- /dev/null +++ b/yt_channel_about_template.html @@ -0,0 +1,128 @@ + + + + + $page_title + + + + +
    +
    + +
    +
    +
    + + + + +
    +
    +
    +
    + +

    $channel_title

    + +
    +
      +$stats + +
    +
    +

    Description

    + $description +
    +$links +
    +
    + + + + + + + \ No newline at end of file diff --git a/yt_channel_items_template.html b/yt_channel_items_template.html new file mode 100644 index 0000000..58333e3 --- /dev/null +++ b/yt_channel_items_template.html @@ -0,0 +1,134 @@ + + + + + $page_title + + + + +
    +
    + +
    +
    +
    + + + + +
    +
    +
    +
    + +

    $channel_title

    + +
    $number_of_results
    + + +
    + + + + + + + \ No newline at end of file diff --git a/yt_comments_template.html b/yt_comments_template.html new file mode 100644 index 0000000..28630ec --- /dev/null +++ b/yt_comments_template.html @@ -0,0 +1,62 @@ + + + + + $page_title + + + + + +
    +
    + +
    +
    +
    +
    +
    +$comments +
    +$more_comments_button +
    +
    + + \ No newline at end of file diff --git a/yt_playlist_template.html b/yt_playlist_template.html new file mode 100644 index 0000000..1d664b4 --- /dev/null +++ b/yt_playlist_template.html @@ -0,0 +1,132 @@ + + + + + $page_title + + + + +
    +
    + +
    +
    +
    +
    + + +
    +$videos +
    + +
    + + + +
    + + + + + + \ No newline at end of file diff --git a/yt_search_results_template.html b/yt_search_results_template.html new file mode 100644 index 0000000..18a8dc9 --- /dev/null +++ b/yt_search_results_template.html @@ -0,0 +1,105 @@ + + + + + $page_title + + + + +
    +
    + +
    +
    +
    +
    +
    +
    Approximately $number_of_results results ($number_of_pages pages)
    +$corrections +
    +
    +$results +
    + +
    + + + +
    + + + + + + \ No newline at end of file diff --git a/yt_search_template.html b/yt_search_template.html new file mode 100644 index 0000000..4ebcb81 --- /dev/null +++ b/yt_search_template.html @@ -0,0 +1,108 @@ + + + + + Search + + + + + + +
    +
    + + + + + + + + + + \ No newline at end of file diff --git a/yt_watch_template.html b/yt_watch_template.html new file mode 100644 index 0000000..fe989a8 --- /dev/null +++ b/yt_watch_template.html @@ -0,0 +1,148 @@ + + + + + $page_title + + + + + +
    +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    + + + +

    $video_title

    + +
    Uploaded by $uploader
    + $views views + + + + + + $description + +
    +$comments +
    +$more_comments_button +
    +
    + + + +
    + + + + + + + \ No newline at end of file -- cgit v1.2.3