From 79937c1c823f998a1d6bb324901fd13b483b3607 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Mon, 2 Jul 2018 17:45:25 -0700 Subject: fix line endings --- youtube/channel.py | 503 +++++++++--------- youtube/comments.css | 116 ++--- youtube/comments.py | 332 ++++++------ youtube/common.py | 1278 +++++++++++++++++++++++----------------------- youtube/opensearch.xml | 20 +- youtube/playlist.py | 484 +++++++++--------- youtube/proto.py | 128 ++--- youtube/search.py | 460 ++++++++--------- youtube/shared.css | 540 ++++++++++---------- youtube/subscriptions.py | 36 +- youtube/template.py | 262 +++++----- youtube/watch.py | 586 ++++++++++----------- youtube/watch_later.py | 20 +- youtube/youtube.py | 118 ++--- 14 files changed, 2442 insertions(+), 2441 deletions(-) (limited to 'youtube') diff --git a/youtube/channel.py b/youtube/channel.py index d993d3b..b7a4462 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -1,252 +1,253 @@ -import base64 -import youtube.common as common -from youtube.common import default_multi_get, URL_ORIGIN, get_thumbnail_url, video_id -import urllib -import json -from string import Template -import youtube.proto as proto -import html -import math -import gevent -import re -import functools - -with open("yt_channel_items_template.html", "r") as file: - yt_channel_items_template = Template(file.read()) - -with open("yt_channel_about_template.html", "r") as file: - yt_channel_about_template = Template(file.read()) - -'''continuation = Proto( - Field('optional', 'continuation', 80226972, Proto( - Field('optional', 'browse_id', 2, String), - Field('optional', 'params', 3, Base64(Proto( - Field('optional', 'channel_tab', 2, String), - Field('optional', 'sort', 3, ENUM - Field('optional', 'page', 15, String), - ))) - )) -)''' - - -'''channel_continuation = Proto( - Field('optional', 'pointless_nest', 80226972, Proto( - Field('optional', 'channel_id', 2, String), - Field('optional', 'continuation_info', 3, Base64(Proto( - Field('optional', 'channel_tab', 2, String), - Field('optional', 'sort', 3, ENUM - Field('optional', 'page', 15, String), - ))) - )) -)''' - -headers_1 = ( - ('Accept', '*/*'), - ('Accept-Language', 'en-US,en;q=0.5'), - ('X-YouTube-Client-Name', '1'), - ('X-YouTube-Client-Version', '2.20180614'), -) -# https://www.youtube.com/browse_ajax?action_continuation=1&direct_render=1&continuation=4qmFsgJAEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk1yZ0JBQSUzRCUzRA%3D%3D -# https://www.youtube.com/browse_ajax?ctoken=4qmFsgJAEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk1yZ0JBQSUzRCUzRA%3D%3D&continuation=4qmFsgJAEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk1yZ0JBQSUzRCUzRA%3D%3D&itct=CDsQybcCIhMIhZi1krTc2wIVjMicCh2HXQnhKJsc - -# grid view: 4qmFsgJAEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk1yZ0JBQSUzRCUzRA -# list view: 4qmFsgJCEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJkVnWjJhV1JsYjNNWUF5QUFNQUk0QVdBQmFnQjZBVEs0QVFBJTNE -# SORT: -# Popular - 1 -# Oldest - 2 -# Newest - 3 - -# view: -# grid: 0 or 1 -# list: 2 -def channel_ctoken(channel_id, page, sort, tab, view=1): - - tab = proto.string(2, tab ) - sort = proto.uint(3, int(sort)) - page = proto.string(15, str(page) ) - view = proto.uint(6, int(view)) - continuation_info = proto.string( 3, proto.percent_b64encode(tab + view + sort + page) ) - - channel_id = proto.string(2, channel_id ) - pointless_nest = proto.string(80226972, channel_id + continuation_info) - - return base64.urlsafe_b64encode(pointless_nest).decode('ascii') - -def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1): - ctoken = channel_ctoken(channel_id, page, sort, tab, view).replace('=', '%3D') - url = "https://www.youtube.com/browse_ajax?ctoken=" + ctoken - - print("Sending channel tab ajax request") - content = common.fetch_url(url, headers_1) - print("Finished recieving channel tab response") - - info = json.loads(content) - return info - - -grid_video_item_template = Template(''' -
-
- - - $duration - - $title - - $views - - -
- -
-''') - -def grid_video_item_info(grid_video_renderer, author): - renderer = grid_video_renderer - return { - "title": renderer['title']['simpleText'], - "id": renderer['videoId'], - "views": renderer['viewCountText'].get('simpleText', None) or renderer['viewCountText']['runs'][0]['text'], - "author": author, - "duration": default_multi_get(renderer, 'lengthText', 'simpleText', default=''), # livestreams dont have a length - "published": default_multi_get(renderer, 'publishedTimeText', 'simpleText', default=''), - } - -def grid_video_item_html(item): - video_info = json.dumps({key: item[key] for key in ('id', 'title', 'author', 'duration')}) - return grid_video_item_template.substitute( - title = html.escape(item["title"]), - views = item["views"], - duration = item["duration"], - url = URL_ORIGIN + "/watch?v=" + item["id"], - thumbnail = get_thumbnail_url(item['id']), - video_info = html.escape(json.dumps(video_info)), - published = item["published"], - datetime = '', # TODO - ) - -def get_number_of_videos(channel_id): - # Uploads playlist - playlist_id = 'UU' + channel_id[2:] - url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&ajax=1&disable_polymer=true' - print("Getting number of videos") - response = common.fetch_url(url, common.mobile_ua + headers_1) - with open('playlist_debug_metadata', 'wb') as f: - f.write(response) - response = response.decode('utf-8') - print("Got response for number of videos") - return int(re.search(r'"num_videos_text":\s*{(?:"item_type":\s*"formatted_string",)?\s*"runs":\s*\[{"text":\s*"([\d,]*) videos"', response).group(1).replace(',','')) - -@functools.lru_cache(maxsize=128) -def get_channel_id(username): - # method that gives the smallest possible response at ~10 kb - # needs to be as fast as possible - url = 'https://m.youtube.com/user/' + username + '/about?ajax=1&disable_polymer=true' - response = common.fetch_url(url, common.mobile_ua + headers_1).decode('utf-8') - return re.search(r'"channel_id":\s*"([a-zA-Z0-9_-]*)"', response).group(1) - - -def channel_videos_html(polymer_json, current_page=1, number_of_videos = 1000, current_query_string=''): - microformat = polymer_json[1]['response']['microformat']['microformatDataRenderer'] - channel_url = microformat['urlCanonical'].rstrip('/') - channel_id = channel_url[channel_url.rfind('/')+1:] - try: - items = polymer_json[1]['response']['continuationContents']['gridContinuation']['items'] - except KeyError: - items = polymer_json[1]['response']['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['gridRenderer']['items'] - items_html = '' - for video in items: - items_html += grid_video_item_html(grid_video_item_info(video['gridVideoRenderer'], microformat['title'])) - - return yt_channel_items_template.substitute( - channel_title = microformat['title'], - channel_about_url = URL_ORIGIN + "/channel/" + channel_id + "/about", - avatar = '/' + microformat['thumbnail']['thumbnails'][0]['url'], - page_title = microformat['title'] + ' - Channel', - items = items_html, - page_buttons = common.page_buttons_html(current_page, math.ceil(number_of_videos/30), URL_ORIGIN + "/channel/" + channel_id + "/videos", current_query_string) - ) - -channel_link_template = Template(''' -$text''') -stat_template = Template(''' -
  • $stat_value
  • ''') -def channel_about_page(polymer_json): - avatar = '/' + polymer_json[1]['response']['microformat']['microformatDataRenderer']['thumbnail']['thumbnails'][0]['url'] - # my goodness... - channel_metadata = polymer_json[1]['response']['contents']['twoColumnBrowseResultsRenderer']['tabs'][5]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'] - channel_links = '' - for link_json in channel_metadata['primaryLinks']: - channel_links += channel_link_template.substitute( - url = html.escape(link_json['navigationEndpoint']['urlEndpoint']['url']), - text = common.get_plain_text(link_json['title']), - ) - - stats = '' - for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'): - try: - stat_value = common.get_plain_text(channel_metadata[stat_name]) - except KeyError: - continue - else: - stats += stat_template.substitute(stat_value=stat_value) - try: - description = common.format_text_runs(common.get_formatted_text(channel_metadata['description'])) - except KeyError: - description = '' - return yt_channel_about_template.substitute( - page_title = common.get_plain_text(channel_metadata['title']) + ' - About', - channel_title = common.get_plain_text(channel_metadata['title']), - avatar = html.escape(avatar), - description = description, - links = channel_links, - stats = stats, - channel_videos_url = common.URL_ORIGIN + '/channel/' + channel_metadata['channelId'] + '/videos', - ) - -def get_channel_page(url, query_string=''): - path_components = url.rstrip('/').lstrip('/').split('/') - channel_id = path_components[0] - try: - tab = path_components[1] - except IndexError: - tab = 'videos' - - parameters = urllib.parse.parse_qs(query_string) - page_number = int(common.default_multi_get(parameters, 'page', 0, default='1')) - sort = common.default_multi_get(parameters, 'sort', 0, default='3') - view = common.default_multi_get(parameters, 'view', 0, default='1') - - if tab == 'videos': - tasks = ( - gevent.spawn(get_number_of_videos, channel_id ), - gevent.spawn(get_channel_tab, channel_id, page_number, sort, 'videos', view) - ) - gevent.joinall(tasks) - number_of_videos, polymer_json = tasks[0].value, tasks[1].value - - return channel_videos_html(polymer_json, page_number, number_of_videos, query_string) - elif tab == 'about': - polymer_json = common.fetch_url('https://www.youtube.com/channel/' + channel_id + '/about?pbj=1', headers_1) - polymer_json = json.loads(polymer_json) - return channel_about_page(polymer_json) - else: - raise ValueError('Unknown channel tab: ' + tab) - -def get_user_page(url, query_string=''): - path_components = url.rstrip('/').lstrip('/').split('/') - username = path_components[0] - try: - page = path_components[1] - except IndexError: - page = 'videos' - if page == 'videos': - polymer_json = common.fetch_url('https://www.youtube.com/user/' + username + '/videos?pbj=1', headers_1) - polymer_json = json.loads(polymer_json) - return channel_videos_html(polymer_json) - elif page == 'about': - polymer_json = common.fetch_url('https://www.youtube.com/user/' + username + '/about?pbj=1', headers_1) - polymer_json = json.loads(polymer_json) - return channel_about_page(polymer_json) - else: +import base64 +import youtube.common as common +from youtube.common import default_multi_get, URL_ORIGIN, get_thumbnail_url, video_id +import urllib +import json +from string import Template +import youtube.proto as proto +import html +import math +import gevent +import re +import functools + +with open("yt_channel_items_template.html", "r") as file: + yt_channel_items_template = Template(file.read()) + +with open("yt_channel_about_template.html", "r") as file: + yt_channel_about_template = Template(file.read()) + +'''continuation = Proto( + Field('optional', 'continuation', 80226972, Proto( + Field('optional', 'browse_id', 2, String), + Field('optional', 'params', 3, Base64(Proto( + Field('optional', 'channel_tab', 2, String), + Field('optional', 'sort', 3, ENUM + Field('optional', 'page', 15, String), + ))) + )) +)''' + + +'''channel_continuation = Proto( + Field('optional', 'pointless_nest', 80226972, Proto( + Field('optional', 'channel_id', 2, String), + Field('optional', 'continuation_info', 3, Base64(Proto( + Field('optional', 'channel_tab', 2, String), + Field('optional', 'sort', 3, ENUM + Field('optional', 'page', 15, String), + ))) + )) +)''' + +headers_1 = ( + ('Accept', '*/*'), + ('Accept-Language', 'en-US,en;q=0.5'), + ('X-YouTube-Client-Name', '1'), + ('X-YouTube-Client-Version', '2.20180614'), +) +# https://www.youtube.com/browse_ajax?action_continuation=1&direct_render=1&continuation=4qmFsgJAEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk1yZ0JBQSUzRCUzRA%3D%3D +# https://www.youtube.com/browse_ajax?ctoken=4qmFsgJAEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk1yZ0JBQSUzRCUzRA%3D%3D&continuation=4qmFsgJAEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk1yZ0JBQSUzRCUzRA%3D%3D&itct=CDsQybcCIhMIhZi1krTc2wIVjMicCh2HXQnhKJsc + +# grid view: 4qmFsgJAEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk1yZ0JBQSUzRCUzRA +# list view: 4qmFsgJCEhhVQzdVY3M0MkZaeTN1WXpqcnF6T0lIc3caJkVnWjJhV1JsYjNNWUF5QUFNQUk0QVdBQmFnQjZBVEs0QVFBJTNE +# SORT: +# Popular - 1 +# Oldest - 2 +# Newest - 3 + +# view: +# grid: 0 or 1 +# list: 2 +def channel_ctoken(channel_id, page, sort, tab, view=1): + + tab = proto.string(2, tab ) + sort = proto.uint(3, int(sort)) + page = proto.string(15, str(page) ) + view = proto.uint(6, int(view)) + continuation_info = proto.string( 3, proto.percent_b64encode(tab + view + sort + page) ) + + channel_id = proto.string(2, channel_id ) + pointless_nest = proto.string(80226972, channel_id + continuation_info) + + return base64.urlsafe_b64encode(pointless_nest).decode('ascii') + +def get_channel_tab(channel_id, page="1", sort=3, tab='videos', view=1): + ctoken = channel_ctoken(channel_id, page, sort, tab, view).replace('=', '%3D') + url = "https://www.youtube.com/browse_ajax?ctoken=" + ctoken + + print("Sending channel tab ajax request") + content = common.fetch_url(url, headers_1) + print("Finished recieving channel tab response") + + info = json.loads(content) + return info + + +grid_video_item_template = Template(''' +
    +
    + + + $duration + + $title + + $views + + +
    + +
    +''') + +def grid_video_item_info(grid_video_renderer, author): + renderer = grid_video_renderer + return { + "title": renderer['title']['simpleText'], + "id": renderer['videoId'], + "views": renderer['viewCountText'].get('simpleText', None) or renderer['viewCountText']['runs'][0]['text'], + "author": author, + "duration": default_multi_get(renderer, 'lengthText', 'simpleText', default=''), # livestreams dont have a length + "published": default_multi_get(renderer, 'publishedTimeText', 'simpleText', default=''), + } + +def grid_video_item_html(item): + video_info = json.dumps({key: item[key] for key in ('id', 'title', 'author', 'duration')}) + return grid_video_item_template.substitute( + title = html.escape(item["title"]), + views = item["views"], + duration = item["duration"], + url = URL_ORIGIN + "/watch?v=" + item["id"], + thumbnail = get_thumbnail_url(item['id']), + video_info = html.escape(json.dumps(video_info)), + published = item["published"], + datetime = '', # TODO + ) + +def get_number_of_videos(channel_id): + # Uploads playlist + playlist_id = 'UU' + channel_id[2:] + url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&ajax=1&disable_polymer=true' + print("Getting number of videos") + response = common.fetch_url(url, common.mobile_ua + headers_1) + with open('playlist_debug_metadata', 'wb') as f: + f.write(response) + response = response.decode('utf-8') + print("Got response for number of videos") + return int(re.search(r'"num_videos_text":\s*{(?:"item_type":\s*"formatted_string",)?\s*"runs":\s*\[{"text":\s*"([\d,]*) videos"', response).group(1).replace(',','')) + +@functools.lru_cache(maxsize=128) +def get_channel_id(username): + # method that gives the smallest possible response at ~10 kb + # needs to be as fast as possible + url = 'https://m.youtube.com/user/' + username + '/about?ajax=1&disable_polymer=true' + response = common.fetch_url(url, common.mobile_ua + headers_1).decode('utf-8') + return re.search(r'"channel_id":\s*"([a-zA-Z0-9_-]*)"', response).group(1) + + +def channel_videos_html(polymer_json, current_page=1, number_of_videos = 1000, current_query_string=''): + microformat = polymer_json[1]['response']['microformat']['microformatDataRenderer'] + channel_url = microformat['urlCanonical'].rstrip('/') + channel_id = channel_url[channel_url.rfind('/')+1:] + try: + items = polymer_json[1]['response']['continuationContents']['gridContinuation']['items'] + except KeyError: + items = polymer_json[1]['response']['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['gridRenderer']['items'] + items_html = '' + for video in items: + items_html += grid_video_item_html(grid_video_item_info(video['gridVideoRenderer'], microformat['title'])) + + return yt_channel_items_template.substitute( + channel_title = microformat['title'], + channel_about_url = URL_ORIGIN + "/channel/" + channel_id + "/about", + avatar = '/' + microformat['thumbnail']['thumbnails'][0]['url'], + page_title = microformat['title'] + ' - Channel', + items = items_html, + page_buttons = common.page_buttons_html(current_page, math.ceil(number_of_videos/30), URL_ORIGIN + "/channel/" + channel_id + "/videos", current_query_string), + number_of_results = '{:,}'.format(number_of_videos) + " videos", + ) + +channel_link_template = Template(''' +$text''') +stat_template = Template(''' +
  • $stat_value
  • ''') +def channel_about_page(polymer_json): + avatar = '/' + polymer_json[1]['response']['microformat']['microformatDataRenderer']['thumbnail']['thumbnails'][0]['url'] + # my goodness... + channel_metadata = polymer_json[1]['response']['contents']['twoColumnBrowseResultsRenderer']['tabs'][5]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'] + channel_links = '' + for link_json in channel_metadata['primaryLinks']: + channel_links += channel_link_template.substitute( + url = html.escape(link_json['navigationEndpoint']['urlEndpoint']['url']), + text = common.get_plain_text(link_json['title']), + ) + + stats = '' + for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'): + try: + stat_value = common.get_plain_text(channel_metadata[stat_name]) + except KeyError: + continue + else: + stats += stat_template.substitute(stat_value=stat_value) + try: + description = common.format_text_runs(common.get_formatted_text(channel_metadata['description'])) + except KeyError: + description = '' + return yt_channel_about_template.substitute( + page_title = common.get_plain_text(channel_metadata['title']) + ' - About', + channel_title = common.get_plain_text(channel_metadata['title']), + avatar = html.escape(avatar), + description = description, + links = channel_links, + stats = stats, + channel_videos_url = common.URL_ORIGIN + '/channel/' + channel_metadata['channelId'] + '/videos', + ) + +def get_channel_page(url, query_string=''): + path_components = url.rstrip('/').lstrip('/').split('/') + channel_id = path_components[0] + try: + tab = path_components[1] + except IndexError: + tab = 'videos' + + parameters = urllib.parse.parse_qs(query_string) + page_number = int(common.default_multi_get(parameters, 'page', 0, default='1')) + sort = common.default_multi_get(parameters, 'sort', 0, default='3') + view = common.default_multi_get(parameters, 'view', 0, default='1') + + if tab == 'videos': + tasks = ( + gevent.spawn(get_number_of_videos, channel_id ), + gevent.spawn(get_channel_tab, channel_id, page_number, sort, 'videos', view) + ) + gevent.joinall(tasks) + number_of_videos, polymer_json = tasks[0].value, tasks[1].value + + return channel_videos_html(polymer_json, page_number, number_of_videos, query_string) + elif tab == 'about': + polymer_json = common.fetch_url('https://www.youtube.com/channel/' + channel_id + '/about?pbj=1', headers_1) + polymer_json = json.loads(polymer_json) + return channel_about_page(polymer_json) + else: + raise ValueError('Unknown channel tab: ' + tab) + +def get_user_page(url, query_string=''): + path_components = url.rstrip('/').lstrip('/').split('/') + username = path_components[0] + try: + page = path_components[1] + except IndexError: + page = 'videos' + if page == 'videos': + polymer_json = common.fetch_url('https://www.youtube.com/user/' + username + '/videos?pbj=1', headers_1) + polymer_json = json.loads(polymer_json) + return channel_videos_html(polymer_json) + elif page == 'about': + polymer_json = common.fetch_url('https://www.youtube.com/user/' + username + '/about?pbj=1', headers_1) + polymer_json = json.loads(polymer_json) + return channel_about_page(polymer_json) + else: raise ValueError('Unknown channel page: ' + page) \ No newline at end of file diff --git a/youtube/comments.css b/youtube/comments.css index 93a6495..325a433 100644 --- a/youtube/comments.css +++ b/youtube/comments.css @@ -1,59 +1,59 @@ -.comments{ - grid-row-gap: 10px; - display: grid; - align-content:start; -} - -.comment{ - display:grid; - grid-template-columns: 0fr 0fr 1fr; - grid-template-rows: 0fr 0fr 0fr 0fr; - background-color: #dadada; -} - -.comment .author-avatar{ - grid-column: 1; - grid-row: 1 / span 3; - align-self: start; - margin-right: 5px; -} - -.comment address{ - grid-column: 2; - grid-row: 1; - margin-right:15px; - white-space: nowrap; -} - -.comment .text{ - grid-column: 2 / span 2; - grid-row: 2; - white-space: pre-line; - min-width: 0; -} - -.comment time{ - grid-column: 3; - grid-row: 1; - white-space: nowrap; - -} - - -.comment .likes{ - grid-column:2; - grid-row:3; - font-weight:bold; - white-space: nowrap; -} - -.comment .replies{ - grid-column:2 / span 2; - grid-row:4; - justify-self:start; -} - -.more-comments{ - justify-self:center; - +.comments{ + grid-row-gap: 10px; + display: grid; + align-content:start; +} + +.comment{ + display:grid; + grid-template-columns: 0fr 0fr 1fr; + grid-template-rows: 0fr 0fr 0fr 0fr; + background-color: #dadada; +} + +.comment .author-avatar{ + grid-column: 1; + grid-row: 1 / span 3; + align-self: start; + margin-right: 5px; +} + +.comment address{ + grid-column: 2; + grid-row: 1; + margin-right:15px; + white-space: nowrap; +} + +.comment .text{ + grid-column: 2 / span 2; + grid-row: 2; + white-space: pre-line; + min-width: 0; +} + +.comment time{ + grid-column: 3; + grid-row: 1; + white-space: nowrap; + +} + + +.comment .likes{ + grid-column:2; + grid-row:3; + font-weight:bold; + white-space: nowrap; +} + +.comment .replies{ + grid-column:2 / span 2; + grid-row:4; + justify-self:start; +} + +.more-comments{ + justify-self:center; + } \ No newline at end of file diff --git a/youtube/comments.py b/youtube/comments.py index 4b30a48..3f44758 100644 --- a/youtube/comments.py +++ b/youtube/comments.py @@ -1,166 +1,166 @@ -import json -import youtube.proto as proto -import base64 -from youtube.common import uppercase_escape, default_multi_get, format_text_runs, URL_ORIGIN, fetch_url -from string import Template -import urllib.request -import urllib -import html -comment_template = Template(''' -
    -
    - - - -
    - $author -
    - $text - - -$replies -
    - -
    -''') -reply_link_template = Template(''' - View replies -''') -with open("yt_comments_template.html", "r") as file: - yt_comments_template = Template(file.read()) - - -# $replies_link_text - - -# Here's what I know about the secret key (starting with ASJN_i) -# *The secret key definitely contains the following information (or perhaps the information is stored at youtube's servers): -# -Video id -# -Offset -# -Sort -# *If the video id or sort in the ctoken contradicts the ASJN, the response is an error. The offset encoded outside the ASJN is ignored entirely. -# *The ASJN is base64 encoded data, indicated by the fact that the character after "ASJN_i" is one of ("0", "1", "2", "3") -# *The encoded data is not valid protobuf -# *The encoded data (after the 5 or so bytes that are always the same) is indistinguishable from random data according to a battery of randomness tests -# *The ASJN in the ctoken provided by a response changes in regular intervals of about a second or two. -# *Old ASJN's continue to work, and start at the same comment even if new comments have been posted since -# *The ASJN has no relation with any of the data in the response it came from - -def make_comment_ctoken(video_id, sort=0, offset=0, secret_key=''): - video_id = proto.as_bytes(video_id) - secret_key = proto.as_bytes(secret_key) - - - page_info = proto.string(4,video_id) + proto.uint(6, sort) - offset_information = proto.nested(4, page_info) + proto.uint(5, offset) - if secret_key: - offset_information = proto.string(1, secret_key) + offset_information - - result = proto.nested(2, proto.string(2, video_id)) + proto.uint(3,6) + proto.nested(6, offset_information) - return base64.urlsafe_b64encode(result).decode('ascii') - -mobile_headers = { - 'Host': 'm.youtube.com', - 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1', - 'Accept': '*/*', - 'Accept-Language': 'en-US,en;q=0.5', - 'X-YouTube-Client-Name': '2', - 'X-YouTube-Client-Version': '1.20180613', -} -def request_comments(ctoken, replies=False): - if replies: # let's make it use different urls for no reason despite all the data being encoded - base_url = "https://m.youtube.com/watch_comment?action_get_comment_replies=1&ctoken=" - else: - base_url = "https://m.youtube.com/watch_comment?action_get_comments=1&ctoken=" - url = base_url + ctoken.replace("=", "%3D") + "&pbj=1" - print("Sending comments ajax request") - for i in range(0,8): # don't retry more than 8 times - content = fetch_url(url, headers=mobile_headers) - if content[0:4] == b")]}'": # random closing characters included at beginning of response for some reason - content = content[4:] - elif content[0:10] == b'\n, retrying") - continue - break - '''with open('comments_debug', 'wb') as f: - f.write(content)''' - return content - -def parse_comments(content, replies=False): - try: - content = json.loads(uppercase_escape(content.decode('utf-8'))) - #print(content) - comments_raw = content['content']['continuation_contents']['contents'] - ctoken = default_multi_get(content, 'content', 'continuation_contents', 'continuations', 0, 'continuation', default='') - - comments = [] - for comment_raw in comments_raw: - replies_url = '' - if not replies: - if comment_raw['replies'] is not None: - ctoken = comment_raw['replies']['continuations'][0]['continuation'] - replies_url = URL_ORIGIN + '/comments?ctoken=' + ctoken + "&replies=1" - comment_raw = comment_raw['comment'] - comment = { - 'author': comment_raw['author']['runs'][0]['text'], - 'author_url': comment_raw['author_endpoint']['url'], - 'author_avatar': comment_raw['author_thumbnail']['url'], - 'likes': comment_raw['like_count'], - 'published': comment_raw['published_time']['runs'][0]['text'], - 'text': comment_raw['content']['runs'], - 'reply_count': '', - 'replies_url': replies_url, - } - comments.append(comment) - except Exception as e: - print('Error parsing comments: ' + str(e)) - comments = () - ctoken = '' - else: - print("Finished getting and parsing comments") - return {'ctoken': ctoken, 'comments': comments} - -def get_comments_html(result): - html_result = '' - for comment in result['comments']: - replies = '' - if comment['replies_url']: - replies = reply_link_template.substitute(url=comment['replies_url']) - html_result += comment_template.substitute( - author=html.escape(comment['author']), - author_url = URL_ORIGIN + comment['author_url'], - author_avatar = '/' + comment['author_avatar'], - likes = str(comment['likes']) + ' likes' if str(comment['likes']) != '0' else '', - published = comment['published'], - text = format_text_runs(comment['text']), - datetime = '', #TODO - replies=replies, - #replies='', - ) - return html_result, result['ctoken'] - -def video_comments(video_id, sort=0, offset=0, secret_key=''): - result = parse_comments(request_comments(make_comment_ctoken(video_id, sort, offset, secret_key))) - return get_comments_html(result) - -more_comments_template = Template('''More comments''') - -def get_comments_page(query_string): - parameters = urllib.parse.parse_qs(query_string) - ctoken = parameters['ctoken'][0] - replies = default_multi_get(parameters, 'replies', 0, default="0") == "1" - - result = parse_comments(request_comments(ctoken, replies), replies) - comments_html, ctoken = get_comments_html(result) - if ctoken == '': - more_comments_button = '' - else: - more_comments_button = more_comments_template.substitute(url = URL_ORIGIN + '/comments?ctoken=' + ctoken) - - return yt_comments_template.substitute( - comments = comments_html, - page_title = 'Comments', - more_comments_button=more_comments_button, - ) - +import json +import youtube.proto as proto +import base64 +from youtube.common import uppercase_escape, default_multi_get, format_text_runs, URL_ORIGIN, fetch_url +from string import Template +import urllib.request +import urllib +import html +comment_template = Template(''' +
    +
    + + + +
    + $author +
    + $text + + +$replies +
    + +
    +''') +reply_link_template = Template(''' + View replies +''') +with open("yt_comments_template.html", "r") as file: + yt_comments_template = Template(file.read()) + + +# $replies_link_text + + +# Here's what I know about the secret key (starting with ASJN_i) +# *The secret key definitely contains the following information (or perhaps the information is stored at youtube's servers): +# -Video id +# -Offset +# -Sort +# *If the video id or sort in the ctoken contradicts the ASJN, the response is an error. The offset encoded outside the ASJN is ignored entirely. +# *The ASJN is base64 encoded data, indicated by the fact that the character after "ASJN_i" is one of ("0", "1", "2", "3") +# *The encoded data is not valid protobuf +# *The encoded data (after the 5 or so bytes that are always the same) is indistinguishable from random data according to a battery of randomness tests +# *The ASJN in the ctoken provided by a response changes in regular intervals of about a second or two. +# *Old ASJN's continue to work, and start at the same comment even if new comments have been posted since +# *The ASJN has no relation with any of the data in the response it came from + +def make_comment_ctoken(video_id, sort=0, offset=0, secret_key=''): + video_id = proto.as_bytes(video_id) + secret_key = proto.as_bytes(secret_key) + + + page_info = proto.string(4,video_id) + proto.uint(6, sort) + offset_information = proto.nested(4, page_info) + proto.uint(5, offset) + if secret_key: + offset_information = proto.string(1, secret_key) + offset_information + + result = proto.nested(2, proto.string(2, video_id)) + proto.uint(3,6) + proto.nested(6, offset_information) + return base64.urlsafe_b64encode(result).decode('ascii') + +mobile_headers = { + 'Host': 'm.youtube.com', + 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1', + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.5', + 'X-YouTube-Client-Name': '2', + 'X-YouTube-Client-Version': '1.20180613', +} +def request_comments(ctoken, replies=False): + if replies: # let's make it use different urls for no reason despite all the data being encoded + base_url = "https://m.youtube.com/watch_comment?action_get_comment_replies=1&ctoken=" + else: + base_url = "https://m.youtube.com/watch_comment?action_get_comments=1&ctoken=" + url = base_url + ctoken.replace("=", "%3D") + "&pbj=1" + print("Sending comments ajax request") + for i in range(0,8): # don't retry more than 8 times + content = fetch_url(url, headers=mobile_headers) + if content[0:4] == b")]}'": # random closing characters included at beginning of response for some reason + content = content[4:] + elif content[0:10] == b'\n, retrying") + continue + break + '''with open('comments_debug', 'wb') as f: + f.write(content)''' + return content + +def parse_comments(content, replies=False): + try: + content = json.loads(uppercase_escape(content.decode('utf-8'))) + #print(content) + comments_raw = content['content']['continuation_contents']['contents'] + ctoken = default_multi_get(content, 'content', 'continuation_contents', 'continuations', 0, 'continuation', default='') + + comments = [] + for comment_raw in comments_raw: + replies_url = '' + if not replies: + if comment_raw['replies'] is not None: + ctoken = comment_raw['replies']['continuations'][0]['continuation'] + replies_url = URL_ORIGIN + '/comments?ctoken=' + ctoken + "&replies=1" + comment_raw = comment_raw['comment'] + comment = { + 'author': comment_raw['author']['runs'][0]['text'], + 'author_url': comment_raw['author_endpoint']['url'], + 'author_avatar': comment_raw['author_thumbnail']['url'], + 'likes': comment_raw['like_count'], + 'published': comment_raw['published_time']['runs'][0]['text'], + 'text': comment_raw['content']['runs'], + 'reply_count': '', + 'replies_url': replies_url, + } + comments.append(comment) + except Exception as e: + print('Error parsing comments: ' + str(e)) + comments = () + ctoken = '' + else: + print("Finished getting and parsing comments") + return {'ctoken': ctoken, 'comments': comments} + +def get_comments_html(result): + html_result = '' + for comment in result['comments']: + replies = '' + if comment['replies_url']: + replies = reply_link_template.substitute(url=comment['replies_url']) + html_result += comment_template.substitute( + author=html.escape(comment['author']), + author_url = URL_ORIGIN + comment['author_url'], + author_avatar = '/' + comment['author_avatar'], + likes = str(comment['likes']) + ' likes' if str(comment['likes']) != '0' else '', + published = comment['published'], + text = format_text_runs(comment['text']), + datetime = '', #TODO + replies=replies, + #replies='', + ) + return html_result, result['ctoken'] + +def video_comments(video_id, sort=0, offset=0, secret_key=''): + result = parse_comments(request_comments(make_comment_ctoken(video_id, sort, offset, secret_key))) + return get_comments_html(result) + +more_comments_template = Template('''More comments''') + +def get_comments_page(query_string): + parameters = urllib.parse.parse_qs(query_string) + ctoken = parameters['ctoken'][0] + replies = default_multi_get(parameters, 'replies', 0, default="0") == "1" + + result = parse_comments(request_comments(ctoken, replies), replies) + comments_html, ctoken = get_comments_html(result) + if ctoken == '': + more_comments_button = '' + else: + more_comments_button = more_comments_template.substitute(url = URL_ORIGIN + '/comments?ctoken=' + ctoken) + + return yt_comments_template.substitute( + comments = comments_html, + page_title = 'Comments', + more_comments_button=more_comments_button, + ) + diff --git a/youtube/common.py b/youtube/common.py index 67bd81f..3133fed 100644 --- a/youtube/common.py +++ b/youtube/common.py @@ -1,639 +1,639 @@ -from youtube.template import Template -import html -import json -import re -import urllib.parse -import gzip -import brotli -import time - - -URL_ORIGIN = "/https://www.youtube.com" - - -# videos (all of type str): - -# id -# title -# url -# author -# author_url -# thumbnail -# description -# published -# duration -# likes -# dislikes -# views -# playlist_index - -# playlists: - -# id -# title -# url -# author -# author_url -# thumbnail -# description -# updated -# size -# first_video_id - - - - - - - -page_button_template = Template('''$page''') -current_page_button_template = Template('''
    $page''') - -medium_playlist_item_template = Template(''' - -''') -medium_video_item_template = Template(''' -
    - - - $duration - - - $title - -
    $stats
    - - - $description - $badges -
    -''') - -small_video_item_template = Template(''' -
    -
    - - - $duration - - $title - -
    $author
    - $views - -
    - -
    -''') - -small_playlist_item_template = Template(''' -
    -
    - - -
    - $size -
    -
    - $title - -
    $author
    -
    -
    -''') - -medium_channel_item_template = Template(''' -
    - - - $duration - - - $title - - $subscriber_count - $size - - $description -
    -''') - - -def fetch_url(url, headers=(), timeout=5, report_text=None): - if isinstance(headers, list): - headers += [('Accept-Encoding', 'gzip, br')] - headers = dict(headers) - elif isinstance(headers, tuple): - headers += (('Accept-Encoding', 'gzip, br'),) - headers = dict(headers) - else: - headers = headers.copy() - headers['Accept-Encoding'] = 'gzip, br' - - start_time = time.time() - - req = urllib.request.Request(url, headers=headers) - response = urllib.request.urlopen(req, timeout=timeout) - response_time = time.time() - - content = response.read() - read_finish = time.time() - if report_text: - print(report_text, 'Latency:', response_time - start_time, ' Read time:', read_finish - response_time) - encodings = response.getheader('Content-Encoding', default='identity').replace(' ', '').split(',') - for encoding in reversed(encodings): - if encoding == 'identity': - continue - if encoding == 'br': - content = brotli.decompress(content) - elif encoding == 'gzip': - content = gzip.decompress(content) - return content - -mobile_ua = (('User-Agent', 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'),) - -def dict_add(*dicts): - for dictionary in dicts[1:]: - dicts[0].update(dictionary) - return dicts[0] - -def video_id(url): - url_parts = urllib.parse.urlparse(url) - return urllib.parse.parse_qs(url_parts.query)['v'][0] - -def uppercase_escape(s): - return re.sub( - r'\\U([0-9a-fA-F]{8})', - lambda m: chr(int(m.group(1), base=16)), s) - -def default_multi_get(object, *keys, default): - ''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors ''' - try: - for key in keys: - object = object[key] - return object - except (IndexError, KeyError): - return default - -def get_plain_text(node): - try: - return html.escape(node['simpleText']) - except KeyError: - return unformmated_text_runs(node['runs']) - -def unformmated_text_runs(runs): - result = '' - for text_run in runs: - result += html.escape(text_run["text"]) - return result - -def format_text_runs(runs): - if isinstance(runs, str): - return runs - result = '' - for text_run in runs: - if text_run.get("bold", False): - result += "" + html.escape(text_run["text"]) + "" - elif text_run.get('italics', False): - result += "" + html.escape(text_run["text"]) + "" - else: - result += html.escape(text_run["text"]) - return result - -# default, sddefault, mqdefault, hqdefault, hq720 -def get_thumbnail_url(video_id): - return "/i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" - -def seconds_to_timestamp(seconds): - seconds = int(seconds) - hours, seconds = divmod(seconds,3600) - minutes, seconds = divmod(seconds,60) - if hours != 0: - timestamp = str(hours) + ":" - timestamp += str(minutes).zfill(2) # zfill pads with zeros - else: - timestamp = str(minutes) - - timestamp += ":" + str(seconds).zfill(2) - return timestamp - -# playlists: - -# id -# title -# url -# author -# author_url -# thumbnail -# description -# updated -# size -# first_video_id -def medium_playlist_item_info(playlist_renderer): - renderer = playlist_renderer - try: - author_url = URL_ORIGIN + renderer['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] - except KeyError: # radioRenderer - author_url = '' - try: - thumbnail = renderer['thumbnails'][0]['thumbnails'][0]['url'] - except KeyError: - thumbnail = renderer['thumbnail']['thumbnails'][0]['url'] - return { - "title": renderer["title"]["simpleText"], - 'id': renderer["playlistId"], - 'size': renderer.get('videoCount', '50+'), - "author": default_multi_get(renderer,'longBylineText','runs',0,'text', default='Youtube'), - "author_url": author_url, - 'thumbnail': thumbnail, - } - -def medium_video_item_info(video_renderer): - renderer = video_renderer - try: - return { - "title": renderer["title"]["simpleText"], - "id": renderer["videoId"], - "description": renderer.get("descriptionSnippet",dict()).get('runs',[]), # a list of text runs (formmated), rather than plain text - "thumbnail": get_thumbnail_url(renderer["videoId"]), - "views": renderer['viewCountText'].get('simpleText', None) or renderer['viewCountText']['runs'][0]['text'], - "duration": default_multi_get(renderer, 'lengthText', 'simpleText', default=''), # livestreams dont have a length - "author": renderer['longBylineText']['runs'][0]['text'], - "author_url": URL_ORIGIN + renderer['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - "published": default_multi_get(renderer, 'publishedTimeText', 'simpleText', default=''), - } - except KeyError: - print(renderer) - raise - -def small_video_item_info(compact_video_renderer): - renderer = compact_video_renderer - return { - "title": renderer['title']['simpleText'], - "id": renderer['videoId'], - "views": renderer['viewCountText'].get('simpleText', None) or renderer['viewCountText']['runs'][0]['text'], - "duration": default_multi_get(renderer, 'lengthText', 'simpleText', default=''), # livestreams dont have a length - "author": renderer['longBylineText']['runs'][0]['text'], - "author_url": renderer['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - } - - -# ----- -# HTML -# ----- - -def small_video_item_html(item): - video_info = json.dumps({key: item[key] for key in ('id', 'title', 'author', 'duration')}) - return small_video_item_template.substitute( - title = html.escape(item["title"]), - views = item["views"], - author = html.escape(item["author"]), - duration = item["duration"], - url = URL_ORIGIN + "/watch?v=" + item["id"], - thumbnail = get_thumbnail_url(item['id']), - video_info = html.escape(json.dumps(video_info)), - ) - -def small_playlist_item_html(item): - return small_playlist_item_template.substitute( - title=html.escape(item["title"]), - size = item['size'], - author="", - url = URL_ORIGIN + "/playlist?list=" + item["id"], - thumbnail= get_thumbnail_url(item['first_video_id']), - ) - -def medium_playlist_item_html(item): - return medium_playlist_item_template.substitute( - title=html.escape(item["title"]), - size = item['size'], - author=item['author'], - author_url= URL_ORIGIN + item['author_url'], - url = URL_ORIGIN + "/playlist?list=" + item["id"], - thumbnail= item['thumbnail'], - ) - -def medium_video_item_html(medium_video_info): - info = medium_video_info - - return medium_video_item_template.substitute( - title=html.escape(info["title"]), - views=info["views"], - published = info["published"], - description = format_text_runs(info["description"]), - author=html.escape(info["author"]), - author_url=info["author_url"], - duration=info["duration"], - url = URL_ORIGIN + "/watch?v=" + info["id"], - thumbnail=info['thumbnail'], - datetime='', # TODO - ) - -html_functions = { - 'compactVideoRenderer': lambda x: small_video_item_html(small_video_item_info(x)), - 'videoRenderer': lambda x: medium_video_item_html(medium_video_item_info(x)), - 'compactPlaylistRenderer': lambda x: small_playlist_item_html(small_playlist_item_info(x)), - 'playlistRenderer': lambda x: medium_playlist_item_html(medium_playlist_item_info(x)), - 'channelRenderer': lambda x: '', - 'radioRenderer': lambda x: medium_playlist_item_html(medium_playlist_item_info(x)), - 'compactRadioRenderer': lambda x: small_playlist_item_html(small_playlist_item_info(x)), - 'didYouMeanRenderer': lambda x: '', -} - - - - - - - -def get_url(node): - try: - return node['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] - except KeyError: - return node['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] - - -def get_text(node): - try: - return node['simpleText'] - except KeyError: - return node['runs'][0]['text'] - -def get_formatted_text(node): - try: - return node['runs'] - except KeyError: - return node['simpleText'] - -def get_badges(node): - badges = [] - for badge_node in node: - badge = badge_node['metadataBadgeRenderer']['label'] - if badge.lower() != 'new': - badges.append(badge) - return badges - -def get_thumbnail(node): - try: - return node['thumbnails'][0]['url'] # polymer format - except KeyError: - return node['url'] # ajax format - -dispatch = { - -# polymer format - 'title': ('title', get_text), - 'publishedTimeText': ('published', get_text), - 'videoId': ('id', lambda node: node), - 'descriptionSnippet': ('description', get_formatted_text), - 'lengthText': ('duration', get_text), - 'thumbnail': ('thumbnail', get_thumbnail), - 'thumbnails': ('thumbnail', lambda node: node[0]['thumbnails'][0]['url']), - - 'videoCountText': ('size', get_text), - 'playlistId': ('id', lambda node: node), - - 'subscriberCountText': ('subscriber_count', get_text), - 'channelId': ('id', lambda node: node), - 'badges': ('badges', get_badges), - -# ajax format - 'view_count_text': ('views', get_text), - 'num_videos_text': ('size', lambda node: get_text(node).split(' ')[0]), - 'owner_text': ('author', get_text), - 'owner_endpoint': ('author_url', lambda node: node['url']), - 'description': ('description', get_formatted_text), - 'index': ('playlist_index', get_text), - 'short_byline': ('author', get_text), - 'length': ('duration', get_text), - 'video_id': ('id', lambda node: node), - -} - -def renderer_info(renderer): - try: - info = {} - if 'viewCountText' in renderer: # prefer this one as it contains all the digits - info['views'] = get_text(renderer['viewCountText']) - elif 'shortViewCountText' in renderer: - info['views'] = get_text(renderer['shortViewCountText']) - - for key, node in renderer.items(): - if key in ('longBylineText', 'shortBylineText'): - info['author'] = get_text(node) - try: - info['author_url'] = get_url(node) - except KeyError: - pass - - continue - - try: - simple_key, function = dispatch[key] - except KeyError: - continue - info[simple_key] = function(node) - return info - except KeyError: - print(renderer) - raise - -def ajax_info(item_json): - try: - info = {} - for key, node in item_json.items(): - try: - simple_key, function = dispatch[key] - except KeyError: - continue - info[simple_key] = function(node) - return info - except KeyError: - print(item_json) - raise - -def badges_html(badges): - return ' | '.join(map(html.escape, badges)) - - - - - -html_transform_dispatch = { - 'title': html.escape, - 'published': html.escape, - 'id': html.escape, - 'description': format_text_runs, - 'duration': html.escape, - 'thumbnail': lambda url: html.escape('/' + url.lstrip('/')), - 'size': html.escape, - 'author': html.escape, - 'author_url': lambda url: html.escape(URL_ORIGIN + url), - 'views': html.escape, - 'subscriber_count': html.escape, - 'badges': badges_html, - 'playlist_index': html.escape, -} - -def get_html_ready(item): - html_ready = {} - for key, value in item.items(): - try: - function = html_transform_dispatch[key] - except KeyError: - continue - html_ready[key] = function(value) - return html_ready - - -author_template_url = Template('''
    By $author
    ''') -author_template = Template('''
    By $author
    ''') -stat_templates = ( - Template('''$views'''), - Template(''''''), -) -def get_video_stats(html_ready): - stats = [] - if 'author' in html_ready: - if 'author_url' in html_ready: - stats.append(author_template_url.substitute(html_ready)) - else: - stats.append(author_template.substitute(html_ready)) - for stat in stat_templates: - try: - stats.append(stat.strict_substitute(html_ready)) - except KeyError: - pass - return ' | '.join(stats) - -def video_item_html(item, template): - html_ready = get_html_ready(item) - video_info = {} - for key in ('id', 'title', 'author'): - try: - video_info[key] = html_ready[key] - except KeyError: - video_info[key] = '' - try: - video_info['duration'] = html_ready['duration'] - except KeyError: - video_info['duration'] = 'Live' # livestreams don't have a duration - - html_ready['video_info'] = html.escape(json.dumps(video_info) ) - html_ready['url'] = URL_ORIGIN + "/watch?v=" + html_ready['id'] - html_ready['datetime'] = '' #TODO - - html_ready['stats'] = get_video_stats(html_ready) - - return template.substitute(html_ready) - - -def playlist_item_html(item, template): - html_ready = get_html_ready(item) - - html_ready['url'] = URL_ORIGIN + "/playlist?list=" + html_ready['id'] - html_ready['datetime'] = '' #TODO - return template.substitute(html_ready) - - - - - - -def make_query_string(query_string): - return '&'.join(key + '=' + ','.join(values) for key,values in query_string.items()) - -def update_query_string(query_string, items): - parameters = urllib.parse.parse_qs(query_string) - parameters.update(items) - return make_query_string(parameters) - -page_button_template = Template('''$page''') -current_page_button_template = Template('''
    $page
    ''') - -def page_buttons_html(current_page, estimated_pages, url, current_query_string): - if current_page <= 5: - page_start = 1 - page_end = min(9, estimated_pages) - else: - page_start = current_page - 4 - page_end = min(current_page + 4, estimated_pages) - - result = "" - for page in range(page_start, page_end+1): - if page == current_page: - template = current_page_button_template - else: - template = page_button_template - result += template.substitute(page=page, href = url + "?" + update_query_string(current_query_string, {'page': [str(page)]}) ) - return result - - - - - - - -showing_results_for = Template(''' -
    -
    Showing results for $corrected_query
    -
    Search instead for $original_query
    -
    -''') - -did_you_mean = Template(''' -
    -
    Did you mean $corrected_query
    -
    -''') - -def renderer_html(renderer, additional_info={}, current_query_string=''): - type = list(renderer.keys())[0] - renderer = renderer[type] - if type in ('videoRenderer', 'playlistRenderer', 'radioRenderer', 'compactVideoRenderer', 'compactPlaylistRenderer', 'compactRadioRenderer', 'gridVideoRenderer', 'gridPlaylistRenderer', 'gridRadioRenderer'): - info = renderer_info(renderer) - info.update(additional_info) - if type == 'compactVideoRenderer': - return video_item_html(info, small_video_item_template) - if type in ('compactPlaylistRenderer', 'compactRadioRenderer'): - return playlist_item_html(info, small_playlist_item_template) - if type in ('videoRenderer', 'gridVideoRenderer'): - return video_item_html(info, medium_video_item_template) - if type in ('playlistRenderer', 'gridPlaylistRenderer', 'radioRenderer', 'gridRadioRenderer'): - return playlist_item_html(info, medium_playlist_item_template) - - if type == 'channelRenderer': - info = renderer_info(renderer) - html_ready = get_html_ready(info) - html_ready['url'] = URL_ORIGIN + "/channel/" + html_ready['id'] - return medium_channel_item_template.substitute(html_ready) - - if type == 'movieRenderer': - return '' - print(renderer) - raise NotImplementedError('Unknown renderer type: ' + type) - - -'videoRenderer' -'playlistRenderer' -'channelRenderer' -'radioRenderer' -'gridVideoRenderer' -'gridPlaylistRenderer' - -'didYouMeanRenderer' -'showingResultsForRenderer' +from youtube.template import Template +import html +import json +import re +import urllib.parse +import gzip +import brotli +import time + + +URL_ORIGIN = "/https://www.youtube.com" + + +# videos (all of type str): + +# id +# title +# url +# author +# author_url +# thumbnail +# description +# published +# duration +# likes +# dislikes +# views +# playlist_index + +# playlists: + +# id +# title +# url +# author +# author_url +# thumbnail +# description +# updated +# size +# first_video_id + + + + + + + +page_button_template = Template('''$page''') +current_page_button_template = Template('''
    $page''') + +medium_playlist_item_template = Template(''' + +''') +medium_video_item_template = Template(''' +
    + + + $duration + + + $title + +
    $stats
    + + + $description + $badges +
    +''') + +small_video_item_template = Template(''' +
    +
    + + + $duration + + $title + +
    $author
    + $views + +
    + +
    +''') + +small_playlist_item_template = Template(''' +
    +
    + + +
    + $size +
    +
    + $title + +
    $author
    +
    +
    +''') + +medium_channel_item_template = Template(''' +
    + + + $duration + + + $title + + $subscriber_count + $size + + $description +
    +''') + + +def fetch_url(url, headers=(), timeout=5, report_text=None): + if isinstance(headers, list): + headers += [('Accept-Encoding', 'gzip, br')] + headers = dict(headers) + elif isinstance(headers, tuple): + headers += (('Accept-Encoding', 'gzip, br'),) + headers = dict(headers) + else: + headers = headers.copy() + headers['Accept-Encoding'] = 'gzip, br' + + start_time = time.time() + + req = urllib.request.Request(url, headers=headers) + response = urllib.request.urlopen(req, timeout=timeout) + response_time = time.time() + + content = response.read() + read_finish = time.time() + if report_text: + print(report_text, 'Latency:', response_time - start_time, ' Read time:', read_finish - response_time) + encodings = response.getheader('Content-Encoding', default='identity').replace(' ', '').split(',') + for encoding in reversed(encodings): + if encoding == 'identity': + continue + if encoding == 'br': + content = brotli.decompress(content) + elif encoding == 'gzip': + content = gzip.decompress(content) + return content + +mobile_ua = (('User-Agent', 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'),) + +def dict_add(*dicts): + for dictionary in dicts[1:]: + dicts[0].update(dictionary) + return dicts[0] + +def video_id(url): + url_parts = urllib.parse.urlparse(url) + return urllib.parse.parse_qs(url_parts.query)['v'][0] + +def uppercase_escape(s): + return re.sub( + r'\\U([0-9a-fA-F]{8})', + lambda m: chr(int(m.group(1), base=16)), s) + +def default_multi_get(object, *keys, default): + ''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors ''' + try: + for key in keys: + object = object[key] + return object + except (IndexError, KeyError): + return default + +def get_plain_text(node): + try: + return html.escape(node['simpleText']) + except KeyError: + return unformmated_text_runs(node['runs']) + +def unformmated_text_runs(runs): + result = '' + for text_run in runs: + result += html.escape(text_run["text"]) + return result + +def format_text_runs(runs): + if isinstance(runs, str): + return runs + result = '' + for text_run in runs: + if text_run.get("bold", False): + result += "" + html.escape(text_run["text"]) + "" + elif text_run.get('italics', False): + result += "" + html.escape(text_run["text"]) + "" + else: + result += html.escape(text_run["text"]) + return result + +# default, sddefault, mqdefault, hqdefault, hq720 +def get_thumbnail_url(video_id): + return "/i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" + +def seconds_to_timestamp(seconds): + seconds = int(seconds) + hours, seconds = divmod(seconds,3600) + minutes, seconds = divmod(seconds,60) + if hours != 0: + timestamp = str(hours) + ":" + timestamp += str(minutes).zfill(2) # zfill pads with zeros + else: + timestamp = str(minutes) + + timestamp += ":" + str(seconds).zfill(2) + return timestamp + +# playlists: + +# id +# title +# url +# author +# author_url +# thumbnail +# description +# updated +# size +# first_video_id +def medium_playlist_item_info(playlist_renderer): + renderer = playlist_renderer + try: + author_url = URL_ORIGIN + renderer['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + except KeyError: # radioRenderer + author_url = '' + try: + thumbnail = renderer['thumbnails'][0]['thumbnails'][0]['url'] + except KeyError: + thumbnail = renderer['thumbnail']['thumbnails'][0]['url'] + return { + "title": renderer["title"]["simpleText"], + 'id': renderer["playlistId"], + 'size': renderer.get('videoCount', '50+'), + "author": default_multi_get(renderer,'longBylineText','runs',0,'text', default='Youtube'), + "author_url": author_url, + 'thumbnail': thumbnail, + } + +def medium_video_item_info(video_renderer): + renderer = video_renderer + try: + return { + "title": renderer["title"]["simpleText"], + "id": renderer["videoId"], + "description": renderer.get("descriptionSnippet",dict()).get('runs',[]), # a list of text runs (formmated), rather than plain text + "thumbnail": get_thumbnail_url(renderer["videoId"]), + "views": renderer['viewCountText'].get('simpleText', None) or renderer['viewCountText']['runs'][0]['text'], + "duration": default_multi_get(renderer, 'lengthText', 'simpleText', default=''), # livestreams dont have a length + "author": renderer['longBylineText']['runs'][0]['text'], + "author_url": URL_ORIGIN + renderer['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + "published": default_multi_get(renderer, 'publishedTimeText', 'simpleText', default=''), + } + except KeyError: + print(renderer) + raise + +def small_video_item_info(compact_video_renderer): + renderer = compact_video_renderer + return { + "title": renderer['title']['simpleText'], + "id": renderer['videoId'], + "views": renderer['viewCountText'].get('simpleText', None) or renderer['viewCountText']['runs'][0]['text'], + "duration": default_multi_get(renderer, 'lengthText', 'simpleText', default=''), # livestreams dont have a length + "author": renderer['longBylineText']['runs'][0]['text'], + "author_url": renderer['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + } + + +# ----- +# HTML +# ----- + +def small_video_item_html(item): + video_info = json.dumps({key: item[key] for key in ('id', 'title', 'author', 'duration')}) + return small_video_item_template.substitute( + title = html.escape(item["title"]), + views = item["views"], + author = html.escape(item["author"]), + duration = item["duration"], + url = URL_ORIGIN + "/watch?v=" + item["id"], + thumbnail = get_thumbnail_url(item['id']), + video_info = html.escape(json.dumps(video_info)), + ) + +def small_playlist_item_html(item): + return small_playlist_item_template.substitute( + title=html.escape(item["title"]), + size = item['size'], + author="", + url = URL_ORIGIN + "/playlist?list=" + item["id"], + thumbnail= get_thumbnail_url(item['first_video_id']), + ) + +def medium_playlist_item_html(item): + return medium_playlist_item_template.substitute( + title=html.escape(item["title"]), + size = item['size'], + author=item['author'], + author_url= URL_ORIGIN + item['author_url'], + url = URL_ORIGIN + "/playlist?list=" + item["id"], + thumbnail= item['thumbnail'], + ) + +def medium_video_item_html(medium_video_info): + info = medium_video_info + + return medium_video_item_template.substitute( + title=html.escape(info["title"]), + views=info["views"], + published = info["published"], + description = format_text_runs(info["description"]), + author=html.escape(info["author"]), + author_url=info["author_url"], + duration=info["duration"], + url = URL_ORIGIN + "/watch?v=" + info["id"], + thumbnail=info['thumbnail'], + datetime='', # TODO + ) + +html_functions = { + 'compactVideoRenderer': lambda x: small_video_item_html(small_video_item_info(x)), + 'videoRenderer': lambda x: medium_video_item_html(medium_video_item_info(x)), + 'compactPlaylistRenderer': lambda x: small_playlist_item_html(small_playlist_item_info(x)), + 'playlistRenderer': lambda x: medium_playlist_item_html(medium_playlist_item_info(x)), + 'channelRenderer': lambda x: '', + 'radioRenderer': lambda x: medium_playlist_item_html(medium_playlist_item_info(x)), + 'compactRadioRenderer': lambda x: small_playlist_item_html(small_playlist_item_info(x)), + 'didYouMeanRenderer': lambda x: '', +} + + + + + + + +def get_url(node): + try: + return node['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + except KeyError: + return node['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + + +def get_text(node): + try: + return node['simpleText'] + except KeyError: + return node['runs'][0]['text'] + +def get_formatted_text(node): + try: + return node['runs'] + except KeyError: + return node['simpleText'] + +def get_badges(node): + badges = [] + for badge_node in node: + badge = badge_node['metadataBadgeRenderer']['label'] + if badge.lower() != 'new': + badges.append(badge) + return badges + +def get_thumbnail(node): + try: + return node['thumbnails'][0]['url'] # polymer format + except KeyError: + return node['url'] # ajax format + +dispatch = { + +# polymer format + 'title': ('title', get_text), + 'publishedTimeText': ('published', get_text), + 'videoId': ('id', lambda node: node), + 'descriptionSnippet': ('description', get_formatted_text), + 'lengthText': ('duration', get_text), + 'thumbnail': ('thumbnail', get_thumbnail), + 'thumbnails': ('thumbnail', lambda node: node[0]['thumbnails'][0]['url']), + + 'videoCountText': ('size', get_text), + 'playlistId': ('id', lambda node: node), + + 'subscriberCountText': ('subscriber_count', get_text), + 'channelId': ('id', lambda node: node), + 'badges': ('badges', get_badges), + +# ajax format + 'view_count_text': ('views', get_text), + 'num_videos_text': ('size', lambda node: get_text(node).split(' ')[0]), + 'owner_text': ('author', get_text), + 'owner_endpoint': ('author_url', lambda node: node['url']), + 'description': ('description', get_formatted_text), + 'index': ('playlist_index', get_text), + 'short_byline': ('author', get_text), + 'length': ('duration', get_text), + 'video_id': ('id', lambda node: node), + +} + +def renderer_info(renderer): + try: + info = {} + if 'viewCountText' in renderer: # prefer this one as it contains all the digits + info['views'] = get_text(renderer['viewCountText']) + elif 'shortViewCountText' in renderer: + info['views'] = get_text(renderer['shortViewCountText']) + + for key, node in renderer.items(): + if key in ('longBylineText', 'shortBylineText'): + info['author'] = get_text(node) + try: + info['author_url'] = get_url(node) + except KeyError: + pass + + continue + + try: + simple_key, function = dispatch[key] + except KeyError: + continue + info[simple_key] = function(node) + return info + except KeyError: + print(renderer) + raise + +def ajax_info(item_json): + try: + info = {} + for key, node in item_json.items(): + try: + simple_key, function = dispatch[key] + except KeyError: + continue + info[simple_key] = function(node) + return info + except KeyError: + print(item_json) + raise + +def badges_html(badges): + return ' | '.join(map(html.escape, badges)) + + + + + +html_transform_dispatch = { + 'title': html.escape, + 'published': html.escape, + 'id': html.escape, + 'description': format_text_runs, + 'duration': html.escape, + 'thumbnail': lambda url: html.escape('/' + url.lstrip('/')), + 'size': html.escape, + 'author': html.escape, + 'author_url': lambda url: html.escape(URL_ORIGIN + url), + 'views': html.escape, + 'subscriber_count': html.escape, + 'badges': badges_html, + 'playlist_index': html.escape, +} + +def get_html_ready(item): + html_ready = {} + for key, value in item.items(): + try: + function = html_transform_dispatch[key] + except KeyError: + continue + html_ready[key] = function(value) + return html_ready + + +author_template_url = Template('''
    By $author
    ''') +author_template = Template('''
    By $author
    ''') +stat_templates = ( + Template('''$views'''), + Template(''''''), +) +def get_video_stats(html_ready): + stats = [] + if 'author' in html_ready: + if 'author_url' in html_ready: + stats.append(author_template_url.substitute(html_ready)) + else: + stats.append(author_template.substitute(html_ready)) + for stat in stat_templates: + try: + stats.append(stat.strict_substitute(html_ready)) + except KeyError: + pass + return ' | '.join(stats) + +def video_item_html(item, template): + html_ready = get_html_ready(item) + video_info = {} + for key in ('id', 'title', 'author'): + try: + video_info[key] = html_ready[key] + except KeyError: + video_info[key] = '' + try: + video_info['duration'] = html_ready['duration'] + except KeyError: + video_info['duration'] = 'Live' # livestreams don't have a duration + + html_ready['video_info'] = html.escape(json.dumps(video_info) ) + html_ready['url'] = URL_ORIGIN + "/watch?v=" + html_ready['id'] + html_ready['datetime'] = '' #TODO + + html_ready['stats'] = get_video_stats(html_ready) + + return template.substitute(html_ready) + + +def playlist_item_html(item, template): + html_ready = get_html_ready(item) + + html_ready['url'] = URL_ORIGIN + "/playlist?list=" + html_ready['id'] + html_ready['datetime'] = '' #TODO + return template.substitute(html_ready) + + + + + + +def make_query_string(query_string): + return '&'.join(key + '=' + ','.join(values) for key,values in query_string.items()) + +def update_query_string(query_string, items): + parameters = urllib.parse.parse_qs(query_string) + parameters.update(items) + return make_query_string(parameters) + +page_button_template = Template('''$page''') +current_page_button_template = Template('''
    $page
    ''') + +def page_buttons_html(current_page, estimated_pages, url, current_query_string): + if current_page <= 5: + page_start = 1 + page_end = min(9, estimated_pages) + else: + page_start = current_page - 4 + page_end = min(current_page + 4, estimated_pages) + + result = "" + for page in range(page_start, page_end+1): + if page == current_page: + template = current_page_button_template + else: + template = page_button_template + result += template.substitute(page=page, href = url + "?" + update_query_string(current_query_string, {'page': [str(page)]}) ) + return result + + + + + + + +showing_results_for = Template(''' +
    +
    Showing results for $corrected_query
    +
    Search instead for $original_query
    +
    +''') + +did_you_mean = Template(''' +
    +
    Did you mean $corrected_query
    +
    +''') + +def renderer_html(renderer, additional_info={}, current_query_string=''): + type = list(renderer.keys())[0] + renderer = renderer[type] + if type in ('videoRenderer', 'playlistRenderer', 'radioRenderer', 'compactVideoRenderer', 'compactPlaylistRenderer', 'compactRadioRenderer', 'gridVideoRenderer', 'gridPlaylistRenderer', 'gridRadioRenderer'): + info = renderer_info(renderer) + info.update(additional_info) + if type == 'compactVideoRenderer': + return video_item_html(info, small_video_item_template) + if type in ('compactPlaylistRenderer', 'compactRadioRenderer'): + return playlist_item_html(info, small_playlist_item_template) + if type in ('videoRenderer', 'gridVideoRenderer'): + return video_item_html(info, medium_video_item_template) + if type in ('playlistRenderer', 'gridPlaylistRenderer', 'radioRenderer', 'gridRadioRenderer'): + return playlist_item_html(info, medium_playlist_item_template) + + if type == 'channelRenderer': + info = renderer_info(renderer) + html_ready = get_html_ready(info) + html_ready['url'] = URL_ORIGIN + "/channel/" + html_ready['id'] + return medium_channel_item_template.substitute(html_ready) + + if type == 'movieRenderer': + return '' + print(renderer) + raise NotImplementedError('Unknown renderer type: ' + type) + + +'videoRenderer' +'playlistRenderer' +'channelRenderer' +'radioRenderer' +'gridVideoRenderer' +'gridPlaylistRenderer' + +'didYouMeanRenderer' +'showingResultsForRenderer' diff --git a/youtube/opensearch.xml b/youtube/opensearch.xml index 1764138..c9de40c 100644 --- a/youtube/opensearch.xml +++ b/youtube/opensearch.xml @@ -1,11 +1,11 @@ - -Youtube local -no CIA shit in the background -UTF-8 - - - - - -http://localhost/youtube.com/search + +Youtube local +no CIA shit in the background +UTF-8 + + + + + +http://localhost/youtube.com/search \ No newline at end of file diff --git a/youtube/playlist.py b/youtube/playlist.py index fc09191..592d1b4 100644 --- a/youtube/playlist.py +++ b/youtube/playlist.py @@ -1,243 +1,243 @@ -import base64 -import youtube.common as common -import urllib -import json -from string import Template -import youtube.proto as proto -import gevent -import math - -with open("yt_playlist_template.html", "r") as file: - yt_playlist_template = Template(file.read()) - - - - - - -def youtube_obfuscated_endian(offset): - if offset < 128: - return bytes((offset,)) - first_byte = 255 & offset - second_byte = 255 & (offset >> 7) - second_byte = second_byte | 1 - - # The next 2 bytes encode the offset in little endian order, - # BUT, it's done in a strange way. The least significant bit (LSB) of the second byte is not part - # of the offset. Instead, to get the number which the two bytes encode, that LSB - # of the second byte is combined with the most significant bit (MSB) of the first byte - # in a logical AND. Replace the two bits with the result of the AND to get the two little endian - # bytes that represent the offset. - - return bytes((first_byte, second_byte)) - - - -# just some garbage that's required, don't know what it means, if it means anything. -ctoken_header = b'\xe2\xa9\x85\xb2\x02' # e2 a9 85 b2 02 - -def byte(x): - return bytes((x,)) - -# TL;DR: the offset is hidden inside 3 nested base 64 encodes with random junk data added on the side periodically -def create_ctoken(playlist_id, offset): - obfuscated_offset = b'\x08' + youtube_obfuscated_endian(offset) # 0x08 slapped on for no apparent reason - obfuscated_offset = b'PT:' + base64.urlsafe_b64encode(obfuscated_offset).replace(b'=', b'') - obfuscated_offset = b'z' + byte(len(obfuscated_offset)) + obfuscated_offset - obfuscated_offset = base64.urlsafe_b64encode(obfuscated_offset).replace(b'=', b'%3D') - - playlist_bytes = b'VL' + bytes(playlist_id, 'ascii') - main_info = b'\x12' + byte(len(playlist_bytes)) + playlist_bytes + b'\x1a' + byte(len(obfuscated_offset)) + obfuscated_offset - - ctoken = base64.urlsafe_b64encode(ctoken_header + byte(len(main_info)) + main_info) - - return ctoken.decode('ascii') - -def playlist_ctoken(playlist_id, offset): - - offset = proto.uint(1, offset) - # this is just obfuscation as far as I can tell. It doesn't even follow protobuf - offset = b'PT:' + proto.unpadded_b64encode(offset) - offset = proto.string(15, offset) - - continuation_info = proto.string( 3, proto.percent_b64encode(offset) ) - - playlist_id = proto.string(2, 'VL' + playlist_id ) - pointless_nest = proto.string(80226972, playlist_id + continuation_info) - - return base64.urlsafe_b64encode(pointless_nest).decode('ascii') - -# initial request types: -# polymer_json: https://m.youtube.com/playlist?list=PLv3TTBr1W_9tppikBxAE_G6qjWdBljBHJ&pbj=1&lact=0 -# ajax json: https://m.youtube.com/playlist?list=PLv3TTBr1W_9tppikBxAE_G6qjWdBljBHJ&pbj=1&lact=0 with header X-YouTube-Client-Version: 1.20180418 - - -# continuation request types: -# polymer_json: https://m.youtube.com/playlist?&ctoken=[...]&pbj=1 -# ajax json: https://m.youtube.com/playlist?action_continuation=1&ajax=1&ctoken=[...] - - -headers_1 = ( - ('Accept', '*/*'), - ('Accept-Language', 'en-US,en;q=0.5'), - ('X-YouTube-Client-Name', '1'), - ('X-YouTube-Client-Version', '2.20180614'), -) - -def playlist_first_page(playlist_id): - url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&ajax=1&disable_polymer=true' - content = common.fetch_url(url, common.mobile_ua + headers_1) - if content[0:4] == b")]}'": - content = content[4:] - content = json.loads(common.uppercase_escape(content.decode('utf-8'))) - return content - -ajax_info_dispatch = { - 'view_count_text': ('views', common.get_text), - 'num_videos_text': ('size', lambda node: common.get_text(node).split(' ')[0]), - 'thumbnail': ('thumbnail', lambda node: node.url), - 'title': ('title', common.get_text), - 'owner_text': ('author', common.get_text), - 'owner_endpoint': ('author_url', lambda node: node.url), - 'description': ('description', common.get_formatted_text), - -} -def metadata_info(ajax_json): - info = {} - try: - for key, node in ajax_json.items(): - try: - simple_key, function = dispatch[key] - except KeyError: - continue - info[simple_key] = function(node) - return info - except (KeyError,IndexError): - print(ajax_json) - raise - - - - -#https://m.youtube.com/playlist?itct=CBMQybcCIhMIptj9xJaJ2wIV2JKcCh3Idwu-&ctoken=4qmFsgI2EiRWTFBMT3kwajlBdmxWWlB0bzZJa2pLZnB1MFNjeC0tN1BHVEMaDmVnWlFWRHBEUWxFJTNE&pbj=1 -def get_videos_ajax(playlist_id, page): - - url = "https://m.youtube.com/playlist?action_continuation=1&ajax=1&ctoken=" + playlist_ctoken(playlist_id, (int(page)-1)*20) - headers = { - 'User-Agent': ' Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1', - 'Accept': '*/*', - 'Accept-Language': 'en-US,en;q=0.5', - 'X-YouTube-Client-Name': '2', - 'X-YouTube-Client-Version': '1.20180508', - } - print("Sending playlist ajax request") - content = common.fetch_url(url, headers) - with open('playlist_debug', 'wb') as f: - f.write(content) - content = content[4:] - print("Finished recieving playlist response") - - info = json.loads(common.uppercase_escape(content.decode('utf-8'))) - return info - -def get_playlist_videos(ajax_json): - videos = [] - #info = get_bloated_playlist_videos(playlist_id, page) - #print(info) - video_list = ajax_json['content']['continuation_contents']['contents'] - - - for video_json_crap in video_list: - try: - videos.append({ - "title": video_json_crap["title"]['runs'][0]['text'], - "id": video_json_crap["video_id"], - "views": "", - "duration": common.default_multi_get(video_json_crap, 'length', 'runs', 0, 'text', default=''), # livestreams dont have a length - "author": video_json_crap['short_byline']['runs'][0]['text'], - "author_url": '', - "published": '', - 'playlist_index': '', - - }) - except (KeyError, IndexError): - print(video_json_crap) - raise - return videos - -def get_playlist_videos_format2(playlist_id, page): - videos = [] - info = get_bloated_playlist_videos(playlist_id, page) - video_list = info['response']['continuationContents']['playlistVideoListContinuation']['contents'] - - for video_json_crap in video_list: - - video_json_crap = video_json_crap['videoRenderer'] - - try: - videos.append({ - "title": video_json_crap["title"]['runs'][0]['text'], - "video_id": video_json_crap["videoId"], - "views": "", - "duration": common.default_multi_get(video_json_crap, 'lengthText', 'runs', 0, 'text', default=''), # livestreams dont have a length - "uploader": video_json_crap['shortBylineText']['runs'][0]['text'], - "uploader_url": common.ORIGIN_URL + video_json_crap['shortBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - "published": common.default_multi_get(video_json_crap, 'publishedTimeText', 'simpleText', default=''), - 'playlist_index': video_json_crap['index']['runs'][0]['text'], - - }) - except (KeyError, IndexError): - print(video_json_crap) - raise - return videos - - -def playlist_videos_html(ajax_json): - result = '' - for info in get_playlist_videos(ajax_json): - result += common.small_video_item_html(info) - return result - -playlist_stat_template = Template(''' -
    $stat
    ''') -def get_playlist_page(query_string): - parameters = urllib.parse.parse_qs(query_string) - playlist_id = parameters['list'][0] - page = parameters.get("page", "1")[0] - if page == "1": - first_page_json = playlist_first_page(playlist_id) - this_page_json = first_page_json - else: - tasks = ( - gevent.spawn(playlist_first_page, playlist_id ), - gevent.spawn(get_videos_ajax, playlist_id, page) - ) - gevent.joinall(tasks) - first_page_json, this_page_json = tasks[0].value, tasks[1].value - - try: - video_list = this_page_json['content']['section_list']['contents'][0]['contents'][0]['contents'] - except KeyError: - video_list = this_page_json['content']['continuation_contents']['contents'] - videos_html = '' - for video_json in video_list: - info = common.ajax_info(video_json) - videos_html += common.video_item_html(info, common.small_video_item_template) - - - metadata = common.ajax_info(first_page_json['content']['playlist_header']) - video_count = int(metadata['size'].replace(',', '')) - page_buttons = common.page_buttons_html(int(page), math.ceil(video_count/20), common.URL_ORIGIN + "/playlist", query_string) - - html_ready = common.get_html_ready(metadata) - html_ready['page_title'] = html_ready['title'] + ' - Page ' + str(page) - - stats = '' - stats += playlist_stat_template.substitute(stat=html_ready['size'] + ' videos') - stats += playlist_stat_template.substitute(stat=html_ready['views']) - return yt_playlist_template.substitute( - videos = videos_html, - page_buttons = page_buttons, - stats = stats, - **html_ready +import base64 +import youtube.common as common +import urllib +import json +from string import Template +import youtube.proto as proto +import gevent +import math + +with open("yt_playlist_template.html", "r") as file: + yt_playlist_template = Template(file.read()) + + + + + + +def youtube_obfuscated_endian(offset): + if offset < 128: + return bytes((offset,)) + first_byte = 255 & offset + second_byte = 255 & (offset >> 7) + second_byte = second_byte | 1 + + # The next 2 bytes encode the offset in little endian order, + # BUT, it's done in a strange way. The least significant bit (LSB) of the second byte is not part + # of the offset. Instead, to get the number which the two bytes encode, that LSB + # of the second byte is combined with the most significant bit (MSB) of the first byte + # in a logical AND. Replace the two bits with the result of the AND to get the two little endian + # bytes that represent the offset. + + return bytes((first_byte, second_byte)) + + + +# just some garbage that's required, don't know what it means, if it means anything. +ctoken_header = b'\xe2\xa9\x85\xb2\x02' # e2 a9 85 b2 02 + +def byte(x): + return bytes((x,)) + +# TL;DR: the offset is hidden inside 3 nested base 64 encodes with random junk data added on the side periodically +def create_ctoken(playlist_id, offset): + obfuscated_offset = b'\x08' + youtube_obfuscated_endian(offset) # 0x08 slapped on for no apparent reason + obfuscated_offset = b'PT:' + base64.urlsafe_b64encode(obfuscated_offset).replace(b'=', b'') + obfuscated_offset = b'z' + byte(len(obfuscated_offset)) + obfuscated_offset + obfuscated_offset = base64.urlsafe_b64encode(obfuscated_offset).replace(b'=', b'%3D') + + playlist_bytes = b'VL' + bytes(playlist_id, 'ascii') + main_info = b'\x12' + byte(len(playlist_bytes)) + playlist_bytes + b'\x1a' + byte(len(obfuscated_offset)) + obfuscated_offset + + ctoken = base64.urlsafe_b64encode(ctoken_header + byte(len(main_info)) + main_info) + + return ctoken.decode('ascii') + +def playlist_ctoken(playlist_id, offset): + + offset = proto.uint(1, offset) + # this is just obfuscation as far as I can tell. It doesn't even follow protobuf + offset = b'PT:' + proto.unpadded_b64encode(offset) + offset = proto.string(15, offset) + + continuation_info = proto.string( 3, proto.percent_b64encode(offset) ) + + playlist_id = proto.string(2, 'VL' + playlist_id ) + pointless_nest = proto.string(80226972, playlist_id + continuation_info) + + return base64.urlsafe_b64encode(pointless_nest).decode('ascii') + +# initial request types: +# polymer_json: https://m.youtube.com/playlist?list=PLv3TTBr1W_9tppikBxAE_G6qjWdBljBHJ&pbj=1&lact=0 +# ajax json: https://m.youtube.com/playlist?list=PLv3TTBr1W_9tppikBxAE_G6qjWdBljBHJ&pbj=1&lact=0 with header X-YouTube-Client-Version: 1.20180418 + + +# continuation request types: +# polymer_json: https://m.youtube.com/playlist?&ctoken=[...]&pbj=1 +# ajax json: https://m.youtube.com/playlist?action_continuation=1&ajax=1&ctoken=[...] + + +headers_1 = ( + ('Accept', '*/*'), + ('Accept-Language', 'en-US,en;q=0.5'), + ('X-YouTube-Client-Name', '1'), + ('X-YouTube-Client-Version', '2.20180614'), +) + +def playlist_first_page(playlist_id): + url = 'https://m.youtube.com/playlist?list=' + playlist_id + '&ajax=1&disable_polymer=true' + content = common.fetch_url(url, common.mobile_ua + headers_1) + if content[0:4] == b")]}'": + content = content[4:] + content = json.loads(common.uppercase_escape(content.decode('utf-8'))) + return content + +ajax_info_dispatch = { + 'view_count_text': ('views', common.get_text), + 'num_videos_text': ('size', lambda node: common.get_text(node).split(' ')[0]), + 'thumbnail': ('thumbnail', lambda node: node.url), + 'title': ('title', common.get_text), + 'owner_text': ('author', common.get_text), + 'owner_endpoint': ('author_url', lambda node: node.url), + 'description': ('description', common.get_formatted_text), + +} +def metadata_info(ajax_json): + info = {} + try: + for key, node in ajax_json.items(): + try: + simple_key, function = dispatch[key] + except KeyError: + continue + info[simple_key] = function(node) + return info + except (KeyError,IndexError): + print(ajax_json) + raise + + + + +#https://m.youtube.com/playlist?itct=CBMQybcCIhMIptj9xJaJ2wIV2JKcCh3Idwu-&ctoken=4qmFsgI2EiRWTFBMT3kwajlBdmxWWlB0bzZJa2pLZnB1MFNjeC0tN1BHVEMaDmVnWlFWRHBEUWxFJTNE&pbj=1 +def get_videos_ajax(playlist_id, page): + + url = "https://m.youtube.com/playlist?action_continuation=1&ajax=1&ctoken=" + playlist_ctoken(playlist_id, (int(page)-1)*20) + headers = { + 'User-Agent': ' Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1', + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.5', + 'X-YouTube-Client-Name': '2', + 'X-YouTube-Client-Version': '1.20180508', + } + print("Sending playlist ajax request") + content = common.fetch_url(url, headers) + with open('playlist_debug', 'wb') as f: + f.write(content) + content = content[4:] + print("Finished recieving playlist response") + + info = json.loads(common.uppercase_escape(content.decode('utf-8'))) + return info + +def get_playlist_videos(ajax_json): + videos = [] + #info = get_bloated_playlist_videos(playlist_id, page) + #print(info) + video_list = ajax_json['content']['continuation_contents']['contents'] + + + for video_json_crap in video_list: + try: + videos.append({ + "title": video_json_crap["title"]['runs'][0]['text'], + "id": video_json_crap["video_id"], + "views": "", + "duration": common.default_multi_get(video_json_crap, 'length', 'runs', 0, 'text', default=''), # livestreams dont have a length + "author": video_json_crap['short_byline']['runs'][0]['text'], + "author_url": '', + "published": '', + 'playlist_index': '', + + }) + except (KeyError, IndexError): + print(video_json_crap) + raise + return videos + +def get_playlist_videos_format2(playlist_id, page): + videos = [] + info = get_bloated_playlist_videos(playlist_id, page) + video_list = info['response']['continuationContents']['playlistVideoListContinuation']['contents'] + + for video_json_crap in video_list: + + video_json_crap = video_json_crap['videoRenderer'] + + try: + videos.append({ + "title": video_json_crap["title"]['runs'][0]['text'], + "video_id": video_json_crap["videoId"], + "views": "", + "duration": common.default_multi_get(video_json_crap, 'lengthText', 'runs', 0, 'text', default=''), # livestreams dont have a length + "uploader": video_json_crap['shortBylineText']['runs'][0]['text'], + "uploader_url": common.ORIGIN_URL + video_json_crap['shortBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + "published": common.default_multi_get(video_json_crap, 'publishedTimeText', 'simpleText', default=''), + 'playlist_index': video_json_crap['index']['runs'][0]['text'], + + }) + except (KeyError, IndexError): + print(video_json_crap) + raise + return videos + + +def playlist_videos_html(ajax_json): + result = '' + for info in get_playlist_videos(ajax_json): + result += common.small_video_item_html(info) + return result + +playlist_stat_template = Template(''' +
    $stat
    ''') +def get_playlist_page(query_string): + parameters = urllib.parse.parse_qs(query_string) + playlist_id = parameters['list'][0] + page = parameters.get("page", "1")[0] + if page == "1": + first_page_json = playlist_first_page(playlist_id) + this_page_json = first_page_json + else: + tasks = ( + gevent.spawn(playlist_first_page, playlist_id ), + gevent.spawn(get_videos_ajax, playlist_id, page) + ) + gevent.joinall(tasks) + first_page_json, this_page_json = tasks[0].value, tasks[1].value + + try: + video_list = this_page_json['content']['section_list']['contents'][0]['contents'][0]['contents'] + except KeyError: + video_list = this_page_json['content']['continuation_contents']['contents'] + videos_html = '' + for video_json in video_list: + info = common.ajax_info(video_json) + videos_html += common.video_item_html(info, common.small_video_item_template) + + + metadata = common.ajax_info(first_page_json['content']['playlist_header']) + video_count = int(metadata['size'].replace(',', '')) + page_buttons = common.page_buttons_html(int(page), math.ceil(video_count/20), common.URL_ORIGIN + "/playlist", query_string) + + html_ready = common.get_html_ready(metadata) + html_ready['page_title'] = html_ready['title'] + ' - Page ' + str(page) + + stats = '' + stats += playlist_stat_template.substitute(stat=html_ready['size'] + ' videos') + stats += playlist_stat_template.substitute(stat=html_ready['views']) + return yt_playlist_template.substitute( + videos = videos_html, + page_buttons = page_buttons, + stats = stats, + **html_ready ) \ No newline at end of file diff --git a/youtube/proto.py b/youtube/proto.py index 9f9dbcc..6230e51 100644 --- a/youtube/proto.py +++ b/youtube/proto.py @@ -1,65 +1,65 @@ -from math import ceil -import base64 - -def byte(n): - return bytes((n,)) - - -def varint_encode(offset): - '''In this encoding system, for each 8-bit byte, the first bit is 1 if there are more bytes, and 0 is this is the last one. - The next 7 bits are data. These 7-bit sections represent the data in Little endian order. For example, suppose the data is - aaaaaaabbbbbbbccccccc (each of these sections is 7 bits). It will be encoded as: - 1ccccccc 1bbbbbbb 0aaaaaaa - - This encoding is used in youtube parameters to encode offsets and to encode the length for length-prefixed data. - See https://developers.google.com/protocol-buffers/docs/encoding#varints for more info.''' - needed_bytes = ceil(offset.bit_length()/7) or 1 # (0).bit_length() returns 0, but we need 1 in that case. - encoded_bytes = bytearray(needed_bytes) - for i in range(0, needed_bytes - 1): - encoded_bytes[i] = (offset & 127) | 128 # 7 least significant bits - offset = offset >> 7 - encoded_bytes[-1] = offset & 127 # leave first bit as zero for last byte - - return bytes(encoded_bytes) - - -def varint_decode(encoded): - decoded = 0 - for i, byte in enumerate(encoded): - decoded |= (byte & 127) << 7*i - - if not (byte & 128): - break - return decoded - - -def string(field_number, data): - data = as_bytes(data) - return _proto_field(2, field_number, varint_encode(len(data)) + data) -nested = string - -def uint(field_number, value): - return _proto_field(0, field_number, varint_encode(value)) - - - - -def _proto_field(wire_type, field_number, data): - ''' See https://developers.google.com/protocol-buffers/docs/encoding#structure ''' - return varint_encode( (field_number << 3) | wire_type) + data - - - -def percent_b64encode(data): - return base64.urlsafe_b64encode(data).replace(b'=', b'%3D') - - -def unpadded_b64encode(data): - return base64.urlsafe_b64encode(data).replace(b'=', b'') - -def as_bytes(value): - if isinstance(value, str): - return value.encode('ascii') - return value - +from math import ceil +import base64 + +def byte(n): + return bytes((n,)) + + +def varint_encode(offset): + '''In this encoding system, for each 8-bit byte, the first bit is 1 if there are more bytes, and 0 is this is the last one. + The next 7 bits are data. These 7-bit sections represent the data in Little endian order. For example, suppose the data is + aaaaaaabbbbbbbccccccc (each of these sections is 7 bits). It will be encoded as: + 1ccccccc 1bbbbbbb 0aaaaaaa + + This encoding is used in youtube parameters to encode offsets and to encode the length for length-prefixed data. + See https://developers.google.com/protocol-buffers/docs/encoding#varints for more info.''' + needed_bytes = ceil(offset.bit_length()/7) or 1 # (0).bit_length() returns 0, but we need 1 in that case. + encoded_bytes = bytearray(needed_bytes) + for i in range(0, needed_bytes - 1): + encoded_bytes[i] = (offset & 127) | 128 # 7 least significant bits + offset = offset >> 7 + encoded_bytes[-1] = offset & 127 # leave first bit as zero for last byte + + return bytes(encoded_bytes) + + +def varint_decode(encoded): + decoded = 0 + for i, byte in enumerate(encoded): + decoded |= (byte & 127) << 7*i + + if not (byte & 128): + break + return decoded + + +def string(field_number, data): + data = as_bytes(data) + return _proto_field(2, field_number, varint_encode(len(data)) + data) +nested = string + +def uint(field_number, value): + return _proto_field(0, field_number, varint_encode(value)) + + + + +def _proto_field(wire_type, field_number, data): + ''' See https://developers.google.com/protocol-buffers/docs/encoding#structure ''' + return varint_encode( (field_number << 3) | wire_type) + data + + + +def percent_b64encode(data): + return base64.urlsafe_b64encode(data).replace(b'=', b'%3D') + + +def unpadded_b64encode(data): + return base64.urlsafe_b64encode(data).replace(b'=', b'') + +def as_bytes(value): + if isinstance(value, str): + return value.encode('ascii') + return value + \ No newline at end of file diff --git a/youtube/search.py b/youtube/search.py index 5268dbe..5982d9b 100644 --- a/youtube/search.py +++ b/youtube/search.py @@ -1,231 +1,231 @@ -import json -import urllib -import html -from string import Template -import base64 -from math import ceil -from youtube.common import default_multi_get, get_thumbnail_url, URL_ORIGIN -import youtube.common as common - -with open("yt_search_results_template.html", "r") as file: - yt_search_results_template = file.read() - -with open("yt_search_template.html", "r") as file: - yt_search_template = file.read() - -page_button_template = Template('''$page''') -current_page_button_template = Template('''
    $page
    ''') -video_result_template = ''' -
    - - - $length - - - $video_title - -
    Uploaded by $uploader
    - $views - - - - - $description -
    -''' - - - -# Sort: 1 - # Upload date: 2 - # View count: 3 - # Rating: 1 -# Offset: 9 -# Filters: 2 - # Upload date: 1 - # Type: 2 - # Duration: 3 - - -features = { - '4k': 14, - 'hd': 4, - 'hdr': 25, - 'subtitles': 5, - 'creative_commons': 6, - '3d': 7, - 'live': 8, - 'purchased': 9, - '360': 15, - 'location': 23, -} - -def page_number_to_sp_parameter(page): - offset = (int(page) - 1)*20 # 20 results per page - first_byte = 255 & offset - second_byte = 255 & (offset >> 7) - second_byte = second_byte | 1 - - # 0b01001000 is required, and is always the same. - # The next 2 bytes encode the offset in little endian order, - # BUT, it's done in a strange way. The least significant bit (LSB) of the second byte is not part - # of the offset. Instead, to get the number which the two bytes encode, that LSB - # of the second byte is combined with the most significant bit (MSB) of the first byte - # in a logical AND. Replace the two bits with the result of the AND to get the two little endian - # bytes that represent the offset. - # I figured this out by trial and error on the sp parameter. I don't know why it's done like this; - # perhaps it's just obfuscation. - param_bytes = bytes((0b01001000, first_byte, second_byte)) - param_encoded = urllib.parse.quote(base64.urlsafe_b64encode(param_bytes)) - return param_encoded - -def get_search_json(query, page): - url = "https://www.youtube.com/results?search_query=" + urllib.parse.quote_plus(query) - headers = { - 'Host': 'www.youtube.com', - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)', - 'Accept': '*/*', - 'Accept-Language': 'en-US,en;q=0.5', - 'X-YouTube-Client-Name': '1', - 'X-YouTube-Client-Version': '2.20180418', - } - url += "&pbj=1&sp=" + page_number_to_sp_parameter(page) - content = common.fetch_url(url, headers=headers) - info = json.loads(content) - return info - -"""def get_search_info(query, page): - result_info = dict() - info = get_bloated_search_info(query, page) - - estimated_results = int(info[1]['response']['estimatedResults']) - estimated_pages = ceil(estimated_results/20) - result_info['estimated_results'] = estimated_results - result_info['estimated_pages'] = estimated_pages - - result_info['results'] = [] - # this is what you get when you hire H-1B's - video_list = info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'] - - - for video_json_crap in video_list: - # they have a dictionary whose only content is another dictionary... - try: - type = list(video_json_crap.keys())[0] - except KeyError: - continue #channelRenderer or playlistRenderer - '''description = "" - for text_run in video_json_crap["descriptionSnippet"]["runs"]: - if text_run.get("bold", False): - description += "" + html.escape''' - try: - result_info['results'].append({ - "title": video_json_crap["title"]["simpleText"], - "video_id": video_json_crap["videoId"], - "description": video_json_crap.get("descriptionSnippet",dict()).get('runs',[]), # a list of text runs (formmated), rather than plain text - "thumbnail": get_thumbnail_url(video_json_crap["videoId"]), - "views_text": video_json_crap['viewCountText'].get('simpleText', None) or video_json_crap['viewCountText']['runs'][0]['text'], - "length_text": default_multi_get(video_json_crap, 'lengthText', 'simpleText', default=''), # livestreams dont have a length - "uploader": video_json_crap['longBylineText']['runs'][0]['text'], - "uploader_url": URL_ORIGIN + video_json_crap['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - "published_time_text": default_multi_get(video_json_crap, 'publishedTimeText', 'simpleText', default=''), - - }) - except KeyError: - print(video_json_crap) - raise - return result_info""" - - -def page_buttons_html(page_start, page_end, current_page, query): - result = "" - for page in range(page_start, page_end+1): - if page == current_page: - template = current_page_button_template - else: - template = page_button_template - result += template.substitute(page=page, href=URL_ORIGIN + "/search?query=" + urllib.parse.quote_plus(query) + "&page=" + str(page)) - return result - -showing_results_for = Template(''' -
    Showing results for $corrected_query
    -
    Search instead for $original_query
    -''') -did_you_mean = Template(''' -
    Did you mean $corrected_query
    -''') -def get_search_page(query_string, parameters=()): - qs_query = urllib.parse.parse_qs(query_string) - if len(qs_query) == 0: - return yt_search_template - query = qs_query["query"][0] - page = qs_query.get("page", "1")[0] - - info = get_search_json(query, page) - - estimated_results = int(info[1]['response']['estimatedResults']) - estimated_pages = ceil(estimated_results/20) - results = info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'] - - corrections = '' - result_list_html = "" - for renderer in results: - type = list(renderer.keys())[0] - if type == 'shelfRenderer': - continue - if type == 'didYouMeanRenderer': - renderer = renderer[type] - corrected_query_string = urllib.parse.parse_qs(query_string) - corrected_query_string['query'] = [renderer['correctedQueryEndpoint']['searchEndpoint']['query']] - corrected_query_url = URL_ORIGIN + '/search?' + common.make_query_string(corrected_query_string) - corrections = did_you_mean.substitute( - corrected_query_url = corrected_query_url, - corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), - ) - continue - if type == 'showingResultsForRenderer': - renderer = renderer[type] - no_autocorrect_query_string = urllib.parse.parse_qs(query_string) - no_autocorrect_query_string['autocorrect'] = ['0'] - no_autocorrect_query_url = URL_ORIGIN + '/search?' + common.make_query_string(no_autocorrect_query_string) - corrections = showing_results_for.substitute( - corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), - original_query_url = no_autocorrect_query_url, - original_query = html.escape(renderer['originalQuery']['simpleText']), - ) - continue - result_list_html += common.renderer_html(renderer, current_query_string=query_string) - '''type = list(result.keys())[0] - result = result[type] - if type == "showingResultsForRenderer": - url = URL_ORIGIN + "/search" - if len(parameters) > 0: - url += ';' + ';'.join(parameters) - url += '?' + '&'.join(key + '=' + ','.join(values) for key,values in qs_query.items()) - - result_list_html += showing_results_for_template.substitute( - corrected_query=common.format_text_runs(result['correctedQuery']['runs']), - - ) - else: - result_list_html += common.html_functions[type](result)''' - - page = int(page) - if page <= 5: - page_start = 1 - page_end = min(9, estimated_pages) - else: - page_start = page - 4 - page_end = min(page + 4, estimated_pages) - - - result = Template(yt_search_results_template).substitute( - results = result_list_html, - page_title = query + " - Search", - search_box_value = html.escape(query), - number_of_results = '{:,}'.format(estimated_results), - number_of_pages = '{:,}'.format(estimated_pages), - page_buttons = page_buttons_html(page_start, page_end, page, query), - corrections = corrections - ) +import json +import urllib +import html +from string import Template +import base64 +from math import ceil +from youtube.common import default_multi_get, get_thumbnail_url, URL_ORIGIN +import youtube.common as common + +with open("yt_search_results_template.html", "r") as file: + yt_search_results_template = file.read() + +with open("yt_search_template.html", "r") as file: + yt_search_template = file.read() + +page_button_template = Template('''$page''') +current_page_button_template = Template('''
    $page
    ''') +video_result_template = ''' +
    + + + $length + + + $video_title + +
    Uploaded by $uploader
    + $views + + + + + $description +
    +''' + + + +# Sort: 1 + # Upload date: 2 + # View count: 3 + # Rating: 1 +# Offset: 9 +# Filters: 2 + # Upload date: 1 + # Type: 2 + # Duration: 3 + + +features = { + '4k': 14, + 'hd': 4, + 'hdr': 25, + 'subtitles': 5, + 'creative_commons': 6, + '3d': 7, + 'live': 8, + 'purchased': 9, + '360': 15, + 'location': 23, +} + +def page_number_to_sp_parameter(page): + offset = (int(page) - 1)*20 # 20 results per page + first_byte = 255 & offset + second_byte = 255 & (offset >> 7) + second_byte = second_byte | 1 + + # 0b01001000 is required, and is always the same. + # The next 2 bytes encode the offset in little endian order, + # BUT, it's done in a strange way. The least significant bit (LSB) of the second byte is not part + # of the offset. Instead, to get the number which the two bytes encode, that LSB + # of the second byte is combined with the most significant bit (MSB) of the first byte + # in a logical AND. Replace the two bits with the result of the AND to get the two little endian + # bytes that represent the offset. + # I figured this out by trial and error on the sp parameter. I don't know why it's done like this; + # perhaps it's just obfuscation. + param_bytes = bytes((0b01001000, first_byte, second_byte)) + param_encoded = urllib.parse.quote(base64.urlsafe_b64encode(param_bytes)) + return param_encoded + +def get_search_json(query, page): + url = "https://www.youtube.com/results?search_query=" + urllib.parse.quote_plus(query) + headers = { + 'Host': 'www.youtube.com', + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)', + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.5', + 'X-YouTube-Client-Name': '1', + 'X-YouTube-Client-Version': '2.20180418', + } + url += "&pbj=1&sp=" + page_number_to_sp_parameter(page) + content = common.fetch_url(url, headers=headers) + info = json.loads(content) + return info + +"""def get_search_info(query, page): + result_info = dict() + info = get_bloated_search_info(query, page) + + estimated_results = int(info[1]['response']['estimatedResults']) + estimated_pages = ceil(estimated_results/20) + result_info['estimated_results'] = estimated_results + result_info['estimated_pages'] = estimated_pages + + result_info['results'] = [] + # this is what you get when you hire H-1B's + video_list = info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'] + + + for video_json_crap in video_list: + # they have a dictionary whose only content is another dictionary... + try: + type = list(video_json_crap.keys())[0] + except KeyError: + continue #channelRenderer or playlistRenderer + '''description = "" + for text_run in video_json_crap["descriptionSnippet"]["runs"]: + if text_run.get("bold", False): + description += "" + html.escape''' + try: + result_info['results'].append({ + "title": video_json_crap["title"]["simpleText"], + "video_id": video_json_crap["videoId"], + "description": video_json_crap.get("descriptionSnippet",dict()).get('runs',[]), # a list of text runs (formmated), rather than plain text + "thumbnail": get_thumbnail_url(video_json_crap["videoId"]), + "views_text": video_json_crap['viewCountText'].get('simpleText', None) or video_json_crap['viewCountText']['runs'][0]['text'], + "length_text": default_multi_get(video_json_crap, 'lengthText', 'simpleText', default=''), # livestreams dont have a length + "uploader": video_json_crap['longBylineText']['runs'][0]['text'], + "uploader_url": URL_ORIGIN + video_json_crap['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + "published_time_text": default_multi_get(video_json_crap, 'publishedTimeText', 'simpleText', default=''), + + }) + except KeyError: + print(video_json_crap) + raise + return result_info""" + + +def page_buttons_html(page_start, page_end, current_page, query): + result = "" + for page in range(page_start, page_end+1): + if page == current_page: + template = current_page_button_template + else: + template = page_button_template + result += template.substitute(page=page, href=URL_ORIGIN + "/search?query=" + urllib.parse.quote_plus(query) + "&page=" + str(page)) + return result + +showing_results_for = Template(''' +
    Showing results for $corrected_query
    +
    Search instead for $original_query
    +''') +did_you_mean = Template(''' +
    Did you mean $corrected_query
    +''') +def get_search_page(query_string, parameters=()): + qs_query = urllib.parse.parse_qs(query_string) + if len(qs_query) == 0: + return yt_search_template + query = qs_query["query"][0] + page = qs_query.get("page", "1")[0] + + info = get_search_json(query, page) + + estimated_results = int(info[1]['response']['estimatedResults']) + estimated_pages = ceil(estimated_results/20) + results = info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'] + + corrections = '' + result_list_html = "" + for renderer in results: + type = list(renderer.keys())[0] + if type == 'shelfRenderer': + continue + if type == 'didYouMeanRenderer': + renderer = renderer[type] + corrected_query_string = urllib.parse.parse_qs(query_string) + corrected_query_string['query'] = [renderer['correctedQueryEndpoint']['searchEndpoint']['query']] + corrected_query_url = URL_ORIGIN + '/search?' + common.make_query_string(corrected_query_string) + corrections = did_you_mean.substitute( + corrected_query_url = corrected_query_url, + corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), + ) + continue + if type == 'showingResultsForRenderer': + renderer = renderer[type] + no_autocorrect_query_string = urllib.parse.parse_qs(query_string) + no_autocorrect_query_string['autocorrect'] = ['0'] + no_autocorrect_query_url = URL_ORIGIN + '/search?' + common.make_query_string(no_autocorrect_query_string) + corrections = showing_results_for.substitute( + corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), + original_query_url = no_autocorrect_query_url, + original_query = html.escape(renderer['originalQuery']['simpleText']), + ) + continue + result_list_html += common.renderer_html(renderer, current_query_string=query_string) + '''type = list(result.keys())[0] + result = result[type] + if type == "showingResultsForRenderer": + url = URL_ORIGIN + "/search" + if len(parameters) > 0: + url += ';' + ';'.join(parameters) + url += '?' + '&'.join(key + '=' + ','.join(values) for key,values in qs_query.items()) + + result_list_html += showing_results_for_template.substitute( + corrected_query=common.format_text_runs(result['correctedQuery']['runs']), + + ) + else: + result_list_html += common.html_functions[type](result)''' + + page = int(page) + if page <= 5: + page_start = 1 + page_end = min(9, estimated_pages) + else: + page_start = page - 4 + page_end = min(page + 4, estimated_pages) + + + result = Template(yt_search_results_template).substitute( + results = result_list_html, + page_title = query + " - Search", + search_box_value = html.escape(query), + number_of_results = '{:,}'.format(estimated_results), + number_of_pages = '{:,}'.format(estimated_pages), + page_buttons = page_buttons_html(page_start, page_end, page, query), + corrections = corrections + ) return result \ No newline at end of file diff --git a/youtube/shared.css b/youtube/shared.css index 39e76f4..2ea511a 100644 --- a/youtube/shared.css +++ b/youtube/shared.css @@ -1,271 +1,271 @@ -h1, h2, h3, h4, h5, h6, div{ - margin:0; - padding:0; - -} - - -body{ - margin:0; - padding: 0; - color:#222; - - - background-color:#cccccc; - - min-height:100vh; - - display:grid; - grid-template-rows: 50px 1fr; -} - - header{ - background-color:#333333; - - grid-row: 1; - } - - main{ - grid-row: 2; - } - -button{ - padding:0; /* Fuck browser-specific styling. Fix your shit mozilla */ -} -address{ - font-style:normal; -} -#site-search{ - display: grid; - grid-template-columns: 1fr 0fr; - -} - - #site-search .search-box{ - align-self:center; - height:25px; - border:0; - - grid-column: 1; - } - #site-search .search-button{ - grid-column: 2; - align-self:center; - height:25px; - - border-style:solid; - border-width:1px; - } - - -.full-item{ - display: grid; - grid-template-rows: 0fr 0fr 0fr 0fr 0fr; - grid-template-columns: 1fr 1fr; - -} - .full-item video{ - grid-column: 1 / span 2; - grid-row: 1; - } - .full-item .title{ - grid-column: 1 / span 2; - grid-row:2; - min-width: 0; - } - .full-item address{ - grid-column: 1; - grid-row: 3; - justify-self: start; - } - .full-item .views{ - grid-column: 2; - grid-row: 3; - justify-self:end; - } - .full-item time{ - grid-column: 1; - grid-row: 4; - justify-self:start; - } - .full-item .likes-dislikes{ - grid-column: 2; - grid-row: 4; - justify-self:end; - } - .full-item .description{ - background-color:#d0d0d0; - margin-top:8px; - white-space: pre-line; - min-width: 0; - - grid-column: 1 / span 2; - grid-row: 5; - } - -.medium-item{ - background-color:#bcbcbc; - display: grid; - align-content: start; - grid-template-columns: 246px 1fr 0fr; - grid-template-rows: 0fr 0fr 0fr 0fr 0fr 1fr; -} - .medium-item .title{ - grid-column:2 / span 2; - grid-row:1; - min-width: 0; - } - .medium-item address{ - display:inline; - } - /*.medium-item .views{ - grid-column: 3; - grid-row: 2; - justify-self:end; - } - .medium-item time{ - grid-column: 2; - grid-row: 3; - justify-self:start; - }*/ - .medium-item .stats{ - grid-column: 2 / span 2; - grid-row: 2; - } - - .medium-item .description{ - grid-column: 2 / span 2; - grid-row: 4; - } - .medium-item .badges{ - grid-column: 2 / span 2; - grid-row: 5; - } - /* thumbnail size */ - .medium-item img{ - /*height:138px; - width:246px;*/ - height:100%; - justify-self:center; - } - -.small-item-box{ - color: #767676; - font-size: 12px; - - display:grid; - grid-template-columns: 1fr 0fr; - grid-template-rows: 94px; -} - -.small-item{ - background-color:#bcbcbc; - align-content: start; - text-decoration:none; - - display: grid; - grid-template-columns: 168px 1fr; - grid-column-gap: 5px; - grid-template-rows: 0fr 0fr 0fr 1fr; -} - .small-item .title{ - grid-column:2; - grid-row:1; - margin:0; - - color: #333; - font-size: 16px; - font-weight: 500; - text-decoration:initial; - min-width: 0; - } - .small-item address{ - grid-column: 2; - grid-row: 2; - justify-self: start; - } - - .small-item .views{ - grid-column: 2; - grid-row: 3; - justify-self:start; - } - /* thumbnail size */ - .small-item img{ - /*height:94px; - width:168px;*/ - height:100%; - justify-self:center; - } - -.item-checkbox{ - justify-self:start; - align-self:center; - height:30px; - width:30px; - - grid-column: 2; -} - -/* ---Thumbnails for videos---- */ -.video-thumbnail-box{ - grid-column:1; - grid-row:1 / span 6; - - display:grid; - grid-template-columns: 1fr 0fr; -} - .video-thumbnail-img{ - grid-column:1 / span 2; - grid-row:1; - } - .video-duration{ - grid-column: 2; - grid-row: 1; - align-self: end; - opacity: .8; - color: #ffffff; - font-size: 12px; - background-color: #000000; - } - -/* ---Thumbnails for playlists---- */ -.playlist-thumbnail-box{ - grid-column:1; - grid-row:1 / span 5; - - display:grid; - grid-template-columns: 3fr 2fr; -} - .playlist-thumbnail-img{ - grid-column:1 / span 2; - grid-row:1; - } - .playlist-thumbnail-info{ - grid-column:2; - grid-row:1; - - display: grid; - align-items:center; - - text-align:center; - white-space: pre-line; - opacity: .8; - color: #cfcfcf; - background-color: #000000; - } - -.page-button-row{ - justify-self:center; - display: grid; - grid-auto-columns: 40px; - grid-auto-flow: column; - height: 40px; -} - .page-button{ - background-color: #e9e9e9; - border-style: outset; - border-width: 2px; - font-weight: bold; - text-align: center; +h1, h2, h3, h4, h5, h6, div{ + margin:0; + padding:0; + +} + + +body{ + margin:0; + padding: 0; + color:#222; + + + background-color:#cccccc; + + min-height:100vh; + + display:grid; + grid-template-rows: 50px 1fr; +} + + header{ + background-color:#333333; + + grid-row: 1; + } + + main{ + grid-row: 2; + } + +button{ + padding:0; /* Fuck browser-specific styling. Fix your shit mozilla */ +} +address{ + font-style:normal; +} +#site-search{ + display: grid; + grid-template-columns: 1fr 0fr; + +} + + #site-search .search-box{ + align-self:center; + height:25px; + border:0; + + grid-column: 1; + } + #site-search .search-button{ + grid-column: 2; + align-self:center; + height:25px; + + border-style:solid; + border-width:1px; + } + + +.full-item{ + display: grid; + grid-template-rows: 0fr 0fr 0fr 0fr 0fr; + grid-template-columns: 1fr 1fr; + +} + .full-item video{ + grid-column: 1 / span 2; + grid-row: 1; + } + .full-item .title{ + grid-column: 1 / span 2; + grid-row:2; + min-width: 0; + } + .full-item address{ + grid-column: 1; + grid-row: 3; + justify-self: start; + } + .full-item .views{ + grid-column: 2; + grid-row: 3; + justify-self:end; + } + .full-item time{ + grid-column: 1; + grid-row: 4; + justify-self:start; + } + .full-item .likes-dislikes{ + grid-column: 2; + grid-row: 4; + justify-self:end; + } + .full-item .description{ + background-color:#d0d0d0; + margin-top:8px; + white-space: pre-line; + min-width: 0; + + grid-column: 1 / span 2; + grid-row: 5; + } + +.medium-item{ + background-color:#bcbcbc; + display: grid; + align-content: start; + grid-template-columns: 246px 1fr 0fr; + grid-template-rows: 0fr 0fr 0fr 0fr 0fr 1fr; +} + .medium-item .title{ + grid-column:2 / span 2; + grid-row:1; + min-width: 0; + } + .medium-item address{ + display:inline; + } + /*.medium-item .views{ + grid-column: 3; + grid-row: 2; + justify-self:end; + } + .medium-item time{ + grid-column: 2; + grid-row: 3; + justify-self:start; + }*/ + .medium-item .stats{ + grid-column: 2 / span 2; + grid-row: 2; + } + + .medium-item .description{ + grid-column: 2 / span 2; + grid-row: 4; + } + .medium-item .badges{ + grid-column: 2 / span 2; + grid-row: 5; + } + /* thumbnail size */ + .medium-item img{ + /*height:138px; + width:246px;*/ + height:100%; + justify-self:center; + } + +.small-item-box{ + color: #767676; + font-size: 12px; + + display:grid; + grid-template-columns: 1fr 0fr; + grid-template-rows: 94px; +} + +.small-item{ + background-color:#bcbcbc; + align-content: start; + text-decoration:none; + + display: grid; + grid-template-columns: 168px 1fr; + grid-column-gap: 5px; + grid-template-rows: 0fr 0fr 0fr 1fr; +} + .small-item .title{ + grid-column:2; + grid-row:1; + margin:0; + + color: #333; + font-size: 16px; + font-weight: 500; + text-decoration:initial; + min-width: 0; + } + .small-item address{ + grid-column: 2; + grid-row: 2; + justify-self: start; + } + + .small-item .views{ + grid-column: 2; + grid-row: 3; + justify-self:start; + } + /* thumbnail size */ + .small-item img{ + /*height:94px; + width:168px;*/ + height:100%; + justify-self:center; + } + +.item-checkbox{ + justify-self:start; + align-self:center; + height:30px; + width:30px; + + grid-column: 2; +} + +/* ---Thumbnails for videos---- */ +.video-thumbnail-box{ + grid-column:1; + grid-row:1 / span 6; + + display:grid; + grid-template-columns: 1fr 0fr; +} + .video-thumbnail-img{ + grid-column:1 / span 2; + grid-row:1; + } + .video-duration{ + grid-column: 2; + grid-row: 1; + align-self: end; + opacity: .8; + color: #ffffff; + font-size: 12px; + background-color: #000000; + } + +/* ---Thumbnails for playlists---- */ +.playlist-thumbnail-box{ + grid-column:1; + grid-row:1 / span 5; + + display:grid; + grid-template-columns: 3fr 2fr; +} + .playlist-thumbnail-img{ + grid-column:1 / span 2; + grid-row:1; + } + .playlist-thumbnail-info{ + grid-column:2; + grid-row:1; + + display: grid; + align-items:center; + + text-align:center; + white-space: pre-line; + opacity: .8; + color: #cfcfcf; + background-color: #000000; + } + +.page-button-row{ + justify-self:center; + display: grid; + grid-auto-columns: 40px; + grid-auto-flow: column; + height: 40px; +} + .page-button{ + background-color: #e9e9e9; + border-style: outset; + border-width: 2px; + font-weight: bold; + text-align: center; } \ No newline at end of file diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py index 5edf6fc..47f1ea3 100644 --- a/youtube/subscriptions.py +++ b/youtube/subscriptions.py @@ -1,18 +1,18 @@ -import urllib - -with open("subscriptions.txt", 'r', encoding='utf-8') as file: - subscriptions = file.read() - -# Line format: "channel_id channel_name" -# Example: -# UCYO_jab_esuFRV4b17AJtAw 3Blue1Brown - -subscriptions = ((line[0:24], line[25: ]) for line in subscriptions.splitlines()) - -def get_new_videos(): - for channel_id, channel_name in subscriptions: - - - - -def get_subscriptions_page(): +import urllib + +with open("subscriptions.txt", 'r', encoding='utf-8') as file: + subscriptions = file.read() + +# Line format: "channel_id channel_name" +# Example: +# UCYO_jab_esuFRV4b17AJtAw 3Blue1Brown + +subscriptions = ((line[0:24], line[25: ]) for line in subscriptions.splitlines()) + +def get_new_videos(): + for channel_id, channel_name in subscriptions: + + + + +def get_subscriptions_page(): diff --git a/youtube/template.py b/youtube/template.py index 7f13415..b6df1ef 100644 --- a/youtube/template.py +++ b/youtube/template.py @@ -1,132 +1,132 @@ - -import re as _re -from collections import ChainMap as _ChainMap - -class _TemplateMetaclass(type): - pattern = r""" - %(delim)s(?: - (?P%(delim)s) | # Escape sequence of two delimiters - (?P%(id)s) | # delimiter and a Python identifier - {(?P%(id)s)} | # delimiter and a braced identifier - (?P) # Other ill-formed delimiter exprs - ) - """ - - def __init__(cls, name, bases, dct): - super(_TemplateMetaclass, cls).__init__(name, bases, dct) - if 'pattern' in dct: - pattern = cls.pattern - else: - pattern = _TemplateMetaclass.pattern % { - 'delim' : _re.escape(cls.delimiter), - 'id' : cls.idpattern, - } - cls.pattern = _re.compile(pattern, cls.flags | _re.VERBOSE) - - -class Template(metaclass=_TemplateMetaclass): - """A string class for supporting $-substitutions.""" - - delimiter = '$' - idpattern = r'[_a-z][_a-z0-9]*' - flags = _re.IGNORECASE - - def __init__(self, template): - self.template = template - - # Search for $$, $identifier, ${identifier}, and any bare $'s - - def _invalid(self, mo): - i = mo.start('invalid') - lines = self.template[:i].splitlines(keepends=True) - if not lines: - colno = 1 - lineno = 1 - else: - colno = i - len(''.join(lines[:-1])) - lineno = len(lines) - raise ValueError('Invalid placeholder in string: line %d, col %d' % - (lineno, colno)) - - def substitute(*args, **kws): - if not args: - raise TypeError("descriptor 'substitute' of 'Template' object " - "needs an argument") - self, *args = args # allow the "self" keyword be passed - if len(args) > 1: - raise TypeError('Too many positional arguments') - if not args: - mapping = kws - elif kws: - mapping = _ChainMap(kws, args[0]) - else: - mapping = args[0] - # Helper function for .sub() - def convert(mo): - # Check the most common path first. - named = mo.group('named') or mo.group('braced') - if named is not None: - return str(mapping.get(named,'')) - if mo.group('escaped') is not None: - return self.delimiter - if mo.group('invalid') is not None: - self._invalid(mo) - raise ValueError('Unrecognized named group in pattern', - self.pattern) - return self.pattern.sub(convert, self.template) - - def strict_substitute(*args, **kws): - if not args: - raise TypeError("descriptor 'substitute' of 'Template' object " - "needs an argument") - self, *args = args # allow the "self" keyword be passed - if len(args) > 1: - raise TypeError('Too many positional arguments') - if not args: - mapping = kws - elif kws: - mapping = _ChainMap(kws, args[0]) - else: - mapping = args[0] - # Helper function for .sub() - def convert(mo): - # Check the most common path first. - named = mo.group('named') or mo.group('braced') - if named is not None: - return str(mapping[named]) - if mo.group('escaped') is not None: - return self.delimiter - if mo.group('invalid') is not None: - self._invalid(mo) - raise ValueError('Unrecognized named group in pattern', - self.pattern) - return self.pattern.sub(convert, self.template) - - def safe_substitute(*args, **kws): - if not args: - raise TypeError("descriptor 'safe_substitute' of 'Template' object " - "needs an argument") - self, *args = args # allow the "self" keyword be passed - if len(args) > 1: - raise TypeError('Too many positional arguments') - if not args: - mapping = kws - elif kws: - mapping = _ChainMap(kws, args[0]) - else: - mapping = args[0] - # Helper function for .sub() - def convert(mo): - named = mo.group('named') or mo.group('braced') - if named is not None: - try: - return str(mapping[named]) - except KeyError: - return mo.group() - if mo.group('escaped') is not None: - return self.delimiter - if mo.group('invalid') is not None: - return mo.group() - raise ValueError('Unrecognized named group in pattern', - self.pattern) + +import re as _re +from collections import ChainMap as _ChainMap + +class _TemplateMetaclass(type): + pattern = r""" + %(delim)s(?: + (?P%(delim)s) | # Escape sequence of two delimiters + (?P%(id)s) | # delimiter and a Python identifier + {(?P%(id)s)} | # delimiter and a braced identifier + (?P) # Other ill-formed delimiter exprs + ) + """ + + def __init__(cls, name, bases, dct): + super(_TemplateMetaclass, cls).__init__(name, bases, dct) + if 'pattern' in dct: + pattern = cls.pattern + else: + pattern = _TemplateMetaclass.pattern % { + 'delim' : _re.escape(cls.delimiter), + 'id' : cls.idpattern, + } + cls.pattern = _re.compile(pattern, cls.flags | _re.VERBOSE) + + +class Template(metaclass=_TemplateMetaclass): + """A string class for supporting $-substitutions.""" + + delimiter = '$' + idpattern = r'[_a-z][_a-z0-9]*' + flags = _re.IGNORECASE + + def __init__(self, template): + self.template = template + + # Search for $$, $identifier, ${identifier}, and any bare $'s + + def _invalid(self, mo): + i = mo.start('invalid') + lines = self.template[:i].splitlines(keepends=True) + if not lines: + colno = 1 + lineno = 1 + else: + colno = i - len(''.join(lines[:-1])) + lineno = len(lines) + raise ValueError('Invalid placeholder in string: line %d, col %d' % + (lineno, colno)) + + def substitute(*args, **kws): + if not args: + raise TypeError("descriptor 'substitute' of 'Template' object " + "needs an argument") + self, *args = args # allow the "self" keyword be passed + if len(args) > 1: + raise TypeError('Too many positional arguments') + if not args: + mapping = kws + elif kws: + mapping = _ChainMap(kws, args[0]) + else: + mapping = args[0] + # Helper function for .sub() + def convert(mo): + # Check the most common path first. + named = mo.group('named') or mo.group('braced') + if named is not None: + return str(mapping.get(named,'')) + if mo.group('escaped') is not None: + return self.delimiter + if mo.group('invalid') is not None: + self._invalid(mo) + raise ValueError('Unrecognized named group in pattern', + self.pattern) + return self.pattern.sub(convert, self.template) + + def strict_substitute(*args, **kws): + if not args: + raise TypeError("descriptor 'substitute' of 'Template' object " + "needs an argument") + self, *args = args # allow the "self" keyword be passed + if len(args) > 1: + raise TypeError('Too many positional arguments') + if not args: + mapping = kws + elif kws: + mapping = _ChainMap(kws, args[0]) + else: + mapping = args[0] + # Helper function for .sub() + def convert(mo): + # Check the most common path first. + named = mo.group('named') or mo.group('braced') + if named is not None: + return str(mapping[named]) + if mo.group('escaped') is not None: + return self.delimiter + if mo.group('invalid') is not None: + self._invalid(mo) + raise ValueError('Unrecognized named group in pattern', + self.pattern) + return self.pattern.sub(convert, self.template) + + def safe_substitute(*args, **kws): + if not args: + raise TypeError("descriptor 'safe_substitute' of 'Template' object " + "needs an argument") + self, *args = args # allow the "self" keyword be passed + if len(args) > 1: + raise TypeError('Too many positional arguments') + if not args: + mapping = kws + elif kws: + mapping = _ChainMap(kws, args[0]) + else: + mapping = args[0] + # Helper function for .sub() + def convert(mo): + named = mo.group('named') or mo.group('braced') + if named is not None: + try: + return str(mapping[named]) + except KeyError: + return mo.group() + if mo.group('escaped') is not None: + return self.delimiter + if mo.group('invalid') is not None: + return mo.group() + raise ValueError('Unrecognized named group in pattern', + self.pattern) return self.pattern.sub(convert, self.template) \ No newline at end of file diff --git a/youtube/watch.py b/youtube/watch.py index b8aa17d..6e1efbc 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -1,294 +1,294 @@ -from youtube_dl.YoutubeDL import YoutubeDL -import json -import urllib -from string import Template -import html -import youtube.common as common -from youtube.common import default_multi_get, get_thumbnail_url, video_id, URL_ORIGIN -import youtube.comments as comments -import gevent - -video_height_priority = (360, 480, 240, 720, 1080) - - -_formats = { - '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, - '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, - '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, - '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well - '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, - '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - - - # 3D videos - '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, - '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - - # Apple HTTP Live Streaming - '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, - - # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, - - # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, - '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, - '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, - - # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - - # Dash webm audio - '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, - '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, - - # Dash webm audio with opus inside - '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, - '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, - '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, - - # RTMP (unnamed) - '_rtmp': {'protocol': 'rtmp'}, -} - - - - -source_tag_template = Template(''' -''') - -with open("yt_watch_template.html", "r") as file: - yt_watch_template = Template(file.read()) - - - -# example: -#https://www.youtube.com/related_ajax?ctoken=CBQSJhILVGNxV29rOEF1YkXAAQDIAQDgAQGiAg0o____________AUAAGAAq0gEInJOqsOyB1tAaCNeMgaD4spLIKQioxdHSu8SF9JgBCLr27tnaioDpXwj1-L_R3s7r2wcIv8TnueeUo908CMXSganIrvHDJgiVuMirrqbgqYABCJDsu8PBzdGW8wEI_-WI2t-c-IlQCOK_m_KB_rP5wAEIl7S4serqnq5YCNSs55mMt8qLyQEImvutmp-x9LaCAQiVg96VpY_pqJMBCOPsgdTflsGRsQEI7ZfYleKIub0tCIrcsb7a_uu95gEIi9Gz6_bC76zEAQjo1c_W8JzlkhI%3D&continuation=CBQSJhILVGNxV29rOEF1YkXAAQDIAQDgAQGiAg0o____________AUAAGAAq0gEInJOqsOyB1tAaCNeMgaD4spLIKQioxdHSu8SF9JgBCLr27tnaioDpXwj1-L_R3s7r2wcIv8TnueeUo908CMXSganIrvHDJgiVuMirrqbgqYABCJDsu8PBzdGW8wEI_-WI2t-c-IlQCOK_m_KB_rP5wAEIl7S4serqnq5YCNSs55mMt8qLyQEImvutmp-x9LaCAQiVg96VpY_pqJMBCOPsgdTflsGRsQEI7ZfYleKIub0tCIrcsb7a_uu95gEIi9Gz6_bC76zEAQjo1c_W8JzlkhI%3D&itct=CCkQybcCIhMIg8PShInX2gIVgdvBCh15WA0ZKPgd -def get_bloated_more_related_videos(video_url, related_videos_token, id_token): - related_videos_token = urllib.parse.quote(related_videos_token) - url = "https://www.youtube.com/related_ajax?ctoken=" + related_videos_token + "&continuation=" + related_videos_token - headers = { - 'Host': 'www.youtube.com', - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)', - 'Accept': '*/*', - 'Accept-Language': 'en-US,en;q=0.5', - 'Referer': video_url, - 'X-YouTube-Client-Name': '1', - 'X-YouTube-Client-Version': '2.20180418', - 'X-Youtube-Identity-Token': id_token, - - } - #print(url) - req = urllib.request.Request(url, headers=headers) - response = urllib.request.urlopen(req, timeout = 5) - content = response.read() - info = json.loads(content) - return info - -def get_more_related_videos_info(video_url, related_videos_token, id_token): - results = [] - info = get_bloated_more_related_videos(video_url, related_videos_token, id_token) - bloated_results = info[1]['response']['continuationContents']['watchNextSecondaryResultsContinuation']['results'] - for bloated_result in bloated_results: - bloated_result = bloated_result['compactVideoRenderer'] - results.append({ - "title": bloated_result['title']['simpleText'], - "video_id": bloated_result['videoId'], - "views_text": bloated_result['viewCountText']['simpleText'], - "length_text": default_multi_get(bloated_result, 'lengthText', 'simpleText', default=''), # livestreams dont have a length - "length_text": bloated_result['lengthText']['simpleText'], - "uploader_name": bloated_result['longBylineText']['runs'][0]['text'], - "uploader_url": bloated_result['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - }) - return results - -def more_related_videos_html(video_info): - related_videos = get_related_videos(url, 1, video_info['related_videos_token'], video_info['id_token']) - - related_videos_html = "" - for video in related_videos: - related_videos_html += Template(video_related_template).substitute( - video_title=html.escape(video["title"]), - views=video["views_text"], - uploader=html.escape(video["uploader_name"]), - uploader_channel_url=video["uploader_url"], - length=video["length_text"], - video_url = "/youtube.com/watch?v=" + video["video_id"], - thumbnail_url= get_thumbnail_url(video['video_id']), - ) - return related_videos_html - - - -def get_related_items_html(info): - result = "" - for item in info['related_vids']: - if 'list' in item: # playlist: - result += common.small_playlist_item_html(watch_page_related_playlist_info(item)) - else: - result += common.small_video_item_html(watch_page_related_video_info(item)) - return result - - -# json of related items retrieved directly from the watch page has different names for everything -# converts these to standard names -def watch_page_related_video_info(item): - result = {key: item[key] for key in ('id', 'title', 'author')} - result['duration'] = common.seconds_to_timestamp(item['length_seconds']) - try: - result['views'] = item['short_view_count_text'] - except KeyError: - result['views'] = '' - return result - -def watch_page_related_playlist_info(item): - return { - 'size': item['playlist_length'] if item['playlist_length'] != "0" else "50+", - 'title': item['playlist_title'], - 'id': item['list'], - 'first_video_id': item['video_id'], - } - - -def sort_formats(info): - info['formats'].sort(key=lambda x: default_multi_get(_formats, x['format_id'], 'height', default=0)) - for index, format in enumerate(info['formats']): - if default_multi_get(_formats, format['format_id'], 'height', default=0) >= 360: - break - info['formats'] = info['formats'][index:] + info['formats'][0:index] - info['formats'] = [format for format in info['formats'] if format['acodec'] != 'none' and format['vcodec'] != 'none'] - -def formats_html(info): - result = '' - for format in info['formats']: - result += source_tag_template.substitute( - src=format['url'], - type='audio/' + format['ext'] if format['vcodec'] == "none" else 'video/' + format['ext'], - ) - return result - -def choose_format(info): - suitable_formats = [] - with open('teste.txt', 'w', encoding='utf-8') as f: - f.write(json.dumps(info['formats'])) - for format in info['formats']: - if (format["ext"] in ("mp4", "webm") - and format["acodec"] != "none" - and format["vcodec"] != "none" - and format.get("height","none") in video_height_priority): - suitable_formats.append(format) - - current_best = (suitable_formats[0],video_height_priority.index(suitable_formats[0]["height"])) - for format in suitable_formats: - video_priority_index = video_height_priority.index(format["height"]) - if video_priority_index < current_best[1]: - current_best = (format, video_priority_index) - return current_best[0] - -more_comments_template = Template('''More comments''') -def get_watch_page(query_string): - id = urllib.parse.parse_qs(query_string)['v'][0] - tasks = ( - gevent.spawn(comments.video_comments, id ), - gevent.spawn(YoutubeDL(params={'youtube_include_dash_manifest':False}).extract_info, "https://www.youtube.com/watch?v=" + id, download=False) - ) - gevent.joinall(tasks) - comments_info, info = tasks[0].value, tasks[1].value - comments_html, ctoken = comments_info - - if ctoken == '': - more_comments_button = '' - else: - more_comments_button = more_comments_template.substitute(url = URL_ORIGIN + '/comments?ctoken=' + ctoken) - #comments_html = comments.comments_html(video_id(url)) - #info = YoutubeDL().extract_info(url, download=False) - - #chosen_format = choose_format(info) - sort_formats(info) - - - - upload_year = info["upload_date"][0:4] - upload_month = info["upload_date"][4:6] - upload_day = info["upload_date"][6:8] - upload_date = upload_month + "/" + upload_day + "/" + upload_year - - related_videos_html = get_related_items_html(info) - - page = yt_watch_template.substitute( - video_title=html.escape(info["title"]), - page_title=html.escape(info["title"]), - uploader=html.escape(info["uploader"]), - uploader_channel_url='/' + info["uploader_url"], - #upload_date=datetime.datetime.fromtimestamp(info["timestamp"]).strftime("%d %b %Y %H:%M:%S"), - upload_date = upload_date, - views='{:,}'.format(info["view_count"]), - likes=(lambda x: '{:,}'.format(x) if x is not None else "")(info["like_count"]), - dislikes=(lambda x: '{:,}'.format(x) if x is not None else "")(info["dislike_count"]), - description=html.escape(info["description"]), - video_sources=formats_html(info), - related = related_videos_html, - comments=comments_html, - more_comments_button = more_comments_button, - ) +from youtube_dl.YoutubeDL import YoutubeDL +import json +import urllib +from string import Template +import html +import youtube.common as common +from youtube.common import default_multi_get, get_thumbnail_url, video_id, URL_ORIGIN +import youtube.comments as comments +import gevent + +video_height_priority = (360, 480, 240, 720, 1080) + + +_formats = { + '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well + '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + + + # 3D videos + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, + + # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, + + # DASH mp4 video + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, + + # Dash mp4 audio + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, + + # Dash webm + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + + # Dash webm audio + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, + + # Dash webm audio with opus inside + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, + + # RTMP (unnamed) + '_rtmp': {'protocol': 'rtmp'}, +} + + + + +source_tag_template = Template(''' +''') + +with open("yt_watch_template.html", "r") as file: + yt_watch_template = Template(file.read()) + + + +# example: +#https://www.youtube.com/related_ajax?ctoken=CBQSJhILVGNxV29rOEF1YkXAAQDIAQDgAQGiAg0o____________AUAAGAAq0gEInJOqsOyB1tAaCNeMgaD4spLIKQioxdHSu8SF9JgBCLr27tnaioDpXwj1-L_R3s7r2wcIv8TnueeUo908CMXSganIrvHDJgiVuMirrqbgqYABCJDsu8PBzdGW8wEI_-WI2t-c-IlQCOK_m_KB_rP5wAEIl7S4serqnq5YCNSs55mMt8qLyQEImvutmp-x9LaCAQiVg96VpY_pqJMBCOPsgdTflsGRsQEI7ZfYleKIub0tCIrcsb7a_uu95gEIi9Gz6_bC76zEAQjo1c_W8JzlkhI%3D&continuation=CBQSJhILVGNxV29rOEF1YkXAAQDIAQDgAQGiAg0o____________AUAAGAAq0gEInJOqsOyB1tAaCNeMgaD4spLIKQioxdHSu8SF9JgBCLr27tnaioDpXwj1-L_R3s7r2wcIv8TnueeUo908CMXSganIrvHDJgiVuMirrqbgqYABCJDsu8PBzdGW8wEI_-WI2t-c-IlQCOK_m_KB_rP5wAEIl7S4serqnq5YCNSs55mMt8qLyQEImvutmp-x9LaCAQiVg96VpY_pqJMBCOPsgdTflsGRsQEI7ZfYleKIub0tCIrcsb7a_uu95gEIi9Gz6_bC76zEAQjo1c_W8JzlkhI%3D&itct=CCkQybcCIhMIg8PShInX2gIVgdvBCh15WA0ZKPgd +def get_bloated_more_related_videos(video_url, related_videos_token, id_token): + related_videos_token = urllib.parse.quote(related_videos_token) + url = "https://www.youtube.com/related_ajax?ctoken=" + related_videos_token + "&continuation=" + related_videos_token + headers = { + 'Host': 'www.youtube.com', + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)', + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.5', + 'Referer': video_url, + 'X-YouTube-Client-Name': '1', + 'X-YouTube-Client-Version': '2.20180418', + 'X-Youtube-Identity-Token': id_token, + + } + #print(url) + req = urllib.request.Request(url, headers=headers) + response = urllib.request.urlopen(req, timeout = 5) + content = response.read() + info = json.loads(content) + return info + +def get_more_related_videos_info(video_url, related_videos_token, id_token): + results = [] + info = get_bloated_more_related_videos(video_url, related_videos_token, id_token) + bloated_results = info[1]['response']['continuationContents']['watchNextSecondaryResultsContinuation']['results'] + for bloated_result in bloated_results: + bloated_result = bloated_result['compactVideoRenderer'] + results.append({ + "title": bloated_result['title']['simpleText'], + "video_id": bloated_result['videoId'], + "views_text": bloated_result['viewCountText']['simpleText'], + "length_text": default_multi_get(bloated_result, 'lengthText', 'simpleText', default=''), # livestreams dont have a length + "length_text": bloated_result['lengthText']['simpleText'], + "uploader_name": bloated_result['longBylineText']['runs'][0]['text'], + "uploader_url": bloated_result['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + }) + return results + +def more_related_videos_html(video_info): + related_videos = get_related_videos(url, 1, video_info['related_videos_token'], video_info['id_token']) + + related_videos_html = "" + for video in related_videos: + related_videos_html += Template(video_related_template).substitute( + video_title=html.escape(video["title"]), + views=video["views_text"], + uploader=html.escape(video["uploader_name"]), + uploader_channel_url=video["uploader_url"], + length=video["length_text"], + video_url = "/youtube.com/watch?v=" + video["video_id"], + thumbnail_url= get_thumbnail_url(video['video_id']), + ) + return related_videos_html + + + +def get_related_items_html(info): + result = "" + for item in info['related_vids']: + if 'list' in item: # playlist: + result += common.small_playlist_item_html(watch_page_related_playlist_info(item)) + else: + result += common.small_video_item_html(watch_page_related_video_info(item)) + return result + + +# json of related items retrieved directly from the watch page has different names for everything +# converts these to standard names +def watch_page_related_video_info(item): + result = {key: item[key] for key in ('id', 'title', 'author')} + result['duration'] = common.seconds_to_timestamp(item['length_seconds']) + try: + result['views'] = item['short_view_count_text'] + except KeyError: + result['views'] = '' + return result + +def watch_page_related_playlist_info(item): + return { + 'size': item['playlist_length'] if item['playlist_length'] != "0" else "50+", + 'title': item['playlist_title'], + 'id': item['list'], + 'first_video_id': item['video_id'], + } + + +def sort_formats(info): + info['formats'].sort(key=lambda x: default_multi_get(_formats, x['format_id'], 'height', default=0)) + for index, format in enumerate(info['formats']): + if default_multi_get(_formats, format['format_id'], 'height', default=0) >= 360: + break + info['formats'] = info['formats'][index:] + info['formats'][0:index] + info['formats'] = [format for format in info['formats'] if format['acodec'] != 'none' and format['vcodec'] != 'none'] + +def formats_html(info): + result = '' + for format in info['formats']: + result += source_tag_template.substitute( + src=format['url'], + type='audio/' + format['ext'] if format['vcodec'] == "none" else 'video/' + format['ext'], + ) + return result + +def choose_format(info): + suitable_formats = [] + with open('teste.txt', 'w', encoding='utf-8') as f: + f.write(json.dumps(info['formats'])) + for format in info['formats']: + if (format["ext"] in ("mp4", "webm") + and format["acodec"] != "none" + and format["vcodec"] != "none" + and format.get("height","none") in video_height_priority): + suitable_formats.append(format) + + current_best = (suitable_formats[0],video_height_priority.index(suitable_formats[0]["height"])) + for format in suitable_formats: + video_priority_index = video_height_priority.index(format["height"]) + if video_priority_index < current_best[1]: + current_best = (format, video_priority_index) + return current_best[0] + +more_comments_template = Template('''More comments''') +def get_watch_page(query_string): + id = urllib.parse.parse_qs(query_string)['v'][0] + tasks = ( + gevent.spawn(comments.video_comments, id ), + gevent.spawn(YoutubeDL(params={'youtube_include_dash_manifest':False}).extract_info, "https://www.youtube.com/watch?v=" + id, download=False) + ) + gevent.joinall(tasks) + comments_info, info = tasks[0].value, tasks[1].value + comments_html, ctoken = comments_info + + if ctoken == '': + more_comments_button = '' + else: + more_comments_button = more_comments_template.substitute(url = URL_ORIGIN + '/comments?ctoken=' + ctoken) + #comments_html = comments.comments_html(video_id(url)) + #info = YoutubeDL().extract_info(url, download=False) + + #chosen_format = choose_format(info) + sort_formats(info) + + + + upload_year = info["upload_date"][0:4] + upload_month = info["upload_date"][4:6] + upload_day = info["upload_date"][6:8] + upload_date = upload_month + "/" + upload_day + "/" + upload_year + + related_videos_html = get_related_items_html(info) + + page = yt_watch_template.substitute( + video_title=html.escape(info["title"]), + page_title=html.escape(info["title"]), + uploader=html.escape(info["uploader"]), + uploader_channel_url='/' + info["uploader_url"], + #upload_date=datetime.datetime.fromtimestamp(info["timestamp"]).strftime("%d %b %Y %H:%M:%S"), + upload_date = upload_date, + views='{:,}'.format(info["view_count"]), + likes=(lambda x: '{:,}'.format(x) if x is not None else "")(info["like_count"]), + dislikes=(lambda x: '{:,}'.format(x) if x is not None else "")(info["dislike_count"]), + description=html.escape(info["description"]), + video_sources=formats_html(info), + related = related_videos_html, + comments=comments_html, + more_comments_button = more_comments_button, + ) return page \ No newline at end of file diff --git a/youtube/watch_later.py b/youtube/watch_later.py index 126fb6e..4bb421c 100644 --- a/youtube/watch_later.py +++ b/youtube/watch_later.py @@ -1,11 +1,11 @@ -import os.path -import json -watch_later_file = os.path.normpath("youtube/watch_later.txt") -def add_to_watch_later(video_info_list): - with open(watch_later_file, "a", encoding='utf-8') as file: - for info in video_info_list: - file.write(info + "\n") - - -def get_watch_later_page(): +import os.path +import json +watch_later_file = os.path.normpath("youtube/watch_later.txt") +def add_to_watch_later(video_info_list): + with open(watch_later_file, "a", encoding='utf-8') as file: + for info in video_info_list: + file.write(info + "\n") + + +def get_watch_later_page(): pass \ No newline at end of file diff --git a/youtube/youtube.py b/youtube/youtube.py index 7ec75c0..a7cc204 100644 --- a/youtube/youtube.py +++ b/youtube/youtube.py @@ -1,60 +1,60 @@ -import mimetypes -import urllib.parse -from youtube import watch_later, watch, search, playlist, channel, comments -YOUTUBE_FILES = ( - "/shared.css", - "/opensearch.xml", - '/comments.css', -) - -def youtube(env, start_response): - path, method, query_string = env['PATH_INFO'], env['REQUEST_METHOD'], env['QUERY_STRING'] - if method == "GET": - if path in YOUTUBE_FILES: - with open("youtube" + path, 'rb') as f: - mime_type = mimetypes.guess_type(path)[0] or 'application/octet-stream' - start_response('200 OK', (('Content-type',mime_type),) ) - return f.read() - - elif path == "/comments": - start_response('200 OK', (('Content-type','text/html'),) ) - return comments.get_comments_page(query_string).encode() - - elif path == "/watch": - start_response('200 OK', (('Content-type','text/html'),) ) - return watch.get_watch_page(query_string).encode() - - elif path == "/search": - start_response('200 OK', (('Content-type','text/html'),) ) - return search.get_search_page(query_string).encode() - - elif path == "/playlist": - start_response('200 OK', (('Content-type','text/html'),) ) - return playlist.get_playlist_page(query_string).encode() - - elif path.startswith("/channel/"): - start_response('200 OK', (('Content-type','text/html'),) ) - return channel.get_channel_page(path[9:], query_string=query_string).encode() - - elif path.startswith("/user/"): - start_response('200 OK', (('Content-type','text/html'),) ) - return channel.get_user_page(path[6:], query_string=query_string).encode() - - else: - start_response('404 Not Found', () ) - return b'404 Not Found' - - elif method == "POST": - if path == "/edit_playlist": - fields = urllib.parse.parse_qs(env['wsgi.input'].read().decode()) - if fields['action'][0] == 'add' and fields['playlist_name'][0] == 'watch_later': - watch_later.add_to_watch_later(fields['video_info_list']) - - start_response('204 No Content', ()) - else: - start_response('404 Not Found', ()) - return b'404 Not Found' - - else: - start_response('501 Not Implemented', ()) +import mimetypes +import urllib.parse +from youtube import watch_later, watch, search, playlist, channel, comments +YOUTUBE_FILES = ( + "/shared.css", + "/opensearch.xml", + '/comments.css', +) + +def youtube(env, start_response): + path, method, query_string = env['PATH_INFO'], env['REQUEST_METHOD'], env['QUERY_STRING'] + if method == "GET": + if path in YOUTUBE_FILES: + with open("youtube" + path, 'rb') as f: + mime_type = mimetypes.guess_type(path)[0] or 'application/octet-stream' + start_response('200 OK', (('Content-type',mime_type),) ) + return f.read() + + elif path == "/comments": + start_response('200 OK', (('Content-type','text/html'),) ) + return comments.get_comments_page(query_string).encode() + + elif path == "/watch": + start_response('200 OK', (('Content-type','text/html'),) ) + return watch.get_watch_page(query_string).encode() + + elif path == "/search": + start_response('200 OK', (('Content-type','text/html'),) ) + return search.get_search_page(query_string).encode() + + elif path == "/playlist": + start_response('200 OK', (('Content-type','text/html'),) ) + return playlist.get_playlist_page(query_string).encode() + + elif path.startswith("/channel/"): + start_response('200 OK', (('Content-type','text/html'),) ) + return channel.get_channel_page(path[9:], query_string=query_string).encode() + + elif path.startswith("/user/"): + start_response('200 OK', (('Content-type','text/html'),) ) + return channel.get_user_page(path[6:], query_string=query_string).encode() + + else: + start_response('404 Not Found', () ) + return b'404 Not Found' + + elif method == "POST": + if path == "/edit_playlist": + fields = urllib.parse.parse_qs(env['wsgi.input'].read().decode()) + if fields['action'][0] == 'add' and fields['playlist_name'][0] == 'watch_later': + watch_later.add_to_watch_later(fields['video_info_list']) + + start_response('204 No Content', ()) + else: + start_response('404 Not Found', ()) + return b'404 Not Found' + + else: + start_response('501 Not Implemented', ()) return b'501 Not Implemented' \ No newline at end of file -- cgit v1.2.3