import json import urllib import html from string import Template import base64 from math import ceil from youtube.common import default_multi_get, get_thumbnail_url, URL_ORIGIN import youtube.common as common with open("yt_search_results_template.html", "r") as file: yt_search_results_template = file.read() with open("yt_search_template.html", "r") as file: yt_search_template = file.read() page_button_template = Template('''$page''') current_page_button_template = Template('''

$page

''') video_result_template = '''

$length $video_title

Uploaded by $uploader

$views Uploaded $upload_date $description

''' # Sort: 1 # Upload date: 2 # View count: 3 # Rating: 1 # Offset: 9 # Filters: 2 # Upload date: 1 # Type: 2 # Duration: 3 features = { '4k': 14, 'hd': 4, 'hdr': 25, 'subtitles': 5, 'creative_commons': 6, '3d': 7, 'live': 8, 'purchased': 9, '360': 15, 'location': 23, } def page_number_to_sp_parameter(page): offset = (int(page) - 1)*20 # 20 results per page first_byte = 255 & offset second_byte = 255 & (offset >> 7) second_byte = second_byte | 1 # 0b01001000 is required, and is always the same. # The next 2 bytes encode the offset in little endian order, # BUT, it's done in a strange way. The least significant bit (LSB) of the second byte is not part # of the offset. Instead, to get the number which the two bytes encode, that LSB # of the second byte is combined with the most significant bit (MSB) of the first byte # in a logical AND. Replace the two bits with the result of the AND to get the two little endian # bytes that represent the offset. # I figured this out by trial and error on the sp parameter. I don't know why it's done like this; # perhaps it's just obfuscation. param_bytes = bytes((0b01001000, first_byte, second_byte)) param_encoded = urllib.parse.quote(base64.urlsafe_b64encode(param_bytes)) return param_encoded def get_search_json(query, page): url = "https://www.youtube.com/results?search_query=" + urllib.parse.quote_plus(query) headers = { 'Host': 'www.youtube.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)', 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'X-YouTube-Client-Name': '1', 'X-YouTube-Client-Version': '2.20180418', } url += "&pbj=1&sp=" + page_number_to_sp_parameter(page) content = common.fetch_url(url, headers=headers) info = json.loads(content) return info """def get_search_info(query, page): result_info = dict() info = get_bloated_search_info(query, page) estimated_results = int(info[1]['response']['estimatedResults']) estimated_pages = ceil(estimated_results/20) result_info['estimated_results'] = estimated_results result_info['estimated_pages'] = estimated_pages result_info['results'] = [] # this is what you get when you hire H-1B's video_list = info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'] for video_json_crap in video_list: # they have a dictionary whose only content is another dictionary... try: type = list(video_json_crap.keys())[0] except KeyError: continue #channelRenderer or playlistRenderer '''description = "" for text_run in video_json_crap["descriptionSnippet"]["runs"]: if text_run.get("bold", False): description += "" + html.escape''' try: result_info['results'].append({ "title": video_json_crap["title"]["simpleText"], "video_id": video_json_crap["videoId"], "description": video_json_crap.get("descriptionSnippet",dict()).get('runs',[]), # a list of text runs (formmated), rather than plain text "thumbnail": get_thumbnail_url(video_json_crap["videoId"]), "views_text": video_json_crap['viewCountText'].get('simpleText', None) or video_json_crap['viewCountText']['runs'][0]['text'], "length_text": default_multi_get(video_json_crap, 'lengthText', 'simpleText', default=''), # livestreams dont have a length "uploader": video_json_crap['longBylineText']['runs'][0]['text'], "uploader_url": URL_ORIGIN + video_json_crap['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], "published_time_text": default_multi_get(video_json_crap, 'publishedTimeText', 'simpleText', default=''), }) except KeyError: print(video_json_crap) raise return result_info""" def page_buttons_html(page_start, page_end, current_page, query): result = "" for page in range(page_start, page_end+1): if page == current_page: template = current_page_button_template else: template = page_button_template result += template.substitute(page=page, href=URL_ORIGIN + "/search?query=" + urllib.parse.quote_plus(query) + "&page=" + str(page)) return result showing_results_for = Template('''
Showing results for $corrected_query

Search instead for $original_query
''') did_you_mean = Template('''
Did you mean $corrected_query
''') def get_search_page(query_string, parameters=()): qs_query = urllib.parse.parse_qs(query_string) if len(qs_query) == 0: return yt_search_template query = qs_query["query"][0] page = qs_query.get("page", "1")[0] info = get_search_json(query, page) estimated_results = int(info[1]['response']['estimatedResults']) estimated_pages = ceil(estimated_results/20) results = info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'] corrections = '' result_list_html = "" for renderer in results: type = list(renderer.keys())[0] if type == 'shelfRenderer': continue if type == 'didYouMeanRenderer': renderer = renderer[type] corrected_query_string = urllib.parse.parse_qs(query_string) corrected_query_string['query'] = [renderer['correctedQueryEndpoint']['searchEndpoint']['query']] corrected_query_url = URL_ORIGIN + '/search?' + common.make_query_string(corrected_query_string) corrections = did_you_mean.substitute( corrected_query_url = corrected_query_url, corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), ) continue if type == 'showingResultsForRenderer': renderer = renderer[type] no_autocorrect_query_string = urllib.parse.parse_qs(query_string) no_autocorrect_query_string['autocorrect'] = ['0'] no_autocorrect_query_url = URL_ORIGIN + '/search?' + common.make_query_string(no_autocorrect_query_string) corrections = showing_results_for.substitute( corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), original_query_url = no_autocorrect_query_url, original_query = html.escape(renderer['originalQuery']['simpleText']), ) continue result_list_html += common.renderer_html(renderer, current_query_string=query_string) '''type = list(result.keys())[0] result = result[type] if type == "showingResultsForRenderer": url = URL_ORIGIN + "/search" if len(parameters) > 0: url += ';' + ';'.join(parameters) url += '?' + '&'.join(key + '=' + ','.join(values) for key,values in qs_query.items()) result_list_html += showing_results_for_template.substitute( corrected_query=common.format_text_runs(result['correctedQuery']['runs']), ) else: result_list_html += common.html_functions[type](result)''' page = int(page) if page <= 5: page_start = 1 page_end = min(9, estimated_pages) else: page_start = page - 4 page_end = min(page + 4, estimated_pages) result = Template(yt_search_results_template).substitute( results = result_list_html, page_title = query + " - Search", search_box_value = html.escape(query), number_of_results = '{:,}'.format(estimated_results), number_of_pages = '{:,}'.format(estimated_pages), page_buttons = page_buttons_html(page_start, page_end, page, query), corrections = corrections ) return result