From 79937c1c823f998a1d6bb324901fd13b483b3607 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Mon, 2 Jul 2018 17:45:25 -0700 Subject: fix line endings --- youtube/search.py | 460 +++++++++++++++++++++++++++--------------------------- 1 file changed, 230 insertions(+), 230 deletions(-) (limited to 'youtube/search.py') diff --git a/youtube/search.py b/youtube/search.py index 5268dbe..5982d9b 100644 --- a/youtube/search.py +++ b/youtube/search.py @@ -1,231 +1,231 @@ -import json -import urllib -import html -from string import Template -import base64 -from math import ceil -from youtube.common import default_multi_get, get_thumbnail_url, URL_ORIGIN -import youtube.common as common - -with open("yt_search_results_template.html", "r") as file: - yt_search_results_template = file.read() - -with open("yt_search_template.html", "r") as file: - yt_search_template = file.read() - -page_button_template = Template('''$page''') -current_page_button_template = Template('''
$page
''') -video_result_template = ''' -
- - - $length - - - $video_title - -
Uploaded by $uploader
- $views - - - - - $description -
-''' - - - -# Sort: 1 - # Upload date: 2 - # View count: 3 - # Rating: 1 -# Offset: 9 -# Filters: 2 - # Upload date: 1 - # Type: 2 - # Duration: 3 - - -features = { - '4k': 14, - 'hd': 4, - 'hdr': 25, - 'subtitles': 5, - 'creative_commons': 6, - '3d': 7, - 'live': 8, - 'purchased': 9, - '360': 15, - 'location': 23, -} - -def page_number_to_sp_parameter(page): - offset = (int(page) - 1)*20 # 20 results per page - first_byte = 255 & offset - second_byte = 255 & (offset >> 7) - second_byte = second_byte | 1 - - # 0b01001000 is required, and is always the same. - # The next 2 bytes encode the offset in little endian order, - # BUT, it's done in a strange way. The least significant bit (LSB) of the second byte is not part - # of the offset. Instead, to get the number which the two bytes encode, that LSB - # of the second byte is combined with the most significant bit (MSB) of the first byte - # in a logical AND. Replace the two bits with the result of the AND to get the two little endian - # bytes that represent the offset. - # I figured this out by trial and error on the sp parameter. I don't know why it's done like this; - # perhaps it's just obfuscation. - param_bytes = bytes((0b01001000, first_byte, second_byte)) - param_encoded = urllib.parse.quote(base64.urlsafe_b64encode(param_bytes)) - return param_encoded - -def get_search_json(query, page): - url = "https://www.youtube.com/results?search_query=" + urllib.parse.quote_plus(query) - headers = { - 'Host': 'www.youtube.com', - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)', - 'Accept': '*/*', - 'Accept-Language': 'en-US,en;q=0.5', - 'X-YouTube-Client-Name': '1', - 'X-YouTube-Client-Version': '2.20180418', - } - url += "&pbj=1&sp=" + page_number_to_sp_parameter(page) - content = common.fetch_url(url, headers=headers) - info = json.loads(content) - return info - -"""def get_search_info(query, page): - result_info = dict() - info = get_bloated_search_info(query, page) - - estimated_results = int(info[1]['response']['estimatedResults']) - estimated_pages = ceil(estimated_results/20) - result_info['estimated_results'] = estimated_results - result_info['estimated_pages'] = estimated_pages - - result_info['results'] = [] - # this is what you get when you hire H-1B's - video_list = info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'] - - - for video_json_crap in video_list: - # they have a dictionary whose only content is another dictionary... - try: - type = list(video_json_crap.keys())[0] - except KeyError: - continue #channelRenderer or playlistRenderer - '''description = "" - for text_run in video_json_crap["descriptionSnippet"]["runs"]: - if text_run.get("bold", False): - description += "" + html.escape''' - try: - result_info['results'].append({ - "title": video_json_crap["title"]["simpleText"], - "video_id": video_json_crap["videoId"], - "description": video_json_crap.get("descriptionSnippet",dict()).get('runs',[]), # a list of text runs (formmated), rather than plain text - "thumbnail": get_thumbnail_url(video_json_crap["videoId"]), - "views_text": video_json_crap['viewCountText'].get('simpleText', None) or video_json_crap['viewCountText']['runs'][0]['text'], - "length_text": default_multi_get(video_json_crap, 'lengthText', 'simpleText', default=''), # livestreams dont have a length - "uploader": video_json_crap['longBylineText']['runs'][0]['text'], - "uploader_url": URL_ORIGIN + video_json_crap['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - "published_time_text": default_multi_get(video_json_crap, 'publishedTimeText', 'simpleText', default=''), - - }) - except KeyError: - print(video_json_crap) - raise - return result_info""" - - -def page_buttons_html(page_start, page_end, current_page, query): - result = "" - for page in range(page_start, page_end+1): - if page == current_page: - template = current_page_button_template - else: - template = page_button_template - result += template.substitute(page=page, href=URL_ORIGIN + "/search?query=" + urllib.parse.quote_plus(query) + "&page=" + str(page)) - return result - -showing_results_for = Template(''' -
Showing results for $corrected_query
-
Search instead for $original_query
-''') -did_you_mean = Template(''' -
Did you mean $corrected_query
-''') -def get_search_page(query_string, parameters=()): - qs_query = urllib.parse.parse_qs(query_string) - if len(qs_query) == 0: - return yt_search_template - query = qs_query["query"][0] - page = qs_query.get("page", "1")[0] - - info = get_search_json(query, page) - - estimated_results = int(info[1]['response']['estimatedResults']) - estimated_pages = ceil(estimated_results/20) - results = info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'] - - corrections = '' - result_list_html = "" - for renderer in results: - type = list(renderer.keys())[0] - if type == 'shelfRenderer': - continue - if type == 'didYouMeanRenderer': - renderer = renderer[type] - corrected_query_string = urllib.parse.parse_qs(query_string) - corrected_query_string['query'] = [renderer['correctedQueryEndpoint']['searchEndpoint']['query']] - corrected_query_url = URL_ORIGIN + '/search?' + common.make_query_string(corrected_query_string) - corrections = did_you_mean.substitute( - corrected_query_url = corrected_query_url, - corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), - ) - continue - if type == 'showingResultsForRenderer': - renderer = renderer[type] - no_autocorrect_query_string = urllib.parse.parse_qs(query_string) - no_autocorrect_query_string['autocorrect'] = ['0'] - no_autocorrect_query_url = URL_ORIGIN + '/search?' + common.make_query_string(no_autocorrect_query_string) - corrections = showing_results_for.substitute( - corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), - original_query_url = no_autocorrect_query_url, - original_query = html.escape(renderer['originalQuery']['simpleText']), - ) - continue - result_list_html += common.renderer_html(renderer, current_query_string=query_string) - '''type = list(result.keys())[0] - result = result[type] - if type == "showingResultsForRenderer": - url = URL_ORIGIN + "/search" - if len(parameters) > 0: - url += ';' + ';'.join(parameters) - url += '?' + '&'.join(key + '=' + ','.join(values) for key,values in qs_query.items()) - - result_list_html += showing_results_for_template.substitute( - corrected_query=common.format_text_runs(result['correctedQuery']['runs']), - - ) - else: - result_list_html += common.html_functions[type](result)''' - - page = int(page) - if page <= 5: - page_start = 1 - page_end = min(9, estimated_pages) - else: - page_start = page - 4 - page_end = min(page + 4, estimated_pages) - - - result = Template(yt_search_results_template).substitute( - results = result_list_html, - page_title = query + " - Search", - search_box_value = html.escape(query), - number_of_results = '{:,}'.format(estimated_results), - number_of_pages = '{:,}'.format(estimated_pages), - page_buttons = page_buttons_html(page_start, page_end, page, query), - corrections = corrections - ) +import json +import urllib +import html +from string import Template +import base64 +from math import ceil +from youtube.common import default_multi_get, get_thumbnail_url, URL_ORIGIN +import youtube.common as common + +with open("yt_search_results_template.html", "r") as file: + yt_search_results_template = file.read() + +with open("yt_search_template.html", "r") as file: + yt_search_template = file.read() + +page_button_template = Template('''$page''') +current_page_button_template = Template('''
$page
''') +video_result_template = ''' +
+ + + $length + + + $video_title + +
Uploaded by $uploader
+ $views + + + + + $description +
+''' + + + +# Sort: 1 + # Upload date: 2 + # View count: 3 + # Rating: 1 +# Offset: 9 +# Filters: 2 + # Upload date: 1 + # Type: 2 + # Duration: 3 + + +features = { + '4k': 14, + 'hd': 4, + 'hdr': 25, + 'subtitles': 5, + 'creative_commons': 6, + '3d': 7, + 'live': 8, + 'purchased': 9, + '360': 15, + 'location': 23, +} + +def page_number_to_sp_parameter(page): + offset = (int(page) - 1)*20 # 20 results per page + first_byte = 255 & offset + second_byte = 255 & (offset >> 7) + second_byte = second_byte | 1 + + # 0b01001000 is required, and is always the same. + # The next 2 bytes encode the offset in little endian order, + # BUT, it's done in a strange way. The least significant bit (LSB) of the second byte is not part + # of the offset. Instead, to get the number which the two bytes encode, that LSB + # of the second byte is combined with the most significant bit (MSB) of the first byte + # in a logical AND. Replace the two bits with the result of the AND to get the two little endian + # bytes that represent the offset. + # I figured this out by trial and error on the sp parameter. I don't know why it's done like this; + # perhaps it's just obfuscation. + param_bytes = bytes((0b01001000, first_byte, second_byte)) + param_encoded = urllib.parse.quote(base64.urlsafe_b64encode(param_bytes)) + return param_encoded + +def get_search_json(query, page): + url = "https://www.youtube.com/results?search_query=" + urllib.parse.quote_plus(query) + headers = { + 'Host': 'www.youtube.com', + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)', + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.5', + 'X-YouTube-Client-Name': '1', + 'X-YouTube-Client-Version': '2.20180418', + } + url += "&pbj=1&sp=" + page_number_to_sp_parameter(page) + content = common.fetch_url(url, headers=headers) + info = json.loads(content) + return info + +"""def get_search_info(query, page): + result_info = dict() + info = get_bloated_search_info(query, page) + + estimated_results = int(info[1]['response']['estimatedResults']) + estimated_pages = ceil(estimated_results/20) + result_info['estimated_results'] = estimated_results + result_info['estimated_pages'] = estimated_pages + + result_info['results'] = [] + # this is what you get when you hire H-1B's + video_list = info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'] + + + for video_json_crap in video_list: + # they have a dictionary whose only content is another dictionary... + try: + type = list(video_json_crap.keys())[0] + except KeyError: + continue #channelRenderer or playlistRenderer + '''description = "" + for text_run in video_json_crap["descriptionSnippet"]["runs"]: + if text_run.get("bold", False): + description += "" + html.escape''' + try: + result_info['results'].append({ + "title": video_json_crap["title"]["simpleText"], + "video_id": video_json_crap["videoId"], + "description": video_json_crap.get("descriptionSnippet",dict()).get('runs',[]), # a list of text runs (formmated), rather than plain text + "thumbnail": get_thumbnail_url(video_json_crap["videoId"]), + "views_text": video_json_crap['viewCountText'].get('simpleText', None) or video_json_crap['viewCountText']['runs'][0]['text'], + "length_text": default_multi_get(video_json_crap, 'lengthText', 'simpleText', default=''), # livestreams dont have a length + "uploader": video_json_crap['longBylineText']['runs'][0]['text'], + "uploader_url": URL_ORIGIN + video_json_crap['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + "published_time_text": default_multi_get(video_json_crap, 'publishedTimeText', 'simpleText', default=''), + + }) + except KeyError: + print(video_json_crap) + raise + return result_info""" + + +def page_buttons_html(page_start, page_end, current_page, query): + result = "" + for page in range(page_start, page_end+1): + if page == current_page: + template = current_page_button_template + else: + template = page_button_template + result += template.substitute(page=page, href=URL_ORIGIN + "/search?query=" + urllib.parse.quote_plus(query) + "&page=" + str(page)) + return result + +showing_results_for = Template(''' +
Showing results for $corrected_query
+
Search instead for $original_query
+''') +did_you_mean = Template(''' +
Did you mean $corrected_query
+''') +def get_search_page(query_string, parameters=()): + qs_query = urllib.parse.parse_qs(query_string) + if len(qs_query) == 0: + return yt_search_template + query = qs_query["query"][0] + page = qs_query.get("page", "1")[0] + + info = get_search_json(query, page) + + estimated_results = int(info[1]['response']['estimatedResults']) + estimated_pages = ceil(estimated_results/20) + results = info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'] + + corrections = '' + result_list_html = "" + for renderer in results: + type = list(renderer.keys())[0] + if type == 'shelfRenderer': + continue + if type == 'didYouMeanRenderer': + renderer = renderer[type] + corrected_query_string = urllib.parse.parse_qs(query_string) + corrected_query_string['query'] = [renderer['correctedQueryEndpoint']['searchEndpoint']['query']] + corrected_query_url = URL_ORIGIN + '/search?' + common.make_query_string(corrected_query_string) + corrections = did_you_mean.substitute( + corrected_query_url = corrected_query_url, + corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), + ) + continue + if type == 'showingResultsForRenderer': + renderer = renderer[type] + no_autocorrect_query_string = urllib.parse.parse_qs(query_string) + no_autocorrect_query_string['autocorrect'] = ['0'] + no_autocorrect_query_url = URL_ORIGIN + '/search?' + common.make_query_string(no_autocorrect_query_string) + corrections = showing_results_for.substitute( + corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']), + original_query_url = no_autocorrect_query_url, + original_query = html.escape(renderer['originalQuery']['simpleText']), + ) + continue + result_list_html += common.renderer_html(renderer, current_query_string=query_string) + '''type = list(result.keys())[0] + result = result[type] + if type == "showingResultsForRenderer": + url = URL_ORIGIN + "/search" + if len(parameters) > 0: + url += ';' + ';'.join(parameters) + url += '?' + '&'.join(key + '=' + ','.join(values) for key,values in qs_query.items()) + + result_list_html += showing_results_for_template.substitute( + corrected_query=common.format_text_runs(result['correctedQuery']['runs']), + + ) + else: + result_list_html += common.html_functions[type](result)''' + + page = int(page) + if page <= 5: + page_start = 1 + page_end = min(9, estimated_pages) + else: + page_start = page - 4 + page_end = min(page + 4, estimated_pages) + + + result = Template(yt_search_results_template).substitute( + results = result_list_html, + page_title = query + " - Search", + search_box_value = html.escape(query), + number_of_results = '{:,}'.format(estimated_results), + number_of_pages = '{:,}'.format(estimated_pages), + page_buttons = page_buttons_html(page_start, page_end, page, query), + corrections = corrections + ) return result \ No newline at end of file -- cgit v1.2.3