youtube/search.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231

import json
import urllib
import html
from string import Template
import base64
from math import ceil
from youtube.common import default_multi_get, get_thumbnail_url, URL_ORIGIN
import youtube.common as common

with open("yt_search_results_template.html", "r") as file:
    yt_search_results_template = file.read()
    
with open("yt_search_template.html", "r") as file:
    yt_search_template = file.read()

page_button_template = Template('''<a class="page-button" href="$href">$page</a>''')
current_page_button_template = Template('''<div class="page-button">$page</div>''')
video_result_template = '''
                <div class="medium-item">
                    <a class="video-thumbnail-box" href="$video_url" title="$video_title">
                        <img class="video-thumbnail-img" src="$thumbnail_url">
                        <span class="video-duration">$length</span>
                    </a>

                    <a class="title" href="$video_url">$video_title</a>
                    
                    <address>Uploaded by <a href="$uploader_channel_url">$uploader</a></address>
                    <span class="views">$views</span>


                    <time datetime="$datetime">Uploaded $upload_date</time>

                    <span class="description">$description</span>
                </div>
'''


# Sort: 1
    # Upload date: 2
    # View count: 3
    # Rating: 1
# Offset: 9
# Filters: 2
    # Upload date: 1
    # Type: 2
    # Duration: 3


features = {
    '4k': 14,
    'hd': 4,
    'hdr': 25,
    'subtitles': 5,
    'creative_commons': 6,
    '3d': 7,
    'live': 8,
    'purchased': 9,
    '360': 15,
    'location': 23,
}

def page_number_to_sp_parameter(page):
    offset = (int(page) - 1)*20    # 20 results per page
    first_byte = 255 & offset
    second_byte = 255 & (offset >> 7)
    second_byte = second_byte | 1
    
    # 0b01001000 is required, and is always the same.
    # The next 2 bytes encode the offset in little endian order,
    #  BUT, it's done in a strange way. The least significant bit (LSB) of the second byte is not part
    #  of the offset. Instead, to get the number which the two bytes encode, that LSB
    #  of the second byte is combined with the most significant bit (MSB) of the first byte
    #  in a logical AND. Replace the two bits with the result of the AND to get the two little endian
    #  bytes that represent the offset.
    # I figured this out by trial and error on the sp parameter. I don't know why it's done like this;
    #  perhaps it's just obfuscation.
    param_bytes = bytes((0b01001000, first_byte, second_byte))
    param_encoded = urllib.parse.quote(base64.urlsafe_b64encode(param_bytes))
    return param_encoded

def get_search_json(query, page):
    url = "https://www.youtube.com/results?search_query=" + urllib.parse.quote_plus(query)
    headers = {
        'Host': 'www.youtube.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)',
        'Accept': '*/*',
        'Accept-Language': 'en-US,en;q=0.5',
        'X-YouTube-Client-Name': '1',
        'X-YouTube-Client-Version': '2.20180418',
    }
    url += "&pbj=1&sp=" + page_number_to_sp_parameter(page)
    content = common.fetch_url(url, headers=headers)
    info = json.loads(content)
    return info
    
"""def get_search_info(query, page):
    result_info = dict()
    info = get_bloated_search_info(query, page)
    
    estimated_results = int(info[1]['response']['estimatedResults'])
    estimated_pages = ceil(estimated_results/20)
    result_info['estimated_results'] = estimated_results
    result_info['estimated_pages'] = estimated_pages
    
    result_info['results'] = []
    # this is what you get when you hire H-1B's
    video_list = info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']
    
    
    for video_json_crap in video_list:
        # they have a dictionary whose only content is another dictionary...
        try:
            type = list(video_json_crap.keys())[0]
        except KeyError:
            continue    #channelRenderer or playlistRenderer
        '''description = ""
        for text_run in video_json_crap["descriptionSnippet"]["runs"]:
            if text_run.get("bold", False):
                description += "<b>" + html.escape'''
        try:
            result_info['results'].append({
                "title": video_json_crap["title"]["simpleText"],
                "video_id": video_json_crap["videoId"],
                "description": video_json_crap.get("descriptionSnippet",dict()).get('runs',[]),   # a list of text runs (formmated), rather than plain text
                "thumbnail": get_thumbnail_url(video_json_crap["videoId"]),
                "views_text": video_json_crap['viewCountText'].get('simpleText', None) or video_json_crap['viewCountText']['runs'][0]['text'],
                "length_text": default_multi_get(video_json_crap, 'lengthText', 'simpleText', default=''), # livestreams dont have a length
                "uploader": video_json_crap['longBylineText']['runs'][0]['text'],
                "uploader_url": URL_ORIGIN + video_json_crap['longBylineText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'],
                "published_time_text": default_multi_get(video_json_crap, 'publishedTimeText', 'simpleText', default=''),
                
            })
        except KeyError:
            print(video_json_crap)
            raise
    return result_info"""
    

def page_buttons_html(page_start, page_end, current_page, query):
    result = ""
    for page in range(page_start, page_end+1):
        if page == current_page:
            template = current_page_button_template
        else:
            template = page_button_template
        result += template.substitute(page=page, href=URL_ORIGIN + "/search?query=" + urllib.parse.quote_plus(query) + "&page=" + str(page))
    return result

showing_results_for = Template('''
                <div>Showing results for <a>$corrected_query</a></div>
                <div>Search instead for <a href="$original_query_url">$original_query</a></div>
''')
did_you_mean = Template('''
                <div>Did you mean <a href="$corrected_query_url">$corrected_query</a></div>
''')    
def get_search_page(query_string, parameters=()):
    qs_query = urllib.parse.parse_qs(query_string)
    if len(qs_query) == 0:
        return yt_search_template
    query = qs_query["query"][0]
    page = qs_query.get("page", "1")[0]

    info = get_search_json(query, page)
    
    estimated_results = int(info[1]['response']['estimatedResults'])
    estimated_pages = ceil(estimated_results/20)
    results = info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']
    
    corrections = ''
    result_list_html = ""
    for renderer in results:
        type = list(renderer.keys())[0]
        if type == 'shelfRenderer':
            continue
        if type == 'didYouMeanRenderer':
            renderer = renderer[type]
            corrected_query_string = urllib.parse.parse_qs(query_string)
            corrected_query_string['query'] = [renderer['correctedQueryEndpoint']['searchEndpoint']['query']]
            corrected_query_url = URL_ORIGIN + '/search?' + common.make_query_string(corrected_query_string)
            corrections = did_you_mean.substitute(
                corrected_query_url = corrected_query_url,
                corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']),
            )
            continue
        if type == 'showingResultsForRenderer':
            renderer = renderer[type]
            no_autocorrect_query_string = urllib.parse.parse_qs(query_string)
            no_autocorrect_query_string['autocorrect'] = ['0']
            no_autocorrect_query_url = URL_ORIGIN + '/search?' + common.make_query_string(no_autocorrect_query_string)
            corrections = showing_results_for.substitute(
                corrected_query = common.format_text_runs(renderer['correctedQuery']['runs']),
                original_query_url = no_autocorrect_query_url,
                original_query = html.escape(renderer['originalQuery']['simpleText']),
            )
            continue
        result_list_html += common.renderer_html(renderer, current_query_string=query_string)
        '''type = list(result.keys())[0]
        result = result[type]
        if type == "showingResultsForRenderer":
            url = URL_ORIGIN + "/search"
            if len(parameters) > 0:
                url += ';' + ';'.join(parameters)
            url += '?' + '&'.join(key + '=' + ','.join(values) for key,values in qs_query.items())
            
            result_list_html += showing_results_for_template.substitute(
                corrected_query=common.format_text_runs(result['correctedQuery']['runs']),
            
            )
        else:
            result_list_html += common.html_functions[type](result)'''
        
    page = int(page)
    if page <= 5:
        page_start = 1
        page_end = min(9, estimated_pages)
    else:
        page_start = page - 4
        page_end = min(page + 4, estimated_pages)
        
    
    result = Template(yt_search_results_template).substitute(
        results             = result_list_html, 
        page_title          = query + " - Search", 
        search_box_value    = html.escape(query),
        number_of_results   = '{:,}'.format(estimated_results),
        number_of_pages     = '{:,}'.format(estimated_pages),
        page_buttons        = page_buttons_html(page_start, page_end, page, query),
        corrections         = corrections
        )
    return result