diff options
Diffstat (limited to 'youtube/util.py')
-rw-r--r-- | youtube/util.py | 267 |
1 files changed, 252 insertions, 15 deletions
diff --git a/youtube/util.py b/youtube/util.py index 1142c1d..c59fae8 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -71,6 +71,10 @@ class TorManager: 'socks5h://127.0.0.1:' + str(settings.tor_port) + '/', cert_reqs='CERT_REQUIRED') self.tor_pool_refresh_time = time.monotonic() + settings.add_setting_changed_hook( + 'tor_port', + lambda old_val, new_val: self.refresh_tor_connection_pool(), + ) self.new_identity_lock = gevent.lock.BoundedSemaphore(1) self.last_new_identity_time = time.monotonic() - 20 @@ -190,7 +194,11 @@ class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler): class FetchError(Exception): def __init__(self, code, reason='', ip=None, error_message=None): - Exception.__init__(self, 'HTTP error during request: ' + code + ' ' + reason) + if error_message: + string = code + ' ' + reason + ': ' + error_message + else: + string = 'HTTP error during request: ' + code + ' ' + reason + Exception.__init__(self, string) self.code = code self.reason = reason self.ip = ip @@ -240,6 +248,7 @@ def fetch_url_response(url, headers=(), timeout=15, data=None, elif not isinstance(data, bytes): data = urllib.parse.urlencode(data).encode('utf-8') + if cookiejar_send is not None or cookiejar_receive is not None: # Use urllib req = urllib.request.Request(url, data=data, headers=headers) @@ -259,13 +268,32 @@ def fetch_url_response(url, headers=(), timeout=15, data=None, # According to the documentation for urlopen, a redirect counts as a # retry. So there are 3 redirects max by default. if max_redirects: - retries = urllib3.Retry(3+max_redirects, redirect=max_redirects) + retries = urllib3.Retry(3+max_redirects, redirect=max_redirects, raise_on_redirect=False) else: - retries = urllib3.Retry(3) + retries = urllib3.Retry(3, raise_on_redirect=False) pool = get_pool(use_tor and settings.route_tor) - response = pool.request(method, url, headers=headers, body=data, - timeout=timeout, preload_content=False, - decode_content=False, retries=retries) + try: + response = pool.request(method, url, headers=headers, body=data, + timeout=timeout, preload_content=False, + decode_content=False, retries=retries) + response.retries = retries + except urllib3.exceptions.MaxRetryError as e: + exception_cause = e.__context__.__context__ + if (isinstance(exception_cause, socks.ProxyConnectionError) + and settings.route_tor): + msg = ('Failed to connect to Tor. Check that Tor is open and ' + 'that your internet connection is working.\n\n' + + str(e)) + raise FetchError('502', reason='Bad Gateway', + error_message=msg) + elif isinstance(e.__context__, + urllib3.exceptions.NewConnectionError): + msg = 'Failed to establish a connection.\n\n' + str(e) + raise FetchError( + '502', reason='Bad Gateway', + error_message=msg) + else: + raise cleanup_func = (lambda r: r.release_conn()) return response, cleanup_func @@ -290,10 +318,11 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cleanup_func(response) # release_connection for urllib3 content = decode_content( content, - response.getheader('Content-Encoding', default='identity')) + response.headers.get('Content-Encoding', default='identity')) if (settings.debugging_save_responses - and debug_name is not None and content): + and debug_name is not None + and content): save_dir = os.path.join(settings.data_dir, 'debug') if not os.path.exists(save_dir): os.makedirs(save_dir) @@ -301,17 +330,28 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, with open(os.path.join(save_dir, debug_name), 'wb') as f: f.write(content) - if response.status == 429: + if response.status == 429 or ( + response.status == 302 and (response.getheader('Location') == url + or response.getheader('Location').startswith( + 'https://www.google.com/sorry/index' + ) + ) + ): + print(response.status, response.reason, response.headers) ip = re.search( br'IP address: ((?:[\da-f]*:)+[\da-f]+|(?:\d+\.)+\d+)', content) ip = ip.group(1).decode('ascii') if ip else None + if not ip: + ip = re.search(r'IP=((?:\d+\.)+\d+)', + response.getheader('Set-Cookie') or '') + ip = ip.group(1) if ip else None # don't get new identity if we're not using Tor if not use_tor: raise FetchError('429', reason=response.reason, ip=ip) - print('Error: Youtube blocked the request because the Tor exit node is overutilized. Exit node IP address: %s' % ip) + print('Error: YouTube blocked the request because the Tor exit node is overutilized. Exit node IP address: %s' % ip) # get new identity error = tor_manager.new_identity(start_time) @@ -355,11 +395,23 @@ def head(url, use_tor=False, report_text=None, max_redirects=10): round(time.monotonic() - start_time, 3)) return response - mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36' mobile_ua = (('User-Agent', mobile_user_agent),) desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0' desktop_ua = (('User-Agent', desktop_user_agent),) +json_header = (('Content-Type', 'application/json'),) +desktop_xhr_headers = ( + ('Accept', '*/*'), + ('Accept-Language', 'en-US,en;q=0.5'), + ('X-YouTube-Client-Name', '1'), + ('X-YouTube-Client-Version', '2.20240304.00.00'), +) + desktop_ua +mobile_xhr_headers = ( + ('Accept', '*/*'), + ('Accept-Language', 'en-US,en;q=0.5'), + ('X-YouTube-Client-Name', '2'), + ('X-YouTube-Client-Version', '2.20240304.08.00'), +) + mobile_ua class RateLimitedQueue(gevent.queue.Queue): @@ -410,7 +462,7 @@ class RateLimitedQueue(gevent.queue.Queue): def download_thumbnail(save_directory, video_id): - url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" + url = f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg" save_location = os.path.join(save_directory, video_id + ".jpg") try: thumbnail = fetch_url(url, report_text="Saved thumbnail: " + video_id) @@ -452,7 +504,7 @@ def video_id(url): # default, sddefault, mqdefault, hqdefault, hq720 def get_thumbnail_url(video_id): - return settings.img_prefix + "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" + return f"{settings.img_prefix}https://i.ytimg.com/vi/{video_id}/hqdefault.jpg" def seconds_to_timestamp(seconds): @@ -475,6 +527,13 @@ def update_query_string(query_string, items): return urllib.parse.urlencode(parameters, doseq=True) +YOUTUBE_DOMAINS = ('youtube.com', 'youtu.be', 'youtube-nocookie.com') +YOUTUBE_URL_RE_STR = r'https?://(?:[a-zA-Z0-9_-]*\.)?(?:' +YOUTUBE_URL_RE_STR += r'|'.join(map(re.escape, YOUTUBE_DOMAINS)) +YOUTUBE_URL_RE_STR += r')(?:/[^"]*)?' +YOUTUBE_URL_RE = re.compile(YOUTUBE_URL_RE_STR) + + def prefix_url(url): if url is None: return None @@ -517,11 +576,11 @@ def add_extra_html_info(item): item['url'] = (URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None video_info = {} - for key in ('id', 'title', 'author', 'duration'): + for key in ('id', 'title', 'author', 'duration', 'author_id'): try: video_info[key] = item[key] except KeyError: - video_info[key] = '' + video_info[key] = None item['video_info'] = json.dumps(video_info) @@ -536,6 +595,9 @@ def add_extra_html_info(item): elif item['type'] == 'channel': item['url'] = concat_or_none(URL_ORIGIN, "/channel/", item['id']) + if item.get('author_id') and 'author_url' not in item: + item['author_url'] = URL_ORIGIN + '/channel/' + item['author_id'] + def check_gevent_exceptions(*tasks): for task in tasks: @@ -603,8 +665,183 @@ def to_valid_filename(name): return name +# https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/youtube.py#L72 +INNERTUBE_CLIENTS = { + 'android': { + 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', + 'INNERTUBE_CONTEXT': { + 'client': { + 'hl': 'en', + 'gl': 'US', + 'clientName': 'ANDROID', + 'clientVersion': '19.09.36', + 'osName': 'Android', + 'osVersion': '12', + 'androidSdkVersion': 31, + 'platform': 'MOBILE', + 'userAgent': 'com.google.android.youtube/19.09.36 (Linux; U; Android 12; US) gzip' + }, + # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287 + #'thirdParty': { + # 'embedUrl': 'https://google.com', # Can be any valid URL + #} + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, + 'REQUIRE_JS_PLAYER': False, + }, + + 'android-test-suite': { + 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', + 'INNERTUBE_CONTEXT': { + 'client': { + 'hl': 'en', + 'gl': 'US', + 'clientName': 'ANDROID_TESTSUITE', + 'clientVersion': '1.9', + 'osName': 'Android', + 'osVersion': '12', + 'androidSdkVersion': 31, + 'platform': 'MOBILE', + 'userAgent': 'com.google.android.youtube/1.9 (Linux; U; Android 12; US) gzip' + }, + # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287 + #'thirdParty': { + # 'embedUrl': 'https://google.com', # Can be any valid URL + #} + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, + 'REQUIRE_JS_PLAYER': False, + }, + + 'ios': { + 'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc', + 'INNERTUBE_CONTEXT': { + 'client': { + 'hl': 'en', + 'gl': 'US', + 'clientName': 'IOS', + 'clientVersion': '19.09.3', + 'deviceModel': 'iPhone14,3', + 'userAgent': 'com.google.ios.youtube/19.09.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, + 'REQUIRE_JS_PLAYER': False + }, + + # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option) + # See: https://github.com/zerodytrash/YouTube-Internal-Clients + 'tv_embedded': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'hl': 'en', + 'gl': 'US', + 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', + 'clientVersion': '2.0', + 'clientScreen': 'EMBED', + }, + # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287 + 'thirdParty': { + 'embedUrl': 'https://google.com', # Can be any valid URL + } + + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 85, + 'REQUIRE_JS_PLAYER': True, + }, + + 'web': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20220801.00.00', + 'userAgent': desktop_user_agent, + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 1 + }, + 'android_vr': { + 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_VR', + 'clientVersion': '1.60.19', + 'deviceMake': 'Oculus', + 'deviceModel': 'Quest 3', + 'androidSdkVersion': 32, + 'userAgent': 'com.google.android.apps.youtube.vr.oculus/1.60.19 (Linux; U; Android 12L; eureka-user Build/SQ3A.220605.009.A1) gzip', + 'osName': 'Android', + 'osVersion': '12L', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 28, + 'REQUIRE_JS_PLAYER': False, + }, +} + +def get_visitor_data(): + visitor_data = None + visitor_data_cache = os.path.join(settings.data_dir, 'visitorData.txt') + if not os.path.exists(settings.data_dir): + os.makedirs(settings.data_dir) + if os.path.isfile(visitor_data_cache): + with open(visitor_data_cache, 'r') as file: + print('Getting visitor_data from cache') + visitor_data = file.read() + max_age = 12*3600 + file_age = time.time() - os.path.getmtime(visitor_data_cache) + if file_age > max_age: + print('visitor_data cache is too old. Removing file...') + os.remove(visitor_data_cache) + return visitor_data + + print('Fetching youtube homepage to get visitor_data') + yt_homepage = 'https://www.youtube.com' + yt_resp = fetch_url(yt_homepage, headers={'User-Agent': mobile_user_agent}, report_text='Getting youtube homepage') + visitor_data_re = r'''"visitorData":\s*?"(.+?)"''' + visitor_data_match = re.search(visitor_data_re, yt_resp.decode()) + if visitor_data_match: + visitor_data = visitor_data_match.group(1) + print(f'Got visitor_data: {len(visitor_data)}') + with open(visitor_data_cache, 'w') as file: + print('Saving visitor_data cache...') + file.write(visitor_data) + return visitor_data + else: + print('Unable to get visitor_data value') + return visitor_data + +def call_youtube_api(client, api, data): + client_params = INNERTUBE_CLIENTS[client] + context = client_params['INNERTUBE_CONTEXT'] + key = client_params['INNERTUBE_API_KEY'] + host = client_params.get('INNERTUBE_HOST') or 'www.youtube.com' + user_agent = context['client'].get('userAgent') or mobile_user_agent + visitor_data = get_visitor_data() + + url = 'https://' + host + '/youtubei/v1/' + api + '?key=' + key + if visitor_data: + context['client'].update({'visitorData': visitor_data}) + data['context'] = context + + data = json.dumps(data) + headers = (('Content-Type', 'application/json'),('User-Agent', user_agent)) + if visitor_data: + headers = ( *headers, ('X-Goog-Visitor-Id', visitor_data )) + response = fetch_url( + url, data=data, headers=headers, + debug_name='youtubei_' + api + '_' + client, + report_text='Fetched ' + client + ' youtubei ' + api + ).decode('utf-8') + return response + + def strip_non_ascii(string): ''' Returns the string without non ASCII characters''' + if string is None: + return "" stripped = (c for c in string if 0 < ord(c) < 127) return ''.join(stripped) |