diff options
Diffstat (limited to 'youtube/util.py')
| -rw-r--r-- | youtube/util.py | 506 |
1 files changed, 431 insertions, 75 deletions
diff --git a/youtube/util.py b/youtube/util.py index 1142c1d..3a8fd01 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -1,4 +1,5 @@ from datetime import datetime +import logging import settings import socks import sockshandler @@ -18,6 +19,8 @@ import gevent.queue import gevent.lock import collections import stem + +logger = logging.getLogger(__name__) import stem.control import traceback @@ -71,6 +74,10 @@ class TorManager: 'socks5h://127.0.0.1:' + str(settings.tor_port) + '/', cert_reqs='CERT_REQUIRED') self.tor_pool_refresh_time = time.monotonic() + settings.add_setting_changed_hook( + 'tor_port', + lambda old_val, new_val: self.refresh_tor_connection_pool(), + ) self.new_identity_lock = gevent.lock.BoundedSemaphore(1) self.last_new_identity_time = time.monotonic() - 20 @@ -190,7 +197,11 @@ class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler): class FetchError(Exception): def __init__(self, code, reason='', ip=None, error_message=None): - Exception.__init__(self, 'HTTP error during request: ' + code + ' ' + reason) + if error_message: + string = code + ' ' + reason + ': ' + error_message + else: + string = 'HTTP error during request: ' + code + ' ' + reason + Exception.__init__(self, string) self.code = code self.reason = reason self.ip = ip @@ -240,6 +251,7 @@ def fetch_url_response(url, headers=(), timeout=15, data=None, elif not isinstance(data, bytes): data = urllib.parse.urlencode(data).encode('utf-8') + if cookiejar_send is not None or cookiejar_receive is not None: # Use urllib req = urllib.request.Request(url, data=data, headers=headers) @@ -259,13 +271,32 @@ def fetch_url_response(url, headers=(), timeout=15, data=None, # According to the documentation for urlopen, a redirect counts as a # retry. So there are 3 redirects max by default. if max_redirects: - retries = urllib3.Retry(3+max_redirects, redirect=max_redirects) + retries = urllib3.Retry(3+max_redirects, redirect=max_redirects, raise_on_redirect=False) else: - retries = urllib3.Retry(3) + retries = urllib3.Retry(3, raise_on_redirect=False) pool = get_pool(use_tor and settings.route_tor) - response = pool.request(method, url, headers=headers, body=data, - timeout=timeout, preload_content=False, - decode_content=False, retries=retries) + try: + response = pool.request(method, url, headers=headers, body=data, + timeout=timeout, preload_content=False, + decode_content=False, retries=retries) + response.retries = retries + except urllib3.exceptions.MaxRetryError as e: + exception_cause = e.__context__.__context__ + if (isinstance(exception_cause, socks.ProxyConnectionError) + and settings.route_tor): + msg = ('Failed to connect to Tor. Check that Tor is open and ' + 'that your internet connection is working.\n\n' + + str(e)) + raise FetchError('502', reason='Bad Gateway', + error_message=msg) + elif isinstance(e.__context__, + urllib3.exceptions.NewConnectionError): + msg = 'Failed to establish a connection.\n\n' + str(e) + raise FetchError( + '502', reason='Bad Gateway', + error_message=msg) + else: + raise cleanup_func = (lambda r: r.release_conn()) return response, cleanup_func @@ -274,61 +305,140 @@ def fetch_url_response(url, headers=(), timeout=15, data=None, def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookiejar_send=None, cookiejar_receive=None, use_tor=True, debug_name=None): - while True: - start_time = time.monotonic() - - response, cleanup_func = fetch_url_response( - url, headers, timeout=timeout, data=data, - cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive, - use_tor=use_tor) - response_time = time.monotonic() - - content = response.read() - - read_finish = time.monotonic() + """ + Fetch URL with exponential backoff retry logic for rate limiting. - cleanup_func(response) # release_connection for urllib3 - content = decode_content( - content, - response.getheader('Content-Encoding', default='identity')) + Retries: + - 429 Too Many Requests: Exponential backoff (1s, 2s, 4s, 8s, 16s) + - 503 Service Unavailable: Exponential backoff + - 302 Redirect to Google Sorry: Treated as rate limit - if (settings.debugging_save_responses - and debug_name is not None and content): - save_dir = os.path.join(settings.data_dir, 'debug') - if not os.path.exists(save_dir): - os.makedirs(save_dir) + Max retries: 5 attempts with exponential backoff + """ + import random - with open(os.path.join(save_dir, debug_name), 'wb') as f: - f.write(content) + max_retries = 5 + base_delay = 1.0 # Base delay in seconds - if response.status == 429: - ip = re.search( - br'IP address: ((?:[\da-f]*:)+[\da-f]+|(?:\d+\.)+\d+)', - content) - ip = ip.group(1).decode('ascii') if ip else None - - # don't get new identity if we're not using Tor - if not use_tor: - raise FetchError('429', reason=response.reason, ip=ip) - - print('Error: Youtube blocked the request because the Tor exit node is overutilized. Exit node IP address: %s' % ip) - - # get new identity - error = tor_manager.new_identity(start_time) - if error: - raise FetchError( - '429', reason=response.reason, ip=ip, - error_message='Automatic circuit change: ' + error) - else: - continue # retry now that we have new identity - - elif response.status >= 400: - raise FetchError(str(response.status), reason=response.reason, - ip=None) - break + for attempt in range(max_retries): + try: + start_time = time.monotonic() + + response, cleanup_func = fetch_url_response( + url, headers, timeout=timeout, data=data, + cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive, + use_tor=use_tor) + response_time = time.monotonic() + + content = response.read() + + read_finish = time.monotonic() + + cleanup_func(response) # release_connection for urllib3 + content = decode_content( + content, + response.headers.get('Content-Encoding', default='identity')) + + if (settings.debugging_save_responses + and debug_name is not None + and content): + save_dir = os.path.join(settings.data_dir, 'debug') + os.makedirs(save_dir, exist_ok=True) + + with open(os.path.join(save_dir, debug_name), 'wb') as f: + f.write(content) + + # Check for rate limiting (429) or redirect to Google Sorry + if response.status == 429 or ( + response.status == 302 and (response.getheader('Location') == url + or response.getheader('Location').startswith( + 'https://www.google.com/sorry/index' + ) + ) + ): + logger.info(f'Rate limit response: {response.status} {response.reason}') + ip = re.search( + br'IP address: ((?:[\da-f]*:)+[\da-f]+|(?:\d+\.)+\d+)', + content) + ip = ip.group(1).decode('ascii') if ip else None + if not ip: + ip = re.search(r'IP=((?:\d+\.)+\d+)', + response.getheader('Set-Cookie') or '') + ip = ip.group(1) if ip else None + + # Without Tor, no point retrying with same IP + if not use_tor or not settings.route_tor: + logger.warning('Rate limited (429). Enable Tor routing to retry with new IP.') + raise FetchError('429', reason=response.reason, ip=ip) + + # Tor: exhausted retries + if attempt >= max_retries - 1: + logger.error(f'Rate limited after {max_retries} retries. Exit IP: {ip}') + raise FetchError('429', reason=response.reason, ip=ip, + error_message='Tor exit node overutilized after multiple retries') + + # Tor: get new identity and retry + logger.info(f'Rate limited. Getting new Tor identity... (IP: {ip})') + error = tor_manager.new_identity(start_time) + if error: + raise FetchError( + '429', reason=response.reason, ip=ip, + error_message='Automatic circuit change: ' + error) + continue # retry with new identity + + # Check for client errors (400, 404) - don't retry these + if response.status == 400: + logger.error(f'Bad Request (400) - Invalid parameters or URL: {url[:100]}') + raise FetchError('400', reason='Bad Request - Invalid parameters or URL format', ip=None) + + if response.status == 404: + logger.warning(f'Not Found (404): {url[:100]}') + raise FetchError('404', reason='Not Found', ip=None) + + # Check for other server errors (503, 502, 504) + if response.status in (502, 503, 504): + if attempt >= max_retries - 1: + logger.error(f'Server error {response.status} after {max_retries} retries') + raise FetchError(str(response.status), reason=response.reason, ip=None) + + # Exponential backoff for server errors + delay = (base_delay * (2 ** attempt)) + random.uniform(0, 1) + logger.warning(f'Server error ({response.status}). Waiting {delay:.1f}s before retry {attempt + 1}/{max_retries}...') + time.sleep(delay) + continue + + # Success - break out of retry loop + break + + except urllib3.exceptions.MaxRetryError as e: + # If this is the last attempt, raise the error + if attempt >= max_retries - 1: + exception_cause = e.__context__.__context__ + if (isinstance(exception_cause, socks.ProxyConnectionError) + and settings.route_tor): + msg = ('Failed to connect to Tor. Check that Tor is open and ' + 'that your internet connection is working.\n\n' + + str(e)) + logger.error(f'Tor connection failed: {msg}') + raise FetchError('502', reason='Bad Gateway', + error_message=msg) + elif isinstance(e.__context__, + urllib3.exceptions.NewConnectionError): + msg = 'Failed to establish a connection.\n\n' + str(e) + logger.error(f'Connection failed: {msg}') + raise FetchError( + '502', reason='Bad Gateway', + error_message=msg) + else: + raise + + # Wait and retry + delay = (base_delay * (2 ** attempt)) + random.uniform(0, 1) + logger.warning(f'Connection error. Waiting {delay:.1f}s before retry {attempt + 1}/{max_retries}...') + time.sleep(delay) if report_text: - print(report_text, ' Latency:', round(response_time - start_time, 3), ' Read time:', round(read_finish - response_time,3)) + logger.info(f'{report_text} - Latency: {round(response_time - start_time, 3)}s - Read time: {round(read_finish - response_time, 3)}s') return content @@ -355,11 +465,23 @@ def head(url, use_tor=False, report_text=None, max_redirects=10): round(time.monotonic() - start_time, 3)) return response - mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36' mobile_ua = (('User-Agent', mobile_user_agent),) desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0' desktop_ua = (('User-Agent', desktop_user_agent),) +json_header = (('Content-Type', 'application/json'),) +desktop_xhr_headers = ( + ('Accept', '*/*'), + ('Accept-Language', 'en-US,en;q=0.5'), + ('X-YouTube-Client-Name', '1'), + ('X-YouTube-Client-Version', '2.20240304.00.00'), +) + desktop_ua +mobile_xhr_headers = ( + ('Accept', '*/*'), + ('Accept-Language', 'en-US,en;q=0.5'), + ('X-YouTube-Client-Name', '2'), + ('X-YouTube-Client-Version', '2.20240304.08.00'), +) + mobile_ua class RateLimitedQueue(gevent.queue.Queue): @@ -410,21 +532,31 @@ class RateLimitedQueue(gevent.queue.Queue): def download_thumbnail(save_directory, video_id): - url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" save_location = os.path.join(save_directory, video_id + ".jpg") - try: - thumbnail = fetch_url(url, report_text="Saved thumbnail: " + video_id) - except urllib.error.HTTPError as e: - print("Failed to download thumbnail for " + video_id + ": " + str(e)) - return False - try: - f = open(save_location, 'wb') - except FileNotFoundError: - os.makedirs(save_directory, exist_ok=True) - f = open(save_location, 'wb') - f.write(thumbnail) - f.close() - return True + for quality in ('hq720.jpg', 'sddefault.jpg', 'hqdefault.jpg'): + url = f"https://i.ytimg.com/vi/{video_id}/{quality}" + try: + thumbnail = fetch_url(url, report_text="Saved thumbnail: " + video_id) + except FetchError as e: + if '404' in str(e): + continue + print("Failed to download thumbnail for " + video_id + ": " + str(e)) + return False + except urllib.error.HTTPError as e: + if e.code == 404: + continue + print("Failed to download thumbnail for " + video_id + ": " + str(e)) + return False + try: + f = open(save_location, 'wb') + except FileNotFoundError: + os.makedirs(save_directory, exist_ok=True) + f = open(save_location, 'wb') + f.write(thumbnail) + f.close() + return True + print("No thumbnail available for " + video_id) + return False def download_thumbnails(save_directory, ids): @@ -450,9 +582,40 @@ def video_id(url): return urllib.parse.parse_qs(url_parts.query)['v'][0] -# default, sddefault, mqdefault, hqdefault, hq720 -def get_thumbnail_url(video_id): - return settings.img_prefix + "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" +def get_thumbnail_url(video_id, quality='hq720'): + """Get thumbnail URL with fallback to lower quality if needed. + + Args: + video_id: YouTube video ID + quality: Preferred quality ('maxres', 'hq720', 'sd', 'hq', 'mq', 'default') + + Returns: + Tuple of (best_available_url, quality_used) + """ + # Quality priority order (highest to lowest) + quality_order = { + 'maxres': ['maxresdefault.jpg', 'sddefault.jpg', 'hqdefault.jpg'], + 'hq720': ['hq720.jpg', 'sddefault.jpg', 'hqdefault.jpg'], + 'sd': ['sddefault.jpg', 'hqdefault.jpg'], + 'hq': ['hqdefault.jpg', 'mqdefault.jpg'], + 'mq': ['mqdefault.jpg', 'default.jpg'], + 'default': ['default.jpg'], + } + + qualities = quality_order.get(quality, quality_order['hq720']) + base_url = f"{settings.img_prefix}https://i.ytimg.com/vi/{video_id}/" + + # For now, return the highest quality URL + # The browser will handle 404s gracefully with alt text + return base_url + qualities[0], qualities[0] + + +def get_best_thumbnail_url(video_id): + """Get the best available thumbnail URL for a video. + + Tries hq720 first (for HD videos), falls back to sddefault for SD videos. + """ + return get_thumbnail_url(video_id, quality='hq720')[0] def seconds_to_timestamp(seconds): @@ -475,10 +638,23 @@ def update_query_string(query_string, items): return urllib.parse.urlencode(parameters, doseq=True) +YOUTUBE_DOMAINS = ('youtube.com', 'youtu.be', 'youtube-nocookie.com') +YOUTUBE_URL_RE_STR = r'https?://(?:[a-zA-Z0-9_-]*\.)?(?:' +YOUTUBE_URL_RE_STR += r'|'.join(map(re.escape, YOUTUBE_DOMAINS)) +YOUTUBE_URL_RE_STR += r')(?:/[^"]*)?' +YOUTUBE_URL_RE = re.compile(YOUTUBE_URL_RE_STR) + + def prefix_url(url): if url is None: return None url = url.lstrip('/') # some urls have // before them, which has a special meaning + + # Increase resolution for YouTube channel avatars + if url and ('ggpht.com' in url or 'yt3.ggpht.com' in url): + # Replace size parameter with higher resolution (s240 instead of s88) + url = re.sub(r'=s\d+-c-k', '=s240-c-k-c0x00ffffff-no-rj', url) + return '/' + url @@ -517,11 +693,11 @@ def add_extra_html_info(item): item['url'] = (URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None video_info = {} - for key in ('id', 'title', 'author', 'duration'): + for key in ('id', 'title', 'author', 'duration', 'author_id'): try: video_info[key] = item[key] except KeyError: - video_info[key] = '' + video_info[key] = None item['video_info'] = json.dumps(video_info) @@ -536,6 +712,9 @@ def add_extra_html_info(item): elif item['type'] == 'channel': item['url'] = concat_or_none(URL_ORIGIN, "/channel/", item['id']) + if item.get('author_id') and 'author_url' not in item: + item['author_url'] = URL_ORIGIN + '/channel/' + item['author_id'] + def check_gevent_exceptions(*tasks): for task in tasks: @@ -603,8 +782,185 @@ def to_valid_filename(name): return name +# https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/youtube.py#L72 +INNERTUBE_CLIENTS = { + 'android': { + 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', + 'INNERTUBE_CONTEXT': { + 'client': { + 'hl': 'en', + 'gl': 'US', + 'clientName': 'ANDROID', + 'clientVersion': '19.09.36', + 'osName': 'Android', + 'osVersion': '12', + 'androidSdkVersion': 31, + 'platform': 'MOBILE', + 'userAgent': 'com.google.android.youtube/19.09.36 (Linux; U; Android 12; US) gzip' + }, + # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287 + #'thirdParty': { + # 'embedUrl': 'https://google.com', # Can be any valid URL + #} + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, + 'REQUIRE_JS_PLAYER': False, + }, + + 'android-test-suite': { + 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', + 'INNERTUBE_CONTEXT': { + 'client': { + 'hl': 'en', + 'gl': 'US', + 'clientName': 'ANDROID_TESTSUITE', + 'clientVersion': '1.9', + 'osName': 'Android', + 'osVersion': '12', + 'androidSdkVersion': 31, + 'platform': 'MOBILE', + 'userAgent': 'com.google.android.youtube/1.9 (Linux; U; Android 12; US) gzip' + }, + # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287 + #'thirdParty': { + # 'embedUrl': 'https://google.com', # Can be any valid URL + #} + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, + 'REQUIRE_JS_PLAYER': False, + }, + + 'ios': { + 'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc', + 'INNERTUBE_CONTEXT': { + 'client': { + 'hl': 'en', + 'gl': 'US', + 'clientName': 'IOS', + 'clientVersion': '21.03.2', + 'deviceMake': 'Apple', + 'deviceModel': 'iPhone16,2', + 'osName': 'iPhone', + 'osVersion': '18.7.2.22H124', + 'userAgent': 'com.google.ios.youtube/21.03.2 (iPhone16,2; U; CPU iOS 18_7_2 like Mac OS X)' + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, + 'REQUIRE_JS_PLAYER': False + }, + + # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option) + # See: https://github.com/zerodytrash/YouTube-Internal-Clients + 'tv_embedded': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'hl': 'en', + 'gl': 'US', + 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', + 'clientVersion': '2.0', + 'clientScreen': 'EMBED', + }, + # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287 + 'thirdParty': { + 'embedUrl': 'https://google.com', # Can be any valid URL + } + + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 85, + 'REQUIRE_JS_PLAYER': True, + }, + + 'web': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20220801.00.00', + 'userAgent': desktop_user_agent, + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 1 + }, + 'android_vr': { + 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_VR', + 'clientVersion': '1.60.19', + 'deviceMake': 'Oculus', + 'deviceModel': 'Quest 3', + 'androidSdkVersion': 32, + 'userAgent': 'com.google.android.apps.youtube.vr.oculus/1.60.19 (Linux; U; Android 12L; eureka-user Build/SQ3A.220605.009.A1) gzip', + 'osName': 'Android', + 'osVersion': '12L', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 28, + 'REQUIRE_JS_PLAYER': False, + }, +} + +def get_visitor_data(): + visitor_data = None + visitor_data_cache = os.path.join(settings.data_dir, 'visitorData.txt') + os.makedirs(settings.data_dir, exist_ok=True) + if os.path.isfile(visitor_data_cache): + with open(visitor_data_cache, 'r') as file: + print('Getting visitor_data from cache') + visitor_data = file.read() + max_age = 12*3600 + file_age = time.time() - os.path.getmtime(visitor_data_cache) + if file_age > max_age: + print('visitor_data cache is too old. Removing file...') + os.remove(visitor_data_cache) + return visitor_data + + print('Fetching youtube homepage to get visitor_data') + yt_homepage = 'https://www.youtube.com' + yt_resp = fetch_url(yt_homepage, headers={'User-Agent': mobile_user_agent}, report_text='Getting youtube homepage') + visitor_data_re = r'''"visitorData":\s*?"(.+?)"''' + visitor_data_match = re.search(visitor_data_re, yt_resp.decode()) + if visitor_data_match: + visitor_data = visitor_data_match.group(1) + print(f'Got visitor_data: {len(visitor_data)}') + with open(visitor_data_cache, 'w') as file: + print('Saving visitor_data cache...') + file.write(visitor_data) + return visitor_data + else: + print('Unable to get visitor_data value') + return visitor_data + +def call_youtube_api(client, api, data): + client_params = INNERTUBE_CLIENTS[client] + context = client_params['INNERTUBE_CONTEXT'] + key = client_params['INNERTUBE_API_KEY'] + host = client_params.get('INNERTUBE_HOST') or 'www.youtube.com' + user_agent = context['client'].get('userAgent') or mobile_user_agent + visitor_data = get_visitor_data() + + url = 'https://' + host + '/youtubei/v1/' + api + '?key=' + key + if visitor_data: + context['client'].update({'visitorData': visitor_data}) + data['context'] = context + + data = json.dumps(data) + headers = (('Content-Type', 'application/json'),('User-Agent', user_agent)) + if visitor_data: + headers = ( *headers, ('X-Goog-Visitor-Id', visitor_data )) + response = fetch_url( + url, data=data, headers=headers, + debug_name='youtubei_' + api + '_' + client, + report_text='Fetched ' + client + ' youtubei ' + api + ).decode('utf-8') + return response + + def strip_non_ascii(string): ''' Returns the string without non ASCII characters''' + if string is None: + return "" stripped = (c for c in string if 0 < ord(c) < 127) return ''.join(stripped) |
