1 files changed, 252 insertions, 15 deletions
diff --git a/youtube/util.py b/youtube/util.py
index 1142c1d..c59fae8 100644
--- a/youtube/util.py
+++ b/youtube/util.py
@@ -71,6 +71,10 @@ class TorManager:
             'socks5h://127.0.0.1:' + str(settings.tor_port) + '/',
             cert_reqs='CERT_REQUIRED')
         self.tor_pool_refresh_time = time.monotonic()
+        settings.add_setting_changed_hook(
+            'tor_port',
+            lambda old_val, new_val: self.refresh_tor_connection_pool(),
+        )
 
         self.new_identity_lock = gevent.lock.BoundedSemaphore(1)
         self.last_new_identity_time = time.monotonic() - 20
@@ -190,7 +194,11 @@ class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler):
 
 class FetchError(Exception):
     def __init__(self, code, reason='', ip=None, error_message=None):
-        Exception.__init__(self, 'HTTP error during request: ' + code + ' ' + reason)
+        if error_message:
+            string = code + ' ' + reason + ': ' + error_message
+        else:
+            string = 'HTTP error during request: ' + code + ' ' + reason
+        Exception.__init__(self, string)
         self.code = code
         self.reason = reason
         self.ip = ip
@@ -240,6 +248,7 @@ def fetch_url_response(url, headers=(), timeout=15, data=None,
         elif not isinstance(data, bytes):
             data = urllib.parse.urlencode(data).encode('utf-8')
 
+
     if cookiejar_send is not None or cookiejar_receive is not None:     # Use urllib
         req = urllib.request.Request(url, data=data, headers=headers)
 
@@ -259,13 +268,32 @@ def fetch_url_response(url, headers=(), timeout=15, data=None,
         # According to the documentation for urlopen, a redirect counts as a
         # retry. So there are 3 redirects max by default.
         if max_redirects:
-            retries = urllib3.Retry(3+max_redirects, redirect=max_redirects)
+            retries = urllib3.Retry(3+max_redirects, redirect=max_redirects, raise_on_redirect=False)
         else:
-            retries = urllib3.Retry(3)
+            retries = urllib3.Retry(3, raise_on_redirect=False)
         pool = get_pool(use_tor and settings.route_tor)
-        response = pool.request(method, url, headers=headers, body=data,
-                                timeout=timeout, preload_content=False,
-                                decode_content=False, retries=retries)
+        try:
+            response = pool.request(method, url, headers=headers, body=data,
+                                    timeout=timeout, preload_content=False,
+                                    decode_content=False, retries=retries)
+            response.retries = retries
+        except urllib3.exceptions.MaxRetryError as e:
+            exception_cause = e.__context__.__context__
+            if (isinstance(exception_cause, socks.ProxyConnectionError)
+                    and settings.route_tor):
+                msg = ('Failed to connect to Tor. Check that Tor is open and '
+                       'that your internet connection is working.\n\n'
+                       + str(e))
+                raise FetchError('502', reason='Bad Gateway',
+                                 error_message=msg)
+            elif isinstance(e.__context__,
+                            urllib3.exceptions.NewConnectionError):
+                msg = 'Failed to establish a connection.\n\n' + str(e)
+                raise FetchError(
+                    '502', reason='Bad Gateway',
+                     error_message=msg)
+            else:
+                raise
         cleanup_func = (lambda r: r.release_conn())
 
     return response, cleanup_func
@@ -290,10 +318,11 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
         cleanup_func(response)  # release_connection for urllib3
         content = decode_content(
             content,
-            response.getheader('Content-Encoding', default='identity'))
+            response.headers.get('Content-Encoding', default='identity'))
 
         if (settings.debugging_save_responses
-                and debug_name is not None and content):
+                and debug_name is not None
+                and content):
             save_dir = os.path.join(settings.data_dir, 'debug')
             if not os.path.exists(save_dir):
                 os.makedirs(save_dir)
@@ -301,17 +330,28 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None,
             with open(os.path.join(save_dir, debug_name), 'wb') as f:
                 f.write(content)
 
-        if response.status == 429:
+        if response.status == 429 or (
+            response.status == 302 and (response.getheader('Location') == url
+                or response.getheader('Location').startswith(
+                       'https://www.google.com/sorry/index'
+                   )
+            )
+        ):
+            print(response.status, response.reason, response.headers)
             ip = re.search(
                 br'IP address: ((?:[\da-f]*:)+[\da-f]+|(?:\d+\.)+\d+)',
                 content)
             ip = ip.group(1).decode('ascii') if ip else None
+            if not ip:
+                ip = re.search(r'IP=((?:\d+\.)+\d+)',
+                               response.getheader('Set-Cookie') or '')
+                ip = ip.group(1) if ip else None
 
             # don't get new identity if we're not using Tor
             if not use_tor:
                 raise FetchError('429', reason=response.reason, ip=ip)
 
-            print('Error: Youtube blocked the request because the Tor exit node is overutilized. Exit node IP address: %s' % ip)
+            print('Error: YouTube blocked the request because the Tor exit node is overutilized. Exit node IP address: %s' % ip)
 
             # get new identity
             error = tor_manager.new_identity(start_time)
@@ -355,11 +395,23 @@ def head(url, use_tor=False, report_text=None, max_redirects=10):
             round(time.monotonic() - start_time, 3))
     return response
 
-
 mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'
 mobile_ua = (('User-Agent', mobile_user_agent),)
 desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
 desktop_ua = (('User-Agent', desktop_user_agent),)
+json_header = (('Content-Type', 'application/json'),)
+desktop_xhr_headers = (
+    ('Accept', '*/*'),
+    ('Accept-Language', 'en-US,en;q=0.5'),
+    ('X-YouTube-Client-Name', '1'),
+    ('X-YouTube-Client-Version', '2.20240304.00.00'),
+) + desktop_ua
+mobile_xhr_headers = (
+    ('Accept', '*/*'),
+    ('Accept-Language', 'en-US,en;q=0.5'),
+    ('X-YouTube-Client-Name', '2'),
+    ('X-YouTube-Client-Version', '2.20240304.08.00'),
+) + mobile_ua
 
 
 class RateLimitedQueue(gevent.queue.Queue):
@@ -410,7 +462,7 @@ class RateLimitedQueue(gevent.queue.Queue):
 
 
 def download_thumbnail(save_directory, video_id):
-    url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
+    url = f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg"
     save_location = os.path.join(save_directory, video_id + ".jpg")
     try:
         thumbnail = fetch_url(url, report_text="Saved thumbnail: " + video_id)
@@ -452,7 +504,7 @@ def video_id(url):
 
 # default, sddefault, mqdefault, hqdefault, hq720
 def get_thumbnail_url(video_id):
-    return settings.img_prefix + "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
+    return f"{settings.img_prefix}https://i.ytimg.com/vi/{video_id}/hqdefault.jpg"
 
 
 def seconds_to_timestamp(seconds):
@@ -475,6 +527,13 @@ def update_query_string(query_string, items):
     return urllib.parse.urlencode(parameters, doseq=True)
 
 
+YOUTUBE_DOMAINS = ('youtube.com', 'youtu.be', 'youtube-nocookie.com')
+YOUTUBE_URL_RE_STR = r'https?://(?:[a-zA-Z0-9_-]*\.)?(?:'
+YOUTUBE_URL_RE_STR += r'|'.join(map(re.escape, YOUTUBE_DOMAINS))
+YOUTUBE_URL_RE_STR += r')(?:/[^"]*)?'
+YOUTUBE_URL_RE = re.compile(YOUTUBE_URL_RE_STR)
+
+
 def prefix_url(url):
     if url is None:
         return None
@@ -517,11 +576,11 @@ def add_extra_html_info(item):
         item['url'] = (URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None
 
         video_info = {}
-        for key in ('id', 'title', 'author', 'duration'):
+        for key in ('id', 'title', 'author', 'duration', 'author_id'):
             try:
                 video_info[key] = item[key]
             except KeyError:
-                video_info[key] = ''
+                video_info[key] = None
 
         item['video_info'] = json.dumps(video_info)
 
@@ -536,6 +595,9 @@ def add_extra_html_info(item):
     elif item['type'] == 'channel':
         item['url'] = concat_or_none(URL_ORIGIN, "/channel/", item['id'])
 
+    if item.get('author_id') and 'author_url' not in item:
+        item['author_url'] = URL_ORIGIN + '/channel/' + item['author_id']
+
 
 def check_gevent_exceptions(*tasks):
     for task in tasks:
@@ -603,8 +665,183 @@ def to_valid_filename(name):
     return name
 
 
+# https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/youtube.py#L72
+INNERTUBE_CLIENTS = {
+    'android': {
+        'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w',
+        'INNERTUBE_CONTEXT': {
+            'client': {
+                'hl': 'en',
+                'gl': 'US',
+                'clientName': 'ANDROID',
+                'clientVersion': '19.09.36',
+                'osName': 'Android',
+                'osVersion': '12',
+                'androidSdkVersion': 31,
+                'platform': 'MOBILE',
+                'userAgent': 'com.google.android.youtube/19.09.36 (Linux; U; Android 12; US) gzip'
+            },
+            # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287
+            #'thirdParty': {
+            #    'embedUrl': 'https://google.com',  # Can be any valid URL
+            #}
+        },
+        'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
+        'REQUIRE_JS_PLAYER': False,
+    },
+
+    'android-test-suite': {
+        'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w',
+        'INNERTUBE_CONTEXT': {
+            'client': {
+                'hl': 'en',
+                'gl': 'US',
+                'clientName': 'ANDROID_TESTSUITE',
+                'clientVersion': '1.9',
+                'osName': 'Android',
+                'osVersion': '12',
+                'androidSdkVersion': 31,
+                'platform': 'MOBILE',
+                'userAgent': 'com.google.android.youtube/1.9 (Linux; U; Android 12; US) gzip'
+            },
+            # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287
+            #'thirdParty': {
+            #    'embedUrl': 'https://google.com',  # Can be any valid URL
+            #}
+        },
+        'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
+        'REQUIRE_JS_PLAYER': False,
+    },
+
+    'ios': {
+        'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc',
+        'INNERTUBE_CONTEXT': {
+            'client': {
+                'hl': 'en',
+                'gl': 'US',
+                'clientName': 'IOS',
+                'clientVersion': '19.09.3',
+                'deviceModel': 'iPhone14,3',
+                'userAgent': 'com.google.ios.youtube/19.09.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)'
+            }
+        },
+        'INNERTUBE_CONTEXT_CLIENT_NAME': 5,
+        'REQUIRE_JS_PLAYER': False
+    },
+
+    # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option)
+    # See: https://github.com/zerodytrash/YouTube-Internal-Clients
+    'tv_embedded': {
+        'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+        'INNERTUBE_CONTEXT': {
+            'client': {
+                'hl': 'en',
+                'gl': 'US',
+                'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER',
+                'clientVersion': '2.0',
+                'clientScreen': 'EMBED',
+            },
+            # https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-887739287
+            'thirdParty': {
+                'embedUrl': 'https://google.com',  # Can be any valid URL
+            }
+
+        },
+        'INNERTUBE_CONTEXT_CLIENT_NAME': 85,
+        'REQUIRE_JS_PLAYER': True,
+    },
+
+    'web': {
+        'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+        'INNERTUBE_CONTEXT': {
+            'client': {
+                'clientName': 'WEB',
+                'clientVersion': '2.20220801.00.00',
+                'userAgent': desktop_user_agent,
+            }
+        },
+        'INNERTUBE_CONTEXT_CLIENT_NAME': 1
+    },
+    'android_vr': {
+        'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w',
+        'INNERTUBE_CONTEXT': {
+            'client': {
+                'clientName': 'ANDROID_VR',
+                'clientVersion': '1.60.19',
+                'deviceMake': 'Oculus',
+                'deviceModel': 'Quest 3',
+                'androidSdkVersion': 32,
+                'userAgent': 'com.google.android.apps.youtube.vr.oculus/1.60.19 (Linux; U; Android 12L; eureka-user Build/SQ3A.220605.009.A1) gzip',
+                'osName': 'Android',
+                'osVersion': '12L',
+            },
+        },
+        'INNERTUBE_CONTEXT_CLIENT_NAME': 28,
+        'REQUIRE_JS_PLAYER': False,
+    },
+}
+
+def get_visitor_data():
+    visitor_data = None
+    visitor_data_cache = os.path.join(settings.data_dir, 'visitorData.txt')
+    if not os.path.exists(settings.data_dir):
+        os.makedirs(settings.data_dir)
+    if os.path.isfile(visitor_data_cache):
+        with open(visitor_data_cache, 'r') as file:
+            print('Getting visitor_data from cache')
+            visitor_data = file.read()
+        max_age = 12*3600
+        file_age = time.time() - os.path.getmtime(visitor_data_cache)
+        if file_age > max_age:
+            print('visitor_data cache is too old. Removing file...')
+            os.remove(visitor_data_cache)
+        return visitor_data
+
+    print('Fetching youtube homepage to get visitor_data')
+    yt_homepage = 'https://www.youtube.com'
+    yt_resp = fetch_url(yt_homepage, headers={'User-Agent': mobile_user_agent}, report_text='Getting youtube homepage')
+    visitor_data_re = r'''"visitorData":\s*?"(.+?)"'''
+    visitor_data_match = re.search(visitor_data_re, yt_resp.decode())
+    if visitor_data_match:
+        visitor_data = visitor_data_match.group(1)
+        print(f'Got visitor_data: {len(visitor_data)}')
+        with open(visitor_data_cache, 'w') as file:
+            print('Saving visitor_data cache...')
+            file.write(visitor_data)
+        return visitor_data
+    else:
+        print('Unable to get visitor_data value')
+    return visitor_data
+
+def call_youtube_api(client, api, data):
+    client_params = INNERTUBE_CLIENTS[client]
+    context = client_params['INNERTUBE_CONTEXT']
+    key = client_params['INNERTUBE_API_KEY']
+    host = client_params.get('INNERTUBE_HOST') or 'www.youtube.com'
+    user_agent = context['client'].get('userAgent') or mobile_user_agent
+    visitor_data = get_visitor_data()
+
+    url = 'https://' + host + '/youtubei/v1/' + api + '?key=' + key
+    if visitor_data:
+        context['client'].update({'visitorData': visitor_data})
+    data['context'] = context
+
+    data = json.dumps(data)
+    headers = (('Content-Type', 'application/json'),('User-Agent', user_agent))
+    if visitor_data:
+        headers = ( *headers, ('X-Goog-Visitor-Id', visitor_data ))
+    response = fetch_url(
+        url, data=data, headers=headers,
+        debug_name='youtubei_' + api + '_' + client,
+        report_text='Fetched ' + client + ' youtubei ' + api
+    ).decode('utf-8')
+    return response
+
+
 def strip_non_ascii(string):
     ''' Returns the string without non ASCII characters'''
+    if string is None:
+        return ""
     stripped = (c for c in string if 0 < ord(c) < 127)
     return ''.join(stripped)