From 22c72aa842efa6d1dca3bb95eeb47122537ce12a Mon Sep 17 00:00:00 2001 From: Astounds Date: Fri, 27 Mar 2026 20:47:44 -0500 Subject: remove yt-dlp, fix captions PO Token issue, fix 429 retry logic - Remove yt-dlp entirely (modules, routes, settings, dependency) Was blocking page loads by running synchronously in gevent - Fix captions: use Android client caption URLs (no PO Token needed) instead of web timedtext URLs that YouTube now blocks - Fix 429 retry: fail immediately without Tor (same IP = pointless retry) Was causing ~27s delays with exponential backoff - Accept ytdlp_enabled as legacy setting to avoid warning on startup --- youtube/yt_data_extract/watch_extraction.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'youtube/yt_data_extract') diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index e09e2d3..85c8100 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -628,6 +628,7 @@ def extract_watch_info(polymer_json): info['manual_caption_languages'] = [] info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url info['translation_languages'] = [] + info['_caption_track_urls'] = {} # lang_code -> full baseUrl from player response captions_info = player_response.get('captions', {}) info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl')) # Sometimes the above playerCaptionsRender is randomly missing @@ -658,6 +659,10 @@ def extract_watch_info(polymer_json): else: info['manual_caption_languages'].append(lang_code) base_url = caption_track.get('baseUrl', '') + # Store the full URL from the player response (includes valid tokens) + if base_url: + normalized = normalize_url(base_url) if base_url.startswith('/') or not base_url.startswith('http') else base_url + info['_caption_track_urls'][lang_code + ('_asr' if caption_track.get('kind') == 'asr' else '')] = normalized lang_name = deep_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0) if lang_name: info['_manual_caption_language_names'][lang_code] = lang_name @@ -825,6 +830,21 @@ def captions_available(info): def get_caption_url(info, language, format, automatic=False, translation_language=None): '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.''' + # Try to use the direct URL from the player response first (has valid tokens) + track_key = language + ('_asr' if automatic else '') + direct_url = info.get('_caption_track_urls', {}).get(track_key) + if direct_url: + url = direct_url + # Override format + if '&fmt=' in url: + url = re.sub(r'&fmt=[^&]*', '&fmt=' + format, url) + else: + url += '&fmt=' + format + if translation_language: + url += '&tlang=' + translation_language + return url + + # Fallback to base_url construction url = info['_captions_base_url'] if not url: return None -- cgit v1.2.3