aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/yt_data_extract/watch_extraction.py
diff options
context:
space:
mode:
authorAstounds <kirito@disroot.org>2026-03-27 20:47:44 -0500
committerAstounds <kirito@disroot.org>2026-03-27 20:47:44 -0500
commit22c72aa842efa6d1dca3bb95eeb47122537ce12a (patch)
treea94cf15bd0d7748db0532f56ddefde1fda74a33d /youtube/yt_data_extract/watch_extraction.py
parent56ecd6cb1b461bd3622c669936050fa7e4d83542 (diff)
downloadyt-local-22c72aa842efa6d1dca3bb95eeb47122537ce12a.tar.lz
yt-local-22c72aa842efa6d1dca3bb95eeb47122537ce12a.tar.xz
yt-local-22c72aa842efa6d1dca3bb95eeb47122537ce12a.zip
remove yt-dlp, fix captions PO Token issue, fix 429 retry logic
- Remove yt-dlp entirely (modules, routes, settings, dependency) Was blocking page loads by running synchronously in gevent - Fix captions: use Android client caption URLs (no PO Token needed) instead of web timedtext URLs that YouTube now blocks - Fix 429 retry: fail immediately without Tor (same IP = pointless retry) Was causing ~27s delays with exponential backoff - Accept ytdlp_enabled as legacy setting to avoid warning on startup
Diffstat (limited to 'youtube/yt_data_extract/watch_extraction.py')
-rw-r--r--youtube/yt_data_extract/watch_extraction.py20
1 files changed, 20 insertions, 0 deletions
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index e09e2d3..85c8100 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -628,6 +628,7 @@ def extract_watch_info(polymer_json):
info['manual_caption_languages'] = []
info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url
info['translation_languages'] = []
+ info['_caption_track_urls'] = {} # lang_code -> full baseUrl from player response
captions_info = player_response.get('captions', {})
info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl'))
# Sometimes the above playerCaptionsRender is randomly missing
@@ -658,6 +659,10 @@ def extract_watch_info(polymer_json):
else:
info['manual_caption_languages'].append(lang_code)
base_url = caption_track.get('baseUrl', '')
+ # Store the full URL from the player response (includes valid tokens)
+ if base_url:
+ normalized = normalize_url(base_url) if base_url.startswith('/') or not base_url.startswith('http') else base_url
+ info['_caption_track_urls'][lang_code + ('_asr' if caption_track.get('kind') == 'asr' else '')] = normalized
lang_name = deep_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0)
if lang_name:
info['_manual_caption_language_names'][lang_code] = lang_name
@@ -825,6 +830,21 @@ def captions_available(info):
def get_caption_url(info, language, format, automatic=False, translation_language=None):
'''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.'''
+ # Try to use the direct URL from the player response first (has valid tokens)
+ track_key = language + ('_asr' if automatic else '')
+ direct_url = info.get('_caption_track_urls', {}).get(track_key)
+ if direct_url:
+ url = direct_url
+ # Override format
+ if '&fmt=' in url:
+ url = re.sub(r'&fmt=[^&]*', '&fmt=' + format, url)
+ else:
+ url += '&fmt=' + format
+ if translation_language:
+ url += '&tlang=' + translation_language
+ return url
+
+ # Fallback to base_url construction
url = info['_captions_base_url']
if not url:
return None