diff options
Diffstat (limited to 'youtube')
| -rw-r--r-- | youtube/__init__.py | 24 | ||||
| -rw-r--r-- | youtube/i18n_strings.py | 112 | ||||
| -rw-r--r-- | youtube/templates/base.html | 30 | ||||
| -rw-r--r-- | youtube/templates/settings.html | 14 | ||||
| -rw-r--r-- | youtube/watch.py | 32 | ||||
| -rw-r--r-- | youtube/ytdlp_integration.py | 78 | ||||
| -rw-r--r-- | youtube/ytdlp_proxy.py | 99 | ||||
| -rw-r--r-- | youtube/ytdlp_service.py | 390 |
8 files changed, 761 insertions, 18 deletions
diff --git a/youtube/__init__.py b/youtube/__init__.py index 0072f74..d52ea98 100644 --- a/youtube/__init__.py +++ b/youtube/__init__.py @@ -7,12 +7,36 @@ import settings import traceback import re from sys import exc_info +from flask_babel import Babel + yt_app = flask.Flask(__name__) yt_app.config['TEMPLATES_AUTO_RELOAD'] = True yt_app.url_map.strict_slashes = False # yt_app.jinja_env.trim_blocks = True # yt_app.jinja_env.lstrip_blocks = True +# Configure Babel for i18n +import os +yt_app.config['BABEL_DEFAULT_LOCALE'] = 'en' +# Use absolute path for translations directory to avoid issues with package structure changes +_app_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +yt_app.config['BABEL_TRANSLATION_DIRECTORIES'] = os.path.join(_app_root, 'translations') + +def get_locale(): + """Determine the best locale based on user preference or browser settings""" + # Check if user has a language preference in settings + if hasattr(settings, 'language') and settings.language: + locale = settings.language + print(f'[i18n] Using user preference: {locale}') + return locale + # Otherwise, use browser's Accept-Language header + # Only match languages with available translations + locale = request.accept_languages.best_match(['en', 'es']) + print(f'[i18n] Using browser language: {locale}') + return locale or 'en' + +babel = Babel(yt_app, locale_selector=get_locale) + yt_app.add_url_rule('/settings', 'settings_page', settings.settings_page, methods=['POST', 'GET']) diff --git a/youtube/i18n_strings.py b/youtube/i18n_strings.py new file mode 100644 index 0000000..47a13a3 --- /dev/null +++ b/youtube/i18n_strings.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +""" +Centralized i18n strings for yt-local + +This file contains static strings that need to be translated but are used +dynamically in templates or generated content. By importing this module, +these strings get extracted by babel for translation. +""" + +from flask_babel import lazy_gettext as _l + +# Settings categories +CATEGORY_NETWORK = _l('Network') +CATEGORY_PLAYBACK = _l('Playback') +CATEGORY_INTERFACE = _l('Interface') + +# Common setting labels +ROUTE_TOR = _l('Route Tor') +DEFAULT_SUBTITLES_MODE = _l('Default subtitles mode') +AV1_CODEC_RANKING = _l('AV1 Codec Ranking') +VP8_VP9_CODEC_RANKING = _l('VP8/VP9 Codec Ranking') +H264_CODEC_RANKING = _l('H.264 Codec Ranking') +USE_INTEGRATED_SOURCES = _l('Use integrated sources') +ROUTE_IMAGES = _l('Route images') +ENABLE_COMMENTS_JS = _l('Enable comments.js') +ENABLE_SPONSORBLOCK = _l('Enable SponsorBlock') +ENABLE_EMBED_PAGE = _l('Enable embed page') + +# Setting names (auto-generated from setting keys) +RELATED_VIDEOS_MODE = _l('Related videos mode') +COMMENTS_MODE = _l('Comments mode') +ENABLE_COMMENT_AVATARS = _l('Enable comment avatars') +DEFAULT_COMMENT_SORTING = _l('Default comment sorting') +THEATER_MODE = _l('Theater mode') +AUTOPLAY_VIDEOS = _l('Autoplay videos') +DEFAULT_RESOLUTION = _l('Default resolution') +USE_VIDEO_PLAYER = _l('Use video player') +USE_VIDEO_DOWNLOAD = _l('Use video download') +PROXY_IMAGES = _l('Proxy images') +THEME = _l('Theme') +FONT = _l('Font') +LANGUAGE = _l('Language') +EMBED_PAGE_MODE = _l('Embed page mode') + +# Common option values +OFF = _l('Off') +ON = _l('On') +DISABLED = _l('Disabled') +ENABLED = _l('Enabled') +ALWAYS_SHOWN = _l('Always shown') +SHOWN_BY_CLICKING_BUTTON = _l('Shown by clicking button') +NATIVE = _l('Native') +NATIVE_WITH_HOTKEYS = _l('Native with hotkeys') +PLYR = _l('Plyr') + +# Theme options +LIGHT = _l('Light') +GRAY = _l('Gray') +DARK = _l('Dark') + +# Font options +BROWSER_DEFAULT = _l('Browser default') +LIBERATION_SERIF = _l('Liberation Serif') +ARIAL = _l('Arial') +VERDANA = _l('Verdana') +TAHOMA = _l('Tahoma') + +# Search and filter options +SORT_BY = _l('Sort by') +RELEVANCE = _l('Relevance') +UPLOAD_DATE = _l('Upload date') +VIEW_COUNT = _l('View count') +RATING = _l('Rating') + +# Time filters +ANY = _l('Any') +LAST_HOUR = _l('Last hour') +TODAY = _l('Today') +THIS_WEEK = _l('This week') +THIS_MONTH = _l('This month') +THIS_YEAR = _l('This year') + +# Content types +TYPE = _l('Type') +VIDEO = _l('Video') +CHANNEL = _l('Channel') +PLAYLIST = _l('Playlist') +MOVIE = _l('Movie') +SHOW = _l('Show') + +# Duration filters +DURATION = _l('Duration') +SHORT_DURATION = _l('Short (< 4 minutes)') +LONG_DURATION = _l('Long (> 20 minutes)') + +# Actions +SEARCH = _l('Search') +DOWNLOAD = _l('Download') +SUBSCRIBE = _l('Subscribe') +UNSUBSCRIBE = _l('Unsubscribe') +IMPORT = _l('Import') +EXPORT = _l('Export') +SAVE = _l('Save') +CHECK = _l('Check') +MUTE = _l('Mute') +UNMUTE = _l('Unmute') + +# Common UI elements +OPTIONS = _l('Options') +SETTINGS = _l('Settings') +ERROR = _l('Error') +LOADING = _l('loading...') diff --git a/youtube/templates/base.html b/youtube/templates/base.html index 393cc52..95207fa 100644 --- a/youtube/templates/base.html +++ b/youtube/templates/base.html @@ -35,57 +35,57 @@ </nav> <form class="form" id="site-search" action="/youtube.com/results"> <input type="search" name="search_query" class="search-box" value="{{ search_box_value }}" - {{ "autofocus" if (request.path in ("/", "/results") or error_message) else "" }} required placeholder="Type to search..."> - <button type="submit" value="Search" class="search-button">Search</button> + {{ "autofocus" if (request.path in ("/", "/results") or error_message) else "" }} required placeholder="{{ _('Type to search...') }}"> + <button type="submit" value="Search" class="search-button">{{ _('Search') }}</button> <!-- options --> <div class="dropdown"> <!-- hidden box --> <input id="options-toggle-cbox" class="opt-box" type="checkbox"> <!-- end hidden box --> - <label class="dropdown-label" for="options-toggle-cbox">Options</label> + <label class="dropdown-label" for="options-toggle-cbox">{{ _('Options') }}</label> <div class="dropdown-content"> - <h3>Sort by</h3> + <h3>{{ _('Sort by') }}</h3> <div class="option"> <input type="radio" id="sort_relevance" name="sort" value="0"> - <label for="sort_relevance">Relevance</label> + <label for="sort_relevance">{{ _('Relevance') }}</label> </div> <div class="option"> <input type="radio" id="sort_upload_date" name="sort" value="2"> - <label for="sort_upload_date">Upload date</label> + <label for="sort_upload_date">{{ _('Upload date') }}</label> </div> <div class="option"> <input type="radio" id="sort_view_count" name="sort" value="3"> - <label for="sort_view_count">View count</label> + <label for="sort_view_count">{{ _('View count') }}</label> </div> <div class="option"> <input type="radio" id="sort_rating" name="sort" value="1"> - <label for="sort_rating">Rating</label> + <label for="sort_rating">{{ _('Rating') }}</label> </div> - <h3>Upload date</h3> + <h3>{{ _('Upload date') }}</h3> <div class="option"> <input type="radio" id="time_any" name="time" value="0"> - <label for="time_any">Any</label> + <label for="time_any">{{ _('Any') }}</label> </div> <div class="option"> <input type="radio" id="time_last_hour" name="time" value="1"> - <label for="time_last_hour">Last hour</label> + <label for="time_last_hour">{{ _('Last hour') }}</label> </div> <div class="option"> <input type="radio" id="time_today" name="time" value="2"> - <label for="time_today">Today</label> + <label for="time_today">{{ _('Today') }}</label> </div> <div class="option"> <input type="radio" id="time_this_week" name="time" value="3"> - <label for="time_this_week">This week</label> + <label for="time_this_week">{{ _('This week') }}</label> </div> <div class="option"> <input type="radio" id="time_this_month" name="time" value="4"> - <label for="time_this_month">This month</label> + <label for="time_this_month">{{ _('This month') }}</label> </div> <div class="option"> <input type="radio" id="time_this_year" name="time" value="5"> - <label for="time_this_year">This year</label> + <label for="time_this_year">{{ _('This year') }}</label> </div> <h3>Type</h3> diff --git a/youtube/templates/settings.html b/youtube/templates/settings.html index a4ebabf..a5bb1e4 100644 --- a/youtube/templates/settings.html +++ b/youtube/templates/settings.html @@ -31,11 +31,19 @@ <input type="number" id="{{ 'setting_' + setting_name }}" name="{{ setting_name }}" value="{{ value }}" step="1"> {% endif %} {% elif setting_info['type'].__name__ == 'float' %} - + <input type="number" id="{{ 'setting_' + setting_name }}" name="{{ setting_name }}" value="{{ value }}" step="0.01"> {% elif setting_info['type'].__name__ == 'str' %} - <input type="text" id="{{ 'setting_' + setting_name }}" name="{{ setting_name }}" value="{{ value }}"> + {% if 'options' is in(setting_info) %} + <select id="{{ 'setting_' + setting_name }}" name="{{ setting_name }}"> + {% for option in setting_info['options'] %} + <option value="{{ option[0] }}" {{ 'selected' if option[0] == value else '' }}>{{ option[1] }}</option> + {% endfor %} + </select> + {% else %} + <input type="text" id="{{ 'setting_' + setting_name }}" name="{{ setting_name }}" value="{{ value }}"> + {% endif %} {% else %} - <span>Error: Unknown setting type: setting_info['type'].__name__</span> + <span>Error: Unknown setting type: {{ setting_info['type'].__name__ }}</span> {% endif %} </li> {% endif %} diff --git a/youtube/watch.py b/youtube/watch.py index 0274cd0..aa286e2 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -6,6 +6,9 @@ import settings from flask import request import flask +import logging + +logger = logging.getLogger(__name__) import json import gevent @@ -685,6 +688,18 @@ def get_watch_page(video_id=None): pair_sources = source_info['pair_sources'] uni_idx, pair_idx = source_info['uni_idx'], source_info['pair_idx'] + # Extract audio tracks using yt-dlp for multi-language support + audio_tracks = [] + try: + from youtube import ytdlp_integration + ytdlp_info = ytdlp_integration.extract_video_info_ytdlp(video_id) + audio_tracks = ytdlp_info.get('audio_tracks', []) + if audio_tracks: + logger.info(f'Found {len(audio_tracks)} audio tracks for video {video_id}') + except Exception as e: + logger.warning(f'Failed to extract audio tracks: {e}') + audio_tracks = [] + pair_quality = yt_data_extract.deep_get(pair_sources, pair_idx, 'quality') uni_quality = yt_data_extract.deep_get(uni_sources, uni_idx, 'quality') @@ -808,7 +823,9 @@ def get_watch_page(video_id=None): 'playlist': info['playlist'], 'related': info['related_videos'], 'playability_error': info['playability_error'], + 'audio_tracks': audio_tracks, }, + audio_tracks = audio_tracks, font_family = youtube.font_choices[settings.font], # for embed page **source_info, using_pair_sources = using_pair_sources, @@ -884,3 +901,18 @@ def get_transcript(caption_path): return flask.Response(result.encode('utf-8'), mimetype='text/plain;charset=UTF-8') + + +# ============================================================================ +# yt-dlp Integration Routes +# ============================================================================ + +@yt_app.route('/ytl-api/video-with-audio/<video_id>') +def proxy_video_with_audio(video_id): + """ + Proxy para servir video con audio específico usando yt-dlp + """ + from youtube import ytdlp_proxy + audio_lang = request.args.get('lang', 'en') + max_quality = int(request.args.get('quality', 720)) + return ytdlp_proxy.stream_video_with_audio(video_id, audio_lang, max_quality) diff --git a/youtube/ytdlp_integration.py b/youtube/ytdlp_integration.py new file mode 100644 index 0000000..90a749d --- /dev/null +++ b/youtube/ytdlp_integration.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +""" +yt-dlp integration wrapper for backward compatibility. + +This module now uses the centralized ytdlp_service for all operations. +""" +import logging +from youtube.ytdlp_service import ( + extract_video_info, + get_language_name, + clear_cache, + get_cache_info, +) + +logger = logging.getLogger(__name__) + + +def extract_video_info_ytdlp(video_id): + """ + Extract video information using yt-dlp (with caching). + + This is a wrapper around ytdlp_service.extract_video_info() + for backward compatibility. + + Args: + video_id: YouTube video ID + + Returns: + Dictionary with audio_tracks, formats, title, duration + """ + logger.debug(f'Extracting video info (legacy API): {video_id}') + + info = extract_video_info(video_id) + + # Convert to legacy format for backward compatibility + return { + 'audio_tracks': info.get('audio_tracks', []), + 'all_audio_formats': info.get('formats', []), + 'formats': info.get('formats', []), + 'title': info.get('title', ''), + 'duration': info.get('duration', 0), + 'error': info.get('error'), + } + + +def get_audio_formats_for_language(video_id, language='en'): + """ + Get available audio formats for a specific language. + + Args: + video_id: YouTube video ID + language: Language code (default: 'en') + + Returns: + List of audio format dicts + """ + info = extract_video_info_ytdlp(video_id) + + if 'error' in info: + logger.warning(f'Cannot get audio formats: {info["error"]}') + return [] + + audio_formats = [] + for track in info.get('audio_tracks', []): + if track['language'] == language: + audio_formats.append(track) + + logger.debug(f'Found {len(audio_formats)} {language} audio formats') + return audio_formats + + +__all__ = [ + 'extract_video_info_ytdlp', + 'get_audio_formats_for_language', + 'get_language_name', + 'clear_cache', + 'get_cache_info', +] diff --git a/youtube/ytdlp_proxy.py b/youtube/ytdlp_proxy.py new file mode 100644 index 0000000..4eb7a99 --- /dev/null +++ b/youtube/ytdlp_proxy.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +""" +Proxy for serving videos with specific audio using yt-dlp. + +This module provides streaming functionality for unified formats +with specific audio languages. +""" +import logging +from flask import Response, request, stream_with_context +import urllib.request +import urllib.error +from youtube.ytdlp_service import find_best_unified_format + +logger = logging.getLogger(__name__) + + +def stream_video_with_audio(video_id: str, audio_language: str = 'en', max_quality: int = 720): + """ + Stream video with specific audio language. + + Args: + video_id: YouTube video ID + audio_language: Preferred audio language (default: 'en') + max_quality: Maximum video height (default: 720) + + Returns: + Flask Response with video stream, or 404 if not available + """ + logger.info(f'Stream request: {video_id} | audio={audio_language} | quality={max_quality}p') + + # Find best unified format + best_format = find_best_unified_format(video_id, audio_language, max_quality) + + if not best_format: + logger.info(f'No suitable unified format found, returning 404 to trigger fallback') + return Response('No suitable unified format available', status=404) + + url = best_format.get('url') + if not url: + logger.error('Format found but no URL available') + return Response('Format URL not available', status=500) + + logger.debug(f'Streaming from: {url[:80]}...') + + # Stream the video + try: + req = urllib.request.Request(url) + req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36') + req.add_header('Accept', '*/*') + + # Add Range header if client requests it + if 'Range' in request.headers: + req.add_header('Range', request.headers['Range']) + logger.debug(f'Range request: {request.headers["Range"]}') + + resp = urllib.request.urlopen(req, timeout=60) + + def generate(): + """Generator for streaming video chunks.""" + try: + while True: + chunk = resp.read(65536) # 64KB chunks + if not chunk: + break + yield chunk + except Exception as e: + logger.error(f'Stream error: {e}') + raise + + # Build response headers + response_headers = { + 'Content-Type': resp.headers.get('Content-Type', 'video/mp4'), + 'Access-Control-Allow-Origin': '*', + } + + # Copy important headers + for header in ['Content-Length', 'Content-Range', 'Accept-Ranges']: + if header in resp.headers: + response_headers[header] = resp.headers[header] + + status_code = resp.getcode() + logger.info(f'Streaming started: {status_code}') + + return Response( + stream_with_context(generate()), + status=status_code, + headers=response_headers, + direct_passthrough=True + ) + + except urllib.error.HTTPError as e: + logger.error(f'HTTP error streaming: {e.code} {e.reason}') + return Response(f'Error: {e.code} {e.reason}', status=e.code) + except urllib.error.URLError as e: + logger.error(f'URL error streaming: {e.reason}') + return Response(f'Network error: {e.reason}', status=502) + except Exception as e: + logger.error(f'Streaming error: {e}', exc_info=True) + return Response(f'Error: {e}', status=500) diff --git a/youtube/ytdlp_service.py b/youtube/ytdlp_service.py new file mode 100644 index 0000000..2520193 --- /dev/null +++ b/youtube/ytdlp_service.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python3 +""" +Centralized yt-dlp integration with caching, logging, and error handling. + +This module provides a clean interface for yt-dlp functionality: +- Multi-language audio track extraction +- Subtitle extraction +- Age-restricted video support + +All yt-dlp usage should go through this module for consistency. +""" +import logging +from functools import lru_cache +from typing import Dict, List, Optional, Any +import yt_dlp +import settings + +logger = logging.getLogger(__name__) + +# Language name mapping +LANGUAGE_NAMES = { + 'en': 'English', + 'es': 'Español', + 'fr': 'Français', + 'de': 'Deutsch', + 'it': 'Italiano', + 'pt': 'Português', + 'ru': 'Русский', + 'ja': '日本語', + 'ko': '한국어', + 'zh': '中文', + 'ar': 'العربية', + 'hi': 'हिन्दी', + 'und': 'Unknown', + 'zxx': 'No linguistic content', +} + + +def get_language_name(lang_code: str) -> str: + """Convert ISO 639-1/2 language code to readable name.""" + if not lang_code: + return 'Unknown' + return LANGUAGE_NAMES.get(lang_code.lower(), lang_code.upper()) + + +def _get_ytdlp_config() -> Dict[str, Any]: + """Get yt-dlp configuration from settings.""" + config = { + 'quiet': True, + 'no_warnings': True, + 'extract_flat': False, + 'format': 'best', + 'skip_download': True, + 'socket_timeout': 30, + 'extractor_retries': 3, + 'http_chunk_size': 10485760, # 10MB + } + + # Configure Tor proxy if enabled + if settings.route_tor: + config['proxy'] = 'socks5://127.0.0.1:9150' + logger.debug('Tor proxy enabled for yt-dlp') + + # Use cookies if available + import os + cookies_file = 'youtube_cookies.txt' + if os.path.exists(cookies_file): + config['cookiefile'] = cookies_file + logger.debug('Using cookies file for yt-dlp') + + return config + + +@lru_cache(maxsize=128) +def extract_video_info(video_id: str) -> Dict[str, Any]: + """ + Extract video information using yt-dlp with caching. + + Args: + video_id: YouTube video ID + + Returns: + Dictionary with video information including audio tracks + + Caching: + Results are cached to avoid repeated requests to YouTube. + Cache size is limited to prevent memory issues. + """ + # Check if yt-dlp is enabled + if not getattr(settings, 'ytdlp_enabled', True): + logger.debug('yt-dlp integration is disabled') + return {'error': 'yt-dlp disabled', 'audio_tracks': []} + + url = f'https://www.youtube.com/watch?v={video_id}' + ydl_opts = _get_ytdlp_config() + + try: + logger.debug(f'Extracting video info: {video_id}') + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(url, download=False) + + if not info: + logger.warning(f'No info returned for video: {video_id}') + return {'error': 'No info returned', 'audio_tracks': []} + + logger.debug(f'Extracted {len(info.get("formats", []))} formats') + + # Extract audio tracks grouped by language + audio_tracks = _extract_audio_tracks(info) + + return { + 'video_id': video_id, + 'title': info.get('title', ''), + 'duration': info.get('duration', 0), + 'audio_tracks': audio_tracks, + 'formats': info.get('formats', []), + 'subtitles': info.get('subtitles', {}), + 'automatic_captions': info.get('automatic_captions', {}), + } + + except yt_dlp.utils.DownloadError as e: + logger.error(f'yt-dlp download error for {video_id}: {e}') + return {'error': str(e), 'audio_tracks': []} + except Exception as e: + logger.error(f'yt-dlp extraction error for {video_id}: {e}', exc_info=True) + return {'error': str(e), 'audio_tracks': []} + + +def _extract_audio_tracks(info: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Extract audio tracks from video info, grouped by language. + + Returns a list of unique audio tracks (one per language), + keeping the highest quality for each language. + """ + audio_by_language = {} + all_formats = info.get('formats', []) + + for fmt in all_formats: + # Only audio-only formats + has_audio = fmt.get('acodec') and fmt.get('acodec') != 'none' + has_video = fmt.get('vcodec') and fmt.get('vcodec') != 'none' + + if not has_audio or has_video: + continue + + # Extract language information + lang = ( + fmt.get('language') or + fmt.get('audio_language') or + fmt.get('lang') or + 'und' + ) + + # Get language name + lang_name = ( + fmt.get('language_name') or + fmt.get('lang_name') or + get_language_name(lang) + ) + + # Get bitrate + bitrate = fmt.get('abr') or fmt.get('tbr') or 0 + + # Create track info + track_info = { + 'language': lang, + 'language_name': lang_name, + 'format_id': str(fmt.get('format_id', '')), + 'itag': str(fmt.get('format_id', '')), + 'ext': fmt.get('ext'), + 'acodec': fmt.get('acodec'), + 'audio_bitrate': int(bitrate) if bitrate else 0, + 'audio_sample_rate': fmt.get('asr'), + 'url': fmt.get('url'), + 'filesize': fmt.get('filesize'), + } + + # Keep best quality per language + lang_key = lang.lower() + if lang_key not in audio_by_language: + audio_by_language[lang_key] = track_info + else: + current_bitrate = audio_by_language[lang_key].get('audio_bitrate', 0) + if bitrate > current_bitrate: + audio_by_language[lang_key] = track_info + logger.debug(f'Updated {lang} to higher bitrate: {bitrate}') + + # Convert to list and sort + audio_tracks = list(audio_by_language.values()) + + # Sort: English first, then by bitrate (descending) + audio_tracks.sort( + key=lambda x: ( + 0 if x['language'] == 'en' else 1, + -x.get('audio_bitrate', 0) + ) + ) + + logger.debug(f'Found {len(audio_tracks)} unique audio tracks') + for track in audio_tracks[:3]: # Log first 3 + logger.debug(f' - {track["language_name"]}: {track["audio_bitrate"]}k') + + return audio_tracks + + +def get_subtitle_url(video_id: str, lang: str = 'en') -> Optional[str]: + """ + Get subtitle URL for a specific language. + + Args: + video_id: YouTube video ID + lang: Language code (default: 'en') + + Returns: + URL to subtitle file, or None if not available + """ + info = extract_video_info(video_id) + + if 'error' in info: + logger.warning(f'Cannot get subtitles: {info["error"]}') + return None + + # Try manual subtitles first + subtitles = info.get('subtitles', {}) + if lang in subtitles: + for sub in subtitles[lang]: + if sub.get('ext') == 'vtt': + logger.debug(f'Found manual {lang} subtitle') + return sub.get('url') + + # Try automatic captions + auto_captions = info.get('automatic_captions', {}) + if lang in auto_captions: + for sub in auto_captions[lang]: + if sub.get('ext') == 'vtt': + logger.debug(f'Found automatic {lang} subtitle') + return sub.get('url') + + logger.debug(f'No {lang} subtitle found') + return None + + +def find_best_unified_format( + video_id: str, + audio_language: str = 'en', + max_quality: int = 720 +) -> Optional[Dict[str, Any]]: + """ + Find best unified (video+audio) format for specific language and quality. + + Args: + video_id: YouTube video ID + audio_language: Preferred audio language + max_quality: Maximum video height (e.g., 720, 1080) + + Returns: + Format dict if found, None otherwise + """ + info = extract_video_info(video_id) + + if 'error' in info or not info.get('formats'): + return None + + # Quality thresholds (minimum acceptable height as % of requested) + thresholds = { + 2160: 0.85, + 1440: 0.80, + 1080: 0.70, + 720: 0.70, + 480: 0.60, + 360: 0.50, + } + + # Get threshold for requested quality + threshold = 0.70 + for q, t in thresholds.items(): + if max_quality >= q: + threshold = t + break + + min_height = int(max_quality * threshold) + logger.debug(f'Quality threshold: {threshold:.0%} = min {min_height}p for {max_quality}p') + + candidates = [] + audio_lang_lower = audio_language.lower() + + for fmt in info['formats']: + # Must have both video and audio + has_video = fmt.get('vcodec') and fmt.get('vcodec') != 'none' + has_audio = fmt.get('acodec') and fmt.get('acodec') != 'none' + + if not (has_video and has_audio): + continue + + # Skip HLS/DASH formats + protocol = fmt.get('protocol', '') + format_id = str(fmt.get('format_id', '')) + + if any(x in protocol.lower() for x in ['m3u8', 'hls', 'dash']): + continue + if format_id.startswith('9'): # HLS formats + continue + + height = fmt.get('height', 0) + if height < min_height: + continue + + # Language matching + lang = ( + fmt.get('language') or + fmt.get('audio_language') or + 'en' + ).lower() + + lang_match = ( + lang == audio_lang_lower or + lang.startswith(audio_lang_lower[:2]) or + audio_lang_lower.startswith(lang[:2]) + ) + + if not lang_match: + continue + + # Calculate score + score = 0 + + # Language match bonus + if lang == audio_lang_lower: + score += 10000 + elif lang.startswith(audio_lang_lower[:2]): + score += 8000 + else: + score += 5000 + + # Quality score + quality_diff = abs(height - max_quality) + if height >= max_quality: + score += 3000 - quality_diff + else: + score += 2000 - quality_diff + + # Protocol preference + if protocol in ('https', 'http'): + score += 500 + + # Format preference + if fmt.get('ext') == 'mp4': + score += 100 + + candidates.append({ + 'format': fmt, + 'score': score, + 'height': height, + 'lang': lang, + }) + + if not candidates: + logger.debug(f'No unified format found for {max_quality}p + {audio_language}') + return None + + # Sort by score and return best + candidates.sort(key=lambda x: x['score'], reverse=True) + best = candidates[0] + + logger.info( + f'Selected unified format: {best["format"].get("format_id")} | ' + f'{best["lang"]} | {best["height"]}p | score={best["score"]}' + ) + + return best['format'] + + +def clear_cache(): + """Clear the video info cache.""" + extract_video_info.cache_clear() + logger.info('yt-dlp cache cleared') + + +def get_cache_info() -> Dict[str, Any]: + """Get cache statistics.""" + cache_info = extract_video_info.cache_info() + return { + 'hits': cache_info.hits, + 'misses': cache_info.misses, + 'size': cache_info.currsize, + 'maxsize': cache_info.maxsize, + 'hit_rate': cache_info.hits / (cache_info.hits + cache_info.misses) if (cache_info.hits + cache_info.misses) > 0 else 0, + } |
