diff options
author | Jesús <heckyel@hyperbola.info> | 2021-10-31 11:36:52 -0500 |
---|---|---|
committer | Jesús <heckyel@hyperbola.info> | 2021-10-31 11:36:52 -0500 |
commit | 5bb25093eb718346ab8a723d2c04f0066fc3958a (patch) | |
tree | 8a7fa5611895a933eaf1ef1623f7b9e1a1c36157 | |
parent | c7afb25e19a91493db6069d1db9f7d1bc8491dc1 (diff) | |
parent | 652fb0d446524af4b783276babd55f5fc6a3afeb (diff) | |
download | hypervideo-pre-5bb25093eb718346ab8a723d2c04f0066fc3958a.tar.lz hypervideo-pre-5bb25093eb718346ab8a723d2c04f0066fc3958a.tar.xz hypervideo-pre-5bb25093eb718346ab8a723d2c04f0066fc3958a.zip |
updated from upstream | 31/10/2021 at 11:36
49 files changed, 1812 insertions, 972 deletions
diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 048d98852..2bf96affe 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -125,3 +125,7 @@ jfogelman timethrow sarnoud Bojidarist +18928172992817182/gustaf +nixklai +smplayer-dev +Zirro diff --git a/Changelog.md b/Changelog.md index 90f9bdafb..b46199168 100644 --- a/Changelog.md +++ b/Changelog.md @@ -14,6 +14,84 @@ --> +### 2021.10.22 + +* [build] Improvements + * Build standalone MacOS packages by [smplayer-dev](https://github.com/smplayer-dev) + * Release windows exe built with `py2exe` + * Enable lazy-extractors in releases. + * Set env var `YTDLP_NO_LAZY_EXTRACTORS` to forcefully disable this (experimental) + * Clean up error reporting in update + * Refactor `pyinst.py`, misc cleanup and improve docs +* [docs] Migrate issues to use forms by [Ashish0804](https://github.com/Ashish0804) +* [downloader] **Fix slow progress hooks** + * This was causing HLS/DASH downloads to be extremely slow in some situations +* [downloader/ffmpeg] Improve simultaneous download and merge +* [EmbedMetadata] Allow overwriting all default metadata with `meta_default` key +* [ModifyChapters] Add ability for `--remove-chapters` to remove sections by timestamp +* [utils] Allow duration strings in `--match-filter` +* Add HDR information to formats +* Add negative option `--no-batch-file` by [Zirro](https://github.com/Zirro) +* Calculate more fields for merged formats +* Do not verify thumbnail URLs unless `--check-formats` is specified +* Don't create console for subprocesses on Windows +* Fix `--restrict-filename` when used with default template +* Fix `check_formats` output being written to stdout when `-qv` +* Fix bug in storyboards +* Fix conflict b/w id and ext in format selection +* Fix verbose head not showing custom configs +* Load archive only after printing verbose head +* Make `duration_string` and `resolution` available in --match-filter +* Re-implement deprecated option `--id` +* Reduce default `--socket-timeout` +* Write verbose header to logger +* [outtmpl] Fix bug in expanding environment variables +* [cookies] Local State should be opened as utf-8 +* [extractor,utils] Detect more codecs/mimetypes +* [extractor] Detect `EXT-X-KEY` Apple FairPlay +* [utils] Use `importlib` to load plugins by [sulyi](https://github.com/sulyi) +* [http] Retry on socket timeout and show the last encountered error +* [fragment] Print error message when skipping fragment +* [aria2c] Fix `--skip-unavailable-fragment` +* [SponsorBlock] Obey `extractor-retries` and `sleep-requests` +* [Merger] Do not add `aac_adtstoasc` to non-hls audio +* [ModifyChapters] Do not mutate original chapters by [nihil-admirari](https://github.com/nihil-admirari) +* [devscripts/run_tests] Use markers to filter tests by [sulyi](https://github.com/sulyi) +* [7plus] Add cookie based authentication by [nyuszika7h](https://github.com/nyuszika7h) +* [AdobePass] Fix RCN MSO by [jfogelman](https://github.com/jfogelman) +* [CBC] Fix Gem livestream by [makeworld-the-better-one](https://github.com/makeworld-the-better-one) +* [CBC] Support CBC Gem member content by [makeworld-the-better-one](https://github.com/makeworld-the-better-one) +* [crunchyroll] Add season to flat-playlist +* [crunchyroll] Add support for `beta.crunchyroll` URLs and fix series URLs with language code +* [EUScreen] Add Extractor by [Ashish0804](https://github.com/Ashish0804) +* [Gronkh] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [hidive] Fix typo +* [Hotstar] Mention Dynamic Range in `format_id` by [Ashish0804](https://github.com/Ashish0804) +* [Hotstar] Raise appropriate error for DRM +* [instagram] Add login by [u-spec-png](https://github.com/u-spec-png) +* [instagram] Show appropriate error when login is needed +* [microsoftstream] Add extractor by [damianoamatruda](https://github.com/damianoamatruda), [nixklai](https://github.com/nixklai) +* [on24] Add extractor by [damianoamatruda](https://github.com/damianoamatruda) +* [patreon] Fix vimeo player regex by [zenerdi0de](https://github.com/zenerdi0de) +* [SkyNewsAU] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [tagesschau] Fix extractor by [u-spec-png](https://github.com/u-spec-png) +* [tbs] Add tbs live streams by [llacb47](https://github.com/llacb47) +* [tiktok] Fix typo and update tests +* [trovo] Support channel clips and VODs by [Ashish0804](https://github.com/Ashish0804) +* [Viafree] Add support for Finland by [18928172992817182](https://github.com/18928172992817182) +* [vimeo] Fix embedded `player.vimeo` +* [vlive:channel] Fix extraction by [kikuyan](https://github.com/kikuyan), [pukkandan](https://github.com/pukkandan) +* [youtube] Add auto-translated subtitles +* [youtube] Expose different formats with same itag +* [youtube:comments] Fix for new layout by [coletdjnz](https://github.com/coletdjnz) +* [cleanup] Cleanup bilibili code by [pukkandan](https://github.com/pukkandan), [u-spec-png](https://github.com/u-spec-png) +* [cleanup] Remove broken youtube login code +* [cleanup] Standardize timestamp formatting code +* [cleanup] Generalize `getcomments` implementation for extractors +* [cleanup] Simplify search extractors code +* [cleanup] misc + + ### 2021.10.10 * [downloader/ffmpeg] Fix bug in initializing `FFmpegPostProcessor` diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 427045b98..0411df76b 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -9,7 +9,7 @@ import sys sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) -lazy_extractors_filename = sys.argv[1] +lazy_extractors_filename = sys.argv[1] if len(sys.argv) > 1 else 'yt_dlp/extractor/lazy_extractors.py' if os.path.exists(lazy_extractors_filename): os.remove(lazy_extractors_filename) diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py index 17a34843f..4c11e25f2 100644 --- a/devscripts/make_supportedsites.py +++ b/devscripts/make_supportedsites.py @@ -29,6 +29,9 @@ def main(): continue if ie_desc is not None: ie_md += ': {0}'.format(ie.IE_DESC) + search_key = getattr(ie, 'SEARCH_KEY', None) + if search_key is not None: + ie_md += f'; "{ie.SEARCH_KEY}:" prefix' if not ie.working(): ie_md += ' (Currently broken)' yield ie_md @@ -1,75 +1,84 @@ #!/usr/bin/env python3 # coding: utf-8 - -from __future__ import unicode_literals -import sys +import os import platform - +import sys from PyInstaller.utils.hooks import collect_submodules -from PyInstaller.utils.win32.versioninfo import ( - VarStruct, VarFileInfo, StringStruct, StringTable, - StringFileInfo, FixedFileInfo, VSVersionInfo, SetVersion, -) -import PyInstaller.__main__ - -arch = platform.architecture()[0][:2] -assert arch in ('32', '64') -_x86 = '_x86' if arch == '32' else '' - -# Compatability with older arguments -opts = sys.argv[1:] -if opts[0:1] in (['32'], ['64']): - if arch != opts[0]: - raise Exception(f'{opts[0]}bit executable cannot be built on a {arch}bit system') - opts = opts[1:] -opts = opts or ['--onefile'] - -print(f'Building {arch}bit version with options {opts}') - -FILE_DESCRIPTION = 'yt-dlp%s' % (' (32 Bit)' if _x86 else '') - -exec(compile(open('yt_dlp/version.py').read(), 'yt_dlp/version.py', 'exec')) -VERSION = locals()['__version__'] - -VERSION_LIST = VERSION.split('.') -VERSION_LIST = list(map(int, VERSION_LIST)) + [0] * (4 - len(VERSION_LIST)) - -print('Version: %s%s' % (VERSION, _x86)) -print('Remember to update the version using devscipts\\update-version.py') - -VERSION_FILE = VSVersionInfo( - ffi=FixedFileInfo( - filevers=VERSION_LIST, - prodvers=VERSION_LIST, - mask=0x3F, - flags=0x0, - OS=0x4, - fileType=0x1, - subtype=0x0, - date=(0, 0), - ), - kids=[ - StringFileInfo([ - StringTable( - '040904B0', [ - StringStruct('Comments', 'yt-dlp%s Command Line Interface.' % _x86), - StringStruct('CompanyName', 'https://github.com/yt-dlp'), - StringStruct('FileDescription', FILE_DESCRIPTION), - StringStruct('FileVersion', VERSION), - StringStruct('InternalName', 'yt-dlp%s' % _x86), - StringStruct( - 'LegalCopyright', - 'pukkandan.ytdlp@gmail.com | UNLICENSE', - ), - StringStruct('OriginalFilename', 'yt-dlp%s.exe' % _x86), - StringStruct('ProductName', 'yt-dlp%s' % _x86), - StringStruct( - 'ProductVersion', - '%s%s on Python %s' % (VERSION, _x86, platform.python_version())), - ])]), - VarFileInfo([VarStruct('Translation', [0, 1200])]) + + +OS_NAME = platform.system() +if OS_NAME == 'Windows': + from PyInstaller.utils.win32.versioninfo import ( + VarStruct, VarFileInfo, StringStruct, StringTable, + StringFileInfo, FixedFileInfo, VSVersionInfo, SetVersion, + ) +elif OS_NAME == 'Darwin': + pass +else: + raise Exception('{OS_NAME} is not supported') + +ARCH = platform.architecture()[0][:2] + + +def main(): + opts = parse_options() + version = read_version() + + suffix = '_macos' if OS_NAME == 'Darwin' else '_x86' if ARCH == '32' else '' + final_file = 'dist/%syt-dlp%s%s' % ( + 'yt-dlp/' if '--onedir' in opts else '', suffix, '.exe' if OS_NAME == 'Windows' else '') + + print(f'Building yt-dlp v{version} {ARCH}bit for {OS_NAME} with options {opts}') + print('Remember to update the version using "devscripts/update-version.py"') + if not os.path.isfile('yt_dlp/extractor/lazy_extractors.py'): + print('WARNING: Building without lazy_extractors. Run ' + '"devscripts/make_lazy_extractors.py" to build lazy extractors', file=sys.stderr) + print(f'Destination: {final_file}\n') + + opts = [ + f'--name=yt-dlp{suffix}', + '--icon=devscripts/logo.ico', + '--upx-exclude=vcruntime140.dll', + '--noconfirm', + *dependancy_options(), + *opts, + 'yt_dlp/__main__.py', ] -) + print(f'Running PyInstaller with {opts}') + + import PyInstaller.__main__ + + PyInstaller.__main__.run(opts) + + set_version_info(final_file, version) + + +def parse_options(): + # Compatability with older arguments + opts = sys.argv[1:] + if opts[0:1] in (['32'], ['64']): + if ARCH != opts[0]: + raise Exception(f'{opts[0]}bit executable cannot be built on a {ARCH}bit system') + opts = opts[1:] + return opts or ['--onefile'] + + +def read_version(): + exec(compile(open('yt_dlp/version.py').read(), 'yt_dlp/version.py', 'exec')) + return locals()['__version__'] + + +def version_to_list(version): + version_list = version.split('.') + return list(map(int, version_list)) + [0] * (4 - len(version_list)) + + +def dependancy_options(): + dependancies = [pycryptodome_module(), 'mutagen'] + collect_submodules('websockets') + excluded_modules = ['test', 'ytdlp_plugins', 'youtube-dl', 'youtube-dlc'] + + yield from (f'--hidden-import={module}' for module in dependancies) + yield from (f'--exclude-module={module}' for module in excluded_modules) def pycryptodome_module(): @@ -86,17 +95,41 @@ def pycryptodome_module(): return 'Cryptodome' -dependancies = [pycryptodome_module(), 'mutagen'] + collect_submodules('websockets') -excluded_modules = ['test', 'ytdlp_plugins', 'youtube-dl', 'youtube-dlc'] - -PyInstaller.__main__.run([ - '--name=yt-dlp%s' % _x86, - '--icon=devscripts/logo.ico', - *[f'--exclude-module={module}' for module in excluded_modules], - *[f'--hidden-import={module}' for module in dependancies], - '--upx-exclude=vcruntime140.dll', - '--noconfirm', - *opts, - 'yt_dlp/__main__.py', -]) -SetVersion('dist/%syt-dlp%s.exe' % ('yt-dlp/' if '--onedir' in opts else '', _x86), VERSION_FILE) +def set_version_info(exe, version): + if OS_NAME == 'Windows': + windows_set_version(exe, version) + + +def windows_set_version(exe, version): + version_list = version_to_list(version) + suffix = '_x86' if ARCH == '32' else '' + SetVersion(exe, VSVersionInfo( + ffi=FixedFileInfo( + filevers=version_list, + prodvers=version_list, + mask=0x3F, + flags=0x0, + OS=0x4, + fileType=0x1, + subtype=0x0, + date=(0, 0), + ), + kids=[ + StringFileInfo([StringTable('040904B0', [ + StringStruct('Comments', 'yt-dlp%s Command Line Interface.' % suffix), + StringStruct('CompanyName', 'https://github.com/yt-dlp'), + StringStruct('FileDescription', 'yt-dlp%s' % (' (32 Bit)' if ARCH == '32' else '')), + StringStruct('FileVersion', version), + StringStruct('InternalName', f'yt-dlp{suffix}'), + StringStruct('LegalCopyright', 'pukkandan.ytdlp@gmail.com | UNLICENSE'), + StringStruct('OriginalFilename', f'yt-dlp{suffix}.exe'), + StringStruct('ProductName', f'yt-dlp{suffix}'), + StringStruct( + 'ProductVersion', f'{version}{suffix} on Python {platform.python_version()}'), + ])]), VarFileInfo([VarStruct('Translation', [0, 1200])]) + ] + )) + + +if __name__ == '__main__': + main() @@ -16,7 +16,7 @@ from distutils.spawn import spawn exec(compile(open('yt_dlp/version.py').read(), 'yt_dlp/version.py', 'exec')) -DESCRIPTION = 'Command-line program to download videos from YouTube.com and many other other video platforms.' +DESCRIPTION = 'A youtube-dl fork with additional features and patches' LONG_DESCRIPTION = '\n\n'.join(( 'Official repository: <https://github.com/yt-dlp/yt-dlp>', @@ -29,7 +29,7 @@ REQUIREMENTS = ['mutagen', 'pycryptodome', 'websockets'] if sys.argv[1:2] == ['py2exe']: import py2exe warnings.warn( - 'Building with py2exe is not officially supported. ' + 'py2exe builds do not support pycryptodomex and needs VC++14 to run. ' 'The recommended way is to use "pyinst.py" to build using pyinstaller') params = { 'console': [{ diff --git a/supportedsites.md b/supportedsites.md index 02be6b918..01c3f43a9 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -127,7 +127,7 @@ - **BilibiliAudioAlbum** - **BilibiliChannel** - **BiliBiliPlayer** - - **BiliBiliSearch**: Bilibili video search, "bilisearch" keyword + - **BiliBiliSearch**: Bilibili video search; "bilisearch:" prefix - **BiliIntl** - **BiliIntlSeries** - **BioBioChileTV** @@ -226,7 +226,9 @@ - **Crackle** - **CrooksAndLiars** - **crunchyroll** + - **crunchyroll:beta** - **crunchyroll:playlist** + - **crunchyroll:playlist:beta** - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 - **CTV** @@ -315,6 +317,7 @@ - **ESPNArticle** - **EsriVideo** - **Europa** + - **EUScreen** - **EWETV** - **ExpoTV** - **Expressen** @@ -394,6 +397,7 @@ - **Goshgay** - **GoToStage** - **GPUTechConf** + - **Gronkh** - **Groupon** - **hbo** - **HearThisAt** @@ -570,6 +574,7 @@ - **Mgoon** - **MGTV**: 芒果TV - **MiaoPai** + - **microsoftstream**: Microsoft Stream - **mildom**: Record ongoing live by specific user in Mildom - **mildom:user:vod**: Download all VODs from specific user in Mildom - **mildom:vod**: Download a VOD in Mildom @@ -686,8 +691,8 @@ - **niconico**: ニコニコ動画 - **NiconicoPlaylist** - **NiconicoUser** - - **nicovideo:search**: Nico video searches - - **nicovideo:search:date**: Nico video searches, newest first + - **nicovideo:search**: Nico video searches; "nicosearch:" prefix + - **nicovideo:search:date**: Nico video searches, newest first; "nicosearchdate:" prefix - **nicovideo:search_url**: Nico video search URLs - **Nintendo** - **Nitter** @@ -734,6 +739,7 @@ - **Odnoklassniki** - **OktoberfestTV** - **OlympicsReplay** + - **on24**: ON24 - **OnDemandKorea** - **onet.pl** - **onet.tv** @@ -930,7 +936,7 @@ - **SBS**: sbs.com.au - **schooltv** - **ScienceChannel** - - **screen.yahoo:search**: Yahoo screen search + - **screen.yahoo:search**: Yahoo screen search; "yvsearch:" prefix - **Screencast** - **ScreencastOMatic** - **ScrippsNetworks** @@ -961,6 +967,7 @@ - **SkylineWebcams** - **skynewsarabia:article** - **skynewsarabia:video** + - **SkyNewsAU** - **Slideshare** - **SlidesLive** - **Slutload** @@ -970,7 +977,7 @@ - **SonyLIVSeries** - **soundcloud** - **soundcloud:playlist** - - **soundcloud:search**: Soundcloud search + - **soundcloud:search**: Soundcloud search; "scsearch:" prefix - **soundcloud:set** - **soundcloud:trackstation** - **soundcloud:user** @@ -1029,7 +1036,6 @@ - **SztvHu** - **t-online.de** - **Tagesschau** - - **tagesschau:player** - **Tass** - **TBS** - **TDSLifeway** @@ -1089,6 +1095,8 @@ - **TrailerAddict** (Currently broken) - **Trilulilu** - **Trovo** + - **TrovoChannelClip**: All Clips of a trovo.live channel; "trovoclip:" prefix + - **TrovoChannelVod**: All VODs of a trovo.live channel; "trovovod:" prefix - **TrovoVod** - **TruNews** - **TruTV** @@ -1193,7 +1201,7 @@ - **Viddler** - **Videa** - **video.arnes.si**: Arnes Video - - **video.google:search**: Google Video search + - **video.google:search**: Google Video search; "gvsearch:" prefix (Currently broken) - **video.sky.it** - **video.sky.it:live** - **VideoDetective** @@ -1335,19 +1343,19 @@ - **YouPorn** - **YourPorn** - **YourUpload** - - **youtube**: YouTube.com - - **youtube:favorites**: YouTube.com liked videos, ":ytfav" for short (requires authentication) - - **youtube:history**: Youtube watch history, ":ythis" for short (requires authentication) - - **youtube:playlist**: YouTube.com playlists - - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) - - **youtube:search**: YouTube.com searches, "ytsearch" keyword - - **youtube:search:date**: YouTube.com searches, newest videos first, "ytsearchdate" keyword - - **youtube:search_url**: YouTube.com search URLs - - **youtube:subscriptions**: YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication) - - **youtube:tab**: YouTube.com tab - - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **youtube**: YouTube + - **youtube:favorites**: YouTube liked videos; ":ytfav" keyword (requires cookies) + - **youtube:history**: Youtube watch history; ":ythis" keyword (requires cookies) + - **youtube:playlist**: YouTube playlists + - **youtube:recommended**: YouTube recommended videos; ":ytrec" keyword + - **youtube:search**: YouTube searches; "ytsearch:" prefix + - **youtube:search:date**: YouTube searches, newest videos first; "ytsearchdate:" prefix + - **youtube:search_url**: YouTube search URLs with sorting and filter support + - **youtube:subscriptions**: YouTube subscriptions feed; ":ytsubs" keyword (requires cookies) + - **youtube:tab**: YouTube Tabs + - **youtube:watchlater**: Youtube watch later list; ":ytwatchlater" keyword (requires cookies) - **YoutubeYtBe**: youtu.be - - **YoutubeYtUser**: YouTube.com user videos, URL or "ytuser" keyword + - **YoutubeYtUser**: YouTube user videos; "ytuser:" prefix - **Zapiks** - **Zattoo** - **ZattooLive** diff --git a/test/test_utils.py b/test/test_utils.py index d84c3d3ee..810ed3de4 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1163,12 +1163,15 @@ class TestUtil(unittest.TestCase): def test_parse_resolution(self): self.assertEqual(parse_resolution(None), {}) self.assertEqual(parse_resolution(''), {}) - self.assertEqual(parse_resolution('1920x1080'), {'width': 1920, 'height': 1080}) - self.assertEqual(parse_resolution('1920×1080'), {'width': 1920, 'height': 1080}) + self.assertEqual(parse_resolution(' 1920x1080'), {'width': 1920, 'height': 1080}) + self.assertEqual(parse_resolution('1920×1080 '), {'width': 1920, 'height': 1080}) self.assertEqual(parse_resolution('1920 x 1080'), {'width': 1920, 'height': 1080}) self.assertEqual(parse_resolution('720p'), {'height': 720}) self.assertEqual(parse_resolution('4k'), {'height': 2160}) self.assertEqual(parse_resolution('8K'), {'height': 4320}) + self.assertEqual(parse_resolution('pre_1920x1080_post'), {'width': 1920, 'height': 1080}) + self.assertEqual(parse_resolution('ep1x2'), {}) + self.assertEqual(parse_resolution('1920, 1080'), {'width': 1920, 'height': 1080}) def test_parse_bitrate(self): self.assertEqual(parse_bitrate(None), None) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index d1ab540d2..b10e56fa1 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -28,6 +28,7 @@ import traceback import random import unicodedata +from enum import Enum from string import ascii_letters from .compat import ( @@ -55,9 +56,7 @@ from .utils import ( DEFAULT_OUTTMPL, determine_ext, determine_protocol, - DOT_DESKTOP_LINK_TEMPLATE, - DOT_URL_LINK_TEMPLATE, - DOT_WEBLOC_LINK_TEMPLATE, + DownloadCancelled, DownloadError, encode_compat_str, encodeFilename, @@ -76,11 +75,13 @@ from .utils import ( iri_to_uri, ISO3166Utils, LazyList, + LINK_TEMPLATES, locked_file, make_dir, make_HTTPS_handler, MaxDownloadsReached, network_exceptions, + number_of_digits, orderedSet, OUTTMPL_TYPES, PagedList, @@ -107,7 +108,6 @@ from .utils import ( strftime_or_none, subtitles_filename, supports_terminal_sequences, - TERMINAL_SEQUENCES, ThrottledDownload, to_high_limit_path, traverse_obj, @@ -123,6 +123,7 @@ from .utils import ( YoutubeDLRedirectHandler, ) from .cache import Cache +from .minicurses import format_text from .extractor import ( gen_extractor_classes, get_info_extractor, @@ -221,7 +222,8 @@ class YoutubeDL(object): allow_multiple_audio_streams: Allow multiple audio streams to be merged into a single file check_formats Whether to test if the formats are downloadable. - Can be True (check all), False (check none) + Can be True (check all), False (check none), + 'selected' (check selected formats), or None (check only if requested by extractor) paths: Dictionary of output paths. The allowed keys are 'home' 'temp' and the keys of OUTTMPL_TYPES (in utils.py) @@ -306,7 +308,7 @@ class YoutubeDL(object): cookiefile: File name where cookies should be read from and dumped to cookiesfrombrowser: A tuple containing the name of the browser and the profile name/path from where cookies are loaded. - Eg: ('chrome', ) or (vivaldi, 'default') + Eg: ('chrome', ) or ('vivaldi', 'default') nocheckcertificate:Do not verify SSL certificates prefer_insecure: Use HTTP instead of HTTPS to retrieve information. At the moment, this is only supported by YouTube. @@ -502,7 +504,7 @@ class YoutubeDL(object): def __init__(self, params=None, auto_init=True): """Create a FileDownloader object with the given options. @param auto_init Whether to load the default extractors and print header (if verbose). - Set to 'no_verbose_header' to not ptint the header + Set to 'no_verbose_header' to not print the header """ if params is None: params = {} @@ -523,7 +525,10 @@ class YoutubeDL(object): windows_enable_vt_mode() # FIXME: This will break if we ever print color to stdout - self.params['no_color'] = self.params.get('no_color') or not supports_terminal_sequences(self._err_file) + self._allow_colors = { + 'screen': not self.params.get('no_color') and supports_terminal_sequences(self._screen_file), + 'err': not self.params.get('no_color') and supports_terminal_sequences(self._err_file), + } if sys.version_info < (3, 6): self.report_warning( @@ -531,10 +536,10 @@ class YoutubeDL(object): if self.params.get('allow_unplayable_formats'): self.report_warning( - f'You have asked for {self._color_text("unplayable formats", "blue")} to be listed/downloaded. ' + f'You have asked for {self._format_err("UNPLAYABLE", self.Styles.EMPHASIS)} formats to be listed/downloaded. ' 'This is a developer option intended for debugging. \n' ' If you experience any issues while using this option, ' - f'{self._color_text("DO NOT", "red")} open a bug report') + f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report') def check_deprecated(param, option, suggestion): if self.params.get(param) is not None: @@ -550,9 +555,12 @@ class YoutubeDL(object): check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"') check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"') - for msg in self.params.get('warnings', []): + for msg in self.params.get('_warnings', []): self.report_warning(msg) + if 'list-formats' in self.params.get('compat_opts', []): + self.params['listformats_table'] = False + if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None: # nooverwrites was unnecessarily changed to overwrites # in 0c3d0f51778b153f65c21906031c2e091fcfb641 @@ -583,7 +591,9 @@ class YoutubeDL(object): self._output_channel = os.fdopen(master, 'rb') except OSError as ose: if ose.errno == errno.ENOENT: - self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.') + self.report_warning( + 'Could not find fribidi executable, ignoring --bidi-workaround. ' + 'Make sure that fribidi is an executable file in one of the directories in your $PATH.') else: raise @@ -630,7 +640,7 @@ class YoutubeDL(object): """Preload the archive, if any is specified""" if fn is None: return False - self.write_debug('Loading archive file %r\n' % fn) + self.write_debug(f'Loading archive file {fn!r}') try: with locked_file(fn, 'r', encoding='utf-8') as archive_file: for line in archive_file: @@ -657,7 +667,7 @@ class YoutubeDL(object): ) self.report_warning( 'Long argument string detected. ' - 'Use -- to separate parameters and URLs, like this:\n%s\n' % + 'Use -- to separate parameters and URLs, like this:\n%s' % args_to_str(correct_argv)) def add_info_extractor(self, ie): @@ -823,10 +833,32 @@ class YoutubeDL(object): self.to_stdout( message, skip_eol, quiet=self.params.get('quiet', False)) - def _color_text(self, text, color): - if self.params.get('no_color'): - return text - return f'{TERMINAL_SEQUENCES[color.upper()]}{text}{TERMINAL_SEQUENCES["RESET_STYLE"]}' + class Styles(Enum): + HEADERS = 'yellow' + EMPHASIS = 'blue' + ID = 'green' + DELIM = 'blue' + ERROR = 'red' + WARNING = 'yellow' + + def __format_text(self, out, text, f, fallback=None, *, test_encoding=False): + assert out in ('screen', 'err') + if test_encoding: + original_text = text + handle = self._screen_file if out == 'screen' else self._err_file + encoding = self.params.get('encoding') or getattr(handle, 'encoding', 'ascii') + text = text.encode(encoding, 'ignore').decode(encoding) + if fallback is not None and text != original_text: + text = fallback + if isinstance(f, self.Styles): + f = f._value_ + return format_text(text, f) if self._allow_colors[out] else text if fallback is None else fallback + + def _format_screen(self, *args, **kwargs): + return self.__format_text('screen', *args, **kwargs) + + def _format_err(self, *args, **kwargs): + return self.__format_text('err', *args, **kwargs) def report_warning(self, message, only_once=False): ''' @@ -838,14 +870,14 @@ class YoutubeDL(object): else: if self.params.get('no_warnings'): return - self.to_stderr(f'{self._color_text("WARNING:", "yellow")} {message}', only_once) + self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once) def report_error(self, message, tb=None): ''' Do the same as trouble, but prefixes the message with 'ERROR:', colored in red if stderr is a tty file. ''' - self.trouble(f'{self._color_text("ERROR:", "red")} {message}', tb) + self.trouble(f'{self._format_err("ERROR:", self.Styles.ERROR)} {message}', tb) def write_debug(self, message, only_once=False): '''Log debug message or Print message to stderr''' @@ -974,8 +1006,8 @@ class YoutubeDL(object): # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences # of %(field)s to %(field)0Nd for backward compatibility field_size_compat_map = { - 'playlist_index': len(str(info_dict.get('_last_playlist_index') or '')), - 'playlist_autonumber': len(str(info_dict.get('n_entries') or '')), + 'playlist_index': number_of_digits(info_dict.get('_last_playlist_index') or 0), + 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0), 'autonumber': self.params.get('autonumber_size') or 5, } @@ -1288,7 +1320,7 @@ class YoutubeDL(object): self.to_stderr('\r') self.report_warning('The download speed is below throttle limit. Re-extracting data') return wrapper(self, *args, **kwargs) - except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached, LazyList.IndexError): + except (DownloadCancelled, LazyList.IndexError): raise except Exception as e: if self.params.get('ignoreerrors'): @@ -1549,7 +1581,7 @@ class YoutubeDL(object): playlistitems = list(range(playliststart, playliststart + n_entries)) ie_result['requested_entries'] = playlistitems - if self.params.get('allow_playlist_files', True): + if not self.params.get('simulate') and self.params.get('allow_playlist_files', True): ie_copy = { 'playlist': playlist, 'playlist_id': ie_result.get('id'), @@ -1557,6 +1589,7 @@ class YoutubeDL(object): 'playlist_uploader': ie_result.get('uploader'), 'playlist_uploader_id': ie_result.get('uploader_id'), 'playlist_index': 0, + 'n_entries': n_entries, } ie_copy.update(dict(ie_result)) @@ -1686,6 +1719,28 @@ class YoutubeDL(object): return op(actual_value, comparison_value) return _filter + def _check_formats(self, formats): + for f in formats: + self.to_screen('[info] Testing format %s' % f['format_id']) + temp_file = tempfile.NamedTemporaryFile( + suffix='.tmp', delete=False, + dir=self.get_output_path('temp') or None) + temp_file.close() + try: + success, _ = self.dl(temp_file.name, f, test=True) + except (DownloadError, IOError, OSError, ValueError) + network_exceptions: + success = False + finally: + if os.path.exists(temp_file.name): + try: + os.remove(temp_file.name) + except OSError: + self.report_warning('Unable to delete temporary file "%s"' % temp_file.name) + if success: + yield f + else: + self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id']) + def _default_format_spec(self, info_dict, download=True): def can_merge(): @@ -1725,7 +1780,7 @@ class YoutubeDL(object): allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False), 'video': self.params.get('allow_multiple_video_streams', False)} - check_formats = self.params.get('check_formats') + check_formats = self.params.get('check_formats') == 'selected' def _parse_filter(tokens): filter_parts = [] @@ -1882,6 +1937,7 @@ class YoutubeDL(object): 'height': the_only_video.get('height'), 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video), 'fps': the_only_video.get('fps'), + 'dynamic_range': the_only_video.get('dynamic_range'), 'vcodec': the_only_video.get('vcodec'), 'vbr': the_only_video.get('vbr'), 'stretched_ratio': the_only_video.get('stretched_ratio'), @@ -1900,26 +1956,7 @@ class YoutubeDL(object): if not check_formats: yield from formats return - for f in formats: - self.to_screen('[info] Testing format %s' % f['format_id']) - temp_file = tempfile.NamedTemporaryFile( - suffix='.tmp', delete=False, - dir=self.get_output_path('temp') or None) - temp_file.close() - try: - success, _ = self.dl(temp_file.name, f, test=True) - except (DownloadError, IOError, OSError, ValueError) + network_exceptions: - success = False - finally: - if os.path.exists(temp_file.name): - try: - os.remove(temp_file.name) - except OSError: - self.report_warning('Unable to delete temporary file "%s"' % temp_file.name) - if success: - yield f - else: - self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id']) + yield from self._check_formats(formats) def _build_selector_function(selector): if isinstance(selector, list): # , @@ -2076,42 +2113,45 @@ class YoutubeDL(object): self.cookiejar.add_cookie_header(pr) return pr.get_header('Cookie') + def _sort_thumbnails(self, thumbnails): + thumbnails.sort(key=lambda t: ( + t.get('preference') if t.get('preference') is not None else -1, + t.get('width') if t.get('width') is not None else -1, + t.get('height') if t.get('height') is not None else -1, + t.get('id') if t.get('id') is not None else '', + t.get('url'))) + def _sanitize_thumbnails(self, info_dict): thumbnails = info_dict.get('thumbnails') if thumbnails is None: thumbnail = info_dict.get('thumbnail') if thumbnail: info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] - if thumbnails: - thumbnails.sort(key=lambda t: ( - t.get('preference') if t.get('preference') is not None else -1, - t.get('width') if t.get('width') is not None else -1, - t.get('height') if t.get('height') is not None else -1, - t.get('id') if t.get('id') is not None else '', - t.get('url'))) - - def thumbnail_tester(): - def test_thumbnail(t): - self.to_screen(f'[info] Testing thumbnail {t["id"]}') - try: - self.urlopen(HEADRequest(t['url'])) - except network_exceptions as err: - self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...') - return False - return True - return test_thumbnail - - for i, t in enumerate(thumbnails): - if t.get('id') is None: - t['id'] = '%d' % i - if t.get('width') and t.get('height'): - t['resolution'] = '%dx%d' % (t['width'], t['height']) - t['url'] = sanitize_url(t['url']) - - if self.params.get('check_formats'): - info_dict['thumbnails'] = LazyList(filter(thumbnail_tester(), thumbnails[::-1])).reverse() - else: - info_dict['thumbnails'] = thumbnails + if not thumbnails: + return + + def check_thumbnails(thumbnails): + for t in thumbnails: + self.to_screen(f'[info] Testing thumbnail {t["id"]}') + try: + self.urlopen(HEADRequest(t['url'])) + except network_exceptions as err: + self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...') + continue + yield t + + self._sort_thumbnails(thumbnails) + for i, t in enumerate(thumbnails): + if t.get('id') is None: + t['id'] = '%d' % i + if t.get('width') and t.get('height'): + t['resolution'] = '%dx%d' % (t['width'], t['height']) + t['url'] = sanitize_url(t['url']) + + if self.params.get('check_formats') is True: + info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1])).reverse() + else: + info_dict['thumbnails'] = thumbnails def process_video_result(self, info_dict, download=True): assert info_dict.get('_type', 'video') == 'video' @@ -2217,7 +2257,6 @@ class YoutubeDL(object): info_dict['requested_subtitles'] = self.process_subtitles( info_dict['id'], subtitles, automatic_captions) - # We now pick which formats have to be downloaded if info_dict.get('formats') is None: # There's only one format available formats = [info_dict] @@ -2289,6 +2328,10 @@ class YoutubeDL(object): format['resolution'] = self.format_resolution(format, default=None) if format.get('dynamic_range') is None and format.get('vcodec') != 'none': format['dynamic_range'] = 'SDR' + if (info_dict.get('duration') and format.get('tbr') + and not format.get('filesize') and not format.get('filesize_approx')): + format['filesize_approx'] = info_dict['duration'] * format['tbr'] * (1024 / 8) + # Add HTTP headers, so that external programs can use them from the # json output full_format_info = info_dict.copy() @@ -2300,6 +2343,9 @@ class YoutubeDL(object): # TODO Central sorting goes here + if self.params.get('check_formats') is True: + formats = LazyList(self._check_formats(formats[::-1])).reverse() + if not formats or formats[0] is not info_dict: # only set the 'formats' fields if the original info_dict list them # otherwise we end up with a circular reference, the first (and unique) @@ -2380,7 +2426,7 @@ class YoutubeDL(object): new_info['__original_infodict'] = info_dict new_info.update(fmt) self.process_info(new_info) - # We update the info dict with the best quality format (backwards compatibility) + # We update the info dict with the selected best quality format (backwards compatibility) if formats_to_download: info_dict.update(formats_to_download[-1]) return info_dict @@ -2617,53 +2663,41 @@ class YoutubeDL(object): return # Write internet shortcut files - url_link = webloc_link = desktop_link = False - if self.params.get('writelink', False): - if sys.platform == "darwin": # macOS. - webloc_link = True - elif sys.platform.startswith("linux"): - desktop_link = True - else: # if sys.platform in ['win32', 'cygwin']: - url_link = True - if self.params.get('writeurllink', False): - url_link = True - if self.params.get('writewebloclink', False): - webloc_link = True - if self.params.get('writedesktoplink', False): - desktop_link = True - - if url_link or webloc_link or desktop_link: + def _write_link_file(link_type): if 'webpage_url' not in info_dict: self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information') - return - ascii_url = iri_to_uri(info_dict['webpage_url']) - - def _write_link_file(extension, template, newline, embed_filename): - linkfn = replace_extension(full_filename, extension, info_dict.get('ext')) + return False + linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext')) if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)): - self.to_screen('[info] Internet shortcut is already present') - else: - try: - self.to_screen('[info] Writing internet shortcut to: ' + linkfn) - with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', newline=newline) as linkfile: - template_vars = {'url': ascii_url} - if embed_filename: - template_vars['filename'] = linkfn[:-(len(extension) + 1)] - linkfile.write(template % template_vars) - except (OSError, IOError): - self.report_error('Cannot write internet shortcut ' + linkfn) - return False + self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present') + return True + try: + self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}') + with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', + newline='\r\n' if link_type == 'url' else '\n') as linkfile: + template_vars = {'url': iri_to_uri(info_dict['webpage_url'])} + if link_type == 'desktop': + template_vars['filename'] = linkfn[:-(len(link_type) + 1)] + linkfile.write(LINK_TEMPLATES[link_type] % template_vars) + except (OSError, IOError): + self.report_error(f'Cannot write internet shortcut {linkfn}') + return False return True - if url_link: - if not _write_link_file('url', DOT_URL_LINK_TEMPLATE, '\r\n', embed_filename=False): - return - if webloc_link: - if not _write_link_file('webloc', DOT_WEBLOC_LINK_TEMPLATE, '\n', embed_filename=False): - return - if desktop_link: - if not _write_link_file('desktop', DOT_DESKTOP_LINK_TEMPLATE, '\n', embed_filename=True): - return + write_links = { + 'url': self.params.get('writeurllink'), + 'webloc': self.params.get('writewebloclink'), + 'desktop': self.params.get('writedesktoplink'), + } + if self.params.get('writelink'): + link_type = ('webloc' if sys.platform == 'darwin' + else 'desktop' if sys.platform.startswith('linux') + else 'url') + write_links[link_type] = True + + if any(should_write and not _write_link_file(link_type) + for link_type, should_write in write_links.items()): + return try: info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move) @@ -2915,14 +2949,8 @@ class YoutubeDL(object): url, force_generic_extractor=self.params.get('force_generic_extractor', False)) except UnavailableVideoError: self.report_error('unable to download video') - except MaxDownloadsReached: - self.to_screen('[info] Maximum number of downloads reached') - raise - except ExistingVideoReached: - self.to_screen('[info] Encountered a video that is already in the archive, stopping due to --break-on-existing') - raise - except RejectedVideoReached: - self.to_screen('[info] Encountered a video that did not match filter, stopping due to --break-on-reject') + except DownloadCancelled as e: + self.to_screen(f'[info] {e.msg}') raise else: if self.params.get('dump_single_json', False): @@ -3162,38 +3190,46 @@ class YoutubeDL(object): res += '~' + format_bytes(fdict['filesize_approx']) return res + def _list_format_headers(self, *headers): + if self.params.get('listformats_table', True) is not False: + return [self._format_screen(header, self.Styles.HEADERS) for header in headers] + return headers + def list_formats(self, info_dict): formats = info_dict.get('formats', [info_dict]) - new_format = ( - 'list-formats' not in self.params.get('compat_opts', []) - and self.params.get('listformats_table', True) is not False) + new_format = self.params.get('listformats_table', True) is not False if new_format: + tbr_digits = number_of_digits(max(f.get('tbr') or 0 for f in formats)) + vbr_digits = number_of_digits(max(f.get('vbr') or 0 for f in formats)) + abr_digits = number_of_digits(max(f.get('abr') or 0 for f in formats)) + delim = self._format_screen('\u2502', self.Styles.DELIM, '|', test_encoding=True) table = [ [ - format_field(f, 'format_id'), + self._format_screen(format_field(f, 'format_id'), self.Styles.ID), format_field(f, 'ext'), self.format_resolution(f), format_field(f, 'fps', '%d'), format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''), - '|', + delim, format_field(f, 'filesize', ' %s', func=format_bytes) + format_field(f, 'filesize_approx', '~%s', func=format_bytes), - format_field(f, 'tbr', '%4dk'), + format_field(f, 'tbr', f'%{tbr_digits}dk'), shorten_protocol_name(f.get('protocol', '').replace("native", "n")), - '|', + delim, format_field(f, 'vcodec', default='unknown').replace('none', ''), - format_field(f, 'vbr', '%4dk'), + format_field(f, 'vbr', f'%{vbr_digits}dk'), format_field(f, 'acodec', default='unknown').replace('none', ''), - format_field(f, 'abr', '%3dk'), + format_field(f, 'abr', f'%{abr_digits}dk'), format_field(f, 'asr', '%5dHz'), ', '.join(filter(None, ( - 'UNSUPPORTED' if f.get('ext') in ('f4f', 'f4m') else '', + self._format_screen('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else '', format_field(f, 'language', '[%s]'), format_field(f, 'format_note'), format_field(f, 'container', ignore=(None, f.get('ext'))), ))), ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] - header_line = ['ID', 'EXT', 'RESOLUTION', 'FPS', 'HDR', '|', ' FILESIZE', ' TBR', 'PROTO', - '|', 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO'] + header_line = self._list_format_headers( + 'ID', 'EXT', 'RESOLUTION', 'FPS', 'HDR', delim, ' FILESIZE', ' TBR', 'PROTO', + delim, 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO') else: table = [ [ @@ -3208,7 +3244,10 @@ class YoutubeDL(object): self.to_screen( '[info] Available formats for %s:' % info_dict['id']) self.to_stdout(render_table( - header_line, table, delim=new_format, extraGap=(0 if new_format else 1), hideEmpty=new_format)) + header_line, table, + extraGap=(0 if new_format else 1), + hideEmpty=new_format, + delim=new_format and self._format_screen('\u2500', self.Styles.DELIM, '-', test_encoding=True))) def list_thumbnails(self, info_dict): thumbnails = list(info_dict.get('thumbnails')) @@ -3219,7 +3258,7 @@ class YoutubeDL(object): self.to_screen( '[info] Thumbnails for %s:' % info_dict['id']) self.to_stdout(render_table( - ['ID', 'width', 'height', 'URL'], + self._list_format_headers('ID', 'Width', 'Height', 'URL'), [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) def list_subtitles(self, video_id, subtitles, name='subtitles'): @@ -3236,7 +3275,7 @@ class YoutubeDL(object): return [lang, ', '.join(names), ', '.join(exts)] self.to_stdout(render_table( - ['Language', 'Name', 'Formats'], + self._list_format_headers('Language', 'Name', 'Formats'), [_row(lang, formats) for lang, formats in subtitles.items()], hideEmpty=True)) @@ -3249,31 +3288,40 @@ class YoutubeDL(object): def print_debug_header(self): if not self.params.get('verbose'): return - get_encoding = lambda stream: getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__) - encoding_str = ( - '[debug] Encodings: locale %s, fs %s, stdout %s, stderr %s, pref %s\n' % ( - locale.getpreferredencoding(), - sys.getfilesystemencoding(), - get_encoding(self._screen_file), get_encoding(self._err_file), - self.get_encoding())) + + def get_encoding(stream): + ret = getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__) + if not supports_terminal_sequences(stream): + ret += ' (No ANSI)' + return ret + + encoding_str = 'Encodings: locale %s, fs %s, out %s, err %s, pref %s' % ( + locale.getpreferredencoding(), + sys.getfilesystemencoding(), + get_encoding(self._screen_file), get_encoding(self._err_file), + self.get_encoding()) logger = self.params.get('logger') if logger: write_debug = lambda msg: logger.debug(f'[debug] {msg}') write_debug(encoding_str) else: - write_debug = lambda msg: self._write_string(f'[debug] {msg}') - write_string(encoding_str, encoding=None) - - write_debug('yt-dlp version %s%s\n' % (__version__, '' if source == 'unknown' else f' ({source})')) - if _LAZY_LOADER: - write_debug('Lazy loading extractors enabled\n') + write_string(f'[debug] {encoding_str}\n', encoding=None) + write_debug = lambda msg: self._write_string(f'[debug] {msg}\n') + + source = detect_variant() + write_debug('yt-dlp version %s%s' % (__version__, '' if source == 'unknown' else f' ({source})')) + if not _LAZY_LOADER: + if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): + write_debug('Lazy loading extractors is forcibly disabled') + else: + write_debug('Lazy loading extractors is disabled') if plugin_extractors or plugin_postprocessors: - write_debug('Plugins: %s\n' % [ + write_debug('Plugins: %s' % [ '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}') for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())]) if self.params.get('compat_opts'): - write_debug('Compatibility options: %s\n' % ', '.join(self.params.get('compat_opts'))) + write_debug('Compatibility options: %s' % ', '.join(self.params.get('compat_opts'))) try: sp = Popen( ['git', 'rev-parse', '--short', 'HEAD'], @@ -3282,7 +3330,7 @@ class YoutubeDL(object): out, err = sp.communicate_or_kill() out = out.decode().strip() if re.match('[0-9a-f]+', out): - write_debug('Git HEAD: %s\n' % out) + write_debug('Git HEAD: %s' % out) except Exception: try: sys.exc_clear() @@ -3295,7 +3343,7 @@ class YoutubeDL(object): return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] return impl_name - write_debug('Python version %s (%s %s) - %s\n' % ( + write_debug('Python version %s (%s %s) - %s' % ( platform.python_version(), python_implementation(), platform.architecture()[0], @@ -3307,7 +3355,7 @@ class YoutubeDL(object): exe_str = ', '.join( f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v ) or 'none' - write_debug('exe versions: %s\n' % exe_str) + write_debug('exe versions: %s' % exe_str) from .downloader.websocket import has_websockets from .postprocessor.embedthumbnail import has_mutagen @@ -3320,21 +3368,18 @@ class YoutubeDL(object): SQLITE_AVAILABLE and 'sqlite', KEYRING_AVAILABLE and 'keyring', )))) or 'none' - write_debug('Optional libraries: %s\n' % lib_str) - write_debug('ANSI escape support: stdout = %s, stderr = %s\n' % ( - supports_terminal_sequences(self._screen_file), - supports_terminal_sequences(self._err_file))) + write_debug('Optional libraries: %s' % lib_str) proxy_map = {} for handler in self._opener.handlers: if hasattr(handler, 'proxies'): proxy_map.update(handler.proxies) - write_debug('Proxy map: ' + compat_str(proxy_map) + '\n') + write_debug(f'Proxy map: {proxy_map}') - if self.params.get('call_home', False): + # Not implemented + if False and self.params.get('call_home'): ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8') - write_debug('Public IP address: %s\n' % ipaddr) - return + write_debug('Public IP address: %s' % ipaddr) latest_version = self.urlopen( 'https://yt-dl.org/latest/version').read().decode('utf-8') if version_tuple(latest_version) > version_tuple(__version__): diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index d8db5754f..3a4b81efd 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -119,10 +119,10 @@ def _real_main(argv=None): desc = getattr(ie, 'IE_DESC', ie.IE_NAME) if desc is False: continue - if hasattr(ie, 'SEARCH_KEY'): + if getattr(ie, 'SEARCH_KEY', None) is not None: _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow') _COUNTS = ('', '5', '10', 'all') - desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES)) + desc += f'; "{ie.SEARCH_KEY}:" prefix (Example: "{ie.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(_SEARCHES)}")' write_string(desc + '\n', out=sys.stdout) sys.exit(0) if opts.ap_list_mso: @@ -256,6 +256,9 @@ def _real_main(argv=None): compat_opts = opts.compat_opts + def report_conflict(arg1, arg2): + warnings.append(f'{arg2} is ignored since {arg1} was given') + def _unused_compat_opt(name): if name not in compat_opts: return False @@ -287,10 +290,14 @@ def _real_main(argv=None): if _video_multistreams_set is False and _audio_multistreams_set is False: _unused_compat_opt('multistreams') outtmpl_default = opts.outtmpl.get('default') + if opts.useid: + if outtmpl_default is None: + outtmpl_default = opts.outtmpl['default'] = '%(id)s.%(ext)s' + else: + report_conflict('--output', '--id') if 'filename' in compat_opts: if outtmpl_default is None: - outtmpl_default = '%(title)s-%(id)s.%(ext)s' - opts.outtmpl.update({'default': outtmpl_default}) + outtmpl_default = opts.outtmpl['default'] = '%(title)s-%(id)s.%(ext)s' else: _unused_compat_opt('filename') @@ -363,9 +370,6 @@ def _real_main(argv=None): opts.addchapters = True opts.remove_chapters = opts.remove_chapters or [] - def report_conflict(arg1, arg2): - warnings.append('%s is ignored since %s was given' % (arg2, arg1)) - if (opts.remove_chapters or sponsorblock_query) and opts.sponskrub is not False: if opts.sponskrub: if opts.remove_chapters: @@ -738,7 +742,7 @@ def _real_main(argv=None): 'geo_bypass': opts.geo_bypass, 'geo_bypass_country': opts.geo_bypass_country, 'geo_bypass_ip_block': opts.geo_bypass_ip_block, - 'warnings': warnings, + '_warnings': warnings, 'compat_opts': compat_opts, } diff --git a/yt_dlp/compat.py b/yt_dlp/compat.py index b107b2114..8508f1465 100644 --- a/yt_dlp/compat.py +++ b/yt_dlp/compat.py @@ -19,6 +19,7 @@ import shlex import shutil import socket import struct +import subprocess import sys import tokenize import urllib @@ -162,7 +163,9 @@ except ImportError: def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075 if compat_os_name != 'nt': return - os.system('') + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + subprocess.Popen('', shell=True, startupinfo=startupinfo) # Deprecated diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 5f7fdf584..c9ae9b6db 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -620,7 +620,7 @@ def _get_windows_v10_key(browser_root, logger): if path is None: logger.error('could not find local state file') return None - with open(path, 'r') as f: + with open(path, 'r', encoding='utf8') as f: data = json.load(f) try: base64_key = data['os_crypt']['encrypted_key'] diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index c345f3148..a9d1471f8 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -370,7 +370,8 @@ class FragmentFD(FileDownloader): if max_progress == 1: return self.download_and_append_fragments(*args[0], pack_func=pack_func, finish_func=finish_func) max_workers = self.params.get('concurrent_fragment_downloads', max_progress) - self._prepare_multiline_status(max_progress) + if max_progress > 1: + self._prepare_multiline_status(max_progress) def thread_func(idx, ctx, fragments, info_dict, tpe): ctx['max_progress'] = max_progress diff --git a/yt_dlp/extractor/__init__.py b/yt_dlp/extractor/__init__.py index 198c4ae17..b35484246 100644 --- a/yt_dlp/extractor/__init__.py +++ b/yt_dlp/extractor/__init__.py @@ -1,14 +1,15 @@ -from __future__ import unicode_literals +import os from ..utils import load_plugins -try: - from .lazy_extractors import * - from .lazy_extractors import _ALL_CLASSES - _LAZY_LOADER = True - _PLUGIN_CLASSES = {} -except ImportError: - _LAZY_LOADER = False +_LAZY_LOADER = False +if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): + try: + from .lazy_extractors import * + from .lazy_extractors import _ALL_CLASSES + _LAZY_LOADER = True + except ImportError: + pass if not _LAZY_LOADER: from .extractors import * @@ -19,8 +20,8 @@ if not _LAZY_LOADER: ] _ALL_CLASSES.append(GenericIE) - _PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) - _ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES +_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) +_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES def gen_extractor_classes(): diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index 9378c33cd..bebcafa6b 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -39,8 +39,8 @@ MSO_INFO = { }, 'RCN': { 'name': 'RCN', - 'username_field': 'UserName', - 'password_field': 'UserPassword', + 'username_field': 'username', + 'password_field': 'password', }, 'Rogers': { 'name': 'Rogers', diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index d6c77e418..483f93d67 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -376,8 +376,10 @@ class BiliBiliIE(InfoExtractor): replies = traverse_obj( self._download_json( f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={video_id}&type=1&jsonp=jsonp&sort=2&_=1567227301685', - video_id, note=f'Extracting comments from page {idx}'), - ('data', 'replies')) or [] + video_id, note=f'Extracting comments from page {idx}', fatal=False), + ('data', 'replies')) + if not replies: + return for children in map(self._get_all_children, replies): yield from children @@ -566,7 +568,7 @@ class BilibiliCategoryIE(InfoExtractor): class BiliBiliSearchIE(SearchInfoExtractor): - IE_DESC = 'Bilibili video search, "bilisearch" keyword' + IE_DESC = 'Bilibili video search' _MAX_RESULTS = 100000 _SEARCH_KEY = 'bilisearch' diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 5e4526c53..4fcf2a9c1 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -2,6 +2,9 @@ from __future__ import unicode_literals import re +import json +import base64 +import time from .common import InfoExtractor from ..compat import ( @@ -244,37 +247,96 @@ class CBCGemIE(InfoExtractor): 'params': {'format': 'bv'}, 'skip': 'Geo-restricted to Canada', }] - _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + + _GEO_COUNTRIES = ['CA'] + _TOKEN_API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' + _NETRC_MACHINE = 'cbcgem' + _claims_token = None + + def _new_claims_token(self, email, password): + data = json.dumps({ + 'email': email, + 'password': password, + }).encode() + headers = {'content-type': 'application/json'} + query = {'apikey': self._TOKEN_API_KEY} + resp = self._download_json('https://api.loginradius.com/identity/v2/auth/login', + None, data=data, headers=headers, query=query) + access_token = resp['access_token'] + + query = { + 'access_token': access_token, + 'apikey': self._TOKEN_API_KEY, + 'jwtapp': 'jwt', + } + resp = self._download_json('https://cloud-api.loginradius.com/sso/jwt/api/token', + None, headers=headers, query=query) + sig = resp['signature'] + + data = json.dumps({'jwt': sig}).encode() + headers = {'content-type': 'application/json', 'ott-device-type': 'web'} + resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token', + None, data=data, headers=headers) + cbc_access_token = resp['accessToken'] + + headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token} + resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile', + None, headers=headers) + return resp['claimsToken'] + + def _get_claims_token_expiry(self): + # Token is a JWT + # JWT is decoded here and 'exp' field is extracted + # It is a Unix timestamp for when the token expires + b64_data = self._claims_token.split('.')[1] + data = base64.urlsafe_b64decode(b64_data + "==") + return json.loads(data)['exp'] + + def claims_token_expired(self): + exp = self._get_claims_token_expiry() + if exp - time.time() < 10: + # It will expire in less than 10 seconds, or has already expired + return True + return False + + def claims_token_valid(self): + return self._claims_token is not None and not self.claims_token_expired() + + def _get_claims_token(self, email, password): + if not self.claims_token_valid(): + self._claims_token = self._new_claims_token(email, password) + self._downloader.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token) + return self._claims_token + + def _real_initialize(self): + if self.claims_token_valid(): + return + self._claims_token = self._downloader.cache.load(self._NETRC_MACHINE, 'claims_token') def _real_extract(self, url): video_id = self._match_id(url) - video_info = self._download_json(self._API_BASE + video_id, video_id) - - last_error = None - attempt = -1 - retries = self.get_param('extractor_retries', 15) - while attempt < retries: - attempt += 1 - if last_error: - self.report_warning('%s. Retrying ...' % last_error) - m3u8_info = self._download_json( - video_info['playSession']['url'], video_id, - note='Downloading JSON metadata%s' % f' (attempt {attempt})') - m3u8_url = m3u8_info.get('url') - if m3u8_url: - break - elif m3u8_info.get('errorCode') == 1: - self.raise_geo_restricted(countries=['CA']) - else: - last_error = f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}' - # 35 means media unavailable, but retries work - if m3u8_info.get('errorCode') != 35 or attempt >= retries: - raise ExtractorError(last_error) + video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id) + + email, password = self._get_login_info() + if email and password: + claims_token = self._get_claims_token(email, password) + headers = {'x-claims-token': claims_token} + else: + headers = {} + m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers) + m3u8_url = m3u8_info.get('url') + + if m3u8_info.get('errorCode') == 1: + self.raise_geo_restricted(countries=['CA']) + elif m3u8_info.get('errorCode') == 35: + self.raise_login_required(method='password') + elif m3u8_info.get('errorCode') != 0: + raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}') formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') self._remove_duplicate_formats(formats) - for i, format in enumerate(formats): + for format in formats: if format.get('vcodec') == 'none': if format.get('ext') is None: format['ext'] = 'm4a' @@ -377,7 +439,7 @@ class CBCGemPlaylistIE(InfoExtractor): class CBCGemLiveIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:live' - _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>[0-9]{12})' + _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>\d+)' _TEST = { 'url': 'https://gem.cbc.ca/live/920604739687', 'info_dict': { @@ -396,21 +458,21 @@ class CBCGemLiveIE(InfoExtractor): # It's unclear where the chars at the end come from, but they appear to be # constant. Might need updating in the future. - _API = 'https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT' + # There are two URLs, some livestreams are in one, and some + # in the other. The JSON schema is the same for both. + _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT'] def _real_extract(self, url): video_id = self._match_id(url) - live_info = self._download_json(self._API, video_id)['entries'] - video_info = None - for stream in live_info: - if stream.get('guid') == video_id: - video_info = stream - - if video_info is None: - raise ExtractorError( - 'Couldn\'t find video metadata, maybe this livestream is now offline', - expected=True) + for api_url in self._API_URLS: + video_info = next(( + stream for stream in self._download_json(api_url, video_id)['entries'] + if stream.get('guid') == video_id), None) + if video_info: + break + else: + raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True) return { '_type': 'url_transparent', diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index e00d8c42b..aa98c0cc9 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -74,6 +74,7 @@ from ..utils import ( strip_or_none, traverse_obj, unescapeHTML, + UnsupportedError, unified_strdate, unified_timestamp, update_Request, @@ -448,7 +449,9 @@ class InfoExtractor(object): } def __init__(self, downloader=None): - """Constructor. Receives an optional downloader.""" + """Constructor. Receives an optional downloader (a YoutubeDL instance). + If a downloader is not passed during initialization, + it must be set using "set_downloader()" before "extract()" is called""" self._ready = False self._x_forwarded_for_ip = None self._printed_messages = set() @@ -602,10 +605,19 @@ class InfoExtractor(object): if self.__maybe_fake_ip_and_retry(e.countries): continue raise + except UnsupportedError: + raise except ExtractorError as e: - video_id = e.video_id or self.get_temp_id(url) - raise ExtractorError( - e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause) + kwargs = { + 'video_id': e.video_id or self.get_temp_id(url), + 'ie': self.IE_NAME, + 'tb': e.traceback, + 'expected': e.expected, + 'cause': e.cause + } + if hasattr(e, 'countries'): + kwargs['countries'] = e.countries + raise type(e)(e.msg, **kwargs) except compat_http_client.IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) except (KeyError, StopIteration) as e: @@ -664,7 +676,7 @@ class InfoExtractor(object): See _download_webpage docstring for arguments specification. """ if not self._downloader._first_webpage_request: - sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0 + sleep_interval = self.get_param('sleep_interval_requests') or 0 if sleep_interval > 0: self.to_screen('Sleeping %s seconds ...' % sleep_interval) time.sleep(sleep_interval) @@ -1137,7 +1149,7 @@ class InfoExtractor(object): if mobj: break - _name = self._downloader._color_text(name, 'blue') + _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS) if mobj: if group is None: @@ -1537,8 +1549,8 @@ class InfoExtractor(object): 'ie_pref': {'priority': True, 'type': 'extractor'}, 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}, 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}, - 'lang': {'convert': 'ignore', 'field': 'language_preference'}, - 'quality': {'convert': 'float_none', 'default': -1}, + 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1}, + 'quality': {'convert': 'float', 'default': -1}, 'filesize': {'convert': 'bytes'}, 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}, 'id': {'convert': 'string', 'field': 'format_id'}, @@ -1549,7 +1561,7 @@ class InfoExtractor(object): 'vbr': {'convert': 'float_none'}, 'abr': {'convert': 'float_none'}, 'asr': {'convert': 'float_none'}, - 'source': {'convert': 'ignore', 'field': 'source_preference'}, + 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}, 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}, 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True}, @@ -3618,9 +3630,11 @@ class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} - Instances should define _SEARCH_KEY and _MAX_RESULTS. + Instances should define _SEARCH_KEY and optionally _MAX_RESULTS """ + _MAX_RESULTS = float('inf') + @classmethod def _make_valid_url(cls): return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY diff --git a/yt_dlp/extractor/coub.py b/yt_dlp/extractor/coub.py index eba6b73ba..e90aa1954 100644 --- a/yt_dlp/extractor/coub.py +++ b/yt_dlp/extractor/coub.py @@ -57,7 +57,7 @@ class CoubIE(InfoExtractor): file_versions = coub['file_versions'] - QUALITIES = ('low', 'med', 'high') + QUALITIES = ('low', 'med', 'high', 'higher') MOBILE = 'mobile' IPHONE = 'iphone' @@ -86,6 +86,7 @@ class CoubIE(InfoExtractor): 'format_id': '%s-%s-%s' % (HTML5, kind, quality), 'filesize': int_or_none(item.get('size')), 'vcodec': 'none' if kind == 'audio' else None, + 'acodec': 'none' if kind == 'video' else None, 'quality': quality_key(quality), 'source_preference': preference_key(HTML5), }) diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index e0e446b87..d62480810 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -325,7 +325,7 @@ class HGTVDeIE(DPlayIE): class DiscoveryPlusIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?:\w{2}/)?video' + DPlayIE._PATH_REGEX _TESTS = [{ 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', 'info_dict': { @@ -343,6 +343,9 @@ class DiscoveryPlusIE(DPlayIE): 'episode_number': 1, }, 'skip': 'Available for Premium users', + }, { + 'url': 'https://discoveryplus.com/ca/video/bering-sea-gold-discovery-ca/goldslingers', + 'only_matching': True, }] _PRODUCT = 'dplus_us' diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index f4f817fcb..9d963ee46 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -744,7 +744,10 @@ from .mdr import MDRIE from .medaltv import MedalTVIE from .mediaite import MediaiteIE from .mediaklikk import MediaKlikkIE -from .mediaset import MediasetIE +from .mediaset import ( + MediasetIE, + MediasetShowIE, +) from .mediasite import ( MediasiteIE, MediasiteCatalogIE, @@ -760,6 +763,7 @@ from .metacritic import MetacriticIE from .mgoon import MgoonIE from .mgtv import MGTVIE from .miaopai import MiaoPaiIE +from .microsoftstream import MicrosoftStreamIE from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, @@ -792,6 +796,7 @@ from .mlb import ( MLBIE, MLBVideoIE, ) +from .mlssoccer import MLSSoccerIE from .mnet import MnetIE from .moevideo import MoeVideoIE from .mofosex import ( @@ -1288,6 +1293,7 @@ from .skynewsarabia import ( from .skynewsau import SkyNewsAUIE from .sky import ( SkyNewsIE, + SkyNewsStoryIE, SkySportsIE, SkySportsNewsIE, ) @@ -1387,10 +1393,7 @@ from .svt import ( from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE -from .tagesschau import ( - TagesschauPlayerIE, - TagesschauIE, -) +from .tagesschau import TagesschauIE from .tass import TassIE from .tbs import TBSIE from .tdslifeway import TDSLifewayIE @@ -1444,6 +1447,10 @@ from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE +from .threespeak import ( + ThreeSpeakIE, + ThreeSpeakUserIE, +) from .threeqsdn import ThreeQSDNIE from .tiktok import ( TikTokIE, diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 5918c8c56..0d279016b 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -1188,6 +1188,21 @@ class GenericIE(InfoExtractor): }, 'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/', }, + # jwplayer with only the json URL + { + 'url': 'https://www.hollywoodreporter.com/news/general-news/dunkirk-team-reveals-what-christopher-nolan-said-oscar-win-meet-your-oscar-winner-1092454', + 'info_dict': { + 'id': 'TljWkvWH', + 'ext': 'mp4', + 'upload_date': '20180306', + 'title': 'md5:91eb1862f6526415214f62c00b453936', + 'description': 'md5:73048ae50ae953da10549d1d2fe9b3aa', + 'timestamp': 1520367225, + }, + 'params': { + 'skip_download': True, + }, + }, # Complex jwplayer { 'url': 'http://www.indiedb.com/games/king-machine/videos', @@ -3503,6 +3518,13 @@ class GenericIE(InfoExtractor): jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) if jwplayer_data: + if isinstance(jwplayer_data.get('playlist'), str): + return { + **info_dict, + '_type': 'url', + 'ie_key': JWPlatformIE.ie_key(), + 'url': jwplayer_data['playlist'], + } try: info = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=url) @@ -3561,8 +3583,7 @@ class GenericIE(InfoExtractor): return info_dict # Looking for http://schema.org/VideoObject - json_ld = self._search_json_ld( - webpage, video_id, default={}, expected_type='VideoObject') + json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url'): return merge_dicts(json_ld, info_dict) diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index 3801c7af9..ccfcddd5b 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -4,6 +4,7 @@ import itertools import hashlib import json import re +import time from .common import InfoExtractor from ..compat import ( @@ -20,11 +21,13 @@ from ..utils import ( try_get, url_or_none, variadic, + urlencode_postdata, ) class InstagramIE(InfoExtractor): _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))' + _NETRC_MACHINE = 'instagram' _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -140,6 +143,47 @@ class InstagramIE(InfoExtractor): if mobj: return mobj.group('link') + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_webpage = self._download_webpage( + 'https://www.instagram.com/accounts/login/', None, + note='Downloading login webpage', errnote='Failed to download login webpage') + + shared_data = self._parse_json( + self._search_regex( + r'window\._sharedData\s*=\s*({.+?});', + login_webpage, 'shared data', default='{}'), + None) + + login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={ + 'Accept': '*/*', + 'X-IG-App-ID': '936619743392459', + 'X-ASBD-ID': '198387', + 'X-IG-WWW-Claim': '0', + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRFToken': shared_data['config']['csrf_token'], + 'X-Instagram-AJAX': shared_data['rollout_hash'], + 'Referer': 'https://www.instagram.com/', + }, data=urlencode_postdata({ + 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', + 'username': username, + 'queryParams': '{}', + 'optIntoOneTap': 'false', + 'stopDeletionNonce': '', + 'trustedDeviceRecords': '{}', + })) + + if not login.get('authenticated'): + if login.get('message'): + raise ExtractorError(f'Unable to login: {login["message"]}') + raise ExtractorError('Unable to login') + + def _real_initialize(self): + self._login() + def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') @@ -147,7 +191,7 @@ class InstagramIE(InfoExtractor): webpage, urlh = self._download_webpage_handle(url, video_id) if 'www.instagram.com/accounts/login' in urlh.geturl().rstrip('/'): - self.raise_login_required('You need to log in to access this content', method='cookies') + self.raise_login_required('You need to log in to access this content') (media, video_url, description, thumbnail, timestamp, uploader, uploader_id, like_count, comment_count, comments, height, diff --git a/yt_dlp/extractor/itv.py b/yt_dlp/extractor/itv.py index d69782b78..6e6a3673c 100644 --- a/yt_dlp/extractor/itv.py +++ b/yt_dlp/extractor/itv.py @@ -220,16 +220,23 @@ class ITVIE(InfoExtractor): class ITVBTCCIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?itv\.com/(?:news|btcc)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ 'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action', 'info_dict': { 'id': 'btcc-2019-brands-hatch-gp-race-action', 'title': 'BTCC 2019: Brands Hatch GP race action', }, 'playlist_count': 12, - } - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' + }, { + 'url': 'https://www.itv.com/news/2021-10-27/i-have-to-protect-the-country-says-rishi-sunak-as-uk-faces-interest-rate-hike', + 'info_dict': { + 'id': 'i-have-to-protect-the-country-says-rishi-sunak-as-uk-faces-interest-rate-hike', + 'title': 'md5:6ef054dd9f069330db3dcc66cb772d32' + }, + 'playlist_count': 4 + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' def _real_extract(self, url): playlist_id = self._match_id(url) @@ -240,15 +247,15 @@ class ITVBTCCIE(InfoExtractor): '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)</script>', webpage, 'json_map'), playlist_id), lambda x: x['props']['pageProps']['article']['body']['content']) or [] - # Discard empty objects - video_ids = [] + entries = [] for video in json_map: - if video['data'].get('id'): - video_ids.append(video['data']['id']) - - entries = [ - self.url_result( - smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, { + if not any(video['data'].get(attr) == 'Brightcove' for attr in ('name', 'type')): + continue + video_id = video['data']['id'] + account_id = video['data']['accountId'] + player_id = video['data']['playerId'] + entries.append(self.url_result( + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), { # ITV does not like some GB IP ranges, so here are some # IP blocks it accepts 'geo_ip_blocks': [ @@ -256,8 +263,7 @@ class ITVBTCCIE(InfoExtractor): ], 'referrer': url, }), - ie=BrightcoveNewIE.ie_key(), video_id=video_id) - for video_id in video_ids] + ie=BrightcoveNewIE.ie_key(), video_id=video_id)) title = self._og_search_title(webpage, fatal=False) diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index 26e7abc49..119b39997 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -1,13 +1,17 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import re from .theplatform import ThePlatformBaseIE from ..utils import ( ExtractorError, int_or_none, + OnDemandPagedList, parse_qs, + try_get, + urljoin, update_url_query, ) @@ -212,3 +216,81 @@ class MediasetIE(ThePlatformBaseIE): 'subtitles': subtitles, }) return info + + +class MediasetShowIE(MediasetIE): + _VALID_URL = r'''(?x) + (?: + https?:// + (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ + (?: + (?:fiction|programmi-tv|serie-tv)/(?:.+?/)? + (?:[a-z]+)_SE(?P<id>\d{12}) + (?:,ST(?P<st>\d{12}))? + (?:,sb(?P<sb>\d{9}))?$ + ) + ) + ''' + _TESTS = [{ + # TV Show webpage (with a single playlist) + 'url': 'https://www.mediasetplay.mediaset.it/serie-tv/fireforce/episodi_SE000000001556', + 'info_dict': { + 'id': '000000001556', + 'title': 'Fire Force', + }, + 'playlist_count': 1, + }, { + # TV Show webpage (with multiple playlists) + 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061,ST000000002763', + 'info_dict': { + 'id': '000000002763', + 'title': 'Le Iene', + }, + 'playlist_count': 7, + }, { + # TV Show specific playlist (single page) + 'url': 'https://www.mediasetplay.mediaset.it/serie-tv/fireforce/episodi_SE000000001556,ST000000002738,sb100013107', + 'info_dict': { + 'id': '100013107', + 'title': 'Episodi', + }, + 'playlist_count': 4, + }, { + # TV Show specific playlist (with multiple pages) + 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/iservizi_SE000000000061,ST000000002763,sb100013375', + 'info_dict': { + 'id': '100013375', + 'title': 'I servizi', + }, + 'playlist_count': 53, + }] + + _BY_SUBBRAND = 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2?byCustomValue={subBrandId}{%s}&sort=:publishInfo_lastPublished|desc,tvSeasonEpisodeNumber|desc&range=%d-%d' + _PAGE_SIZE = 25 + + def _fetch_page(self, sb, page): + lower_limit = page * self._PAGE_SIZE + 1 + upper_limit = lower_limit + self._PAGE_SIZE - 1 + content = self._download_json( + self._BY_SUBBRAND % (sb, lower_limit, upper_limit), sb) + for entry in content.get('entries') or []: + yield self.url_result( + 'mediaset:' + entry['guid'], + playlist_title=entry['mediasetprogram$subBrandDescription']) + + def _real_extract(self, url): + playlist_id, st, sb = self._match_valid_url(url).group('id', 'st', 'sb') + if not sb: + page = self._download_webpage(url, playlist_id) + entries = [self.url_result(urljoin('https://www.mediasetplay.mediaset.it', url)) + for url in re.findall(r'href="([^<>=]+SE\d{12},ST\d{12},sb\d{9})">[^<]+<', page)] + title = (self._html_search_regex(r'(?s)<h1[^>]*>(.+?)</h1>', page, 'title', default=None) + or self._og_search_title(page)) + return self.playlist_result(entries, st or playlist_id, title) + + entries = OnDemandPagedList( + functools.partial(self._fetch_page, sb), + self._PAGE_SIZE) + title = try_get(entries, lambda x: x[0]['playlist_title']) + + return self.playlist_result(entries, sb, title) diff --git a/yt_dlp/extractor/microsoftstream.py b/yt_dlp/extractor/microsoftstream.py new file mode 100644 index 000000000..4d5a9df1f --- /dev/null +++ b/yt_dlp/extractor/microsoftstream.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from base64 import b64decode + +from .common import InfoExtractor +from ..utils import ( + merge_dicts, + parse_iso8601, + parse_duration, + parse_resolution, + try_get, + url_basename, +) + + +class MicrosoftStreamIE(InfoExtractor): + IE_NAME = 'microsoftstream' + IE_DESC = 'Microsoft Stream' + _VALID_URL = r'https?://(?:web|www|msit)\.microsoftstream\.com/video/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + _TESTS = [{ + 'url': 'https://web.microsoftstream.com/video/6e51d928-4f46-4f1c-b141-369925e37b62?list=user&userId=f5491e02-e8fe-4e34-b67c-ec2e79a6ecc0', + 'only_matching': True, + }, { + 'url': 'https://msit.microsoftstream.com/video/b60f5987-aabd-4e1c-a42f-c559d138f2ca', + 'only_matching': True, + }] + + def _get_all_subtitles(self, api_url, video_id, headers): + subtitles = {} + automatic_captions = {} + text_tracks = self._download_json( + f'{api_url}/videos/{video_id}/texttracks', video_id, + note='Downloading subtitles JSON', fatal=False, headers=headers, + query={'api-version': '1.4-private'}).get('value') or [] + for track in text_tracks: + if not track.get('language') or not track.get('url'): + continue + sub_dict = automatic_captions if track.get('autoGenerated') else subtitles + sub_dict.setdefault(track['language'], []).append({ + 'ext': 'vtt', + 'url': track.get('url') + }) + return { + 'subtitles': subtitles, + 'automatic_captions': automatic_captions + } + + def extract_all_subtitles(self, *args, **kwargs): + if (self.get_param('writesubtitles', False) + or self.get_param('writeautomaticsub', False) + or self.get_param('listsubtitles')): + return self._get_all_subtitles(*args, **kwargs) + return {} + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + if '<title>Microsoft Stream</title>' not in webpage: + self.raise_login_required(method='cookies') + + access_token = self._html_search_regex(r'"AccessToken":"(.+?)"', webpage, 'access token') + api_url = self._html_search_regex(r'"ApiGatewayUri":"(.+?)"', webpage, 'api url') + + headers = {'Authorization': f'Bearer {access_token}'} + + video_data = self._download_json( + f'{api_url}/videos/{video_id}', video_id, + headers=headers, query={ + '$expand': 'creator,tokens,status,liveEvent,extensions', + 'api-version': '1.4-private' + }) + video_id = video_data.get('id') or video_id + language = video_data.get('language') + + thumbnails = [] + for thumbnail_id in ('extraSmall', 'small', 'medium', 'large'): + thumbnail_url = try_get(video_data, lambda x: x['posterImage'][thumbnail_id]['url'], str) + if not thumbnail_url: + continue + thumb = { + 'id': thumbnail_id, + 'url': thumbnail_url, + } + thumb_name = url_basename(thumbnail_url) + thumb_name = str(b64decode(thumb_name + '=' * (-len(thumb_name) % 4))) + thumb.update(parse_resolution(thumb_name)) + thumbnails.append(thumb) + + formats = [] + for playlist in video_data['playbackUrls']: + if playlist['mimeType'] == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + playlist['playbackUrl'], video_id, + ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False, headers=headers)) + elif playlist['mimeType'] == 'application/dash+xml': + formats.extend(self._extract_mpd_formats( + playlist['playbackUrl'], video_id, mpd_id='dash', + fatal=False, headers=headers)) + elif playlist['mimeType'] == 'application/vnd.ms-sstr+xml': + formats.extend(self._extract_ism_formats( + playlist['playbackUrl'], video_id, ism_id='mss', + fatal=False, headers=headers)) + formats = [merge_dicts(f, {'language': language}) for f in formats] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['name'], + 'description': video_data.get('description'), + 'uploader': try_get(video_data, lambda x: x['creator']['name'], str), + 'uploader_id': try_get(video_data, (lambda x: x['creator']['mail'], + lambda x: x['creator']['id']), str), + 'thumbnails': thumbnails, + **self.extract_all_subtitles(api_url, video_id, headers), + 'timestamp': parse_iso8601(video_data.get('created')), + 'duration': parse_duration(try_get(video_data, lambda x: x['media']['duration'])), + 'webpage_url': f'https://web.microsoftstream.com/video/{video_id}', + 'view_count': try_get(video_data, lambda x: x['metrics']['views'], int), + 'like_count': try_get(video_data, lambda x: x['metrics']['likes'], int), + 'comment_count': try_get(video_data, lambda x: x['metrics']['comments'], int), + 'formats': formats, + } diff --git a/yt_dlp/extractor/mlssoccer.py b/yt_dlp/extractor/mlssoccer.py new file mode 100644 index 000000000..2d65787e2 --- /dev/null +++ b/yt_dlp/extractor/mlssoccer.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MLSSoccerIE(InfoExtractor): + _VALID_DOMAINS = r'(?:(?:cfmontreal|intermiamicf|lagalaxy|lafc|houstondynamofc|dcunited|atlutd|mlssoccer|fcdallas|columbuscrew|coloradorapids|fccincinnati|chicagofirefc|austinfc|nashvillesc|whitecapsfc|sportingkc|soundersfc|sjearthquakes|rsl|timbers|philadelphiaunion|orlandocitysc|newyorkredbulls|nycfc)\.com|(?:torontofc)\.ca|(?:revolutionsoccer)\.net)' + _VALID_URL = r'(?:https?://)(?:www\.)?%s/video/#?(?P<id>[^/&$#?]+)' % _VALID_DOMAINS + + _TESTS = [{ + 'url': 'https://www.mlssoccer.com/video/the-octagon-can-alphonso-davies-lead-canada-to-first-world-cup-since-1986#the-octagon-can-alphonso-davies-lead-canada-to-first-world-cup-since-1986', + 'info_dict': { + 'id': '6276033198001', + 'ext': 'mp4', + 'title': 'The Octagon | Can Alphonso Davies lead Canada to first World Cup since 1986?', + 'description': 'md5:f0a883ee33592a0221798f451a98be8f', + 'thumbnail': 'https://cf-images.us-east-1.prod.boltdns.net/v1/static/5530036772001/1bbc44f6-c63c-4981-82fa-46b0c1f891e0/5c1ca44a-a033-4e98-b531-ff24c4947608/160x90/match/image.jpg', + 'duration': 350.165, + 'timestamp': 1633627291, + 'uploader_id': '5530036772001', + 'tags': ['club/canada'], + 'is_live': False, + 'duration_string': '5:50', + 'upload_date': '20211007', + 'filesize_approx': 255193528.83200002 + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.whitecapsfc.com/video/highlights-san-jose-earthquakes-vs-vancouver-whitecaps-fc-october-23-2021#highlights-san-jose-earthquakes-vs-vancouver-whitecaps-fc-october-23-2021', + 'only_matching': True + }, { + 'url': 'https://www.torontofc.ca/video/highlights-toronto-fc-vs-cf-montreal-october-23-2021-x6733#highlights-toronto-fc-vs-cf-montreal-october-23-2021-x6733', + 'only_matching': True + }, { + 'url': 'https://www.sportingkc.com/video/post-match-press-conference-john-pulskamp-oct-27-2021#post-match-press-conference-john-pulskamp-oct-27-2021', + 'only_matching': True + }, { + 'url': 'https://www.soundersfc.com/video/highlights-seattle-sounders-fc-vs-sporting-kansas-city-october-23-2021', + 'only_matching': True + }, { + 'url': 'https://www.sjearthquakes.com/video/#highlights-austin-fc-vs-san-jose-earthquakes-june-19-2021', + 'only_matching': True + }, { + 'url': 'https://www.rsl.com/video/2021-u-of-u-health-mic-d-up-vs-colorado-10-16-21#2021-u-of-u-health-mic-d-up-vs-colorado-10-16-21', + 'only_matching': True + }, { + 'url': 'https://www.timbers.com/video/highlights-d-chara-asprilla-with-goals-in-portland-timbers-2-0-win-over-san-jose#highlights-d-chara-asprilla-with-goals-in-portland-timbers-2-0-win-over-san-jose', + 'only_matching': True + }, { + 'url': 'https://www.philadelphiaunion.com/video/highlights-torvphi', + 'only_matching': True + }, { + 'url': 'https://www.orlandocitysc.com/video/highlight-columbus-crew-vs-orlando-city-sc', + 'only_matching': True + }, { + 'url': 'https://www.newyorkredbulls.com/video/all-access-matchday-double-derby-week#all-access-matchday-double-derby-week', + 'only_matching': True + }, { + 'url': 'https://www.nycfc.com/video/highlights-nycfc-1-0-chicago-fire-fc#highlights-nycfc-1-0-chicago-fire-fc', + 'only_matching': True + }, { + 'url': 'https://www.revolutionsoccer.net/video/two-minute-highlights-revs-1-rapids-0-october-27-2021#two-minute-highlights-revs-1-rapids-0-october-27-2021', + 'only_matching': True + }, { + 'url': 'https://www.nashvillesc.com/video/goal-c-j-sapong-nashville-sc-92nd-minute', + 'only_matching': True + }, { + 'url': 'https://www.cfmontreal.com/video/faits-saillants-tor-v-mtl#faits-saillants-orl-v-mtl-x5645', + 'only_matching': True + }, { + 'url': 'https://www.intermiamicf.com/video/all-access-victory-vs-nashville-sc-by-ukg#all-access-victory-vs-nashville-sc-by-ukg', + 'only_matching': True + }, { + 'url': 'https://www.lagalaxy.com/video/#moment-of-the-month-presented-by-san-manuel-casino-rayan-raveloson-scores-his-se', + 'only_matching': True + }, { + 'url': 'https://www.lafc.com/video/breaking-down-lafc-s-final-6-matches-of-the-2021-mls-regular-season#breaking-down-lafc-s-final-6-matches-of-the-2021-mls-regular-season', + 'only_matching': True + }, { + 'url': 'https://www.houstondynamofc.com/video/postgame-press-conference-michael-nelson-presented-by-coushatta-casino-res-x9660#postgame-press-conference-michael-nelson-presented-by-coushatta-casino-res-x9660', + 'only_matching': True + }, { + 'url': 'https://www.dcunited.com/video/tony-alfaro-my-family-pushed-me-to-believe-everything-was-possible', + 'only_matching': True + }, { + 'url': 'https://www.fcdallas.com/video/highlights-fc-dallas-vs-minnesota-united-fc-october-02-2021#highlights-fc-dallas-vs-minnesota-united-fc-october-02-2021', + 'only_matching': True + }, { + 'url': 'https://www.columbuscrew.com/video/match-rewind-columbus-crew-vs-new-york-red-bulls-october-23-2021', + 'only_matching': True + }, { + 'url': 'https://www.coloradorapids.com/video/postgame-reaction-robin-fraser-october-27#postgame-reaction-robin-fraser-october-27', + 'only_matching': True + }, { + 'url': 'https://www.fccincinnati.com/video/#keeping-cincy-chill-presented-by-coors-lite', + 'only_matching': True + }, { + 'url': 'https://www.chicagofirefc.com/video/all-access-fire-score-dramatic-road-win-in-cincy#all-access-fire-score-dramatic-road-win-in-cincy', + 'only_matching': True + }, { + 'url': 'https://www.austinfc.com/video/highlights-colorado-rapids-vs-austin-fc-september-29-2021#highlights-colorado-rapids-vs-austin-fc-september-29-2021', + 'only_matching': True + }, { + 'url': 'https://www.atlutd.com/video/goal-josef-martinez-scores-in-the-73rd-minute#goal-josef-martinez-scores-in-the-73rd-minute', + 'only_matching': True + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._html_search_regex(r'data-options\=\"([^\"]+)\"', webpage, 'json'), id)['videoList'][0] + return { + 'id': id, + '_type': 'url', + 'url': 'https://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (data_json['accountId'], data_json['videoId']), + 'ie_key': 'BrightcoveNew', + } diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py index e0608845d..141dd7deb 100644 --- a/yt_dlp/extractor/mtv.py +++ b/yt_dlp/extractor/mtv.py @@ -306,6 +306,14 @@ class MTVServicesInfoExtractor(InfoExtractor): mgid = self._extract_triforce_mgid(webpage) if not mgid: + mgid = self._search_regex( + r'"videoConfig":{"videoId":"(mgid:.*?)"', webpage, 'mgid', default=None) + + if not mgid: + mgid = self._search_regex( + r'"media":{"video":{"config":{"uri":"(mgid:.*?)"', webpage, 'mgid', default=None) + + if not mgid: data = self._parse_json(self._search_regex( r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) main_container = self._extract_child_with_type(data, 'MainContainer') @@ -313,10 +321,6 @@ class MTVServicesInfoExtractor(InfoExtractor): video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer') mgid = video_player['props']['media']['video']['config']['uri'] - if not mgid: - mgid = self._search_regex( - r'"media":{"video":{"config":{"uri":"(mgid:.*?)"', webpage, 'mgid', default=None) - return mgid def _real_extract(self, url): diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py index acf53c1ff..a6821ba86 100644 --- a/yt_dlp/extractor/naver.py +++ b/yt_dlp/extractor/naver.py @@ -40,6 +40,7 @@ class NaverBaseIE(InfoExtractor): formats.append({ 'format_id': '%s_%s' % (stream.get('type') or stream_type, dict_get(encoding_option, ('name', 'id'))), 'url': stream_url, + 'ext': 'mp4', 'width': int_or_none(encoding_option.get('width')), 'height': int_or_none(encoding_option.get('height')), 'vbr': int_or_none(bitrate.get('video')), @@ -174,7 +175,7 @@ class NaverLiveIE(InfoExtractor): 'url': 'https://tv.naver.com/l/52010', 'info_dict': { 'id': '52010', - 'ext': 'm3u8', + 'ext': 'mp4', 'title': '[LIVE] 뉴스특보 : "수도권 거리두기, 2주간 2단계로 조정"', 'description': 'md5:df7f0c237a5ed5e786ce5c91efbeaab3', 'channel_id': 'NTV-ytnnews24-0', @@ -184,7 +185,7 @@ class NaverLiveIE(InfoExtractor): 'url': 'https://tv.naver.com/l/51549', 'info_dict': { 'id': '51549', - 'ext': 'm3u8', + 'ext': 'mp4', 'title': '연합뉴스TV - 코로나19 뉴스특보', 'description': 'md5:c655e82091bc21e413f549c0eaccc481', 'channel_id': 'NTV-yonhapnewstv-0', @@ -233,7 +234,7 @@ class NaverLiveIE(InfoExtractor): continue formats.extend(self._extract_m3u8_formats( - quality.get('url'), video_id, 'm3u8', + quality.get('url'), video_id, 'mp4', m3u8_id=quality.get('qualityId'), live=True )) self._sort_formats(formats) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 76f087057..4bcea33d5 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -704,7 +704,6 @@ class NicovideoSearchURLIE(InfoExtractor): class NicovideoSearchIE(SearchInfoExtractor, NicovideoSearchURLIE): IE_DESC = 'Nico video searches' - _MAX_RESULTS = float('inf') IE_NAME = NicovideoSearchIE_NAME _SEARCH_KEY = 'nicosearch' _TESTS = [] diff --git a/yt_dlp/extractor/nrk.py b/yt_dlp/extractor/nrk.py index b556bc6aa..49d58a685 100644 --- a/yt_dlp/extractor/nrk.py +++ b/yt_dlp/extractor/nrk.py @@ -147,7 +147,7 @@ class NRKIE(NRKBaseIE): def _real_extract(self, url): video_id = self._match_id(url).split('/')[-1] - path_templ = 'playback/%s/' + video_id + path_templ = 'playback/%s/program/' + video_id def call_playback_api(item, query=None): return self._call_api(path_templ % item, video_id, item, query=query) @@ -188,7 +188,7 @@ class NRKIE(NRKBaseIE): title = titles['title'] alt_title = titles.get('subtitle') - description = preplay.get('description') + description = try_get(preplay, lambda x: x['description'].replace('\r', '\n')) duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) thumbnails = [] diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index a189c0237..c7d316efc 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -161,7 +161,7 @@ class PatreonIE(InfoExtractor): if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo': embed_html = try_get(attributes, lambda x: x['embed']['html']) v_url = url_or_none(compat_urllib_parse_unquote( - self._search_regex(r'src=(https%3A%2F%2Fplayer\.vimeo\.com.+)%3F', embed_html, 'vimeo url', fatal=False))) + self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False))) if v_url: info.update({ '_type': 'url_transparent', diff --git a/yt_dlp/extractor/sky.py b/yt_dlp/extractor/sky.py index ff2c977a0..ad1e62d88 100644 --- a/yt_dlp/extractor/sky.py +++ b/yt_dlp/extractor/sky.py @@ -105,6 +105,34 @@ class SkyNewsIE(SkyBaseIE): } +class SkyNewsStoryIE(SkyBaseIE): + IE_NAME = 'sky:news:story' + _VALID_URL = r'https?://news\.sky\.com/story/[0-9a-z-]+-(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://news.sky.com/story/budget-2021-chancellor-rishi-sunak-vows-address-will-deliver-strong-economy-fit-for-a-new-age-of-optimism-12445425', + 'info_dict': { + 'id': 'ref:0714acb9-123d-42c8-91b8-5c1bc6c73f20', + 'title': 'md5:e408dd7aad63f31a1817bbe40c7d276f', + 'description': 'md5:a881e12f49212f92be2befe4a09d288a', + 'ext': 'mp4', + 'upload_date': '20211027', + 'timestamp': 1635317494, + 'uploader_id': '6058004172001', + } + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + + entries = [self._process_ooyala_element(webpage, sdc_el, url) + for sdc_el in re.findall(self._SDC_EL_REGEX, webpage)] + + return self.playlist_result( + entries, article_id, self._og_search_title(webpage), + self._html_search_meta(['og:description', 'description'], webpage)) + + class SkySportsNewsIE(SkyBaseIE): IE_NAME = 'sky:sports:news' _VALID_URL = r'https?://(?:www\.)?skysports\.com/([^/]+/)*news/\d+/(?P<id>\d+)' diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index e89383ff1..824528474 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -856,7 +856,7 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): IE_NAME = 'soundcloud:search' IE_DESC = 'Soundcloud search' - _MAX_RESULTS = float('inf') + _SEARCH_KEY = 'scsearch' _TESTS = [{ 'url': 'scsearch15:post-avant jazzcore', 'info_dict': { @@ -865,7 +865,6 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): 'playlist_count': 15, }] - _SEARCH_KEY = 'scsearch' _MAX_RESULTS_PER_PAGE = 200 _DEFAULT_RESULTS_PER_PAGE = 50 diff --git a/yt_dlp/extractor/tagesschau.py b/yt_dlp/extractor/tagesschau.py index 25c200455..6e03d0a7d 100644 --- a/yt_dlp/extractor/tagesschau.py +++ b/yt_dlp/extractor/tagesschau.py @@ -5,177 +5,63 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, js_to_json, - parse_iso8601, - parse_filesize, + extract_attributes, + try_get, + int_or_none, ) -class TagesschauPlayerIE(InfoExtractor): - IE_NAME = 'tagesschau:player' - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html' - - _TESTS = [{ - 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html', - 'md5': '8d09548d5c15debad38bee3a4d15ca21', - 'info_dict': { - 'id': '179517', - 'ext': 'mp4', - 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD', - 'thumbnail': r're:^https?:.*\.jpg$', - 'formats': 'mincount:6', - }, - }, { - 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', - 'md5': '76e6eec6ebd40740671cf0a2c88617e5', - 'info_dict': { - 'id': '29417', - 'ext': 'mp3', - 'title': 'Trabi - Bye, bye Rennpappe', - 'thumbnail': r're:^https?:.*\.jpg$', - 'formats': 'mincount:2', - }, - }, { - 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html', - 'only_matching': True, - }] - - _FORMATS = { - 'xs': {'quality': 0}, - 's': {'width': 320, 'height': 180, 'quality': 1}, - 'm': {'width': 512, 'height': 288, 'quality': 2}, - 'l': {'width': 960, 'height': 540, 'quality': 3}, - 'xl': {'width': 1280, 'height': 720, 'quality': 4}, - 'xxl': {'quality': 5}, - } - - def _extract_via_api(self, kind, video_id): - info = self._download_json( - 'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id), - video_id) - title = info['headline'] - formats = [] - for media in info['mediadata']: - for format_id, format_url in media.items(): - if determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls')) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'vcodec': 'none' if kind == 'audio' else None, - }) - self._sort_formats(formats) - timestamp = parse_iso8601(info.get('date')) - return { - 'id': video_id, - 'title': title, - 'timestamp': timestamp, - 'formats': formats, - } - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - # kind = mobj.group('kind').lower() - # if kind == 'video': - # return self._extract_via_api(kind, video_id) - - # JSON api does not provide some audio formats (e.g. ogg) thus - # extracting audio via webpage - - webpage = self._download_webpage(url, video_id) - - title = self._og_search_title(webpage).strip() - formats = [] - - for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage): - media = self._parse_json(js_to_json(media_json), video_id, fatal=False) - if not media: - continue - src = media.get('src') - if not src: - return - quality = media.get('quality') - kind = media.get('type', '').split('/')[0] - ext = determine_ext(src) - f = { - 'url': src, - 'format_id': '%s_%s' % (quality, ext) if quality else ext, - 'ext': ext, - 'vcodec': 'none' if kind == 'audio' else None, - } - f.update(self._FORMATS.get(quality, {})) - formats.append(f) - - self._sort_formats(formats) - - thumbnail = self._og_search_thumbnail(webpage) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } - - class TagesschauIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', - 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6', + 'md5': '7a7287612fa881a1ae1d087df45c2fd6', 'info_dict': { - 'id': 'video-102143', + 'id': 'video-102143-1', 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', - 'description': '18.07.2015 20:10 Uhr', - 'thumbnail': r're:^https?:.*\.jpg$', }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', 'md5': '3c54c1f6243d279b706bde660ceec633', 'info_dict': { - 'id': 'ts-5727', + 'id': 'ts-5727-1', 'ext': 'mp4', - 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', - 'description': 'md5:695c01bfd98b7e313c501386327aea59', - 'thumbnail': r're:^https?:.*\.jpg$', + 'title': 'Ganze Sendung', }, }, { # exclusive audio 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', - 'md5': '76e6eec6ebd40740671cf0a2c88617e5', + 'md5': '4cf22023c285f35e99c24d290ba58cc9', 'info_dict': { - 'id': 'audio-29417', + 'id': 'audio-29417-1', 'ext': 'mp3', - 'title': 'Trabi - Bye, bye Rennpappe', - 'description': 'md5:8687dda862cbbe2cfb2df09b56341317', - 'thumbnail': r're:^https?:.*\.jpg$', + 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt', }, }, { - # audio in article 'url': 'http://www.tagesschau.de/inland/bnd-303.html', - 'md5': 'e0916c623e85fc1d2b26b78f299d3958', + 'md5': '12cfb212d9325b5ba0d52b625f1aa61c', 'info_dict': { - 'id': 'bnd-303', - 'ext': 'mp3', - 'title': 'Viele Baustellen für neuen BND-Chef', - 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4', - 'thumbnail': r're:^https?:.*\.jpg$', + 'id': 'bnd-303-1', + 'ext': 'mp4', + 'title': 'SPD-Gruppenbild mit Bärbel Bas nach der Fraktionssitzung | dpa', }, }, { 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', 'info_dict': { 'id': 'afd-parteitag-135', - 'title': 'Möchtegern-Underdog mit Machtanspruch', + 'title': 'AfD', + }, + 'playlist_count': 20, + }, { + 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', + 'info_dict': { + 'id': 'audio-29417-1', + 'ext': 'mp3', + 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt', }, - 'playlist_count': 2, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', 'only_matching': True, @@ -206,62 +92,6 @@ class TagesschauIE(InfoExtractor): 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url) - - def _extract_formats(self, download_text, media_kind): - links = re.finditer( - r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>', - download_text) - formats = [] - for l in links: - link_url = l.group('url') - if not link_url: - continue - format_id = self._search_regex( - r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID', - default=determine_ext(link_url)) - format = { - 'format_id': format_id, - 'url': l.group('url'), - 'format_name': l.group('name'), - } - title = l.group('title') - if title: - if media_kind.lower() == 'video': - m = re.match( - r'''(?x) - Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10; - (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10; - (?P<vbr>[0-9]+)kbps&\#10; - Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10; - Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''', - title) - if m: - format.update({ - 'format_note': m.group('audio_desc'), - 'vcodec': m.group('vcodec'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - 'abr': int(m.group('abr')), - 'vbr': int(m.group('vbr')), - 'filesize_approx': parse_filesize(m.group('filesize_approx')), - }) - else: - m = re.match( - r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)', - title) - if m: - format.update({ - 'format_note': '%s, %s' % (m.group('format'), m.group('note')), - 'vcodec': 'none', - 'abr': int(m.group('abr')), - }) - formats.append(format) - self._sort_formats(formats) - return formats - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') or mobj.group('path') @@ -271,34 +101,46 @@ class TagesschauIE(InfoExtractor): title = self._html_search_regex( r'<span[^>]*class="headline"[^>]*>(.+?)</span>', - webpage, 'title', default=None) or self._og_search_title(webpage) - - DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>' - - webpage_type = self._og_search_property('type', webpage, default=None) - if webpage_type == 'website': # Article - entries = [] - for num, (entry_title, media_kind, download_text) in enumerate(re.findall( - r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX, - webpage), 1): + webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False) + + entries = [] + videos = re.findall(r'<div[^>]+>', webpage) + num = 0 + for video in videos: + video = extract_attributes(video).get('data-config') + if not video: + continue + video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False) + video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray']) + if not video_formats: + continue + num += 1 + for video_format in video_formats: + media_url = video_format.get('_stream') or '' + formats = [] + if media_url.endswith('master.m3u8'): + formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls') + elif media_url.endswith('.hi.mp3') and media_url.startswith('https://download'): + formats = [{ + 'url': media_url, + 'vcodec': 'none', + }] + if not formats: + continue entries.append({ 'id': '%s-%d' % (display_id, num), - 'title': '%s' % entry_title, - 'formats': self._extract_formats(download_text, media_kind), + 'title': try_get(video, lambda x: x['mc']['_title']), + 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])), + 'formats': formats }) - if len(entries) > 1: - return self.playlist_result(entries, display_id, title) - formats = entries[0]['formats'] - else: # Assume single video - download_text = self._search_regex( - DOWNLOAD_REGEX, webpage, 'download links', group='links') - media_kind = self._search_regex( - DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind') - formats = self._extract_formats(download_text, media_kind) - thumbnail = self._og_search_thumbnail(webpage) - description = self._html_search_regex( - r'(?s)<p class="teasertext">(.*?)</p>', - webpage, 'description', default=None) + if len(entries) > 1: + return self.playlist_result(entries, display_id, title) + formats = entries[0]['formats'] + video_info = self._search_json_ld(webpage, video_id) + description = video_info.get('description') + thumbnail = self._og_search_thumbnail(webpage) or video_info.get('thumbnail') + timestamp = video_info.get('timestamp') + title = title or video_info.get('description') self._sort_formats(formats) @@ -307,5 +149,6 @@ class TagesschauIE(InfoExtractor): 'title': title, 'thumbnail': thumbnail, 'formats': formats, + 'timestamp': timestamp, 'description': description, } diff --git a/yt_dlp/extractor/threespeak.py b/yt_dlp/extractor/threespeak.py new file mode 100644 index 000000000..60e84529d --- /dev/null +++ b/yt_dlp/extractor/threespeak.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + try_get, + unified_strdate, +) + + +class ThreeSpeakIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?3speak\.tv/watch\?v\=[^/]+/(?P<id>[^/$&#?]+)' + + _TESTS = [{ + 'url': 'https://3speak.tv/watch?v=dannyshine/wjgoxyfy', + 'info_dict': { + 'id': 'wjgoxyfy', + 'ext': 'mp4', + 'title': 'Can People who took the Vax think Critically', + 'uploader': 'dannyshine', + 'description': 'md5:181aa7ccb304afafa089b5af3bca7a10', + 'tags': ['sex', 'covid', 'antinatalism', 'comedy', 'vaccines'], + 'thumbnail': 'https://img.3speakcontent.co/wjgoxyfy/thumbnails/default.png', + 'upload_date': '20211021', + 'duration': 2703.867833, + 'filesize': 1620054781, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + json_str = self._html_search_regex(r'JSON\.parse\(\'([^\']+)\'\)', webpage, 'json') + # The json string itself is escaped. Hence the double parsing + data_json = self._parse_json(self._parse_json(f'"{json_str}"', id), id) + video_json = self._parse_json(data_json['json_metadata'], id) + formats, subtitles = [], {} + og_m3u8 = self._html_search_regex(r'<meta\s?property=\"ogvideo\"\s?content=\"([^\"]+)\">', webpage, 'og m3u8', fatal=False) + if og_m3u8: + https_frmts, https_subs = self._extract_m3u8_formats_and_subtitles(og_m3u8, id, fatal=False, m3u8_id='https') + formats.extend(https_frmts) + subtitles = self._merge_subtitles(subtitles, https_subs) + ipfs_m3u8 = try_get(video_json, lambda x: x['video']['info']['ipfs']) + if ipfs_m3u8: + ipfs_frmts, ipfs_subs = self._extract_m3u8_formats_and_subtitles(f'https://ipfs.3speak.tv/ipfs/{ipfs_m3u8}', + id, fatal=False, m3u8_id='ipfs') + formats.extend(ipfs_frmts) + subtitles = self._merge_subtitles(subtitles, ipfs_subs) + mp4_file = try_get(video_json, lambda x: x['video']['info']['file']) + if mp4_file: + formats.append({ + 'url': f'https://threespeakvideo.b-cdn.net/{id}/{mp4_file}', + 'ext': 'mp4', + 'format_id': 'https-mp4', + 'duration': try_get(video_json, lambda x: x['video']['info']['duration']), + 'filesize': try_get(video_json, lambda x: x['video']['info']['filesize']), + 'quality': 11, + 'format_note': 'Original file', + }) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title') or data_json.get('root_title'), + 'uploader': data_json.get('author'), + 'description': try_get(video_json, lambda x: x['video']['content']['description']), + 'tags': try_get(video_json, lambda x: x['video']['content']['tags']), + 'thumbnail': try_get(video_json, lambda x: x['image'][0]), + 'upload_date': unified_strdate(data_json.get('created')), + 'formats': formats, + 'subtitles': subtitles, + } + + +class ThreeSpeakUserIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?3speak\.tv/user/(?P<id>[^/$&?#]+)' + + _TESTS = [{ + 'url': 'https://3speak.tv/user/theycallmedan', + 'info_dict': { + 'id': 'theycallmedan', + }, + 'playlist_mincount': 115, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + entries = [ + self.url_result( + 'https://3speak.tv/watch?v=%s' % video, + ie=ThreeSpeakIE.ie_key()) + for video in re.findall(r'data-payout\s?\=\s?\"([^\"]+)\"', webpage) if video + ] + return self.playlist_result(entries, id) diff --git a/yt_dlp/extractor/trovo.py b/yt_dlp/extractor/trovo.py index ec55f41f2..a0f0cc31c 100644 --- a/yt_dlp/extractor/trovo.py +++ b/yt_dlp/extractor/trovo.py @@ -223,7 +223,7 @@ class TrovoChannelBaseIE(InfoExtractor): class TrovoChannelVodIE(TrovoChannelBaseIE): _VALID_URL = r'trovovod:(?P<id>[^\s]+)' - IE_DESC = 'All VODs of a trovo.live channel, "trovovod" keyword' + IE_DESC = 'All VODs of a trovo.live channel; "trovovod:" prefix' _TESTS = [{ 'url': 'trovovod:OneTappedYou', @@ -244,7 +244,7 @@ class TrovoChannelVodIE(TrovoChannelBaseIE): class TrovoChannelClipIE(TrovoChannelBaseIE): _VALID_URL = r'trovoclip:(?P<id>[^\s]+)' - IE_DESC = 'All Clips of a trovo.live channel, "trovoclip" keyword' + IE_DESC = 'All Clips of a trovo.live channel; "trovoclip:" prefix' _TESTS = [{ 'url': 'trovoclip:OneTappedYou', diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 485b781ca..0749263d9 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -485,7 +485,7 @@ class TwitterIE(TwitterBaseIE): fmts, subs = self._extract_variant_formats(variant, twid) subtitles = self._merge_subtitles(subtitles, subs) formats.extend(fmts) - self._sort_formats(formats) + self._sort_formats(formats, ('res', 'br', 'size', 'proto')) # The codec of http formats are unknown thumbnails = [] media_url = media.get('media_url_https') or media.get('media_url') diff --git a/yt_dlp/extractor/viewlift.py b/yt_dlp/extractor/viewlift.py index c3b2e863d..5b558d890 100644 --- a/yt_dlp/extractor/viewlift.py +++ b/yt_dlp/extractor/viewlift.py @@ -9,6 +9,7 @@ from ..utils import ( ExtractorError, int_or_none, parse_age_limit, + traverse_obj, ) @@ -32,26 +33,36 @@ class ViewLiftBaseIE(InfoExtractor): } _TOKENS = {} - def _call_api(self, site, path, video_id, query): - token = self._TOKENS.get(site) - if not token: - token_query = {'site': site} - email, password = self._get_login_info(netrc_machine=site) - if email: - resp = self._download_json( - self._API_BASE + 'identity/signin', video_id, - 'Logging in', query=token_query, data=json.dumps({ - 'email': email, - 'password': password, - }).encode()) - else: - resp = self._download_json( - self._API_BASE + 'identity/anonymous-token', video_id, - 'Downloading authorization token', query=token_query) - self._TOKENS[site] = token = resp['authorizationToken'] - return self._download_json( - self._API_BASE + path, video_id, - headers={'Authorization': token}, query=query) + def _fetch_token(self, site, url): + if self._TOKENS.get(site): + return + email, password = self._get_login_info(netrc_machine=site) + if email: + self.report_warning('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies']) + + cookies = self._get_cookies(url) + if cookies and cookies.get('token'): + self._TOKENS[site] = self._search_regex(r'22authorizationToken\%22:\%22([^\%]+)\%22', cookies['token'].value, 'token') + if not self._TOKENS.get(site): + self.raise_login_required('Cookies (not necessarily logged in) are needed to download from this website', method='cookies') + + def _call_api(self, site, path, video_id, url, query): + self._fetch_token(site, url) + try: + return self._download_json( + self._API_BASE + path, video_id, headers={'Authorization': self._TOKENS.get(site)}, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + webpage = e.cause.read().decode() + try: + error_message = traverse_obj(json.loads(webpage), 'errorMessage', 'message') + except json.JSONDecodeError: + raise ExtractorError(f'{site} said: {webpage}', cause=e.cause) + if error_message: + if 'has not purchased' in error_message: + self.raise_login_required(method='cookies') + raise ExtractorError(error_message, expected=True) + raise class ViewLiftEmbedIE(ViewLiftBaseIE): @@ -96,27 +107,24 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): site = domain.split('.')[-2] if site in self._SITE_MAP: site = self._SITE_MAP[site] - try: - content_data = self._call_api( - site, 'entitlement/video/status', film_id, { - 'id': film_id - })['video'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error_message = self._parse_json(e.cause.read().decode(), film_id).get('errorMessage') - if error_message == 'User does not have a valid subscription or has not purchased this content.': - self.raise_login_required() - raise ExtractorError(error_message, expected=True) - raise + + content_data = self._call_api( + site, 'entitlement/video/status', film_id, url, { + 'id': film_id + })['video'] gist = content_data['gist'] title = gist['title'] video_assets = content_data['streamingInfo']['videoAssets'] - formats = [] - mpeg_video_assets = video_assets.get('mpeg') or [] - for video_asset in mpeg_video_assets: + hls_url = video_assets.get('hls') + formats, subtitles = [], {} + if hls_url: + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + + for video_asset in video_assets.get('mpeg') or []: video_asset_url = video_asset.get('url') - if not video_asset: + if not video_asset_url: continue bitrate = int_or_none(video_asset.get('bitrate')) height = int_or_none(self._search_regex( @@ -130,13 +138,17 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'vcodec': video_asset.get('codec'), }) - hls_url = video_assets.get('hls') - if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) + subs = {} + for sub in traverse_obj(content_data, ('contentDetails', 'closedCaptions')) or []: + sub_url = sub.get('url') + if not sub_url: + continue + subs.setdefault(sub.get('language', 'English'), []).append({ + 'url': sub_url, + }) - info = { + self._sort_formats(formats) + return { 'id': film_id, 'title': title, 'description': gist.get('description'), @@ -145,14 +157,15 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'age_limit': parse_age_limit(content_data.get('parentalRating')), 'timestamp': int_or_none(gist.get('publishDate'), 1000), 'formats': formats, + 'subtitles': self._merge_subtitles(subs, subtitles), + 'categories': traverse_obj(content_data, ('categories', ..., 'title')), + 'tags': traverse_obj(content_data, ('tags', ..., 'title')), } - for k in ('categories', 'tags'): - info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] - return info class ViewLiftIE(ViewLiftBaseIE): IE_NAME = 'viewlift' + _API_BASE = 'https://prod-api-cached-2.viewlift.com/' _VALID_URL = r'https?://(?:www\.)?(?P<domain>%s)(?P<path>(?:/(?:films/title|show|(?:news/)?videos?|watch))?/(?P<id>[^?#]+))' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://www.snagfilms.com/films/title/lost_for_life', @@ -222,24 +235,111 @@ class ViewLiftIE(ViewLiftBaseIE): }, { 'url': 'https://www.marquee.tv/watch/sadlerswells-sacredmonsters', 'only_matching': True, + }, { # Free film with langauge code + 'url': 'https://www.hoichoi.tv/bn/films/title/shuyopoka', + 'info_dict': { + 'id': '7a7a9d33-1f4c-4771-9173-ee4fb6dbf196', + 'ext': 'mp4', + 'title': 'Shuyopoka', + 'description': 'md5:e28f2fb8680096a69c944d37c1fa5ffc', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20211006', + 'series': None + }, + 'params': {'skip_download': True}, + }, { # Free film + 'url': 'https://www.hoichoi.tv/films/title/dadu-no1', + 'info_dict': { + 'id': '0000015b-b009-d126-a1db-b81ff3780000', + 'ext': 'mp4', + 'title': 'Dadu No.1', + 'description': 'md5:605cba408e51a79dafcb824bdeded51e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20210827', + 'series': None + }, + 'params': {'skip_download': True}, + }, { # Free episode + 'url': 'https://www.hoichoi.tv/webseries/case-jaundice-s01-e01', + 'info_dict': { + 'id': 'f779e07c-30c8-459c-8612-5a834ab5e5ba', + 'ext': 'mp4', + 'title': 'Humans Vs. Corona', + 'description': 'md5:ca30a682b4528d02a3eb6d0427dd0f87', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20210830', + 'series': 'Case Jaundice' + }, + 'params': {'skip_download': True}, + }, { # Free video + 'url': 'https://www.hoichoi.tv/videos/1549072415320-six-episode-02-hindi', + 'info_dict': { + 'id': 'b41fa1ce-aca6-47b6-b208-283ff0a2de30', + 'ext': 'mp4', + 'title': 'Woman in red - Hindi', + 'description': 'md5:9d21edc1827d32f8633eb67c2054fc31', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20211006', + 'series': 'Six (Hindi)' + }, + 'params': {'skip_download': True}, + }, { # Free episode + 'url': 'https://www.hoichoi.tv/shows/watch-asian-paints-moner-thikana-online-season-1-episode-1', + 'info_dict': { + 'id': '1f45d185-8500-455c-b88d-13252307c3eb', + 'ext': 'mp4', + 'title': 'Jisshu Sengupta', + 'description': 'md5:ef6ffae01a3d83438597367400f824ed', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20211004', + 'series': 'Asian Paints Moner Thikana' + }, + 'params': {'skip_download': True}, + }, { # Free series + 'url': 'https://www.hoichoi.tv/shows/watch-moner-thikana-bengali-web-series-online', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'watch-moner-thikana-bengali-web-series-online', + }, + }, { # Premium series + 'url': 'https://www.hoichoi.tv/shows/watch-byomkesh-bengali-web-series-online', + 'playlist_mincount': 14, + 'info_dict': { + 'id': 'watch-byomkesh-bengali-web-series-online', + }, + }, { # Premium movie + 'url': 'https://www.hoichoi.tv/movies/detective-2020', + 'only_matching': True }] @classmethod def suitable(cls, url): return False if ViewLiftEmbedIE.suitable(url) else super(ViewLiftIE, cls).suitable(url) + def _show_entries(self, domain, seasons): + for season in seasons: + for episode in season.get('episodes') or []: + path = traverse_obj(episode, ('gist', 'permalink')) + if path: + yield self.url_result(f'https://www.{domain}{path}', ie=self.ie_key()) + def _real_extract(self, url): domain, path, display_id = self._match_valid_url(url).groups() site = domain.split('.')[-2] if site in self._SITE_MAP: site = self._SITE_MAP[site] modules = self._call_api( - site, 'content/pages', display_id, { + site, 'content/pages', display_id, url, { 'includeContent': 'true', 'moduleOffset': 1, 'path': path, 'site': site, })['modules'] + + seasons = next((m['contentData'][0]['seasons'] for m in modules if m.get('moduleType') == 'ShowDetailModule'), None) + if seasons: + return self.playlist_result(self._show_entries(domain, seasons), display_id) + film_id = next(m['contentData'][0]['gist']['id'] for m in modules if m.get('moduleType') == 'VideoDetailModule') return { '_type': 'url_transparent', diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 8b367a4e6..04c504934 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import base64 import functools -import json import re import itertools @@ -17,8 +16,8 @@ from ..compat import ( from ..utils import ( clean_html, determine_ext, - dict_get, ExtractorError, + get_element_by_class, js_to_json, int_or_none, merge_dicts, @@ -26,7 +25,6 @@ from ..utils import ( parse_filesize, parse_iso8601, parse_qs, - RegexNotFoundError, sanitized_Request, smuggle_url, std_headers, @@ -129,10 +127,11 @@ class VimeoBaseInfoExtractor(InfoExtractor): video_title = video_data['title'] live_event = video_data.get('live_event') or {} is_live = live_event.get('status') == 'started' + request = config.get('request') or {} formats = [] - config_files = video_data.get('files') or config['request'].get('files', {}) - for f in config_files.get('progressive', []): + config_files = video_data.get('files') or request.get('files') or {} + for f in (config_files.get('progressive') or []): video_url = f.get('url') if not video_url: continue @@ -148,7 +147,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): # TODO: fix handling of 308 status code returned for live archive manifest requests sep_pattern = r'/sep/video/' for files_type in ('hls', 'dash'): - for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items(): + for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items(): manifest_url = cdn_data.get('url') if not manifest_url: continue @@ -188,17 +187,15 @@ class VimeoBaseInfoExtractor(InfoExtractor): }) subtitles = {} - text_tracks = config['request'].get('text_tracks') - if text_tracks: - for tt in text_tracks: - subtitles[tt['lang']] = [{ - 'ext': 'vtt', - 'url': urljoin('https://vimeo.com', tt['url']), - }] + for tt in (request.get('text_tracks') or []): + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': urljoin('https://vimeo.com', tt['url']), + }] thumbnails = [] if not is_live: - for key, thumb in video_data.get('thumbs', {}).items(): + for key, thumb in (video_data.get('thumbs') or {}).items(): thumbnails.append({ 'id': key, 'width': int_or_none(key), @@ -342,6 +339,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 1595, 'upload_date': '20130610', 'timestamp': 1370893156, + 'license': 'by', }, 'params': { 'format': 'best[protocol=https]', @@ -420,6 +418,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'staff', 'uploader': 'Vimeo Staff', 'duration': 62, + 'subtitles': { + 'de': [{'ext': 'vtt'}], + 'en': [{'ext': 'vtt'}], + 'es': [{'ext': 'vtt'}], + 'fr': [{'ext': 'vtt'}], + }, } }, { @@ -626,6 +630,37 @@ class VimeoIE(VimeoBaseInfoExtractor): def _real_initialize(self): self._login() + def _extract_from_api(self, video_id, unlisted_hash=None): + token = self._download_json( + 'https://vimeo.com/_rv/jwt', video_id, headers={ + 'X-Requested-With': 'XMLHttpRequest' + })['token'] + api_url = 'https://api.vimeo.com/videos/' + video_id + if unlisted_hash: + api_url += ':' + unlisted_hash + video = self._download_json( + api_url, video_id, headers={ + 'Authorization': 'jwt ' + token, + }, query={ + 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', + }) + info = self._parse_config(self._download_json( + video['config_url'], video_id), video_id) + self._vimeo_sort_formats(info['formats']) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) + info.update({ + 'description': video.get('description'), + 'license': video.get('license'), + 'release_timestamp': get_timestamp('release'), + 'timestamp': get_timestamp('created'), + 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), + }) + connections = try_get( + video, lambda x: x['metadata']['connections'], dict) or {} + for k in ('comment', 'like'): + info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) + return info + def _try_album_password(self, url): album_id = self._search_regex( r'vimeo\.com/(?:album|showcase)/([^/]+)', url, 'album id', default=None) @@ -675,45 +710,16 @@ class VimeoIE(VimeoBaseInfoExtractor): # Extract ID from URL video_id, unlisted_hash = self._match_valid_url(url).groups() if unlisted_hash: - token = self._download_json( - 'https://vimeo.com/_rv/jwt', video_id, headers={ - 'X-Requested-With': 'XMLHttpRequest' - })['token'] - video = self._download_json( - 'https://api.vimeo.com/videos/%s:%s' % (video_id, unlisted_hash), - video_id, headers={ - 'Authorization': 'jwt ' + token, - }, query={ - 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', - }) - info = self._parse_config(self._download_json( - video['config_url'], video_id), video_id) - self._vimeo_sort_formats(info['formats']) - get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) - info.update({ - 'description': video.get('description'), - 'license': video.get('license'), - 'release_timestamp': get_timestamp('release'), - 'timestamp': get_timestamp('created'), - 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), - }) - connections = try_get( - video, lambda x: x['metadata']['connections'], dict) or {} - for k in ('comment', 'like'): - info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) - return info + return self._extract_from_api(video_id, unlisted_hash) orig_url = url is_pro = 'vimeopro.com/' in url - is_player = '://player.vimeo.com/video/' in url if is_pro: # some videos require portfolio_id to be present in player url # https://github.com/ytdl-org/youtube-dl/issues/20070 url = self._extract_url(url, self._download_webpage(url, video_id)) if not url: url = 'https://vimeo.com/' + video_id - elif is_player: - url = 'https://player.vimeo.com/video/' + video_id elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id @@ -734,14 +740,25 @@ class VimeoIE(VimeoBaseInfoExtractor): expected=True) raise - # Now we begin extracting as much information as we can from what we - # retrieved. First we extract the information common to all extractors, - # and latter we extract those that are Vimeo specific. - self.report_extraction(video_id) + if '://player.vimeo.com/video/' in url: + config = self._parse_json(self._search_regex( + r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) + if config.get('view') == 4: + config = self._verify_player_video_password( + redirect_url, video_id, headers) + info = self._parse_config(config, video_id) + self._vimeo_sort_formats(info['formats']) + return info + + if re.search(r'<form[^>]+?id="pw_form"', webpage): + video_password = self._get_video_password() + token, vuid = self._extract_xsrft_and_vuid(webpage) + webpage = self._verify_video_password( + redirect_url, video_id, video_password, token, vuid) vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) if vimeo_config: - seed_status = vimeo_config.get('seed_status', {}) + seed_status = vimeo_config.get('seed_status') or {} if seed_status.get('state') == 'failed': raise ExtractorError( '%s said: %s' % (self.IE_NAME, seed_status['title']), @@ -750,70 +767,40 @@ class VimeoIE(VimeoBaseInfoExtractor): cc_license = None timestamp = None video_description = None + info_dict = {} - # Extract the config JSON - try: - try: - config_url = self._html_search_regex( - r' data-config-url="(.+?)"', webpage, - 'config URL', default=None) - if not config_url: - # Sometimes new react-based page is served instead of old one that require - # different config URL extraction approach (see - # https://github.com/ytdl-org/youtube-dl/pull/7209) - page_config = self._parse_json(self._search_regex( - r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', - webpage, 'page config'), video_id) - config_url = page_config['player']['config_url'] - cc_license = page_config.get('cc_license') - timestamp = try_get( - page_config, lambda x: x['clip']['uploaded_on'], - compat_str) - video_description = clean_html(dict_get( - page_config, ('description', 'description_html_escaped'))) - config = self._download_json(config_url, video_id) - except RegexNotFoundError: - # For pro videos or player.vimeo.com urls - # We try to find out to which variable is assigned the config dic - m_variable_name = re.search(r'(\w)\.video\.id', webpage) - if m_variable_name is not None: - config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))] - else: - config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] - config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;') - config_re.append(r'\bconfig\s*=\s*({.+?})\s*;') - config = self._search_regex(config_re, webpage, 'info section', - flags=re.DOTALL) - config = json.loads(config) - except Exception as e: - if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): - raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option') - - if re.search(r'<form[^>]+?id="pw_form"', webpage) is not None: - if '_video_password_verified' in data: - raise ExtractorError('video password verification failed!') - video_password = self._get_video_password() - token, vuid = self._extract_xsrft_and_vuid(webpage) - self._verify_video_password( - redirect_url, video_id, video_password, token, vuid) - return self._real_extract( - smuggle_url(redirect_url, {'_video_password_verified': 'verified'})) - else: - raise ExtractorError('Unable to extract info section', - cause=e) + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) + if channel_id: + config_url = self._html_search_regex( + r'\bdata-config-url="([^"]+)"', webpage, 'config URL') + video_description = clean_html(get_element_by_class('description', webpage)) + info_dict.update({ + 'channel_id': channel_id, + 'channel_url': 'https://vimeo.com/channels/' + channel_id, + }) else: - if config.get('view') == 4: - config = self._verify_player_video_password(redirect_url, video_id, headers) - + page_config = self._parse_json(self._search_regex( + r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', + webpage, 'page config', default='{}'), video_id, fatal=False) + if not page_config: + return self._extract_from_api(video_id) + config_url = page_config['player']['config_url'] + cc_license = page_config.get('cc_license') + clip = page_config.get('clip') or {} + timestamp = clip.get('uploaded_on') + video_description = clean_html( + clip.get('description') or page_config.get('description_html_escaped')) + config = self._download_json(config_url, video_id) video = config.get('video') or {} vod = video.get('vod') or {} def is_rented(): if '>You rented this title.<' in webpage: return True - if config.get('user', {}).get('purchased'): + if try_get(config, lambda x: x['user']['purchased']): return True - for purchase_option in vod.get('purchase_options', []): + for purchase_option in (vod.get('purchase_options') or []): if purchase_option.get('purchased'): return True label = purchase_option.get('label_string') @@ -828,14 +815,14 @@ class VimeoIE(VimeoBaseInfoExtractor): 'https://player.vimeo.com/player/%s' % feature_id, {'force_feature_id': True}), 'Vimeo') - # Extract video description if not video_description: video_description = self._html_search_regex( r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>', webpage, 'description', default=None) if not video_description: video_description = self._html_search_meta( - 'description', webpage, default=None) + ['description', 'og:description', 'twitter:description'], + webpage, default=None) if not video_description and is_pro: orig_webpage = self._download_webpage( orig_url, video_id, @@ -844,24 +831,17 @@ class VimeoIE(VimeoBaseInfoExtractor): if orig_webpage: video_description = self._html_search_meta( 'description', orig_webpage, default=None) - if not video_description and not is_player: + if not video_description: self.report_warning('Cannot find video description') - # Extract upload date if not timestamp: timestamp = self._search_regex( r'<time[^>]+datetime="([^"]+)"', webpage, 'timestamp', default=None) - try: - view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) - like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count')) - comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count')) - except RegexNotFoundError: - # This info is only available in vimeo.com/{id} urls - view_count = None - like_count = None - comment_count = None + view_count = int_or_none(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count', default=None)) + like_count = int_or_none(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count', default=None)) + comment_count = int_or_none(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count', default=None)) formats = [] @@ -881,11 +861,7 @@ class VimeoIE(VimeoBaseInfoExtractor): r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') - channel_id = self._search_regex( - r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) - channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None - - info_dict = { + info_dict.update({ 'formats': formats, 'timestamp': unified_timestamp(timestamp), 'description': video_description, @@ -894,18 +870,14 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': like_count, 'comment_count': comment_count, 'license': cc_license, - 'channel_id': channel_id, - 'channel_url': channel_url, - } - - info_dict = merge_dicts(info_dict, info_dict_config, json_ld) + }) - return info_dict + return merge_dicts(info_dict, info_dict_config, json_ld) class VimeoOndemandIE(VimeoIE): IE_NAME = 'vimeo:ondemand' - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?:[^/]+/)?(?P<id>[^/?#&]+)' _TESTS = [{ # ondemand video not available via https://vimeo.com/id 'url': 'https://vimeo.com/ondemand/20704', diff --git a/yt_dlp/extractor/vlive.py b/yt_dlp/extractor/vlive.py index 84f51a544..4340b1d4c 100644 --- a/yt_dlp/extractor/vlive.py +++ b/yt_dlp/extractor/vlive.py @@ -17,17 +17,65 @@ from ..utils import ( strip_or_none, try_get, urlencode_postdata, + url_or_none, ) class VLiveBaseIE(NaverBaseIE): - _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + _NETRC_MACHINE = 'vlive' + _logged_in = False + + def _real_initialize(self): + if not self._logged_in: + VLiveBaseIE._logged_in = self._login() + + def _login(self): + email, password = self._get_login_info() + if email is None: + return False + + LOGIN_URL = 'https://www.vlive.tv/auth/email/login' + self._request_webpage( + LOGIN_URL, None, note='Downloading login cookies') + + self._download_webpage( + LOGIN_URL, None, note='Logging in', + data=urlencode_postdata({'email': email, 'pwd': password}), + headers={ + 'Referer': LOGIN_URL, + 'Content-Type': 'application/x-www-form-urlencoded' + }) + + login_info = self._download_json( + 'https://www.vlive.tv/auth/loginInfo', None, + note='Checking login status', + headers={'Referer': 'https://www.vlive.tv/home'}) + + if not try_get(login_info, lambda x: x['message']['login'], bool): + raise ExtractorError('Unable to log in', expected=True) + return True + + def _call_api(self, path_template, video_id, fields=None, query_add={}, note=None): + if note is None: + note = 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0] + query = {'appId': '8c6cc7b45d2568fb668be6e05b6e5a3b', 'gcc': 'KR', 'platformType': 'PC'} + if fields: + query['fields'] = fields + if query_add: + query.update(query_add) + try: + return self._download_json( + 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, + note, headers={'Referer': 'https://www.vlive.tv/'}, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message']) + raise class VLiveIE(VLiveBaseIE): IE_NAME = 'vlive' _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)' - _NETRC_MACHINE = 'vlive' _TESTS = [{ 'url': 'http://www.vlive.tv/video/1326', 'md5': 'cc7314812855ce56de70a06a27314983', @@ -38,6 +86,12 @@ class VLiveIE(VLiveBaseIE): 'creator': "Girl's Day", 'view_count': int, 'uploader_id': 'muploader_a', + 'upload_date': '20150817', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'timestamp': 1439816449, + }, + 'params': { + 'skip_download': True, }, }, { 'url': 'http://www.vlive.tv/video/16937', @@ -49,6 +103,9 @@ class VLiveIE(VLiveBaseIE): 'view_count': int, 'subtitles': 'mincount:12', 'uploader_id': 'muploader_j', + 'upload_date': '20161112', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'timestamp': 1478923074, }, 'params': { 'skip_download': True, @@ -81,53 +138,6 @@ class VLiveIE(VLiveBaseIE): 'playlist_mincount': 120 }] - def _real_initialize(self): - self._login() - - def _login(self): - email, password = self._get_login_info() - if None in (email, password): - return - - def is_logged_in(): - login_info = self._download_json( - 'https://www.vlive.tv/auth/loginInfo', None, - note='Downloading login info', - headers={'Referer': 'https://www.vlive.tv/home'}) - return try_get( - login_info, lambda x: x['message']['login'], bool) or False - - LOGIN_URL = 'https://www.vlive.tv/auth/email/login' - self._request_webpage( - LOGIN_URL, None, note='Downloading login cookies') - - self._download_webpage( - LOGIN_URL, None, note='Logging in', - data=urlencode_postdata({'email': email, 'pwd': password}), - headers={ - 'Referer': LOGIN_URL, - 'Content-Type': 'application/x-www-form-urlencoded' - }) - - if not is_logged_in(): - raise ExtractorError('Unable to log in', expected=True) - - def _call_api(self, path_template, video_id, fields=None, limit=None): - query = {'appId': self._APP_ID, 'gcc': 'KR', 'platformType': 'PC'} - if fields: - query['fields'] = fields - if limit: - query['limit'] = limit - try: - return self._download_json( - 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, - 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0], - headers={'Referer': 'https://www.vlive.tv/'}, query=query) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message']) - raise - def _real_extract(self, url): video_id = self._match_id(url) @@ -150,7 +160,7 @@ class VLiveIE(VLiveBaseIE): playlist_count = str_or_none(playlist.get('totalCount')) playlist = self._call_api( - 'playlist/v1.0/playlist-%s/posts', playlist_id, 'data', limit=playlist_count) + 'playlist/v1.0/playlist-%s/posts', playlist_id, 'data', {'limit': playlist_count}) entries = [] for video_data in playlist['data']: @@ -172,6 +182,8 @@ class VLiveIE(VLiveBaseIE): 'view_count': int_or_none(video.get('playCount')), 'like_count': int_or_none(video.get('likeCount')), 'comment_count': int_or_none(video.get('commentCount')), + 'timestamp': int_or_none(video.get('createdAt'), scale=1000), + 'thumbnail': video.get('thumb'), } video_type = video.get('type') @@ -216,7 +228,7 @@ class VLiveIE(VLiveBaseIE): raise ExtractorError('Unknown status ' + status) -class VLivePostIE(VLiveIE): +class VLivePostIE(VLiveBaseIE): IE_NAME = 'vlive:post' _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/post/(?P<id>\d-\d+)' _TESTS = [{ @@ -238,8 +250,6 @@ class VLivePostIE(VLiveIE): 'playlist_count': 1, }] _FVIDEO_TMPL = 'fvideo/v1.0/fvideo-%%s/%s' - _SOS_TMPL = _FVIDEO_TMPL % 'sosPlayInfo' - _INKEY_TMPL = _FVIDEO_TMPL % 'inKey' def _real_extract(self, url): post_id = self._match_id(url) @@ -266,7 +276,7 @@ class VLivePostIE(VLiveIE): entry = None if upload_type == 'SOS': download = self._call_api( - self._SOS_TMPL, video_id)['videoUrl']['download'] + self._FVIDEO_TMPL % 'sosPlayInfo', video_id)['videoUrl']['download'] formats = [] for f_id, f_url in download.items(): formats.append({ @@ -284,7 +294,7 @@ class VLivePostIE(VLiveIE): vod_id = upload_info.get('videoId') if not vod_id: continue - inkey = self._call_api(self._INKEY_TMPL, video_id)['inKey'] + inkey = self._call_api(self._FVIDEO_TMPL % 'inKey', video_id)['inKey'] entry = self._extract_video_info(video_id, vod_id, inkey) if entry: entry['title'] = '%s_part%s' % (title, idx) @@ -295,7 +305,7 @@ class VLivePostIE(VLiveIE): class VLiveChannelIE(VLiveBaseIE): IE_NAME = 'vlive:channel' - _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)' + _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<channel_id>[0-9A-Z]+)(?:/board/(?P<posts_id>\d+))?' _TESTS = [{ 'url': 'http://channels.vlive.tv/FCD4B', 'info_dict': { @@ -306,78 +316,58 @@ class VLiveChannelIE(VLiveBaseIE): }, { 'url': 'https://www.vlive.tv/channel/FCD4B', 'only_matching': True, + }, { + 'url': 'https://www.vlive.tv/channel/FCD4B/board/3546', + 'info_dict': { + 'id': 'FCD4B-3546', + 'title': 'MAMAMOO - Star Board', + }, + 'playlist_mincount': 880 }] - def _call_api(self, path, channel_key_suffix, channel_value, note, query): - q = { - 'app_id': self._APP_ID, - 'channel' + channel_key_suffix: channel_value, - } - q.update(query) - return self._download_json( - 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path, - channel_value, note='Downloading ' + note, query=q)['result'] - - def _real_extract(self, url): - channel_code = self._match_id(url) - - channel_seq = self._call_api( - 'decodeChannelCode', 'Code', channel_code, - 'decode channel code', {})['channelSeq'] - - channel_name = None - entries = [] + def _entries(self, posts_id, board_name): + if board_name: + posts_path = 'post/v1.0/board-%s/posts' + query_add = {'limit': 100, 'sortType': 'LATEST'} + else: + posts_path = 'post/v1.0/channel-%s/starPosts' + query_add = {'limit': 100} for page_num in itertools.count(1): video_list = self._call_api( - 'getChannelVideoList', 'Seq', channel_seq, - 'channel list page #%d' % page_num, { - # Large values of maxNumOfRows (~300 or above) may cause - # empty responses (see [1]), e.g. this happens for [2] that - # has more than 300 videos. - # 1. https://github.com/ytdl-org/youtube-dl/issues/13830 - # 2. http://channels.vlive.tv/EDBF. - 'maxNumOfRows': 100, - 'pageNo': page_num - } - ) - - if not channel_name: - channel_name = try_get( - video_list, - lambda x: x['channelInfo']['channelName'], - compat_str) + posts_path, posts_id, 'channel{channelName},contentType,postId,title,url', query_add, + note=f'Downloading playlist page {page_num}') + + for video in try_get(video_list, lambda x: x['data'], list) or []: + video_id = str(video.get('postId')) + video_title = str_or_none(video.get('title')) + video_url = url_or_none(video.get('url')) + if not all((video_id, video_title, video_url)) or video.get('contentType') != 'VIDEO': + continue + channel_name = try_get(video, lambda x: x['channel']['channelName'], compat_str) + yield self.url_result(video_url, VLivePostIE.ie_key(), video_id, video_title, channel=channel_name) - videos = try_get( - video_list, lambda x: x['videoList'], list) - if not videos: + after = try_get(video_list, lambda x: x['paging']['nextParams']['after'], compat_str) + if not after: break + query_add['after'] = after - for video in videos: - video_id = video.get('videoSeq') - video_type = video.get('videoType') + def _real_extract(self, url): + channel_id, posts_id = self._match_valid_url(url).groups() - if not video_id or not video_type: - continue - video_id = compat_str(video_id) - - if video_type in ('PLAYLIST'): - first_video_id = try_get( - video, - lambda x: x['videoPlaylist']['videoList'][0]['videoSeq'], int) - - if not first_video_id: - continue - - entries.append( - self.url_result( - 'http://www.vlive.tv/video/%s' % first_video_id, - ie=VLiveIE.ie_key(), video_id=first_video_id)) - else: - entries.append( - self.url_result( - 'http://www.vlive.tv/video/%s' % video_id, - ie=VLiveIE.ie_key(), video_id=video_id)) + board_name = None + if posts_id: + board = self._call_api( + 'board/v1.0/board-%s', posts_id, 'title,boardType') + board_name = board.get('title') or 'Unknown' + if board.get('boardType') not in ('STAR', 'VLIVE_PLUS'): + raise ExtractorError(f'Board {board_name!r} is not supported', expected=True) + + entries = self._entries(posts_id or channel_id, board_name) + first_video = next(entries) + channel_name = first_video['channel'] return self.playlist_result( - entries, channel_code, channel_name) + itertools.chain([first_video], entries), + f'{channel_id}-{posts_id}' if posts_id else channel_id, + f'{channel_name} - {board_name}' if channel_name and board_name else channel_name) diff --git a/yt_dlp/extractor/wakanim.py b/yt_dlp/extractor/wakanim.py index c956d616e..a61a630e2 100644 --- a/yt_dlp/extractor/wakanim.py +++ b/yt_dlp/extractor/wakanim.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +from urllib.parse import unquote + from .common import InfoExtractor from ..utils import ( merge_dicts, @@ -31,26 +33,37 @@ class WakanimIE(InfoExtractor): 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/7843/sword-art-online-alicization-omu-arc-2-folge-15-omu', 'only_matching': True, }] + _GEO_BYPASS = False def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m3u8_url = urljoin(url, self._search_regex( - r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'm3u8 url', + if 'Geoblocking' in webpage: + if '/de/' in url: + self.raise_geo_restricted(countries=['DE', 'AT', 'CH']) + else: + self.raise_geo_restricted(countries=['RU']) + + manifest_url = urljoin(url, self._search_regex( + r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'manifest url', group='url')) if not self.get_param('allow_unplayable_formats'): # https://docs.microsoft.com/en-us/azure/media-services/previous/media-services-content-protection-overview#streaming-urls encryption = self._search_regex( r'encryption%3D(c(?:enc|bc(?:s-aapl)?))', - m3u8_url, 'encryption', default=None) + manifest_url, 'encryption', default=None) if encryption in ('cenc', 'cbcs-aapl'): self.report_drm(video_id) - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + if 'format=mpd-time-cmaf' in unquote(manifest_url): + formats = self._extract_mpd_formats( + manifest_url, video_id, mpd_id='dash') + else: + formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') info = self._search_json_ld(webpage, video_id, default={}) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index aa58a22bf..658b45fe1 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -695,7 +695,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com' + IE_DESC = 'YouTube' _INVIDIOUS_SITES = ( # invidious-redirect websites r'(?:www\.)?redirect\.invidious\.io', @@ -2696,6 +2696,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): thumbnails.append({ 'url': thumbnail_url, }) + original_thumbnails = thumbnails.copy() + # The best resolution thumbnails sometimes does not appear in the webpage # See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/yt-dlp/yt-dlp/issues/340 # List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029> @@ -2706,7 +2708,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'default', '1', '2', '3' ] n_thumbnail_names = len(thumbnail_names) - thumbnails.extend({ 'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format( video_id=video_id, name=name, ext=ext, @@ -2716,6 +2717,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names) thumb['preference'] = (0 if '.webp' in thumb['url'] else -1) - (2 * i) self._remove_duplicate_formats(thumbnails) + self._downloader._sort_thumbnails(original_thumbnails) category = get_first(microformats, 'category') or search_meta('genre') channel_id = str_or_none( @@ -2745,6 +2747,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': self._live_title(video_title) if is_live else video_title, 'formats': formats, 'thumbnails': thumbnails, + # The best thumbnail that we are sure exists. Prevents unnecessary + # URL checking if user don't care about getting the best possible thumbnail + 'thumbnail': traverse_obj(original_thumbnails, (-1, 'url')), 'description': video_description, 'upload_date': unified_strdate( get_first(microformats, 'uploadDate') @@ -3010,7 +3015,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): class YoutubeTabIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com tab' + IE_DESC = 'YouTube Tabs' _VALID_URL = r'''(?x) https?:// (?:\w+\.)? @@ -4238,7 +4243,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): class YoutubePlaylistIE(InfoExtractor): - IE_DESC = 'YouTube.com playlists' + IE_DESC = 'YouTube playlists' _VALID_URL = r'''(?x)(?: (?:https?://)? (?:\w+\.)? @@ -4304,9 +4309,7 @@ class YoutubePlaylistIE(InfoExtractor): def suitable(cls, url): if YoutubeTabIE.suitable(url): return False - # Hack for lazy extractors until more generic solution is implemented - # (see #28780) - from .youtube import parse_qs + from ..utils import parse_qs qs = parse_qs(url) if qs.get('v', [None])[0]: return False @@ -4364,7 +4367,7 @@ class YoutubeYtBeIE(InfoExtractor): class YoutubeYtUserIE(InfoExtractor): - IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword' + IE_DESC = 'YouTube user videos; "ytuser:" prefix' _VALID_URL = r'ytuser:(?P<id>.+)' _TESTS = [{ 'url': 'ytuser:phihag', @@ -4380,7 +4383,7 @@ class YoutubeYtUserIE(InfoExtractor): class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube:favorites' - IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)' + IE_DESC = 'YouTube liked videos; ":ytfav" keyword (requires cookies)' _VALID_URL = r':ytfav(?:ou?rite)?s?' _LOGIN_REQUIRED = True _TESTS = [{ @@ -4398,10 +4401,7 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE): - IE_DESC = 'YouTube.com searches, "ytsearch" keyword' - # there doesn't appear to be a real limit, for example if you search for - # 'python' you get more than 8.000.000 results - _MAX_RESULTS = float('inf') + IE_DESC = 'YouTube searches' IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' _SEARCH_PARAMS = None @@ -4461,13 +4461,14 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE): class YoutubeSearchDateIE(YoutubeSearchIE): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _SEARCH_KEY = 'ytsearchdate' - IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword' + IE_DESC = 'YouTube searches, newest videos first' _SEARCH_PARAMS = 'CAI%3D' class YoutubeSearchURLIE(YoutubeSearchIE): - IE_DESC = 'YouTube.com search URLs' + IE_DESC = 'YouTube search URLs with sorting and filter support' IE_NAME = YoutubeSearchIE.IE_NAME + '_url' + _SEARCH_KEY = None _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)' # _MAX_RESULTS = 100 _TESTS = [{ @@ -4513,7 +4514,7 @@ class YoutubeFeedsInfoExtractor(YoutubeTabIE): class YoutubeWatchLaterIE(InfoExtractor): IE_NAME = 'youtube:watchlater' - IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' + IE_DESC = 'Youtube watch later list; ":ytwatchlater" keyword (requires cookies)' _VALID_URL = r':ytwatchlater' _TESTS = [{ 'url': ':ytwatchlater', @@ -4526,7 +4527,7 @@ class YoutubeWatchLaterIE(InfoExtractor): class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' + IE_DESC = 'YouTube recommended videos; ":ytrec" keyword' _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _LOGIN_REQUIRED = False @@ -4543,7 +4544,7 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)' + IE_DESC = 'YouTube subscriptions feed; ":ytsubs" keyword (requires cookies)' _VALID_URL = r':ytsub(?:scription)?s?' _FEED_NAME = 'subscriptions' _TESTS = [{ @@ -4556,7 +4557,7 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)' + IE_DESC = 'Youtube watch history; ":ythis" keyword (requires cookies)' _VALID_URL = r':ythis(?:tory)?' _FEED_NAME = 'history' _TESTS = [{ diff --git a/yt_dlp/minicurses.py b/yt_dlp/minicurses.py index a6e159a14..699b1158a 100644 --- a/yt_dlp/minicurses.py +++ b/yt_dlp/minicurses.py @@ -1,6 +1,77 @@ import functools from threading import Lock -from .utils import supports_terminal_sequences, TERMINAL_SEQUENCES, write_string +from .utils import supports_terminal_sequences, write_string + + +CONTROL_SEQUENCES = { + 'DOWN': '\n', + 'UP': '\033[A', + 'ERASE_LINE': '\033[K', + 'RESET': '\033[0m', +} + + +_COLORS = { + 'BLACK': '0', + 'RED': '1', + 'GREEN': '2', + 'YELLOW': '3', + 'BLUE': '4', + 'PURPLE': '5', + 'CYAN': '6', + 'WHITE': '7', +} + + +_TEXT_STYLES = { + 'NORMAL': '0', + 'BOLD': '1', + 'UNDERLINED': '4', +} + + +def format_text(text, f): + ''' + @param f String representation of formatting to apply in the form: + [style] [light] font_color [on [light] bg_color] + Eg: "red", "bold green on light blue" + ''' + f = f.upper() + tokens = f.strip().split() + + bg_color = '' + if 'ON' in tokens: + if tokens[-1] == 'ON': + raise SyntaxError(f'Empty background format specified in {f!r}') + if tokens[-1] not in _COLORS: + raise SyntaxError(f'{tokens[-1]} in {f!r} must be a color') + bg_color = f'4{_COLORS[tokens.pop()]}' + if tokens[-1] == 'LIGHT': + bg_color = f'0;10{bg_color[1:]}' + tokens.pop() + if tokens[-1] != 'ON': + raise SyntaxError(f'Invalid format {f.split(" ON ", 1)[1]!r} in {f!r}') + bg_color = f'\033[{bg_color}m' + tokens.pop() + + if not tokens: + fg_color = '' + elif tokens[-1] not in _COLORS: + raise SyntaxError(f'{tokens[-1]} in {f!r} must be a color') + else: + fg_color = f'3{_COLORS[tokens.pop()]}' + if tokens and tokens[-1] == 'LIGHT': + fg_color = f'9{fg_color[1:]}' + tokens.pop() + fg_style = tokens.pop() if tokens and tokens[-1] in _TEXT_STYLES else 'NORMAL' + fg_color = f'\033[{_TEXT_STYLES[fg_style]};{fg_color}m' + if tokens: + raise SyntaxError(f'Invalid format {" ".join(tokens)!r} in {f!r}') + + if fg_color or bg_color: + return f'{fg_color}{bg_color}{text}{CONTROL_SEQUENCES["RESET"]}' + else: + return text class MultilinePrinterBase: @@ -67,15 +138,15 @@ class MultilinePrinter(MultilinePrinterBase): yield '\r' distance = dest - current if distance < 0: - yield TERMINAL_SEQUENCES['UP'] * -distance + yield CONTROL_SEQUENCES['UP'] * -distance elif distance > 0: - yield TERMINAL_SEQUENCES['DOWN'] * distance + yield CONTROL_SEQUENCES['DOWN'] * distance self._lastline = dest @lock def print_at_line(self, text, pos): if self._HAVE_FULLCAP: - self.write(*self._move_cursor(pos), TERMINAL_SEQUENCES['ERASE_LINE'], text) + self.write(*self._move_cursor(pos), CONTROL_SEQUENCES['ERASE_LINE'], text) text = self._add_line_number(text, pos) textlen = len(text) @@ -103,7 +174,7 @@ class MultilinePrinter(MultilinePrinterBase): if self._HAVE_FULLCAP: self.write( - *text, TERMINAL_SEQUENCES['ERASE_LINE'], - f'{TERMINAL_SEQUENCES["UP"]}{TERMINAL_SEQUENCES["ERASE_LINE"]}' * self.maximum) + *text, CONTROL_SEQUENCES['ERASE_LINE'], + f'{CONTROL_SEQUENCES["UP"]}{CONTROL_SEQUENCES["ERASE_LINE"]}' * self.maximum) else: self.write(*text, ' ' * self._lastlength) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index b45b79bc9..eb86f9e0c 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -558,12 +558,16 @@ def parseOpts(overrideArguments=None): help="Don't give any special preference to free containers (default)") video_format.add_option( '--check-formats', - action='store_true', dest='check_formats', default=None, - help='Check that the formats selected are actually downloadable') + action='store_const', const='selected', dest='check_formats', default=None, + help='Check that the selected formats are actually downloadable') + video_format.add_option( + '--check-all-formats', + action='store_true', dest='check_formats', + help='Check all formats for whether they are actually downloadable') video_format.add_option( '--no-check-formats', action='store_false', dest='check_formats', - help='Do not check that the formats selected are actually downloadable') + help='Do not check that the formats are actually downloadable') video_format.add_option( '-F', '--list-formats', action='store_true', dest='listformats', @@ -972,6 +976,9 @@ def parseOpts(overrideArguments=None): dest='batchfile', action='store_const', const=None, help='Do not read URLs from batch file (default)') filesystem.add_option( + '--id', default=False, + action='store_true', dest='useid', help=optparse.SUPPRESS_HELP) + filesystem.add_option( '-P', '--paths', metavar='[TYPES:]PATH', dest='paths', default={}, type='str', action='callback', callback=_dict_from_options_callback, diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 4a0a96427..b7fcc569b 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -41,6 +41,7 @@ EXT_TO_OUT_FORMATS = { 'ts': 'mpegts', 'wma': 'asf', 'wmv': 'asf', + 'vtt': 'webvtt', } ACODECS = { 'mp3': 'libmp3lame', diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py index 7265a9de7..70c5462d1 100644 --- a/yt_dlp/postprocessor/sponsorblock.py +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -1,6 +1,8 @@ +from hashlib import sha256 +import itertools import json import re -from hashlib import sha256 +import time from .ffmpeg import FFmpegPostProcessor from ..compat import compat_urllib_parse_urlencode, compat_HTTPError @@ -33,6 +35,7 @@ class SponsorBlockPP(FFmpegPostProcessor): self.to_screen(f'SponsorBlock is not supported for {extractor}') return [], info + self.to_screen('Fetching SponsorBlock segments') info['sponsorblock_chapters'] = self._get_sponsor_chapters(info, info['duration']) return [], info @@ -79,18 +82,28 @@ class SponsorBlockPP(FFmpegPostProcessor): 'service': service, 'categories': json.dumps(self._categories), }) + self.write_debug(f'SponsorBlock query: {url}') for d in self._get_json(url): if d['videoID'] == video_id: return d['segments'] return [] def _get_json(self, url): - self.write_debug(f'SponsorBlock query: {url}') - try: - rsp = self._downloader.urlopen(sanitized_Request(url)) - except network_exceptions as e: - if isinstance(e, compat_HTTPError) and e.code == 404: - return [] - raise PostProcessingError(f'Unable to communicate with SponsorBlock API - {e}') - - return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8')) + # While this is not an extractor, it behaves similar to one and + # so obey extractor_retries and sleep_interval_requests + max_retries = self.get_param('extractor_retries', 3) + sleep_interval = self.get_param('sleep_interval_requests') or 0 + for retries in itertools.count(): + try: + rsp = self._downloader.urlopen(sanitized_Request(url)) + return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8')) + except network_exceptions as e: + if isinstance(e, compat_HTTPError) and e.code == 404: + return [] + if retries < max_retries: + self.report_warning(f'{e}. Retrying...') + if sleep_interval > 0: + self.to_screen(f'Sleeping {sleep_interval} seconds ...') + time.sleep(sleep_interval) + continue + raise PostProcessingError(f'Unable to communicate with SponsorBlock API: {e}') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 88adbd3b9..e70c5f909 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2492,9 +2492,9 @@ class GeoRestrictedError(ExtractorError): geographic location due to geographic restrictions imposed by a website. """ - def __init__(self, msg, countries=None): - super(GeoRestrictedError, self).__init__(msg, expected=True) - self.msg = msg + def __init__(self, msg, countries=None, **kwargs): + kwargs['expected'] = True + super(GeoRestrictedError, self).__init__(msg, **kwargs) self.countries = countries @@ -2542,23 +2542,33 @@ class PostProcessingError(YoutubeDLError): self.msg = msg -class ExistingVideoReached(YoutubeDLError): - """ --max-downloads limit has been reached. """ - pass +class DownloadCancelled(YoutubeDLError): + """ Exception raised when the download queue should be interrupted """ + msg = 'The download was cancelled' + def __init__(self, msg=None): + if msg is not None: + self.msg = msg + YoutubeDLError.__init__(self, self.msg) -class RejectedVideoReached(YoutubeDLError): - """ --max-downloads limit has been reached. """ - pass +class ExistingVideoReached(DownloadCancelled): + """ --break-on-existing triggered """ + msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing' -class ThrottledDownload(YoutubeDLError): - """ Download speed below --throttled-rate. """ - pass + +class RejectedVideoReached(DownloadCancelled): + """ --break-on-reject triggered """ + msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject' -class MaxDownloadsReached(YoutubeDLError): +class MaxDownloadsReached(DownloadCancelled): """ --max-downloads limit has been reached. """ + msg = 'Maximum number of downloads reached, stopping due to --max-downloads' + + +class ThrottledDownload(YoutubeDLError): + """ Download speed below --throttled-rate. """ pass @@ -3714,14 +3724,14 @@ def parse_resolution(s): if s is None: return {} - mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s) + mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s) if mobj: return { 'width': int(mobj.group('w')), 'height': int(mobj.group('h')), } - mobj = re.search(r'\b(\d+)[pPiI]\b', s) + mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s) if mobj: return {'height': int(mobj.group(1))} @@ -4050,6 +4060,8 @@ class LazyList(collections.abc.Sequence): def __exhaust(self): self.__cache.extend(self.__iterable) + # Discard the emptied iterable to make it pickle-able + self.__iterable = [] return self.__cache def exhaust(self): @@ -4501,6 +4513,7 @@ OUTTMPL_TYPES = { 'description': 'description', 'annotation': 'annotations.xml', 'infojson': 'info.json', + 'link': None, 'pl_thumbnail': None, 'pl_description': 'description', 'pl_infojson': 'info.json', @@ -4729,7 +4742,7 @@ def determine_protocol(info_dict): if protocol is not None: return protocol - url = info_dict['url'] + url = sanitize_url(info_dict['url']) if url.startswith('rtmp'): return 'rtmp' elif url.startswith('mms'): @@ -4748,9 +4761,11 @@ def determine_protocol(info_dict): def render_table(header_row, data, delim=False, extraGap=0, hideEmpty=False): """ Render a list of rows, each as a list of values """ + def width(string): + return len(remove_terminal_sequences(string)) def get_max_lens(table): - return [max(len(compat_str(v)) for v in col) for col in zip(*table)] + return [max(width(str(v)) for v in col) for col in zip(*table)] def filter_using_list(row, filterArray): return [col for (take, col) in zip(filterArray, row) if take] @@ -4762,10 +4777,15 @@ def render_table(header_row, data, delim=False, extraGap=0, hideEmpty=False): table = [header_row] + data max_lens = get_max_lens(table) + extraGap += 1 if delim: - table = [header_row] + [['-' * ml for ml in max_lens]] + data - format_str = ' '.join('%-' + compat_str(ml + extraGap) + 's' for ml in max_lens[:-1]) + ' %s' - return '\n'.join(format_str % tuple(row) for row in table) + table = [header_row] + [[delim * (ml + extraGap) for ml in max_lens]] + data + max_lens[-1] = 0 + for row in table: + for pos, text in enumerate(map(str, row)): + row[pos] = text + (' ' * (max_lens[pos] - width(text) + extraGap)) + ret = '\n'.join(''.join(row) for row in table) + return ret def _match_one(filter_part, dct, incomplete): @@ -6229,6 +6249,12 @@ URL=%(url)s Icon=text-html '''.lstrip() +LINK_TEMPLATES = { + 'url': DOT_URL_LINK_TEMPLATE, + 'desktop': DOT_DESKTOP_LINK_TEMPLATE, + 'webloc': DOT_WEBLOC_LINK_TEMPLATE, +} + def iri_to_uri(iri): """ @@ -6486,6 +6512,13 @@ def jwt_encode_hs256(payload_data, key, headers={}): return token +# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256 +def jwt_decode_hs256(jwt): + header_b64, payload_b64, signature_b64 = jwt.split('.') + payload_data = json.loads(base64.urlsafe_b64decode(payload_b64)) + return payload_data + + def supports_terminal_sequences(stream): if compat_os_name == 'nt': if get_windows_version() < (10, 0, 10586): @@ -6498,12 +6531,12 @@ def supports_terminal_sequences(stream): return False -TERMINAL_SEQUENCES = { - 'DOWN': '\n', - 'UP': '\x1b[A', - 'ERASE_LINE': '\x1b[K', - 'RED': '\033[0;31m', - 'YELLOW': '\033[0;33m', - 'BLUE': '\033[0;34m', - 'RESET_STYLE': '\033[0m', -} +_terminal_sequences_re = re.compile('\033\\[[^m]+m') + + +def remove_terminal_sequences(string): + return _terminal_sequences_re.sub('', string) + + +def number_of_digits(number): + return len('%d' % number) diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 83b6fea9f..e7203be6b 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.10.10' +__version__ = '2021.10.22' |