diff options
author | Jesús <heckyel@hyperbola.info> | 2022-06-27 01:25:17 +0800 |
---|---|---|
committer | Jesús <heckyel@hyperbola.info> | 2022-06-27 01:25:17 +0800 |
commit | 16e8548f6a720a78679e417a20a300db2036bf6c (patch) | |
tree | b1247bca3417ce882e4a4d80213f41c20113c1a4 | |
parent | 4bbf329feb5a820ac21269fa426c95ca14d7af25 (diff) | |
parent | e08f72e6759fb6b1102521f0bdb9457038ef7c06 (diff) | |
download | hypervideo-pre-16e8548f6a720a78679e417a20a300db2036bf6c.tar.lz hypervideo-pre-16e8548f6a720a78679e417a20a300db2036bf6c.tar.xz hypervideo-pre-16e8548f6a720a78679e417a20a300db2036bf6c.zip |
updated from upstream | 27/06/2022 at 01:25
213 files changed, 8374 insertions, 5705 deletions
diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 9b29acb0c..17a1d192d 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -231,3 +231,39 @@ Fam0r bohwaz dodrian vvto33 +ca-za +connercsbn +diegorodriguezv +ekangmonyet +elyse0 +evansp +GiedriusS +HE7086 +JordanWeatherby +m4tu4g +MarwenDallel +nevack +putnam +rand-net +vertan +Wikidepia +Yipten +moench-tegeder +christoph-heinrich +HobbyistDev +LunarFang416 +sbor23 +aurelg +adamanldo +gamer191 +vkorablin +Burve +mnn +ZhymabekRoman +mozbugbox +aejdl +ping +sqrtNOT +bubbleguuum +darkxex +miseran diff --git a/Changelog.md b/Changelog.md index 243f3d244..a6b898bd8 100644 --- a/Changelog.md +++ b/Changelog.md @@ -11,6 +11,249 @@ --> +### 2022.06.22.1 + +* [build] Fix updating homebrew formula + +### 2022.06.22 + +* [**Deprecate support for Python 3.6**](https://github.com/yt-dlp/yt-dlp/issues/3764#issuecomment-1154051119) +* **Add option `--download-sections` to download video partially** + * Chapter regex and time ranges are accepted (Eg: `--download-sections *1:10-2:20`) +* Add option `--alias` +* Add option `--lazy-playlist` to process entries as they are received +* Add option `--retry-sleep` +* Add slicing notation to `--playlist-items` + * Adds support for negative indices and step + * Add `-I` as alias for `--playlist-index` + * Makes `--playlist-start`, `--playlist-end`, `--playlist-reverse`, `--no-playlist-reverse` redundant +* `--config-location -` to provide options interactively +* [build] Add Linux standalone builds +* [update] Self-restart after update +* Merge youtube-dl: Upto [commit/8a158a9](https://github.com/ytdl-org/youtube-dl/commit/8a158a9) +* Add `--no-update` +* Allow extractors to specify section_start/end for clips +* Do not print progress to `stderr` with `-q` +* Ensure pre-processor errors do not block video download +* Fix `--simulate --max-downloads` +* Improve error handling of bad config files +* Return an error code if update fails +* Fix bug in [3a408f9](https://github.com/yt-dlp/yt-dlp/commit/3a408f9d199127ca2626359e21a866a09ab236b3) +* [ExtractAudio] Allow conditional conversion +* [ModifyChapters] Fix repeated removal of small segments +* [ThumbnailsConvertor] Allow conditional conversion +* [cookies] Detect profiles for cygwin/BSD by [moench-tegeder](https://github.com/moench-tegeder) +* [dash] Show fragment count with `--live-from-start` by [flashdagger](https://github.com/flashdagger) +* [extractor] Add `_search_json` by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* [extractor] Add `default` parameter to `_search_json` by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* [extractor] Add dev option `--load-pages` +* [extractor] Handle `json_ld` with multiple `@type`s +* [extractor] Import `_ALL_CLASSES` lazily +* [extractor] Recognize `src` attribute from HTML5 media elements by [Lesmiscore](https://github.com/Lesmiscore) +* [extractor/generic] Revert e6ae51c123897927eb3c9899923d8ffd31c7f85d +* [f4m] Bugfix +* [ffmpeg] Check version lazily +* [jsinterp] Some optimizations and refactoring by [dirkf](https://github.com/dirkf), [pukkandan](https://github.com/pukkandan) +* [utils] Improve performance using `functools.cache` +* [utils] Send HTTP/1.1 ALPN extension by [coletdjnz](https://github.com/coletdjnz) +* [utils] `ExtractorError`: Fix `exc_info` +* [utils] `ISO3166Utils`: Add `EU` and `AP` +* [utils] `Popen`: Refactor to use contextmanager +* [utils] `locked_file`: Fix for PyPy on Windows +* [update] Expose more functionality to API +* [update] Use `.git` folder to distinguish `source`/`unknown` +* [compat] Add `functools.cached_property` +* [test] Fix `FakeYDL` signatures by [coletdjnz](https://github.com/coletdjnz) +* [docs] Improvements +* [cleanup, ExtractAudio] Refactor +* [cleanup, downloader] Refactor `report_progress` +* [cleanup, extractor] Refactor `_download_...` methods +* [cleanup, extractor] Rename `extractors.py` to `_extractors.py` +* [cleanup, utils] Don't use kwargs for `format_field` +* [cleanup, build] Refactor +* [cleanup, docs] Re-indent "Usage and Options" section +* [cleanup] Deprecate `YoutubeDL.parse_outtmpl` +* [cleanup] Misc fixes and cleanup by [Lesmiscore](https://github.com/Lesmiscore), [MrRawes](https://github.com/MrRawes), [christoph-heinrich](https://github.com/christoph-heinrich), [flashdagger](https://github.com/flashdagger), [gamer191](https://github.com/gamer191), [kwconder](https://github.com/kwconder), [pukkandan](https://github.com/pukkandan) +* [extractor/DailyWire] Add extractors by [HobbyistDev](https://github.com/HobbyistDev), [pukkandan](https://github.com/pukkandan) +* [extractor/fourzerostudio] Add extractors by [Lesmiscore](https://github.com/Lesmiscore) +* [extractor/GoogleDrive] Add folder extractor by [evansp](https://github.com/evansp), [pukkandan](https://github.com/pukkandan) +* [extractor/MirrorCoUK] Add extractor by [LunarFang416](https://github.com/LunarFang416), [pukkandan](https://github.com/pukkandan) +* [extractor/atscaleconfevent] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [extractor/freetv] Add extractor by [elyse0](https://github.com/elyse0) +* [extractor/ixigua] Add Extractor by [HobbyistDev](https://github.com/HobbyistDev) +* [extractor/kicker.de] Add extractor by [HobbyistDev](https://github.com/HobbyistDev) +* [extractor/netverse] Add extractors by [HobbyistDev](https://github.com/HobbyistDev), [pukkandan](https://github.com/pukkandan) +* [extractor/playsuisse] Add extractor by [pukkandan](https://github.com/pukkandan), [sbor23](https://github.com/sbor23) +* [extractor/substack] Add extractor by [elyse0](https://github.com/elyse0) +* [extractor/youtube] **Support downloading clips** +* [extractor/youtube] Add `innertube_host` and `innertube_key` extractor args by [coletdjnz](https://github.com/coletdjnz) +* [extractor/youtube] Add warning for PostLiveDvr +* [extractor/youtube] Bring back `_extract_chapters_from_description` +* [extractor/youtube] Extract `comment_count` from webpage +* [extractor/youtube] Fix `:ytnotifications` extractor by [coletdjnz](https://github.com/coletdjnz) +* [extractor/youtube] Fix initial player response extraction by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* [extractor/youtube] Fix live chat for videos with content warning by [coletdjnz](https://github.com/coletdjnz) +* [extractor/youtube] Make signature extraction non-fatal +* [extractor/youtube:tab] Detect `videoRenderer` in `_post_thread_continuation_entries` +* [extractor/BiliIntl] Fix metadata extraction +* [extractor/BiliIntl] Fix subtitle extraction by [HobbyistDev](https://github.com/HobbyistDev) +* [extractor/FranceCulture] Fix extractor by [aurelg](https://github.com/aurelg), [pukkandan](https://github.com/pukkandan) +* [extractor/PokemonSoundLibrary] Remove extractor by [Lesmiscore](https://github.com/Lesmiscore) +* [extractor/StreamCZ] Fix extractor by [adamanldo](https://github.com/adamanldo), [dirkf](https://github.com/dirkf) +* [extractor/WatchESPN] Support free videos and BAM_DTC by [ischmidt20](https://github.com/ischmidt20) +* [extractor/animelab] Remove extractor by [gamer191](https://github.com/gamer191) +* [extractor/bloomberg] Change playback endpoint by [m4tu4g](https://github.com/m4tu4g) +* [extractor/ccc] Extract view_count by [vkorablin](https://github.com/vkorablin) +* [extractor/crunchyroll:beta] Fix extractor after API change by [Burve](https://github.com/Burve), [tejing1](https://github.com/tejing1) +* [extractor/curiositystream] Get `auth_token` from cookie by [mnn](https://github.com/mnn) +* [extractor/digitalconcerthall] Fix extractor by [ZhymabekRoman](https://github.com/ZhymabekRoman) +* [extractor/dropbox] Extract the correct `mountComponent` +* [extractor/dropout] Login is not mandatory +* [extractor/duboku] Fix for hostname change by [mozbugbox](https://github.com/mozbugbox) +* [extractor/espn] Add `WatchESPN` extractor by [ischmidt20](https://github.com/ischmidt20), [pukkandan](https://github.com/pukkandan) +* [extractor/expressen] Fix extractor by [aejdl](https://github.com/aejdl) +* [extractor/foxnews] Update embed extraction by [elyse0](https://github.com/elyse0) +* [extractor/ina] Fix extractor by [elyse0](https://github.com/elyse0) +* [extractor/iwara:user] Make paging better by [Lesmiscore](https://github.com/Lesmiscore) +* [extractor/jwplatform] Look for `data-video-jw-id` +* [extractor/lbry] Update livestream API by [flashdagger](https://github.com/flashdagger) +* [extractor/mediaset] Improve `_VALID_URL` +* [extractor/naver] Add `navernow` extractor by [ping](https://github.com/ping) +* [extractor/niconico:series] Fix extractor by [sqrtNOT](https://github.com/sqrtNOT) +* [extractor/npr] Use stream url from json-ld by [r5d](https://github.com/r5d) +* [extractor/pornhub] Extract `uploader_id` field by [Lesmiscore](https://github.com/Lesmiscore) +* [extractor/radiofrance] Add more radios by [bubbleguuum](https://github.com/bubbleguuum) +* [extractor/rumble] Detect JS embed +* [extractor/rumble] Extract subtitles by [fstirlitz](https://github.com/fstirlitz) +* [extractor/southpark] Add `southpark.lat` extractor by [darkxex](https://github.com/darkxex) +* [extractor/spotify:show] Fix extractor +* [extractor/tiktok] Detect embeds +* [extractor/tiktok] Extract `SIGI_STATE` by [dirkf](https://github.com/dirkf), [pukkandan](https://github.com/pukkandan), [sulyi](https://github.com/sulyi) +* [extractor/tver] Fix extractor by [Lesmiscore](https://github.com/Lesmiscore) +* [extractor/vevo] Fix extractor by [Lesmiscore](https://github.com/Lesmiscore) +* [extractor/yahoo:gyao] Fix extractor +* [extractor/zattoo] Fix live streams by [miseran](https://github.com/miseran) +* [extractor/zdf] Improve format sorting by [elyse0](https://github.com/elyse0) + + +### 2022.05.18 + +* Add support for SSL client certificate authentication by [coletdjnz](https://github.com/coletdjnz), [dirkf](https://github.com/dirkf) + * Adds `--client-certificate`, `--client-certificate-key`, `--client-certificate-password` +* Add `--match-filter -` to interactively ask for each video +* `--max-downloads` should obey `--break-per-input` +* Allow use of weaker ciphers with `--legacy-server-connect` +* Don't imply `-s` for later stages of `-O` +* Fix `--date today` +* Fix `--skip-unavailable-fragments` +* Fix color in `-q -F` +* Fix redirect HTTP method handling by [coletdjnz](https://github.com/coletdjnz) +* Improve `--clean-infojson` +* Remove warning for videos with an empty title +* Run `FFmpegFixupM3u8PP` for live-streams if needed +* Show name of downloader in verbose log +* [cookies] Allow `cookiefile` to be a text stream +* [cookies] Report progress when importing cookies +* [downloader/ffmpeg] Specify headers for each URL by [elyse0](https://github.com/elyse0) +* [fragment] Do not change chunk-size when `--test` +* [fragment] Make single thread download work for `--live-from-start` by [Lesmiscore](https://github.com/Lesmiscore) +* [hls] Fix `byte_range` for `EXT-X-MAP` fragment by [fstirlitz](https://github.com/fstirlitz) +* [http] Fix retrying on read timeout by [coletdjnz](https://github.com/coletdjnz) +* [ffmpeg] Fix features detection +* [EmbedSubtitle] Enable for more video extensions +* [EmbedThumbnail] Disable thumbnail conversion for mkv by [evansp](https://github.com/evansp) +* [EmbedThumbnail] Do not obey `-k` +* [EmbedThumbnail] Do not remove id3v1 tags +* [FFmpegMetadata] Remove `\0` from metadata +* [FFmpegMetadata] Remove filename from attached info-json +* [FixupM3u8] Obey `--hls-prefer-mpegts` +* [Sponsorblock] Don't crash when duration is unknown +* [XAttrMetadata] Refactor and document dependencies +* [extractor] Document netrc machines +* [extractor] Update `manifest_url`s after redirect by [elyse0](https://github.com/elyse0) +* [extractor] Update dash `manifest_url` after redirects by [elyse0](https://github.com/elyse0) +* [extractor] Use `classmethod`/`property` where possible +* [generic] Refactor `_extract_rss` +* [utils] `is_html`: Handle double BOM +* [utils] `locked_file`: Ignore illegal seek on `truncate` by [jakeogh](https://github.com/jakeogh) +* [utils] `sanitize_path`: Fix when path is empty string +* [utils] `write_string`: Workaround newline issue in `conhost` +* [utils] `certifi`: Make sure the pem file exists +* [utils] Fix `WebSocketsWrapper` +* [utils] `locked_file`: Do not give executable bits for newly created files by [Lesmiscore](https://github.com/Lesmiscore) +* [utils] `YoutubeDLCookieJar`: Detect and reject JSON file by [Lesmiscore](https://github.com/Lesmiscore) +* [test] Convert warnings into errors and fix some existing warnings by [fstirlitz](https://github.com/fstirlitz) +* [dependencies] Create module with all dependency imports +* [compat] Split into sub-modules by [fstirlitz](https://github.com/fstirlitz), [pukkandan](https://github.com/pukkandan) +* [compat] Implement `compat.imghdr` +* [build] Add `make uninstall` by [MrRawes](https://github.com/MrRawes) +* [build] Avoid use of `install -D` +* [build] Fix `Makefile` by [putnam](https://github.com/putnam) +* [build] Fix `--onedir` on macOS +* [build] Add more test-runners +* [cleanup] Deprecate some compat vars by [fstirlitz](https://github.com/fstirlitz), [pukkandan](https://github.com/pukkandan) +* [cleanup] Remove unused code paths, extractors, scripts and tests by [fstirlitz](https://github.com/fstirlitz) +* [cleanup] Upgrade syntax (`pyupgrade`) and sort imports (`isort`) +* [cleanup, docs, build] Misc fixes +* [BilibiliLive] Add extractor by [HE7086](https://github.com/HE7086), [pukkandan](https://github.com/pukkandan) +* [Fifa] Add Extractor by [Bricio](https://github.com/Bricio) +* [goodgame] Add extractor by [nevack](https://github.com/nevack) +* [gronkh] Add playlist extractors by [hatienl0i261299](https://github.com/hatienl0i261299) +* [icareus] Add extractor by [tpikonen](https://github.com/tpikonen), [pukkandan](https://github.com/pukkandan) +* [iwara] Add playlist extractors by [i6t](https://github.com/i6t) +* [Likee] Add extractor by [hatienl0i261299](https://github.com/hatienl0i261299) +* [masters] Add extractor by [m4tu4g](https://github.com/m4tu4g) +* [nebula] Add support for subscriptions by [hheimbuerger](https://github.com/hheimbuerger) +* [Podchaser] Add extractors by [connercsbn](https://github.com/connercsbn) +* [rokfin:search] Add extractor by [P-reducible](https://github.com/P-reducible), [pukkandan](https://github.com/pukkandan) +* [youtube] Add `:ytnotifications` extractor by [krichbanana](https://github.com/krichbanana) +* [youtube] Add YoutubeStoriesIE (`ytstories:<channel UCID>`) by [coletdjnz](https://github.com/coletdjnz) +* [ZingMp3] Add chart and user extractors by [hatienl0i261299](https://github.com/hatienl0i261299) +* [adn] Update AES key by [elyse0](https://github.com/elyse0) +* [adobepass] Allow cookies for authenticating MSO +* [bandcamp] Exclude merch links by [Yipten](https://github.com/Yipten) +* [chingari] Fix archiving and tests +* [DRTV] Improve `_VALID_URL` by [vertan](https://github.com/vertan) +* [facebook] Improve thumbnail extraction by [Wikidepia](https://github.com/Wikidepia) +* [fc2] Stop heatbeating once FFmpeg finishes by [Lesmiscore](https://github.com/Lesmiscore) +* [Gofile] Fix extraction and support password-protected links by [mehq](https://github.com/mehq) +* [hotstar, cleanup] Refactor extractors +* [InfoQ] Don't fail on missing audio format by [evansp](https://github.com/evansp) +* [Jamendo] Extract more metadata by [evansp](https://github.com/evansp) +* [kaltura] Update API calls by [flashdagger](https://github.com/flashdagger) +* [KhanAcademy] Fix extractor by [rand-net](https://github.com/rand-net) +* [LCI] Fix extractor by [MarwenDallel](https://github.com/MarwenDallel) +* [lrt] Support livestreams by [GiedriusS](https://github.com/GiedriusS) +* [niconico] Set `expected_protocol` to a public field +* [Niconico] Support 2FA by [ekangmonyet](https://github.com/ekangmonyet) +* [Olympics] Fix format extension +* [openrec:movie] Enable fallback for /movie/ URLs +* [PearVideo] Add fallback for formats by [hatienl0i261299](https://github.com/hatienl0i261299) +* [radiko] Fix extractor by [Lesmiscore](https://github.com/Lesmiscore) +* [rai] Add `release_year` +* [reddit] Prevent infinite loop +* [rokfin] Implement login by [P-reducible](https://github.com/P-reducible), [pukkandan](https://github.com/pukkandan) +* [ruutu] Support hs.fi embeds by [tpikonen](https://github.com/tpikonen), [pukkandan](https://github.com/pukkandan) +* [spotify] Detect iframe embeds by [fstirlitz](https://github.com/fstirlitz) +* [telegram] Fix metadata extraction +* [tmz, cleanup] Update tests by [diegorodriguezv](https://github.com/diegorodriguezv) +* [toggo] Fix `_VALID_URL` by [ca-za](https://github.com/ca-za) +* [trovo] Update to new API by [nyuszika7h](https://github.com/nyuszika7h) +* [TVer] Improve extraction by [Lesmiscore](https://github.com/Lesmiscore) +* [twitcasting] Pass headers for each formats by [Lesmiscore](https://github.com/Lesmiscore) +* [VideocampusSachsen] Improve extractor by [FestplattenSchnitzel](https://github.com/FestplattenSchnitzel) +* [vimeo] Fix extractors +* [wat] Fix extraction of multi-language videos and subtitles by [elyse0](https://github.com/elyse0) +* [wistia] Fix `_VALID_URL` by [dirkf](https://github.com/dirkf) +* [youtube, cleanup] Minor refactoring by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* [youtube] Added piped instance urls by [JordanWeatherby](https://github.com/JordanWeatherby) +* [youtube] Deprioritize auto-generated thumbnails +* [youtube] Deprioritize format 22 (often damaged) +* [youtube] Fix episode metadata extraction +* [zee5] Fix extractor by [Ashish0804](https://github.com/Ashish0804) +* [zingmp3, cleanup] Refactor extractors + + ### 2022.04.08 * Use certificates from `certifi` if installed by [coletdjnz](https://github.com/coletdjnz) @@ -785,7 +1028,7 @@ * [build] Improvements * Build standalone MacOS packages by [smplayer-dev](https://github.com/smplayer-dev) * Release windows exe built with `py2exe` - * Enable lazy-extractors in releases. + * Enable lazy-extractors in releases * Set env var `YTDLP_NO_LAZY_EXTRACTORS` to forcefully disable this (experimental) * Clean up error reporting in update * Refactor `pyinst.py`, misc cleanup and improve docs @@ -1038,7 +1281,7 @@ * [build] Automate more of the release process by [animelover1984](https://github.com/animelover1984), [pukkandan](https://github.com/pukkandan) * [build] Fix sha256 by [nihil-admirari](https://github.com/nihil-admirari) * [build] Bring back brew taps by [nao20010128nao](https://github.com/nao20010128nao) -* [build] Provide `--onedir` zip for windows by [pukkandan](https://github.com/pukkandan) +* [build] Provide `--onedir` zip for windows * [cleanup,docs] Add deprecation warning in docs for some counter intuitive behaviour * [cleanup] Fix line endings for `nebula.py` by [glenn-slayden](https://github.com/glenn-slayden) * [cleanup] Improve `make clean-test` by [sulyi](https://github.com/sulyi) @@ -2031,7 +2274,7 @@ * **Format Sort:** Added `--format-sort` (`-S`), `--format-sort-force` (`--S-force`) - See [Sorting Formats](README.md#sorting-formats) for details * **Format Selection:** See [Format Selection](README.md#format-selection) for details * New format selectors: `best*`, `worst*`, `bestvideo*`, `bestaudio*`, `worstvideo*`, `worstaudio*` - * Changed video format sorting to show video only files and video+audio files together. + * Changed video format sorting to show video only files and video+audio files together * Added `--video-multistreams`, `--no-video-multistreams`, `--audio-multistreams`, `--no-audio-multistreams` * Added `b`,`w`,`v`,`a` as alias for `best`, `worst`, `video` and `audio` respectively * Shortcut Options: Added `--write-link`, `--write-url-link`, `--write-webloc-link`, `--write-desktop-link` by [h-h-h-h](https://github.com/h-h-h-h) - See [Internet Shortcut Options](README.md#internet-shortcut-options) for details diff --git a/devscripts/bash-completion.py b/devscripts/bash-completion.py index 268e8a2ae..9b4a9d4e2 100755 --- a/devscripts/bash-completion.py +++ b/devscripts/bash-completion.py @@ -1,9 +1,12 @@ #!/usr/bin/env python3 + +# Allow direct execution import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + import yt_dlp BASH_COMPLETION_FILE = "completions/bash/yt-dlp" diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 08f663e4b..fc72c3051 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -13,9 +13,11 @@ import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import gettestcases -from yt_dlp.utils import compat_urllib_parse_urlparse, compat_urllib_request +import urllib.parse +import urllib.request + +from test.helper import gettestcases if len(sys.argv) > 1: METHOD = 'LIST' @@ -26,7 +28,7 @@ else: for test in gettestcases(): if METHOD == 'EURISTIC': try: - webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() + webpage = urllib.request.urlopen(test['url'], timeout=10).read() except Exception: print('\nFail: {}'.format(test['name'])) continue @@ -36,7 +38,7 @@ for test in gettestcases(): RESULT = 'porn' in webpage.lower() elif METHOD == 'LIST': - domain = compat_urllib_parse_urlparse(test['url']).netloc + domain = urllib.parse.urlparse(test['url']).netloc if not domain: print('\nFail: {}'.format(test['name'])) continue diff --git a/devscripts/fish-completion.py b/devscripts/fish-completion.py index d9c0048e2..5d2f68a48 100755 --- a/devscripts/fish-completion.py +++ b/devscripts/fish-completion.py @@ -1,10 +1,14 @@ #!/usr/bin/env python3 -import optparse + +# Allow direct execution import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import optparse + import yt_dlp from yt_dlp.utils import shell_quote diff --git a/devscripts/generate_aes_testdata.py b/devscripts/generate_aes_testdata.py index c7d83f1a7..7f3c88bcf 100644 --- a/devscripts/generate_aes_testdata.py +++ b/devscripts/generate_aes_testdata.py @@ -1,11 +1,15 @@ #!/usr/bin/env python3 -import codecs + +# Allow direct execution import os -import subprocess import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import codecs +import subprocess + from yt_dlp.aes import aes_encrypt, key_expansion from yt_dlp.utils import intlist_to_bytes diff --git a/devscripts/make_contributing.py b/devscripts/make_contributing.py index 361e17d8c..d74462a3c 100755 --- a/devscripts/make_contributing.py +++ b/devscripts/make_contributing.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + import optparse import re diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 8c481bc2d..785d66a6a 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -1,12 +1,15 @@ #!/usr/bin/env python3 + +# Allow direct execution import os -import optparse import sys -from inspect import getsource sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import optparse +from inspect import getsource + NO_ATTR = object() STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_WORKING', '_NETRC_MACHINE', 'age_limit'] CLASS_METHODS = [ @@ -53,7 +56,7 @@ def get_all_ies(): if os.path.exists(PLUGINS_DIRNAME): os.rename(PLUGINS_DIRNAME, BLOCKED_DIRNAME) try: - from yt_dlp.extractor import _ALL_CLASSES + from yt_dlp.extractor.extractors import _ALL_CLASSES finally: if os.path.exists(BLOCKED_DIRNAME): os.rename(BLOCKED_DIRNAME, PLUGINS_DIRNAME) diff --git a/devscripts/make_readme.py b/devscripts/make_readme.py index fd234bf58..f2e08d7c6 100644 --- a/devscripts/make_readme.py +++ b/devscripts/make_readme.py @@ -1,7 +1,12 @@ #!/usr/bin/env python3 -# yt-dlp --help | make_readme.py -# This must be run in a console of correct width +""" +yt-dlp --help | make_readme.py +This must be run in a console of correct width +""" + + +import functools import re import sys @@ -10,21 +15,60 @@ README_FILE = 'README.md' OPTIONS_START = 'General Options:' OPTIONS_END = 'CONFIGURATION' EPILOG_START = 'See full documentation' +ALLOWED_OVERSHOOT = 2 + +DISABLE_PATCH = object() + + +def take_section(text, start=None, end=None, *, shift=0): + return text[ + text.index(start) + shift if start else None: + text.index(end) + shift if end else None + ] -helptext = sys.stdin.read() -if isinstance(helptext, bytes): - helptext = helptext.decode() +def apply_patch(text, patch): + return text if patch[0] is DISABLE_PATCH else re.sub(*patch, text) -start, end = helptext.index(f'\n {OPTIONS_START}'), helptext.index(f'\n{EPILOG_START}') -options = re.sub(r'(?m)^ (\w.+)$', r'## \1', helptext[start + 1: end + 1]) + +options = take_section(sys.stdin.read(), f'\n {OPTIONS_START}', f'\n{EPILOG_START}', shift=1) + +max_width = max(map(len, options.split('\n'))) +switch_col_width = len(re.search(r'(?m)^\s{5,}', options).group()) +delim = f'\n{" " * switch_col_width}' + +PATCHES = ( + ( # Headings + r'(?m)^ (\w.+\n)( (?=\w))?', + r'## \1' + ), + ( # Do not split URLs + rf'({delim[:-1]})? (?P<label>\[\S+\] )?(?P<url>https?({delim})?:({delim})?/({delim})?/(({delim})?\S+)+)\s', + lambda mobj: ''.join((delim, mobj.group('label') or '', re.sub(r'\s+', '', mobj.group('url')), '\n')) + ), + ( # Do not split "words" + rf'(?m)({delim}\S+)+$', + lambda mobj: ''.join((delim, mobj.group(0).replace(delim, ''))) + ), + ( # Allow overshooting last line + rf'(?m)^(?P<prev>.+)${delim}(?P<current>.+)$(?!{delim})', + lambda mobj: (mobj.group().replace(delim, ' ') + if len(mobj.group()) - len(delim) + 1 <= max_width + ALLOWED_OVERSHOOT + else mobj.group()) + ), + ( # Avoid newline when a space is available b/w switch and description + DISABLE_PATCH, # This creates issues with prepare_manpage + r'(?m)^(\s{4}-.{%d})(%s)' % (switch_col_width - 6, delim), + r'\1 ' + ), +) with open(README_FILE, encoding='utf-8') as f: readme = f.read() -header = readme[:readme.index(f'## {OPTIONS_START}')] -footer = readme[readme.index(f'# {OPTIONS_END}'):] - with open(README_FILE, 'w', encoding='utf-8') as f: - for part in (header, options, footer): - f.write(part) + f.write(''.join(( + take_section(readme, end=f'## {OPTIONS_START}'), + functools.reduce(apply_patch, PATCHES, options), + take_section(readme, f'# {OPTIONS_END}'), + ))) diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py index d8c53c5e1..e46f7af56 100644 --- a/devscripts/make_supportedsites.py +++ b/devscripts/make_supportedsites.py @@ -1,10 +1,14 @@ #!/usr/bin/env python3 -import optparse + +# Allow direct execution import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import optparse + from yt_dlp.extractor import list_extractor_classes diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index df9abe5ae..d12ff4947 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + import optparse import os.path import re @@ -23,7 +24,7 @@ yt\-dlp \- A youtube-dl fork with additional features and patches def main(): parser = optparse.OptionParser(usage='%prog OUTFILE.md') - options, args = parser.parse_args() + _, args = parser.parse_args() if len(args) != 1: parser.error('Expected an output filename') diff --git a/devscripts/run_tests.sh b/devscripts/run_tests.sh index e9904ae35..d496a092b 100755 --- a/devscripts/run_tests.sh +++ b/devscripts/run_tests.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/usr/bin/env sh if [ -z $1 ]; then test_set='test' diff --git a/devscripts/zsh-completion.py b/devscripts/zsh-completion.py index 59faea06a..267af5f6e 100755 --- a/devscripts/zsh-completion.py +++ b/devscripts/zsh-completion.py @@ -1,9 +1,12 @@ #!/usr/bin/env python3 + +# Allow direct execution import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + import yt_dlp ZSH_COMPLETION_FILE = "completions/zsh/_yt-dlp" @@ -1,28 +1,12 @@ #!/usr/bin/env python3 + import os import platform import sys from PyInstaller.__main__ import run as run_pyinstaller -OS_NAME = platform.system() -if OS_NAME == 'Windows': - from PyInstaller.utils.win32.versioninfo import ( - FixedFileInfo, - SetVersion, - StringFileInfo, - StringStruct, - StringTable, - VarFileInfo, - VarStruct, - VSVersionInfo, - ) -elif OS_NAME == 'Darwin': - pass -else: - raise Exception(f'{OS_NAME} is not supported') - -ARCH = platform.architecture()[0][:2] +OS_NAME, ARCH = sys.platform, platform.architecture()[0][:2] def main(): @@ -33,10 +17,7 @@ def main(): if not onedir and '-F' not in opts and '--onefile' not in opts: opts.append('--onefile') - name = 'yt-dlp%s' % ('_macos' if OS_NAME == 'Darwin' else '_x86' if ARCH == '32' else '') - final_file = ''.join(( - 'dist/', f'{name}/' if onedir else '', name, '.exe' if OS_NAME == 'Windows' else '')) - + name, final_file = exe(onedir) print(f'Building yt-dlp v{version} {ARCH}bit for {OS_NAME} with options {opts}') print('Remember to update the version using "devscripts/update-version.py"') if not os.path.isfile('yt_dlp/extractor/lazy_extractors.py'): @@ -79,6 +60,21 @@ def read_version(fname): return locals()['__version__'] +def exe(onedir): + """@returns (name, path)""" + name = '_'.join(filter(None, ( + 'yt-dlp', + {'win32': '', 'darwin': 'macos'}.get(OS_NAME, OS_NAME), + ARCH == '32' and 'x86' + ))) + return name, ''.join(filter(None, ( + 'dist/', + onedir and f'{name}/', + name, + OS_NAME == 'win32' and '.exe' + ))) + + def version_to_list(version): version_list = version.split('.') return list(map(int, version_list)) + [0] * (4 - len(version_list)) @@ -109,11 +105,22 @@ def pycryptodome_module(): def set_version_info(exe, version): - if OS_NAME == 'Windows': + if OS_NAME == 'win32': windows_set_version(exe, version) def windows_set_version(exe, version): + from PyInstaller.utils.win32.versioninfo import ( + FixedFileInfo, + SetVersion, + StringFileInfo, + StringStruct, + StringTable, + VarFileInfo, + VarStruct, + VSVersionInfo, + ) + version_list = version_to_list(version) suffix = '_x86' if ARCH == '32' else '' SetVersion(exe, VSVersionInfo( diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 52feb4aba..000000000 --- a/pytest.ini +++ /dev/null @@ -1,4 +0,0 @@ -[pytest] -addopts = -ra -v --strict-markers -markers = - download @@ -1,6 +1,41 @@ [wheel] -universal = True +universal = true + [flake8] -exclude = devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git,venv +exclude = build,venv,.tox,.git,.pytest_cache ignore = E402,E501,E731,E741,W503 +max_line_length = 120 +per_file_ignores = + devscripts/lazy_load_template.py: F401 + + +[tool:pytest] +addopts = -ra -v --strict-markers +markers = + download + + +[tox:tox] +skipsdist = true +envlist = py{36,37,38,39,310},pypy{36,37,38,39} +skip_missing_interpreters = true + +[testenv] # tox +deps = + pytest +commands = pytest {posargs:"-m not download"} +passenv = HOME # For test_compat_expanduser +setenv = + # PYTHONWARNINGS = error # Catches PIP's warnings too + + +[isort] +py_version = 36 +multi_line_output = VERTICAL_HANGING_INDENT +line_length = 80 +reverse_relative = true +ensure_newline_before_comments = true +include_trailing_comma = true +known_first_party = + test @@ -27,7 +27,7 @@ REQUIREMENTS = ['mutagen', 'pycryptodome', 'websockets'] if sys.argv[1:2] == ['py2exe']: - import py2exe + import py2exe # noqa: F401 warnings.warn( 'py2exe builds do not support pycryptodomex and needs VC++14 to run. ' 'The recommended way is to use "pyinst.py" to build using pyinstaller') @@ -124,6 +124,9 @@ setup( 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: Implementation', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', diff --git a/supportedsites.md b/supportedsites.md index 7663c09d4..7a91358d5 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -1,4 +1,6 @@ # Supported sites + - **0000studio:archive** + - **0000studio:clip** - **17live** - **17live:clip** - **1tv**: Первый канал @@ -60,8 +62,6 @@ - **AmHistoryChannel** - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **AnimalPlanet** - - **AnimeLab**: [<abbr title="netrc machine"><em>animelab</em></abbr>] - - **AnimeLabShows**: [<abbr title="netrc machine"><em>animelab</em></abbr>] - **AnimeOnDemand**: [<abbr title="netrc machine"><em>animeondemand</em></abbr>] - **ant1newsgr:article**: ant1news.gr articles - **ant1newsgr:embed**: ant1news.gr embedded videos @@ -89,6 +89,7 @@ - **AsianCrush** - **AsianCrushPlaylist** - **AtresPlayer**: [<abbr title="netrc machine"><em>atresplayer</em></abbr>] + - **AtScaleConfEvent** - **ATTTechChannel** - **ATVAt** - **AudiMedia** @@ -276,6 +277,8 @@ - **dailymotion**: [<abbr title="netrc machine"><em>dailymotion</em></abbr>] - **dailymotion:playlist**: [<abbr title="netrc machine"><em>dailymotion</em></abbr>] - **dailymotion:user**: [<abbr title="netrc machine"><em>dailymotion</em></abbr>] + - **DailyWire** + - **DailyWirePodcast** - **damtomo:record** - **damtomo:video** - **daum.net** @@ -322,8 +325,8 @@ - **drtv** - **drtv:live** - **DTube** - - **duboku**: www.duboku.co - - **duboku:list**: www.duboku.co entire series + - **duboku**: www.duboku.io + - **duboku:list**: www.duboku.io entire series - **Dumpert** - **dvtv**: http://video.aktualne.cz/ - **dw** @@ -376,6 +379,7 @@ - **fc2:embed** - **fc2:live** - **Fczenit** + - **Fifa** - **Filmmodu** - **filmon** - **filmon:channel** @@ -402,6 +406,8 @@ - **FranceTVSite** - **Freesound** - **freespeech.org** + - **freetv:series** + - **FreeTvMovies** - **FrontendMasters**: [<abbr title="netrc machine"><em>frontendmasters</em></abbr>] - **FrontendMastersCourse**: [<abbr title="netrc machine"><em>frontendmasters</em></abbr>] - **FrontendMastersLesson**: [<abbr title="netrc machine"><em>frontendmasters</em></abbr>] @@ -447,14 +453,18 @@ - **GodTube** - **Gofile** - **Golem** + - **goodgame:stream** - **google:podcasts** - **google:podcasts:feed** - **GoogleDrive** + - **GoogleDrive:Folder** - **GoPro** - **Goshgay** - **GoToStage** - **GPUTechConf** - **Gronkh** + - **gronkh:feed** + - **gronkh:vods** - **Groupon** - **hbo** - **HearThisAt** @@ -492,6 +502,7 @@ - **HungamaSong** - **huya:live**: huya.com - **Hypem** + - **Icareus** - **ign.com** - **IGNArticle** - **IGNVideo** @@ -528,6 +539,9 @@ - **ivi:compilation**: ivi.ru compilations - **ivideon**: Ivideon TV - **Iwara** + - **iwara:playlist** + - **iwara:user** + - **Ixigua** - **Izlesene** - **Jable** - **JablePlaylist** @@ -547,12 +561,14 @@ - **Ketnet** - **khanacademy** - **khanacademy:unit** + - **Kicker** - **KickStarter** - **KinjaEmbed** - **KinoPoisk** - **KonserthusetPlay** - **Koo** - **KrasView**: Красвью + - **KTH** - **Ku6** - **KUSI** - **kuwo:album**: 酷我音乐 - 专辑 @@ -587,6 +603,8 @@ - **Libsyn** - **life**: Life.ru - **life:embed** + - **likee** + - **likee:user** - **limelight** - **limelight:channel** - **limelight:channel_list** @@ -605,7 +623,8 @@ - **loc**: Library of Congress - **LocalNews8** - **LoveHomePorn** - - **lrt.lt** + - **LRTStream** + - **LRTVOD** - **lynda**: [<abbr title="netrc machine"><em>lynda</em></abbr>] lynda.com videos - **lynda:course**: [<abbr title="netrc machine"><em>lynda</em></abbr>] lynda.com online courses - **m6** @@ -626,6 +645,7 @@ - **Markiza** - **MarkizaPage** - **massengeschmack.tv** + - **Masters** - **MatchTV** - **MDR**: MDR.DE and KiKA - **MedalTV** @@ -664,6 +684,7 @@ - **miomio.tv** - **mirrativ** - **mirrativ:user** + - **MirrorCoUK** - **MiTele**: mitele.es - **mixch** - **mixch:archive** @@ -729,6 +750,7 @@ - **NationalGeographicTV** - **Naver** - **Naver:live** + - **navernow** - **NBA** - **nba:watch** - **nba:watch:collection** @@ -747,7 +769,8 @@ - **ndr:embed:base** - **NDTV** - **Nebula**: [<abbr title="netrc machine"><em>watchnebula</em></abbr>] - - **nebula:collection**: [<abbr title="netrc machine"><em>watchnebula</em></abbr>] + - **nebula:channel**: [<abbr title="netrc machine"><em>watchnebula</em></abbr>] + - **nebula:subscriptions**: [<abbr title="netrc machine"><em>watchnebula</em></abbr>] - **NerdCubedFeed** - **netease:album**: 网易云音乐 - 专辑 - **netease:djradio**: 网易云音乐 - 电台 @@ -757,6 +780,8 @@ - **netease:singer**: 网易云音乐 - 歌手 - **netease:song**: 网易云音乐 - **NetPlus**: [<abbr title="netrc machine"><em>netplus</em></abbr>] + - **Netverse** + - **NetversePlaylist** - **Netzkino** - **Newgrounds** - **Newgrounds:playlist** @@ -920,6 +945,7 @@ - **PlayPlusTV**: [<abbr title="netrc machine"><em>playplustv</em></abbr>] - **PlayStuff** - **PlaysTV** + - **PlaySuisse** - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz - **Playvid** - **PlayVids** @@ -927,9 +953,9 @@ - **pluralsight**: [<abbr title="netrc machine"><em>pluralsight</em></abbr>] - **pluralsight:course** - **PlutoTV** + - **Podchaser** - **podomatic** - **Pokemon** - - **PokemonSoundLibrary** - **PokemonWatch** - **PokerGo**: [<abbr title="netrc machine"><em>pokergo</em></abbr>] - **PokerGoCollection**: [<abbr title="netrc machine"><em>pokergo</em></abbr>] @@ -1026,9 +1052,10 @@ - **RICE** - **RMCDecouverte** - **RockstarGames** - - **Rokfin** - - **rokfin:channel** - - **rokfin:stack** + - **Rokfin**: [<abbr title="netrc machine"><em>rokfin</em></abbr>] + - **rokfin:channel**: Rokfin Channels + - **rokfin:search**: Rokfin Search; "rkfnsearch:" prefix + - **rokfin:stack**: Rokfin Stacks - **RoosterTeeth**: [<abbr title="netrc machine"><em>roosterteeth</em></abbr>] - **RoosterTeethSeries**: [<abbr title="netrc machine"><em>roosterteeth</em></abbr>] - **RottenTomatoes** @@ -1136,6 +1163,7 @@ - **southpark.cc.com** - **southpark.cc.com:español** - **southpark.de** + - **southpark.lat** - **southpark.nl** - **southparkstudios.dk** - **SovietsCloset** @@ -1175,6 +1203,7 @@ - **StretchInternet** - **Stripchat** - **stv:player** + - **Substack** - **SunPorno** - **sverigesradio:episode** - **sverigesradio:publication** @@ -1370,8 +1399,6 @@ - **video.google:search**: Google Video search; "gvsearch:" prefix - **video.sky.it** - **video.sky.it:live** - - **VideocampusSachsen** - - **VideocampusSachsenEmbed** - **VideoDetective** - **videofy.me** - **videomore** @@ -1400,6 +1427,7 @@ - **vimeo:watchlater**: [<abbr title="netrc machine"><em>vimeo</em></abbr>] Vimeo watch later list, ":vimeowatchlater" keyword (requires authentication) - **Vimm:recording** - **Vimm:stream** + - **Vimp** - **Vimple**: Vimple - one-click video hosting - **Vine** - **vine:user** @@ -1450,6 +1478,7 @@ - **washingtonpost:article** - **wat.tv** - **WatchBox** + - **WatchESPN** - **WatchIndianPorn**: Watch Indian Porn - **WDR** - **wdr:mobile**: (**Currently broken**) @@ -1522,14 +1551,17 @@ - **YourPorn** - **YourUpload** - **youtube**: YouTube + - **youtube:clip** - **youtube:favorites**: YouTube liked videos; ":ytfav" keyword (requires cookies) - **youtube:history**: Youtube watch history; ":ythis" keyword (requires cookies) - **youtube:music:search_url**: YouTube music search URLs with selectable sections (Eg: #songs) + - **youtube:notif**: YouTube notifications; ":ytnotif" keyword (requires cookies) - **youtube:playlist**: YouTube playlists - **youtube:recommended**: YouTube recommended videos; ":ytrec" keyword - **youtube:search**: YouTube search; "ytsearch:" prefix - **youtube:search:date**: YouTube search, newest videos first; "ytsearchdate:" prefix - **youtube:search_url**: YouTube search URLs with sorting and filter support + - **youtube:stories**: YouTube channel stories; "ytstories:" prefix - **youtube:subscriptions**: YouTube subscriptions feed; ":ytsubs" keyword (requires cookies) - **youtube:tab**: YouTube Tabs - **youtube:user**: YouTube user videos; "ytuser:" prefix @@ -1550,6 +1582,10 @@ - **Zhihu** - **zingmp3**: zingmp3.vn - **zingmp3:album** + - **zingmp3:chart-home** + - **zingmp3:chart-music-video** + - **zingmp3:user** + - **zingmp3:week-chart** - **zoom** - **Zype** - **generic**: Generic downloader that works on some sites diff --git a/test/helper.py b/test/helper.py index 2333ace98..f19e1a34f 100644 --- a/test/helper.py +++ b/test/helper.py @@ -9,7 +9,7 @@ import types import yt_dlp.extractor from yt_dlp import YoutubeDL -from yt_dlp.compat import compat_os_name, compat_str +from yt_dlp.compat import compat_os_name from yt_dlp.utils import preferredencoding, write_string if 'pytest' in sys.modules: @@ -44,7 +44,7 @@ def try_rm(filename): raise -def report_warning(message): +def report_warning(message, *args, **kwargs): ''' Print the message to stderr, it will be prefixed with 'WARNING:' If stderr is a tty file the 'WARNING:' will be colored @@ -67,10 +67,10 @@ class FakeYDL(YoutubeDL): super().__init__(params, auto_init=False) self.result = [] - def to_screen(self, s, skip_eol=None): + def to_screen(self, s, *args, **kwargs): print(s) - def trouble(self, s, tb=None): + def trouble(self, s, *args, **kwargs): raise Exception(s) def download(self, x): @@ -80,10 +80,10 @@ class FakeYDL(YoutubeDL): # Silence an expected warning matching a regex old_report_warning = self.report_warning - def report_warning(self, message): + def report_warning(self, message, *args, **kwargs): if re.match(regex, message): return - old_report_warning(message) + old_report_warning(message, *args, **kwargs) self.report_warning = types.MethodType(report_warning, self) @@ -96,29 +96,29 @@ md5 = lambda s: hashlib.md5(s.encode()).hexdigest() def expect_value(self, got, expected, field): - if isinstance(expected, compat_str) and expected.startswith('re:'): + if isinstance(expected, str) and expected.startswith('re:'): match_str = expected[len('re:'):] match_rex = re.compile(match_str) self.assertTrue( - isinstance(got, compat_str), - f'Expected a {compat_str.__name__} object, but got {type(got).__name__} for field {field}') + isinstance(got, str), + f'Expected a {str.__name__} object, but got {type(got).__name__} for field {field}') self.assertTrue( match_rex.match(got), f'field {field} (value: {got!r}) should match {match_str!r}') - elif isinstance(expected, compat_str) and expected.startswith('startswith:'): + elif isinstance(expected, str) and expected.startswith('startswith:'): start_str = expected[len('startswith:'):] self.assertTrue( - isinstance(got, compat_str), - f'Expected a {compat_str.__name__} object, but got {type(got).__name__} for field {field}') + isinstance(got, str), + f'Expected a {str.__name__} object, but got {type(got).__name__} for field {field}') self.assertTrue( got.startswith(start_str), f'field {field} (value: {got!r}) should start with {start_str!r}') - elif isinstance(expected, compat_str) and expected.startswith('contains:'): + elif isinstance(expected, str) and expected.startswith('contains:'): contains_str = expected[len('contains:'):] self.assertTrue( - isinstance(got, compat_str), - f'Expected a {compat_str.__name__} object, but got {type(got).__name__} for field {field}') + isinstance(got, str), + f'Expected a {str.__name__} object, but got {type(got).__name__} for field {field}') self.assertTrue( contains_str in got, f'field {field} (value: {got!r}) should contain {contains_str!r}') @@ -142,12 +142,12 @@ def expect_value(self, got, expected, field): index, field, type_expected, type_got)) expect_value(self, item_got, item_expected, field) else: - if isinstance(expected, compat_str) and expected.startswith('md5:'): + if isinstance(expected, str) and expected.startswith('md5:'): self.assertTrue( - isinstance(got, compat_str), + isinstance(got, str), f'Expected field {field} to be a unicode object, but got value {got!r} of type {type(got)!r}') got = 'md5:' + md5(got) - elif isinstance(expected, compat_str) and re.match(r'^(?:min|max)?count:\d+', expected): + elif isinstance(expected, str) and re.match(r'^(?:min|max)?count:\d+', expected): self.assertTrue( isinstance(got, (list, dict)), f'Expected field {field} to be a list or a dict, but it is of type {type(got).__name__}') @@ -236,7 +236,7 @@ def expect_info_dict(self, got_dict, expected_dict): missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys()) if missing_keys: def _repr(v): - if isinstance(v, compat_str): + if isinstance(v, str): return "'%s'" % v.replace('\\', '\\\\').replace("'", "\\'").replace('\n', '\\n') elif isinstance(v, type): return v.__name__ @@ -301,9 +301,9 @@ def assertEqual(self, got, expected, msg=None): def expect_warnings(ydl, warnings_re): real_warning = ydl.report_warning - def _report_warning(w): + def _report_warning(w, *args, **kwargs): if not any(re.search(w_re, w) for w_re in warnings_re): - real_warning(w) + real_warning(w, *args, **kwargs) ydl.report_warning = _report_warning diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 257ea7dd3..f57a29ffc 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + # Allow direct execution import os import sys @@ -6,10 +7,12 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import http.server import threading -from test.helper import FakeYDL, expect_dict, expect_value, http_server_port -from yt_dlp.compat import compat_etree_fromstring, compat_http_server +from test.helper import FakeYDL, expect_dict, expect_value, http_server_port +from yt_dlp.compat import compat_etree_fromstring from yt_dlp.extractor import YoutubeIE, get_info_extractor from yt_dlp.extractor.common import InfoExtractor from yt_dlp.utils import ( @@ -23,7 +26,7 @@ TEAPOT_RESPONSE_STATUS = 418 TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>" -class InfoExtractorTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): +class InfoExtractorTestRequestHandler(http.server.BaseHTTPRequestHandler): def log_message(self, format, *args): pass @@ -502,6 +505,24 @@ class TestInfoExtractor(unittest.TestCase): }], }) + # from https://0000.studio/ + # with type attribute but without extension in URL + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://0000.studio', + r''' + <video src="https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92" + controls="controls" type="video/mp4" preload="metadata" autoplay="autoplay" playsinline class="object-contain"> + </video> + ''', None)[0], + { + 'formats': [{ + 'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92', + 'ext': 'mp4', + }], + }) + def test_extract_jwplayer_data_realworld(self): # from http://www.suffolk.edu/sjc/ expect_dict( @@ -1637,7 +1658,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ # or the underlying `_download_webpage_handle` returning no content # when a response matches `expected_status`. - httpd = compat_http_server.HTTPServer( + httpd = http.server.HTTPServer( ('127.0.0.1', 0), InfoExtractorTestRequestHandler) port = http_server_port(httpd) server_thread = threading.Thread(target=httpd.serve_forever) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 1133f6165..1eb3abc17 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + # Allow direct execution import os import sys @@ -6,23 +7,21 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + import copy import json -from test.helper import FakeYDL, assertRegexpMatches +import urllib.error +from test.helper import FakeYDL, assertRegexpMatches from yt_dlp import YoutubeDL -from yt_dlp.compat import ( - compat_os_name, - compat_setenv, - compat_str, - compat_urllib_error, -) +from yt_dlp.compat import compat_os_name from yt_dlp.extractor import YoutubeIE from yt_dlp.extractor.common import InfoExtractor from yt_dlp.postprocessor.common import PostProcessor from yt_dlp.utils import ( ExtractorError, LazyList, + OnDemandPagedList, int_or_none, match_filter_func, ) @@ -39,7 +38,7 @@ class YDL(FakeYDL): def process_info(self, info_dict): self.downloaded_info_dicts.append(info_dict.copy()) - def to_screen(self, msg): + def to_screen(self, msg, *args, **kwargs): self.msgs.append(msg) def dl(self, *args, **kwargs): @@ -840,14 +839,14 @@ class TestYoutubeDL(unittest.TestCase): # test('%(foo|)s', ('', '_')) # fixme # Environment variable expansion for prepare_filename - compat_setenv('__yt_dlp_var', 'expanded') + os.environ['__yt_dlp_var'] = 'expanded' envvar = '%__yt_dlp_var%' if compat_os_name == 'nt' else '$__yt_dlp_var' test(envvar, (envvar, 'expanded')) if compat_os_name == 'nt': test('%s%', ('%s%', '%s%')) - compat_setenv('s', 'expanded') + os.environ['s'] = 'expanded' test('%s%', ('%s%', 'expanded')) # %s% should be expanded before escaping %s - compat_setenv('(test)s', 'expanded') + os.environ['(test)s'] = 'expanded' test('%(test)s%', ('NA%', 'expanded')) # Environment should take priority over template # Path expansion and escaping @@ -989,41 +988,79 @@ class TestYoutubeDL(unittest.TestCase): self.assertEqual(res, []) def test_playlist_items_selection(self): - entries = [{ - 'id': compat_str(i), - 'title': compat_str(i), - 'url': TEST_URL, - } for i in range(1, 5)] - playlist = { - '_type': 'playlist', - 'id': 'test', - 'entries': entries, - 'extractor': 'test:playlist', - 'extractor_key': 'test:playlist', - 'webpage_url': 'http://example.com', - } + INDICES, PAGE_SIZE = list(range(1, 11)), 3 + + def entry(i, evaluated): + evaluated.append(i) + return { + 'id': str(i), + 'title': str(i), + 'url': TEST_URL, + } - def get_downloaded_info_dicts(params): + def pagedlist_entries(evaluated): + def page_func(n): + start = PAGE_SIZE * n + for i in INDICES[start: start + PAGE_SIZE]: + yield entry(i, evaluated) + return OnDemandPagedList(page_func, PAGE_SIZE) + + def page_num(i): + return (i + PAGE_SIZE - 1) // PAGE_SIZE + + def generator_entries(evaluated): + for i in INDICES: + yield entry(i, evaluated) + + def list_entries(evaluated): + return list(generator_entries(evaluated)) + + def lazylist_entries(evaluated): + return LazyList(generator_entries(evaluated)) + + def get_downloaded_info_dicts(params, entries): ydl = YDL(params) - # make a deep copy because the dictionary and nested entries - # can be modified - ydl.process_ie_result(copy.deepcopy(playlist)) + ydl.process_ie_result({ + '_type': 'playlist', + 'id': 'test', + 'extractor': 'test:playlist', + 'extractor_key': 'test:playlist', + 'webpage_url': 'http://example.com', + 'entries': entries, + }) return ydl.downloaded_info_dicts - def test_selection(params, expected_ids): - results = [ - (v['playlist_autonumber'] - 1, (int(v['id']), v['playlist_index'])) - for v in get_downloaded_info_dicts(params)] - self.assertEqual(results, list(enumerate(zip(expected_ids, expected_ids)))) - - test_selection({}, [1, 2, 3, 4]) - test_selection({'playlistend': 10}, [1, 2, 3, 4]) - test_selection({'playlistend': 2}, [1, 2]) - test_selection({'playliststart': 10}, []) - test_selection({'playliststart': 2}, [2, 3, 4]) - test_selection({'playlist_items': '2-4'}, [2, 3, 4]) + def test_selection(params, expected_ids, evaluate_all=False): + expected_ids = list(expected_ids) + if evaluate_all: + generator_eval = pagedlist_eval = INDICES + elif not expected_ids: + generator_eval = pagedlist_eval = [] + else: + generator_eval = INDICES[0: max(expected_ids)] + pagedlist_eval = INDICES[PAGE_SIZE * page_num(min(expected_ids)) - PAGE_SIZE: + PAGE_SIZE * page_num(max(expected_ids))] + + for name, func, expected_eval in ( + ('list', list_entries, INDICES), + ('Generator', generator_entries, generator_eval), + # ('LazyList', lazylist_entries, generator_eval), # Generator and LazyList follow the exact same code path + ('PagedList', pagedlist_entries, pagedlist_eval), + ): + evaluated = [] + entries = func(evaluated) + results = [(v['playlist_autonumber'] - 1, (int(v['id']), v['playlist_index'])) + for v in get_downloaded_info_dicts(params, entries)] + self.assertEqual(results, list(enumerate(zip(expected_ids, expected_ids))), f'Entries of {name} for {params}') + self.assertEqual(sorted(evaluated), expected_eval, f'Evaluation of {name} for {params}') + test_selection({}, INDICES) + test_selection({'playlistend': 20}, INDICES, True) + test_selection({'playlistend': 2}, INDICES[:2]) + test_selection({'playliststart': 11}, [], True) + test_selection({'playliststart': 2}, INDICES[1:]) + test_selection({'playlist_items': '2-4'}, INDICES[1:4]) test_selection({'playlist_items': '2,4'}, [2, 4]) - test_selection({'playlist_items': '10'}, []) + test_selection({'playlist_items': '20'}, [], True) test_selection({'playlist_items': '0'}, []) # Tests for https://github.com/ytdl-org/youtube-dl/issues/10591 @@ -1032,15 +1069,37 @@ class TestYoutubeDL(unittest.TestCase): # Tests for https://github.com/yt-dlp/yt-dlp/issues/720 # https://github.com/yt-dlp/yt-dlp/issues/302 - test_selection({'playlistreverse': True}, [4, 3, 2, 1]) - test_selection({'playliststart': 2, 'playlistreverse': True}, [4, 3, 2]) + test_selection({'playlistreverse': True}, INDICES[::-1]) + test_selection({'playliststart': 2, 'playlistreverse': True}, INDICES[:0:-1]) test_selection({'playlist_items': '2,4', 'playlistreverse': True}, [4, 2]) test_selection({'playlist_items': '4,2'}, [4, 2]) + # Tests for --playlist-items start:end:step + test_selection({'playlist_items': ':'}, INDICES, True) + test_selection({'playlist_items': '::1'}, INDICES, True) + test_selection({'playlist_items': '::-1'}, INDICES[::-1], True) + test_selection({'playlist_items': ':6'}, INDICES[:6]) + test_selection({'playlist_items': ':-6'}, INDICES[:-5], True) + test_selection({'playlist_items': '-1:6:-2'}, INDICES[:4:-2], True) + test_selection({'playlist_items': '9:-6:-2'}, INDICES[8:3:-2], True) + + test_selection({'playlist_items': '1:inf:2'}, INDICES[::2], True) + test_selection({'playlist_items': '-2:inf'}, INDICES[-2:], True) + test_selection({'playlist_items': ':inf:-1'}, [], True) + test_selection({'playlist_items': '0-2:2'}, [2]) + test_selection({'playlist_items': '1-:2'}, INDICES[::2], True) + test_selection({'playlist_items': '0--2:2'}, INDICES[1:-1:2], True) + + test_selection({'playlist_items': '10::3'}, [10], True) + test_selection({'playlist_items': '-1::3'}, [10], True) + test_selection({'playlist_items': '11::3'}, [], True) + test_selection({'playlist_items': '-15::2'}, INDICES[1::2], True) + test_selection({'playlist_items': '-15::15'}, [], True) + def test_urlopen_no_file_protocol(self): # see https://github.com/ytdl-org/youtube-dl/issues/8227 ydl = YDL() - self.assertRaises(compat_urllib_error.URLError, ydl.urlopen, 'file:///etc/passwd') + self.assertRaises(urllib.error.URLError, ydl.urlopen, 'file:///etc/passwd') def test_do_not_override_ie_key_in_url_transparent(self): ydl = YDL() @@ -1126,7 +1185,7 @@ class TestYoutubeDL(unittest.TestCase): def _entries(self): for n in range(3): - video_id = compat_str(n) + video_id = str(n) yield { '_type': 'url_transparent', 'ie_key': VideoIE.ie_key(), diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py index 6280e1f2c..0d4e7dc97 100644 --- a/test/test_YoutubeDLCookieJar.py +++ b/test/test_YoutubeDLCookieJar.py @@ -1,12 +1,16 @@ #!/usr/bin/env python3 + +# Allow direct execution import os -import re import sys -import tempfile import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import re +import tempfile + from yt_dlp.utils import YoutubeDLCookieJar diff --git a/test/test_aes.py b/test/test_aes.py index 2b7b7cf54..037246588 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + # Allow direct execution import os import sys @@ -6,6 +7,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + import base64 from yt_dlp.aes import ( diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index e1012f69b..ff248432b 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + # Allow direct execution import os import sys @@ -6,8 +7,8 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import is_download_test, try_rm +from test.helper import is_download_test, try_rm from yt_dlp import YoutubeDL diff --git a/test/test_all_urls.py b/test/test_all_urls.py index b6019554e..848c96ff0 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 + # Allow direct execution -import collections import os import sys import unittest @@ -8,8 +8,9 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import gettestcases +import collections +from test.helper import gettestcases from yt_dlp.extractor import FacebookIE, YoutubeIE, gen_extractors diff --git a/test/test_cache.py b/test/test_cache.py index 14e54ba20..ce1624b68 100644 --- a/test/test_cache.py +++ b/test/test_cache.py @@ -1,15 +1,16 @@ #!/usr/bin/env python3 + # Allow direct execution import os -import shutil import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL +import shutil +from test.helper import FakeYDL from yt_dlp.cache import Cache diff --git a/test/test_compat.py b/test/test_compat.py index 224175c65..c6a8f4ecb 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + # Allow direct execution import os import sys @@ -7,16 +8,14 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import struct +import urllib.parse + from yt_dlp import compat from yt_dlp.compat import ( compat_etree_fromstring, compat_expanduser, - compat_getenv, - compat_setenv, - compat_str, - compat_struct_unpack, compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, ) @@ -26,28 +25,19 @@ class TestCompat(unittest.TestCase): with self.assertWarns(DeprecationWarning): compat.compat_basestring - compat.asyncio.events # Must not raise error - - def test_compat_getenv(self): - test_str = 'тест' - compat_setenv('yt_dlp_COMPAT_GETENV', test_str) - self.assertEqual(compat_getenv('yt_dlp_COMPAT_GETENV'), test_str) + with self.assertWarns(DeprecationWarning): + compat.WINDOWS_VT_MODE - def test_compat_setenv(self): - test_var = 'yt_dlp_COMPAT_SETENV' - test_str = 'тест' - compat_setenv(test_var, test_str) - compat_getenv(test_var) - self.assertEqual(compat_getenv(test_var), test_str) + compat.asyncio.events # Must not raise error def test_compat_expanduser(self): old_home = os.environ.get('HOME') test_str = R'C:\Documents and Settings\тест\Application Data' try: - compat_setenv('HOME', test_str) + os.environ['HOME'] = test_str self.assertEqual(compat_expanduser('~'), test_str) finally: - compat_setenv('HOME', old_home or '') + os.environ['HOME'] = old_home or '' def test_compat_urllib_parse_unquote(self): self.assertEqual(compat_urllib_parse_unquote('abc%20def'), 'abc def') @@ -69,8 +59,8 @@ class TestCompat(unittest.TestCase): '''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%Things%''') def test_compat_urllib_parse_unquote_plus(self): - self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def') - self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def') + self.assertEqual(urllib.parse.unquote_plus('abc%20def'), 'abc def') + self.assertEqual(urllib.parse.unquote_plus('%7e/abc+def'), '~/abc def') def test_compat_urllib_parse_urlencode(self): self.assertEqual(compat_urllib_parse_urlencode({'abc': 'def'}), 'abc=def') @@ -91,11 +81,11 @@ class TestCompat(unittest.TestCase): </root> ''' doc = compat_etree_fromstring(xml.encode()) - self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) - self.assertTrue(isinstance(doc.attrib['spam'], compat_str)) - self.assertTrue(isinstance(doc.find('normal').text, compat_str)) - self.assertTrue(isinstance(doc.find('chinese').text, compat_str)) - self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str)) + self.assertTrue(isinstance(doc.attrib['foo'], str)) + self.assertTrue(isinstance(doc.attrib['spam'], str)) + self.assertTrue(isinstance(doc.find('normal').text, str)) + self.assertTrue(isinstance(doc.find('chinese').text, str)) + self.assertTrue(isinstance(doc.find('foo/bar').text, str)) def test_compat_etree_fromstring_doctype(self): xml = '''<?xml version="1.0"?> @@ -104,7 +94,7 @@ class TestCompat(unittest.TestCase): compat_etree_fromstring(xml) def test_struct_unpack(self): - self.assertEqual(compat_struct_unpack('!B', b'\x00'), (0,)) + self.assertEqual(struct.unpack('!B', b'\x00'), (0,)) if __name__ == '__main__': diff --git a/test/test_cookies.py b/test/test_cookies.py index 5bfaec367..cfeb11b55 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -14,16 +14,16 @@ from yt_dlp.cookies import ( class Logger: - def debug(self, message): + def debug(self, message, *args, **kwargs): print(f'[verbose] {message}') - def info(self, message): + def info(self, message, *args, **kwargs): print(message) - def warning(self, message, only_once=False): + def warning(self, message, *args, **kwargs): self.error(message) - def error(self, message): + def error(self, message, *args, **kwargs): raise Exception(message) diff --git a/test/test_download.py b/test/test_download.py index 9a83bee2f..b397b3ecf 100755 --- a/test/test_download.py +++ b/test/test_download.py @@ -1,14 +1,19 @@ #!/usr/bin/env python3 + # Allow direct execution -import hashlib -import json import os -import socket import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import hashlib +import http.client +import json +import socket +import urllib.error + from test.helper import ( assertGreaterEqual, expect_info_dict, @@ -20,12 +25,7 @@ from test.helper import ( try_rm, ) -import yt_dlp.YoutubeDL -from yt_dlp.compat import ( - compat_http_client, - compat_HTTPError, - compat_urllib_error, -) +import yt_dlp.YoutubeDL # isort: split from yt_dlp.extractor import get_info_extractor from yt_dlp.utils import ( DownloadError, @@ -43,7 +43,7 @@ class YoutubeDL(yt_dlp.YoutubeDL): self.processed_info_dicts = [] super().__init__(*args, **kwargs) - def report_warning(self, message): + def report_warning(self, message, *args, **kwargs): # Don't accept warnings during tests raise ExtractorError(message) @@ -102,9 +102,10 @@ def generator(test_case, tname): def print_skipping(reason): print('Skipping %s: %s' % (test_case['name'], reason)) + self.skipTest(reason) + if not ie.working(): print_skipping('IE marked as not _WORKING') - return for tc in test_cases: info_dict = tc.get('info_dict', {}) @@ -118,11 +119,10 @@ def generator(test_case, tname): if 'skip' in test_case: print_skipping(test_case['skip']) - return + for other_ie in other_ies: if not other_ie.working(): print_skipping('test depends on %sIE, marked as not WORKING' % other_ie.ie_key()) - return params = get_params(test_case.get('params', {})) params['outtmpl'] = tname + '_' + params['outtmpl'] @@ -167,7 +167,7 @@ def generator(test_case, tname): force_generic_extractor=params.get('force_generic_extractor', False)) except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one - if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): + if not err.exc_info[0] in (urllib.error.URLError, socket.timeout, UnavailableVideoError, http.client.BadStatusLine) or (err.exc_info[0] == urllib.error.HTTPError and err.exc_info[1].code == 503): raise if try_num == RETRIES: diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py index c33308064..cce7c59e2 100644 --- a/test/test_downloader_http.py +++ b/test/test_downloader_http.py @@ -1,17 +1,19 @@ #!/usr/bin/env python3 + # Allow direct execution import os -import re import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import http.server +import re import threading -from test.helper import http_server_port, try_rm +from test.helper import http_server_port, try_rm from yt_dlp import YoutubeDL -from yt_dlp.compat import compat_http_server from yt_dlp.downloader.http import HttpFD from yt_dlp.utils import encodeFilename @@ -21,7 +23,7 @@ TEST_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_SIZE = 10 * 1024 -class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): +class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler): def log_message(self, format, *args): pass @@ -78,7 +80,7 @@ class FakeLogger: class TestHttpFD(unittest.TestCase): def setUp(self): - self.httpd = compat_http_server.HTTPServer( + self.httpd = http.server.HTTPServer( ('127.0.0.1', 0), HTTPTestRequestHandler) self.port = http_server_port(self.httpd) self.server_thread = threading.Thread(target=self.httpd.serve_forever) diff --git a/test/test_execution.py b/test/test_execution.py index 6efd432e9..1d15fddab 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -1,12 +1,16 @@ #!/usr/bin/env python3 -import contextlib + +# Allow direct execution import os -import subprocess import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import contextlib +import subprocess + from yt_dlp.utils import encodeArgument rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) diff --git a/test/test_http.py b/test/test_http.py index 146df7500..b1aac7720 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + # Allow direct execution import os import sys @@ -6,17 +7,19 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import http.server import ssl import threading -from test.helper import http_server_port +import urllib.request +from test.helper import http_server_port from yt_dlp import YoutubeDL -from yt_dlp.compat import compat_http_server, compat_urllib_request TEST_DIR = os.path.dirname(os.path.abspath(__file__)) -class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): +class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler): def log_message(self, format, *args): pass @@ -53,7 +56,7 @@ class FakeLogger: class TestHTTP(unittest.TestCase): def setUp(self): - self.httpd = compat_http_server.HTTPServer( + self.httpd = http.server.HTTPServer( ('127.0.0.1', 0), HTTPTestRequestHandler) self.port = http_server_port(self.httpd) self.server_thread = threading.Thread(target=self.httpd.serve_forever) @@ -64,7 +67,7 @@ class TestHTTP(unittest.TestCase): class TestHTTPS(unittest.TestCase): def setUp(self): certfn = os.path.join(TEST_DIR, 'testcert.pem') - self.httpd = compat_http_server.HTTPServer( + self.httpd = http.server.HTTPServer( ('127.0.0.1', 0), HTTPTestRequestHandler) sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) sslctx.load_cert_chain(certfn, None) @@ -90,7 +93,7 @@ class TestClientCert(unittest.TestCase): certfn = os.path.join(TEST_DIR, 'testcert.pem') self.certdir = os.path.join(TEST_DIR, 'testdata', 'certificate') cacertfn = os.path.join(self.certdir, 'ca.crt') - self.httpd = compat_http_server.HTTPServer(('127.0.0.1', 0), HTTPTestRequestHandler) + self.httpd = http.server.HTTPServer(('127.0.0.1', 0), HTTPTestRequestHandler) sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) sslctx.verify_mode = ssl.CERT_REQUIRED sslctx.load_verify_locations(cafile=cacertfn) @@ -130,7 +133,7 @@ class TestClientCert(unittest.TestCase): def _build_proxy_handler(name): - class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler): proxy_name = name def log_message(self, format, *args): @@ -146,14 +149,14 @@ def _build_proxy_handler(name): class TestProxy(unittest.TestCase): def setUp(self): - self.proxy = compat_http_server.HTTPServer( + self.proxy = http.server.HTTPServer( ('127.0.0.1', 0), _build_proxy_handler('normal')) self.port = http_server_port(self.proxy) self.proxy_thread = threading.Thread(target=self.proxy.serve_forever) self.proxy_thread.daemon = True self.proxy_thread.start() - self.geo_proxy = compat_http_server.HTTPServer( + self.geo_proxy = http.server.HTTPServer( ('127.0.0.1', 0), _build_proxy_handler('geo')) self.geo_port = http_server_port(self.geo_proxy) self.geo_proxy_thread = threading.Thread(target=self.geo_proxy.serve_forever) @@ -170,7 +173,7 @@ class TestProxy(unittest.TestCase): response = ydl.urlopen(url).read().decode() self.assertEqual(response, f'normal: {url}') - req = compat_urllib_request.Request(url) + req = urllib.request.Request(url) req.add_header('Ytdl-request-proxy', geo_proxy) response = ydl.urlopen(req).read().decode() self.assertEqual(response, f'geo: {url}') diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 872c58c8f..4277cabe0 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + # Allow direct execution import os import sys @@ -6,6 +7,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + from yt_dlp.jsinterp import JSInterpreter diff --git a/test/test_netrc.py b/test/test_netrc.py index f7a0b33d2..dc708d974 100644 --- a/test/test_netrc.py +++ b/test/test_netrc.py @@ -1,3 +1,6 @@ +#!/usr/bin/env python3 + +# Allow direct execution import os import sys import unittest diff --git a/test/test_overwrites.py b/test/test_overwrites.py index a6d5bae40..6954c07f9 100644 --- a/test/test_overwrites.py +++ b/test/test_overwrites.py @@ -1,11 +1,15 @@ #!/usr/bin/env python3 + +# Allow direct execution import os -import subprocess import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import subprocess + from test.helper import is_download_test, try_rm root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) diff --git a/test/test_post_hooks.py b/test/test_post_hooks.py index e84a08f29..3778d1794 100644 --- a/test/test_post_hooks.py +++ b/test/test_post_hooks.py @@ -1,13 +1,15 @@ #!/usr/bin/env python3 + +# Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_params, is_download_test, try_rm -import yt_dlp.YoutubeDL +from test.helper import get_params, is_download_test, try_rm +import yt_dlp.YoutubeDL # isort: split from yt_dlp.utils import DownloadError diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py index 9d8a4dcc5..c49e3ede0 100644 --- a/test/test_postprocessors.py +++ b/test/test_postprocessors.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + # Allow direct execution import os import sys @@ -6,6 +7,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + from yt_dlp import YoutubeDL from yt_dlp.compat import compat_shlex_quote from yt_dlp.postprocessor import ( diff --git a/test/test_socks.py b/test/test_socks.py index a8b068cdd..6651290d2 100644 --- a/test/test_socks.py +++ b/test/test_socks.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + # Allow direct execution import os import sys @@ -6,11 +7,12 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + import random import subprocess -from test.helper import FakeYDL, get_params, is_download_test +import urllib.request -from yt_dlp.compat import compat_str, compat_urllib_request +from test.helper import FakeYDL, get_params, is_download_test @is_download_test @@ -51,7 +53,7 @@ class TestMultipleSocks(unittest.TestCase): if params is None: return ydl = FakeYDL() - req = compat_urllib_request.Request('http://yt-dl.org/ip') + req = urllib.request.Request('http://yt-dl.org/ip') req.add_header('Ytdl-request-proxy', params['secondary_proxy']) self.assertEqual( ydl.urlopen(req).read().decode(), @@ -62,7 +64,7 @@ class TestMultipleSocks(unittest.TestCase): if params is None: return ydl = FakeYDL() - req = compat_urllib_request.Request('https://yt-dl.org/ip') + req = urllib.request.Request('https://yt-dl.org/ip') req.add_header('Ytdl-request-proxy', params['secondary_proxy']) self.assertEqual( ydl.urlopen(req).read().decode(), @@ -99,13 +101,13 @@ class TestSocks(unittest.TestCase): return ydl.urlopen('http://yt-dl.org/ip').read().decode() def test_socks4(self): - self.assertTrue(isinstance(self._get_ip('socks4'), compat_str)) + self.assertTrue(isinstance(self._get_ip('socks4'), str)) def test_socks4a(self): - self.assertTrue(isinstance(self._get_ip('socks4a'), compat_str)) + self.assertTrue(isinstance(self._get_ip('socks4a'), str)) def test_socks5(self): - self.assertTrue(isinstance(self._get_ip('socks5'), compat_str)) + self.assertTrue(isinstance(self._get_ip('socks5'), str)) if __name__ == '__main__': diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 182bd7a4b..57362895f 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + # Allow direct execution import os import sys @@ -6,8 +7,8 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, is_download_test, md5 +from test.helper import FakeYDL, is_download_test, md5 from yt_dlp.extractor import ( NPOIE, NRKTVIE, @@ -38,6 +39,9 @@ class BaseTestSubtitles(unittest.TestCase): self.DL = FakeYDL() self.ie = self.IE() self.DL.add_info_extractor(self.ie) + if not self.IE.working(): + print('Skipping: %s marked as not _WORKING' % self.IE.ie_key()) + self.skipTest('IE marked as not _WORKING') def getInfoDict(self): info_dict = self.DL.extract_info(self.url, download=False) @@ -57,6 +61,21 @@ class BaseTestSubtitles(unittest.TestCase): @is_download_test class TestYoutubeSubtitles(BaseTestSubtitles): + # Available subtitles for QRS8MkLhQmM: + # Language formats + # ru vtt, ttml, srv3, srv2, srv1, json3 + # fr vtt, ttml, srv3, srv2, srv1, json3 + # en vtt, ttml, srv3, srv2, srv1, json3 + # nl vtt, ttml, srv3, srv2, srv1, json3 + # de vtt, ttml, srv3, srv2, srv1, json3 + # ko vtt, ttml, srv3, srv2, srv1, json3 + # it vtt, ttml, srv3, srv2, srv1, json3 + # zh-Hant vtt, ttml, srv3, srv2, srv1, json3 + # hi vtt, ttml, srv3, srv2, srv1, json3 + # pt-BR vtt, ttml, srv3, srv2, srv1, json3 + # es-MX vtt, ttml, srv3, srv2, srv1, json3 + # ja vtt, ttml, srv3, srv2, srv1, json3 + # pl vtt, ttml, srv3, srv2, srv1, json3 url = 'QRS8MkLhQmM' IE = YoutubeIE @@ -65,47 +84,60 @@ class TestYoutubeSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) - self.assertEqual(md5(subtitles['en']), '688dd1ce0981683867e7fe6fde2a224b') - self.assertEqual(md5(subtitles['it']), '31324d30b8430b309f7f5979a504a769') + self.assertEqual(md5(subtitles['en']), 'ae1bd34126571a77aabd4d276b28044d') + self.assertEqual(md5(subtitles['it']), '0e0b667ba68411d88fd1c5f4f4eab2f9') for lang in ['fr', 'de']: self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) - def test_youtube_subtitles_ttml_format(self): + def _test_subtitles_format(self, fmt, md5_hash, lang='en'): self.DL.params['writesubtitles'] = True - self.DL.params['subtitlesformat'] = 'ttml' + self.DL.params['subtitlesformat'] = fmt subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), 'c97ddf1217390906fa9fbd34901f3da2') + self.assertEqual(md5(subtitles[lang]), md5_hash) + + def test_youtube_subtitles_ttml_format(self): + self._test_subtitles_format('ttml', 'c97ddf1217390906fa9fbd34901f3da2') def test_youtube_subtitles_vtt_format(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitlesformat'] = 'vtt' - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), 'ae1bd34126571a77aabd4d276b28044d') + self._test_subtitles_format('vtt', 'ae1bd34126571a77aabd4d276b28044d') - def test_youtube_automatic_captions(self): - self.url = '8YoUxe5ncPo' - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslangs'] = ['it'] - subtitles = self.getSubtitles() - self.assertTrue(subtitles['it'] is not None) + def test_youtube_subtitles_json3_format(self): + self._test_subtitles_format('json3', '688dd1ce0981683867e7fe6fde2a224b') - def test_youtube_no_automatic_captions(self): - self.url = 'QRS8MkLhQmM' + def _test_automatic_captions(self, url, lang): + self.url = url self.DL.params['writeautomaticsub'] = True + self.DL.params['subtitleslangs'] = [lang] subtitles = self.getSubtitles() - self.assertTrue(not subtitles) + self.assertTrue(subtitles[lang] is not None) + def test_youtube_automatic_captions(self): + # Available automatic captions for 8YoUxe5ncPo: + # Language formats (all in vtt, ttml, srv3, srv2, srv1, json3) + # gu, zh-Hans, zh-Hant, gd, ga, gl, lb, la, lo, tt, tr, + # lv, lt, tk, th, tg, te, fil, haw, yi, ceb, yo, de, da, + # el, eo, en, eu, et, es, ru, rw, ro, bn, be, bg, uk, jv, + # bs, ja, or, xh, co, ca, cy, cs, ps, pt, pa, vi, pl, hy, + # hr, ht, hu, hmn, hi, ha, mg, uz, ml, mn, mi, mk, ur, + # mt, ms, mr, ug, ta, my, af, sw, is, am, + # *it*, iw, sv, ar, + # su, zu, az, id, ig, nl, no, ne, ny, fr, ku, fy, fa, fi, + # ka, kk, sr, sq, ko, kn, km, st, sk, si, so, sn, sm, sl, + # ky, sd + # ... + self._test_automatic_captions('8YoUxe5ncPo', 'it') + + @unittest.skip('Video unavailable') def test_youtube_translated_subtitles(self): - # This video has a subtitles track, which can be translated - self.url = 'i0ZabxXmH4Y' - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslangs'] = ['it'] - subtitles = self.getSubtitles() - self.assertTrue(subtitles['it'] is not None) + # This video has a subtitles track, which can be translated (#4555) + self._test_automatic_captions('Ky9eprVWzlI', 'it') def test_youtube_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') - self.url = 'n5BB19UTcdA' + # Available automatic captions for 8YoUxe5ncPo: + # ... + # 8YoUxe5ncPo has no subtitles + self.url = '8YoUxe5ncPo' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() @@ -137,6 +169,7 @@ class TestDailymotionSubtitles(BaseTestSubtitles): @is_download_test +@unittest.skip('IE broken') class TestTedSubtitles(BaseTestSubtitles): url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' IE = TedTalkIE @@ -162,12 +195,12 @@ class TestVimeoSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), {'de', 'en', 'es', 'fr'}) - self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') - self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8') + self.assertEqual(md5(subtitles['en']), '386cbc9320b94e25cb364b97935e5dd1') + self.assertEqual(md5(subtitles['fr']), 'c9b69eef35bc6641c0d4da8a04f9dfac') def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') - self.url = 'http://vimeo.com/56015672' + self.url = 'http://vimeo.com/68093876' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() @@ -175,6 +208,7 @@ class TestVimeoSubtitles(BaseTestSubtitles): @is_download_test +@unittest.skip('IE broken') class TestWallaSubtitles(BaseTestSubtitles): url = 'http://vod.walla.co.il/movie/2705958/the-yes-men' IE = WallaIE @@ -197,6 +231,7 @@ class TestWallaSubtitles(BaseTestSubtitles): @is_download_test +@unittest.skip('IE broken') class TestCeskaTelevizeSubtitles(BaseTestSubtitles): url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky' IE = CeskaTelevizeIE @@ -219,6 +254,7 @@ class TestCeskaTelevizeSubtitles(BaseTestSubtitles): @is_download_test +@unittest.skip('IE broken') class TestLyndaSubtitles(BaseTestSubtitles): url = 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html' IE = LyndaIE @@ -232,6 +268,7 @@ class TestLyndaSubtitles(BaseTestSubtitles): @is_download_test +@unittest.skip('IE broken') class TestNPOSubtitles(BaseTestSubtitles): url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860' IE = NPOIE @@ -245,6 +282,7 @@ class TestNPOSubtitles(BaseTestSubtitles): @is_download_test +@unittest.skip('IE broken') class TestMTVSubtitles(BaseTestSubtitles): url = 'http://www.cc.com/video-clips/p63lk0/adam-devine-s-house-party-chasing-white-swans' IE = ComedyCentralIE @@ -269,8 +307,8 @@ class TestNRKSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(set(subtitles.keys()), {'no'}) - self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2') + self.assertEqual(set(subtitles.keys()), {'nb-ttv'}) + self.assertEqual(md5(subtitles['nb-ttv']), '67e06ff02d0deaf975e68f6cb8f6a149') @is_download_test @@ -295,6 +333,7 @@ class TestRaiPlaySubtitles(BaseTestSubtitles): @is_download_test +@unittest.skip('IE broken - DRM only') class TestVikiSubtitles(BaseTestSubtitles): url = 'http://www.viki.com/videos/1060846v-punch-episode-18' IE = VikiIE @@ -323,6 +362,7 @@ class TestThePlatformSubtitles(BaseTestSubtitles): @is_download_test +@unittest.skip('IE broken') class TestThePlatformFeedSubtitles(BaseTestSubtitles): url = 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207' IE = ThePlatformFeedIE @@ -360,7 +400,7 @@ class TestDemocracynowSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), {'en'}) - self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') + self.assertEqual(md5(subtitles['en']), 'a3cc4c0b5eadd74d9974f1c1f5101045') def test_subtitles_in_page(self): self.url = 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree' @@ -368,7 +408,7 @@ class TestDemocracynowSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), {'en'}) - self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') + self.assertEqual(md5(subtitles['en']), 'a3cc4c0b5eadd74d9974f1c1f5101045') @is_download_test diff --git a/test/test_utils.py b/test/test_utils.py index 184c39cff..8024a8e7c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 + # Allow direct execution -import contextlib import os import sys import unittest @@ -8,19 +8,16 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Various small unit tests +import contextlib import io import itertools import json import xml.etree.ElementTree from yt_dlp.compat import ( - compat_chr, compat_etree_fromstring, - compat_getenv, compat_HTMLParseError, compat_os_name, - compat_setenv, ) from yt_dlp.utils import ( Config, @@ -266,20 +263,20 @@ class TestUtil(unittest.TestCase): def env(var): return f'%{var}%' if sys.platform == 'win32' else f'${var}' - compat_setenv('yt_dlp_EXPATH_PATH', 'expanded') + os.environ['yt_dlp_EXPATH_PATH'] = 'expanded' self.assertEqual(expand_path(env('yt_dlp_EXPATH_PATH')), 'expanded') old_home = os.environ.get('HOME') test_str = R'C:\Documents and Settings\тест\Application Data' try: - compat_setenv('HOME', test_str) - self.assertEqual(expand_path(env('HOME')), compat_getenv('HOME')) - self.assertEqual(expand_path('~'), compat_getenv('HOME')) + os.environ['HOME'] = test_str + self.assertEqual(expand_path(env('HOME')), os.getenv('HOME')) + self.assertEqual(expand_path('~'), os.getenv('HOME')) self.assertEqual( expand_path('~/%s' % env('yt_dlp_EXPATH_PATH')), - '%s/expanded' % compat_getenv('HOME')) + '%s/expanded' % os.getenv('HOME')) finally: - compat_setenv('HOME', old_home or '') + os.environ['HOME'] = old_home or '' def test_prepend_extension(self): self.assertEqual(prepend_extension('abc.ext', 'temp'), 'abc.temp.ext') @@ -1128,7 +1125,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(extract_attributes('<e x="décomposé">'), {'x': 'décompose\u0301'}) # "Narrow" Python builds don't support unicode code points outside BMP. try: - compat_chr(0x10000) + chr(0x10000) supports_outside_bmp = True except ValueError: supports_outside_bmp = False diff --git a/test/test_verbose_output.py b/test/test_verbose_output.py index 657994074..21ce10a1f 100644 --- a/test/test_verbose_output.py +++ b/test/test_verbose_output.py @@ -1,11 +1,15 @@ #!/usr/bin/env python3 + +# Allow direct execution import os -import subprocess import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import subprocess + rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 66611e236..c2dd0ac30 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + # Allow direct execution import os import sys @@ -6,8 +7,8 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, is_download_test +from test.helper import FakeYDL, is_download_test from yt_dlp.extractor import YoutubeIE, YoutubeTabIE diff --git a/test/test_youtube_misc.py b/test/test_youtube_misc.py index 36f8be689..81be5d3c9 100644 --- a/test/test_youtube_misc.py +++ b/test/test_youtube_misc.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + # Allow direct execution import os import sys diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 2c2013295..4fc2917e5 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -1,18 +1,19 @@ #!/usr/bin/env python3 + # Allow direct execution -import contextlib import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import contextlib import re import string import urllib.request -from test.helper import FakeYDL, is_download_test -from yt_dlp.compat import compat_str +from test.helper import FakeYDL, is_download_test from yt_dlp.extractor import YoutubeIE from yt_dlp.jsinterp import JSInterpreter @@ -157,7 +158,7 @@ def t_factory(name, sig_func, url_pattern): def signature(jscode, sig_input): func = YoutubeIE(FakeYDL())._parse_sig_js(jscode) src_sig = ( - compat_str(string.printable[:sig_input]) + str(string.printable[:sig_input]) if isinstance(sig_input, int) else sig_input) return func(src_sig) diff --git a/tox.ini b/tox.ini deleted file mode 100644 index c8c14aafc..000000000 --- a/tox.ini +++ /dev/null @@ -1,15 +0,0 @@ -[tox] -envlist = py26,py27,py33,py34,py35 - -# Needed? -[testenv] -deps = - nose - coverage -# We need a valid $HOME for test_compat_expanduser -passenv = HOME -defaultargs = test --exclude test_download.py --exclude test_age_restriction.py - --exclude test_subtitles.py --exclude test_write_annotations.py - --exclude test_youtube_lists.py --exclude test_socks.py -commands = nosetests --verbose {posargs:{[testenv]defaultargs}} # --with-coverage --cover-package=yt_dlp --cover-html - # test.test_download:TestDownload.test_NowVideo @@ -1,2 +1,2 @@ -#!/bin/sh +#!/usr/bin/env sh exec "${PYTHON:-python3}" -bb -Werror -Xdev "$(dirname "$(realpath "$0")")/yt_dlp/__main__.py" "$@" diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 94f8dcaef..de8a8c4d2 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 import collections import contextlib import datetime @@ -26,20 +25,11 @@ import urllib.request from string import ascii_letters from .cache import Cache -from .compat import ( - compat_get_terminal_size, - compat_os_name, - compat_shlex_quote, - compat_str, - compat_urllib_error, - compat_urllib_request, - windows_enable_vt_mode, -) +from .compat import HAS_LEGACY as compat_has_legacy +from .compat import compat_os_name, compat_shlex_quote from .cookies import load_cookies from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name from .downloader.rtmp import rtmpdump_version -from .extractor import _LAZY_LOADER -from .extractor import _PLUGIN_CLASSES as plugin_extractors from .extractor import gen_extractor_classes, get_info_extractor from .extractor.openload import PhantomJSwrapper from .minicurses import format_text @@ -59,6 +49,7 @@ from .postprocessor import ( ) from .utils import ( DEFAULT_OUTTMPL, + IDENTITY, LINK_TEMPLATES, NO_DEFAULT, NUMBER_RE, @@ -75,13 +66,13 @@ from .utils import ( ExtractorError, GeoRestrictedError, HEADRequest, - InAdvancePagedList, ISO3166Utils, LazyList, MaxDownloadsReached, Namespace, PagedList, PerRequestProxyHandler, + PlaylistEntries, Popen, PostProcessingError, ReExtractInfo, @@ -141,6 +132,7 @@ from .utils import ( url_basename, variadic, version_tuple, + windows_enable_vt_mode, write_json_file, write_string, ) @@ -193,13 +185,6 @@ class YoutubeDL: For compatibility, a single list is also accepted print_to_file: A dict with keys WHEN (same as forceprint) mapped to a list of tuples with (template, filename) - forceurl: Force printing final URL. (Deprecated) - forcetitle: Force printing title. (Deprecated) - forceid: Force printing ID. (Deprecated) - forcethumbnail: Force printing thumbnail URL. (Deprecated) - forcedescription: Force printing description. (Deprecated) - forcefilename: Force printing final filename. (Deprecated) - forceduration: Force printing duration. (Deprecated) forcejson: Force printing info_dict as JSON. dump_single_json: Force printing the info_dict of the whole playlist (or video) as a single JSON line. @@ -249,11 +234,9 @@ class YoutubeDL: and don't overwrite any file if False For compatibility with youtube-dl, "nooverwrites" may also be used instead - playliststart: Playlist item to start at. - playlistend: Playlist item to end at. playlist_items: Specific indices of playlist to download. - playlistreverse: Download playlist items in reverse order. playlistrandom: Download playlist items in random order. + lazy_playlist: Process playlist entries as they are received. matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. logger: Log messages to a logging.Logger instance. @@ -276,9 +259,6 @@ class YoutubeDL: writedesktoplink: Write a Linux internet shortcut file (.desktop) writesubtitles: Write the video subtitles to a file writeautomaticsub: Write the automatically generated subtitles to a file - allsubtitles: Deprecated - Use subtitleslangs = ['all'] - Downloads all the subtitles of the video - (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video subtitlesformat: The format code for subtitles subtitleslangs: List of languages of the subtitles to download (can be regex). @@ -332,7 +312,6 @@ class YoutubeDL: bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi debug_printtraffic:Print out sent and received HTTP traffic - include_ads: Download ads as well (deprecated) default_search: Prepend this string if an input url is not valid. 'auto' for elaborate guessing encoding: Use this encoding instead of the system-specified. @@ -348,10 +327,6 @@ class YoutubeDL: * when: When to run the postprocessor. Allowed values are the entries of utils.POSTPROCESS_WHEN Assumed to be 'post_process' if not given - post_hooks: Deprecated - Register a custom postprocessor instead - A list of functions that get called as the final step - for each video file, after all postprocessors have been - called. The filename will be passed as the only argument. progress_hooks: A list of functions that get called on download progress, with a dictionary with the entries * status: One of "downloading", "error", or "finished". @@ -396,8 +371,6 @@ class YoutubeDL: - "detect_or_warn": check whether we can do anything about it, warn otherwise (default) source_address: Client-side IP address to bind to. - call_home: Boolean, true iff we are allowed to contact the - yt-dlp servers for debugging. (BROKEN) sleep_interval_requests: Number of seconds to sleep between requests during extraction sleep_interval: Number of seconds to sleep before each download when @@ -432,17 +405,10 @@ class YoutubeDL: geo_bypass_ip_block: IP range in CIDR notation that will be used similarly to geo_bypass_country - - The following options determine which downloader is picked: external_downloader: A dictionary of protocol keys and the executable of the external downloader to use for it. The allowed protocols are default|http|ftp|m3u8|dash|rtsp|rtmp|mms. Set the value to 'native' to use the native downloader - hls_prefer_native: Deprecated - Use external_downloader = {'m3u8': 'native'} - or {'m3u8': 'ffmpeg'} instead. - Use the native HLS downloader instead of ffmpeg/avconv - if True, otherwise use ffmpeg/avconv if False, otherwise - use downloader suggested by extractor if None. compat_opts: Compatibility options. See "Differences in default behavior". The following options do not work when used through the API: filename, abort-on-error, multistreams, no-live-chat, format-sort @@ -452,6 +418,16 @@ class YoutubeDL: Allowed keys are 'download', 'postprocess', 'download-title' (console title) and 'postprocess-title'. The template is mapped on a dictionary with keys 'progress' and 'info' + retry_sleep_functions: Dictionary of functions that takes the number of attempts + as argument and returns the time to sleep in seconds. + Allowed keys are 'http', 'fragment', 'file_access' + download_ranges: A function that gets called for every video with the signature + (info_dict, *, ydl) -> Iterable[Section]. + Only the returned sections will be downloaded. Each Section contains: + * start_time: Start time of the section in seconds + * end_time: End time of the section in seconds + * title: Section title (Optional) + * index: Section number (Optional) The following parameters are not used by YoutubeDL itself, they are used by the downloader (see yt_dlp/downloader/common.py): @@ -461,8 +437,6 @@ class YoutubeDL: external_downloader_args, concurrent_fragment_downloads. The following options are used by the post processors: - prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, - otherwise prefer ffmpeg. (avconv support is deprecated) ffmpeg_location: Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory. postprocessor_args: A dictionary of postprocessor/executable keys (in lower case) @@ -482,12 +456,54 @@ class YoutubeDL: See "EXTRACTOR ARGUMENTS" for details. Eg: {'youtube': {'skip': ['dash', 'hls']}} mark_watched: Mark videos watched (even with --simulate). Only for YouTube - youtube_include_dash_manifest: Deprecated - Use extractor_args instead. + + The following options are deprecated and may be removed in the future: + + playliststart: - Use playlist_items + Playlist item to start at. + playlistend: - Use playlist_items + Playlist item to end at. + playlistreverse: - Use playlist_items + Download playlist items in reverse order. + forceurl: - Use forceprint + Force printing final URL. + forcetitle: - Use forceprint + Force printing title. + forceid: - Use forceprint + Force printing ID. + forcethumbnail: - Use forceprint + Force printing thumbnail URL. + forcedescription: - Use forceprint + Force printing description. + forcefilename: - Use forceprint + Force printing final filename. + forceduration: - Use forceprint + Force printing duration. + allsubtitles: - Use subtitleslangs = ['all'] + Downloads all the subtitles of the video + (requires writesubtitles or writeautomaticsub) + include_ads: - Doesn't work + Download ads as well + call_home: - Not implemented + Boolean, true iff we are allowed to contact the + yt-dlp servers for debugging. + post_hooks: - Register a custom postprocessor + A list of functions that get called as the final step + for each video file, after all postprocessors have been + called. The filename will be passed as the only argument. + hls_prefer_native: - Use external_downloader = {'m3u8': 'native'} or {'m3u8': 'ffmpeg'}. + Use the native HLS downloader instead of ffmpeg/avconv + if True, otherwise use ffmpeg/avconv if False, otherwise + use downloader suggested by extractor if None. + prefer_ffmpeg: - avconv support is deprecated + If False, use avconv instead of ffmpeg if both are available, + otherwise prefer ffmpeg. + youtube_include_dash_manifest: - Use extractor_args If True (default), DASH manifests and related data will be downloaded and processed by extractor. You can reduce network I/O by disabling it if you don't care about DASH. (only for youtube) - youtube_include_hls_manifest: Deprecated - Use extractor_args instead. + youtube_include_hls_manifest: - Use extractor_args If True (default), HLS manifests and related data will be downloaded and processed by extractor. You can reduce network I/O by disabling it if you don't @@ -544,21 +560,27 @@ class YoutubeDL: self.cache = Cache(self) windows_enable_vt_mode() - self._out_files = { - 'error': sys.stderr, - 'print': sys.stderr if self.params.get('logtostderr') else sys.stdout, - 'console': None if compat_os_name == 'nt' else next( + stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout + self._out_files = Namespace( + out=stdout, + error=sys.stderr, + screen=sys.stderr if self.params.get('quiet') else stdout, + console=None if compat_os_name == 'nt' else next( filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None) - } - self._out_files['screen'] = sys.stderr if self.params.get('quiet') else self._out_files['print'] - self._allow_colors = { - type_: not self.params.get('no_color') and supports_terminal_sequences(self._out_files[type_]) - for type_ in ('screen', 'error') - } - - if sys.version_info < (3, 6): - self.report_warning( - 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2]) + ) + self._allow_colors = Namespace(**{ + type_: not self.params.get('no_color') and supports_terminal_sequences(stream) + for type_, stream in self._out_files.items_ if type_ != 'console' + }) + + MIN_SUPPORTED, MIN_RECOMMENDED = (3, 6), (3, 7) + current_version = sys.version_info[:2] + if current_version < MIN_RECOMMENDED: + msg = 'Support for Python version %d.%d has been deprecated and will break in future versions of yt-dlp' + if current_version < MIN_SUPPORTED: + msg = 'Python version %d.%d is no longer supported' + self.deprecation_warning( + f'{msg}! Please update to Python %d.%d or above' % (*current_version, *MIN_RECOMMENDED)) if self.params.get('allow_unplayable_formats'): self.report_warning( @@ -586,7 +608,10 @@ class YoutubeDL: for msg in self.params.get('_deprecation_warnings', []): self.deprecation_warning(msg) - if 'list-formats' in self.params.get('compat_opts', []): + self.params['compat_opts'] = set(self.params.get('compat_opts', ())) + if not compat_has_legacy: + self.params['compat_opts'].add('no-compat-legacy') + if 'list-formats' in self.params['compat_opts']: self.params['listformats_table'] = False if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None: @@ -610,15 +635,9 @@ class YoutubeDL: try: import pty master, slave = pty.openpty() - width = compat_get_terminal_size().columns - if width is None: - width_args = [] - else: - width_args = ['-w', str(width)] - sp_kwargs = dict( - stdin=subprocess.PIPE, - stdout=slave, - stderr=self._out_files['error']) + width = shutil.get_terminal_size().columns + width_args = [] if width is None else ['-w', str(width)] + sp_kwargs = {'stdin': subprocess.PIPE, 'stdout': slave, 'stderr': self._out_files.error} try: self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs) except OSError: @@ -647,7 +666,7 @@ class YoutubeDL: 'Set the LC_ALL environment variable to fix this.') self.params['restrictfilenames'] = True - self.outtmpl_dict = self.parse_outtmpl() + self._parse_outtmpl() # Creating format selector here allows us to catch syntax errors before the extraction self.format_selector = ( @@ -747,6 +766,7 @@ class YoutubeDL: def add_post_processor(self, pp, when='post_process'): """Add a PostProcessor object to the end of the chain.""" + assert when in POSTPROCESS_WHEN, f'Invalid when={when}' self._pps[when].append(pp) pp.set_downloader(self) @@ -770,7 +790,7 @@ class YoutubeDL: return message assert hasattr(self, '_output_process') - assert isinstance(message, compat_str) + assert isinstance(message, str) line_count = message.count('\n') + 1 self._output_process.stdin.write((message + '\n').encode()) self._output_process.stdin.flush() @@ -789,9 +809,9 @@ class YoutubeDL: """Print message to stdout""" if quiet is not None: self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. Use "YoutubeDL.to_screen" instead') - self._write_string( - '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')), - self._out_files['print']) + if skip_eol is not False: + self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument skip_eol. Use "YoutubeDL.to_screen" instead') + self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.out) def to_screen(self, message, skip_eol=False, quiet=None): """Print message to screen if not in quiet mode""" @@ -802,20 +822,20 @@ class YoutubeDL: return self._write_string( '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')), - self._out_files['screen']) + self._out_files.screen) def to_stderr(self, message, only_once=False): """Print message to stderr""" - assert isinstance(message, compat_str) + assert isinstance(message, str) if self.params.get('logger'): self.params['logger'].error(message) else: - self._write_string('%s\n' % self._bidi_workaround(message), self._out_files['error'], only_once=only_once) + self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.error, only_once=only_once) def _send_console_code(self, code): - if compat_os_name == 'nt' or not self._out_files['console']: + if compat_os_name == 'nt' or not self._out_files.console: return - self._write_string(code, self._out_files['console']) + self._write_string(code, self._out_files.console) def to_console_title(self, message): if not self.params.get('consoletitle', False): @@ -905,13 +925,14 @@ class YoutubeDL: text = fallback return format_text(text, f) if allow_colors else text if fallback is None else fallback + def _format_out(self, *args, **kwargs): + return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs) + def _format_screen(self, *args, **kwargs): - return self._format_text( - self._out_files['screen'], self._allow_colors['screen'], *args, **kwargs) + return self._format_text(self._out_files.screen, self._allow_colors.screen, *args, **kwargs) def _format_err(self, *args, **kwargs): - return self._format_text( - self._out_files['error'], self._allow_colors['error'], *args, **kwargs) + return self._format_text(self._out_files.error, self._allow_colors.error, *args, **kwargs) def report_warning(self, message, only_once=False): ''' @@ -942,7 +963,7 @@ class YoutubeDL: '''Log debug message or Print message to stderr''' if not self.params.get('verbose', False): return - message = '[debug] %s' % message + message = f'[debug] {message}' if self.params.get('logger'): self.params['logger'].debug(message) else: @@ -973,21 +994,19 @@ class YoutubeDL: self.report_warning(msg) def parse_outtmpl(self): - outtmpl_dict = self.params.get('outtmpl', {}) - if not isinstance(outtmpl_dict, dict): - outtmpl_dict = {'default': outtmpl_dict} - # Remove spaces in the default template - if self.params.get('restrictfilenames'): + self.deprecation_warning('"YoutubeDL.parse_outtmpl" is deprecated and may be removed in a future version') + self._parse_outtmpl() + return self.params['outtmpl'] + + def _parse_outtmpl(self): + sanitize = IDENTITY + if self.params.get('restrictfilenames'): # Remove spaces in the default template sanitize = lambda x: x.replace(' - ', ' ').replace(' ', '-') - else: - sanitize = lambda x: x - outtmpl_dict.update({ - k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() - if outtmpl_dict.get(k) is None}) - for _, val in outtmpl_dict.items(): - if isinstance(val, bytes): - self.report_warning('Parameter outtmpl is bytes, but should be a unicode string') - return outtmpl_dict + + outtmpl = self.params.setdefault('outtmpl', {}) + if not isinstance(outtmpl, dict): + self.params['outtmpl'] = outtmpl = {'default': outtmpl} + outtmpl.update({k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() if outtmpl.get(k) is None}) def get_output_path(self, dir_type='', filename=None): paths = self.params.get('paths', {}) @@ -1038,6 +1057,7 @@ class YoutubeDL: def _copy_infodict(info_dict): info_dict = dict(info_dict) info_dict.pop('__postprocessors', None) + info_dict.pop('__pending_error', None) return info_dict def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): @@ -1135,7 +1155,7 @@ class YoutubeDL: def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')): return sanitize_filename(str(value), restricted=restricted, is_id=( bool(re.search(r'(^|[_.])id(\.|$)', key)) - if 'filename-sanitization' in self.params.get('compat_opts', []) + if 'filename-sanitization' in self.params['compat_opts'] else NO_DEFAULT)) sanitizer = sanitize if callable(sanitize) else filename_sanitizer @@ -1224,7 +1244,7 @@ class YoutubeDL: def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None): assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive' if outtmpl is None: - outtmpl = self.outtmpl_dict.get(tmpl_type or 'default', self.outtmpl_dict['default']) + outtmpl = self.params['outtmpl'].get(tmpl_type or 'default', self.params['outtmpl']['default']) try: outtmpl = self._outtmpl_expandpath(outtmpl) filename = self.evaluate_outtmpl(outtmpl, info_dict, True) @@ -1390,7 +1410,7 @@ class YoutubeDL: else: self.report_error('no suitable InfoExtractor for URL %s' % url) - def __handle_extraction_exceptions(func): + def _handle_extraction_exceptions(func): @functools.wraps(func) def wrapper(self, *args, **kwargs): while True: @@ -1463,7 +1483,7 @@ class YoutubeDL: self.to_screen('') raise - @__handle_extraction_exceptions + @_handle_extraction_exceptions def __extract_info(self, url, ie, download, extra_info, process): ie_result = ie.extract(url) if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) @@ -1529,6 +1549,7 @@ class YoutubeDL: self.add_extra_info(info_copy, extra_info) info_copy, _ = self.pre_process(info_copy) self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True) + self._raise_pending_errors(info_copy) if self.params.get('force_write_download_archive', False): self.record_download_archive(info_copy) return ie_result @@ -1536,10 +1557,11 @@ class YoutubeDL: if result_type == 'video': self.add_extra_info(ie_result, extra_info) ie_result = self.process_video_result(ie_result, download=download) + self._raise_pending_errors(ie_result) additional_urls = (ie_result or {}).get('additional_urls') if additional_urls: # TODO: Improve MetadataParserPP to allow setting a list - if isinstance(additional_urls, compat_str): + if isinstance(additional_urls, str): additional_urls = [additional_urls] self.to_screen( '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls))) @@ -1570,9 +1592,13 @@ class YoutubeDL: if not info: return info + exempted_fields = {'_type', 'url', 'ie_key'} + if not ie_result.get('section_end') and ie_result.get('section_start') is None: + # For video clips, the id etc of the clip extractor should be used + exempted_fields |= {'id', 'extractor', 'extractor_key'} + new_result = info.copy() - new_result.update(filter_dict(ie_result, lambda k, v: ( - v is not None and k not in {'_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'}))) + new_result.update(filter_dict(ie_result, lambda k, v: v is not None and k not in exempted_fields)) # Extracted info may not be a video result (i.e. # info.get('_type', 'video') != video) but rather an url or @@ -1644,112 +1670,31 @@ class YoutubeDL: } def __process_playlist(self, ie_result, download): - # We process each entry in the playlist - playlist = ie_result.get('title') or ie_result.get('id') - self.to_screen('[download] Downloading playlist: %s' % playlist) - - if 'entries' not in ie_result: - raise EntryNotInPlaylist('There are no entries') - - MissingEntry = object() - incomplete_entries = bool(ie_result.get('requested_entries')) - if incomplete_entries: - def fill_missing_entries(entries, indices): - ret = [MissingEntry] * max(indices) - for i, entry in zip(indices, entries): - ret[i - 1] = entry - return ret - ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries']) - - playlist_results = [] - - playliststart = self.params.get('playliststart', 1) - playlistend = self.params.get('playlistend') - # For backwards compatibility, interpret -1 as whole list - if playlistend == -1: - playlistend = None - - playlistitems_str = self.params.get('playlist_items') - playlistitems = None - if playlistitems_str is not None: - def iter_playlistitems(format): - for string_segment in format.split(','): - if '-' in string_segment: - start, end = string_segment.split('-') - for item in range(int(start), int(end) + 1): - yield int(item) - else: - yield int(string_segment) - playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) + """Process each entry in the playlist""" + title = ie_result.get('title') or ie_result.get('id') or '<Untitled>' + self.to_screen(f'[download] Downloading playlist: {title}') - ie_entries = ie_result['entries'] - if isinstance(ie_entries, list): - playlist_count = len(ie_entries) - msg = f'Collected {playlist_count} videos; downloading %d of them' - ie_result['playlist_count'] = ie_result.get('playlist_count') or playlist_count + all_entries = PlaylistEntries(self, ie_result) + entries = orderedSet(all_entries.get_requested_items(), lazy=True) - def get_entry(i): - return ie_entries[i - 1] + lazy = self.params.get('lazy_playlist') + if lazy: + resolved_entries, n_entries = [], 'N/A' + ie_result['requested_entries'], ie_result['entries'] = None, None else: - msg = 'Downloading %d videos' - if not isinstance(ie_entries, (PagedList, LazyList)): - ie_entries = LazyList(ie_entries) - elif isinstance(ie_entries, InAdvancePagedList): - if ie_entries._pagesize == 1: - playlist_count = ie_entries._pagecount - - def get_entry(i): - return YoutubeDL.__handle_extraction_exceptions( - lambda self, i: ie_entries[i - 1] - )(self, i) - - entries, broken = [], False - items = playlistitems if playlistitems is not None else itertools.count(playliststart) - for i in items: - if i == 0: - continue - if playlistitems is None and playlistend is not None and playlistend < i: - break - entry = None - try: - entry = get_entry(i) - if entry is MissingEntry: - raise EntryNotInPlaylist() - except (IndexError, EntryNotInPlaylist): - if incomplete_entries: - raise EntryNotInPlaylist(f'Entry {i} cannot be found') - elif not playlistitems: - break - entries.append(entry) - try: - if entry is not None: - # TODO: Add auto-generated fields - self._match_entry(entry, incomplete=True, silent=True) - except (ExistingVideoReached, RejectedVideoReached): - broken = True - break - ie_result['entries'] = entries - - # Save playlist_index before re-ordering - entries = [ - ((playlistitems[i - 1] if playlistitems else i + playliststart - 1), entry) - for i, entry in enumerate(entries, 1) - if entry is not None] - n_entries = len(entries) - - if not (ie_result.get('playlist_count') or broken or playlistitems or playlistend): - ie_result['playlist_count'] = n_entries - - if not playlistitems and (playliststart != 1 or playlistend): - playlistitems = list(range(playliststart, playliststart + n_entries)) - ie_result['requested_entries'] = playlistitems + entries = resolved_entries = list(entries) + n_entries = len(resolved_entries) + ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], []) + if not ie_result.get('playlist_count'): + # Better to do this after potentially exhausting entries + ie_result['playlist_count'] = all_entries.get_full_count() _infojson_written = False write_playlist_files = self.params.get('allow_playlist_files', True) if write_playlist_files and self.params.get('list_thumbnails'): self.list_thumbnails(ie_result) if write_playlist_files and not self.params.get('simulate'): - ie_copy = self._playlist_infodict(ie_result, n_entries=n_entries) + ie_copy = self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries)) _infojson_written = self._write_info_json( 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson')) if _infojson_written is None: @@ -1760,33 +1705,41 @@ class YoutubeDL: # TODO: This should be passed to ThumbnailsConvertor if necessary self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail')) - if self.params.get('playlistreverse', False): - entries = entries[::-1] - if self.params.get('playlistrandom', False): + if lazy: + if self.params.get('playlistreverse') or self.params.get('playlistrandom'): + self.report_warning('playlistreverse and playlistrandom are not supported with lazy_playlist', only_once=True) + elif self.params.get('playlistreverse'): + entries.reverse() + elif self.params.get('playlistrandom'): random.shuffle(entries) - x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + self.to_screen(f'[{ie_result["extractor"]}] Playlist {title}: Downloading {n_entries} videos' + f'{format_field(ie_result, "playlist_count", " of %s")}') - self.to_screen(f'[{ie_result["extractor"]}] playlist {playlist}: {msg % n_entries}') failures = 0 max_failures = self.params.get('skip_playlist_after_errors') or float('inf') - for i, entry_tuple in enumerate(entries, 1): - playlist_index, entry = entry_tuple - if 'playlist-index' in self.params.get('compat_opts', []): - playlist_index = playlistitems[i - 1] if playlistitems else i + playliststart - 1 + for i, (playlist_index, entry) in enumerate(entries): + if lazy: + resolved_entries.append((playlist_index, entry)) + + # TODO: Add auto-generated fields + if not entry or self._match_entry(entry, incomplete=True) is not None: + continue + self.to_screen('[download] Downloading video %s of %s' % ( - self._format_screen(i, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS))) - # This __x_forwarded_for_ip thing is a bit ugly but requires - # minimal changes - if x_forwarded_for: - entry['__x_forwarded_for_ip'] = x_forwarded_for - extra = { - 'n_entries': n_entries, - '__last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries), + self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS))) + + entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip') + if not lazy and 'playlist-index' in self.params.get('compat_opts', []): + playlist_index = ie_result['requested_entries'][i] + + entry_result = self.__process_iterable_entry(entry, download, { + 'n_entries': int_or_none(n_entries), + '__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)), 'playlist_count': ie_result.get('playlist_count'), 'playlist_index': playlist_index, - 'playlist_autonumber': i, - 'playlist': playlist, + 'playlist_autonumber': i + 1, + 'playlist': title, 'playlist_id': ie_result.get('id'), 'playlist_title': ie_result.get('title'), 'playlist_uploader': ie_result.get('uploader'), @@ -1796,20 +1749,17 @@ class YoutubeDL: 'webpage_url_basename': url_basename(ie_result['webpage_url']), 'webpage_url_domain': get_domain(ie_result['webpage_url']), 'extractor_key': ie_result['extractor_key'], - } - - if self._match_entry(entry, incomplete=True) is not None: - continue - - entry_result = self.__process_iterable_entry(entry, download, extra) + }) if not entry_result: failures += 1 if failures >= max_failures: self.report_error( - 'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures)) + f'Skipping the remaining entries in playlist "{title}" since {failures} items failed extraction') break - playlist_results.append(entry_result) - ie_result['entries'] = playlist_results + resolved_entries[i] = (playlist_index, entry_result) + + # Update with processed data + ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], []) # Write the updated info to json if _infojson_written is True and self._write_info_json( @@ -1818,10 +1768,10 @@ class YoutubeDL: return ie_result = self.run_all_pps('playlist', ie_result) - self.to_screen(f'[download] Finished downloading playlist: {playlist}') + self.to_screen(f'[download] Finished downloading playlist: {title}') return ie_result - @__handle_extraction_exceptions + @_handle_extraction_exceptions def __process_iterable_entry(self, entry, download, extra_info): return self.process_ie_result( entry, download=download, extra_info=extra_info) @@ -1903,7 +1853,7 @@ class YoutubeDL: temp_file.close() try: success, _ = self.dl(temp_file.name, f, test=True) - except (DownloadError, IOError, OSError, ValueError) + network_exceptions: + except (DownloadError, OSError, ValueError) + network_exceptions: success = False finally: if os.path.exists(temp_file.name): @@ -1927,12 +1877,12 @@ class YoutubeDL: and download and ( not can_merge() - or info_dict.get('is_live', False) - or self.outtmpl_dict['default'] == '-')) + or info_dict.get('is_live') and not self.params.get('live_from_start') + or self.params['outtmpl']['default'] == '-')) compat = ( prefer_best or self.params.get('allow_multiple_audio_streams', False) - or 'format-spec' in self.params.get('compat_opts', [])) + or 'format-spec' in self.params['compat_opts']) return ( 'best/bestvideo+bestaudio' if prefer_best @@ -2273,7 +2223,7 @@ class YoutubeDL: def _calc_headers(self, info_dict): res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {}) - cookies = self._calc_cookies(info_dict) + cookies = self._calc_cookies(info_dict['url']) if cookies: res['Cookie'] = cookies @@ -2284,8 +2234,8 @@ class YoutubeDL: return res - def _calc_cookies(self, info_dict): - pr = sanitized_Request(info_dict['url']) + def _calc_cookies(self, url): + pr = sanitized_Request(url) self.cookiejar.add_cookie_header(pr) return pr.get_header('Cookie') @@ -2383,6 +2333,11 @@ class YoutubeDL: if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + def _raise_pending_errors(self, info): + err = info.pop('__pending_error', None) + if err: + self.report_error(err, tb=False) + def process_video_result(self, info_dict, download=True): assert info_dict.get('_type', 'video') == 'video' self._num_videos += 1 @@ -2399,10 +2354,10 @@ class YoutubeDL: def sanitize_string_field(info, string_field): field = info.get(string_field) - if field is None or isinstance(field, compat_str): + if field is None or isinstance(field, str): return report_force_conversion(string_field, 'a string', 'string') - info[string_field] = compat_str(field) + info[string_field] = str(field) def sanitize_numeric_fields(info): for numeric_field in self._NUMERIC_FIELDS: @@ -2414,9 +2369,20 @@ class YoutubeDL: sanitize_string_field(info_dict, 'id') sanitize_numeric_fields(info_dict) + if info_dict.get('section_end') and info_dict.get('section_start') is not None: + info_dict['duration'] = round(info_dict['section_end'] - info_dict['section_start'], 3) if (info_dict.get('duration') or 0) <= 0 and info_dict.pop('duration', None): self.report_warning('"duration" field is negative, there is an error in extractor') + chapters = info_dict.get('chapters') or [] + dummy_chapter = {'end_time': 0, 'start_time': info_dict.get('duration')} + for prev, current, next_ in zip( + (dummy_chapter, *chapters), chapters, (*chapters[1:], dummy_chapter)): + if current.get('start_time') is None: + current['start_time'] = prev.get('end_time') + if not current.get('end_time'): + current['end_time'] = next_.get('start_time') + if 'playlist' not in info_dict: # It isn't part of a playlist info_dict['playlist'] = None @@ -2503,7 +2469,7 @@ class YoutubeDL: sanitize_numeric_fields(format) format['url'] = sanitize_url(format['url']) if not format.get('format_id'): - format['format_id'] = compat_str(i) + format['format_id'] = str(i) else: # Sanitize format_id from characters used in format selector expression format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id']) @@ -2541,7 +2507,7 @@ class YoutubeDL: format['dynamic_range'] = 'SDR' if (info_dict.get('duration') and format.get('tbr') and not format.get('filesize') and not format.get('filesize_approx')): - format['filesize_approx'] = info_dict['duration'] * format['tbr'] * (1024 / 8) + format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8)) # Add HTTP headers, so that external programs can use them from the # json output @@ -2588,7 +2554,7 @@ class YoutubeDL: if list_only: # Without this printing, -F --print-json will not work self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True) - return + return info_dict format_selector = self.format_selector if format_selector is None: @@ -2629,20 +2595,40 @@ class YoutubeDL: # Process what we can, even without any available formats. formats_to_download = [{}] - best_format = formats_to_download[-1] + requested_ranges = self.params.get('download_ranges') + if requested_ranges: + requested_ranges = tuple(requested_ranges(info_dict, self)) + + best_format, downloaded_formats = formats_to_download[-1], [] if download: if best_format: - self.to_screen( - f'[info] {info_dict["id"]}: Downloading {len(formats_to_download)} format(s): ' - + ', '.join([f['format_id'] for f in formats_to_download])) + def to_screen(*msg): + self.to_screen(f'[info] {info_dict["id"]}: {" ".join(", ".join(variadic(m)) for m in msg)}') + + to_screen(f'Downloading {len(formats_to_download)} format(s):', + (f['format_id'] for f in formats_to_download)) + if requested_ranges: + to_screen(f'Downloading {len(requested_ranges)} time ranges:', + (f'{int(c["start_time"])}-{int(c["end_time"])}' for c in requested_ranges)) max_downloads_reached = False - for i, fmt in enumerate(formats_to_download): - formats_to_download[i] = new_info = self._copy_infodict(info_dict) + + for fmt, chapter in itertools.product(formats_to_download, requested_ranges or [{}]): + new_info = self._copy_infodict(info_dict) new_info.update(fmt) + offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf') + if chapter or offset: + new_info.update({ + 'section_start': offset + chapter.get('start_time', 0), + 'section_end': offset + min(chapter.get('end_time', duration), duration), + 'section_title': chapter.get('title'), + 'section_number': chapter.get('index'), + }) + downloaded_formats.append(new_info) try: self.process_info(new_info) except MaxDownloadsReached: max_downloads_reached = True + self._raise_pending_errors(new_info) # Remove copied info for key, val in tuple(new_info.items()): if info_dict.get(key) == val: @@ -2650,12 +2636,12 @@ class YoutubeDL: if max_downloads_reached: break - write_archive = {f.get('__write_download_archive', False) for f in formats_to_download} + write_archive = {f.get('__write_download_archive', False) for f in downloaded_formats} assert write_archive.issubset({True, False, 'ignore'}) if True in write_archive and False not in write_archive: self.record_download_archive(info_dict) - info_dict['requested_downloads'] = formats_to_download + info_dict['requested_downloads'] = downloaded_formats info_dict = self.run_all_pps('after_video', info_dict) if max_downloads_reached: raise MaxDownloadsReached() @@ -2877,8 +2863,13 @@ class YoutubeDL: # Forced printings self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict)) + def check_max_downloads(): + if self._num_downloads >= float(self.params.get('max_downloads') or 'inf'): + raise MaxDownloadsReached() + if self.params.get('simulate'): info_dict['__write_download_archive'] = self.params.get('force_write_download_archive') + check_max_downloads() return if full_filename is None: @@ -2982,12 +2973,8 @@ class YoutubeDL: info_dict.clear() info_dict.update(new_info) - try: - new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move) - replace_info_dict(new_info) - except PostProcessingError as err: - self.report_error('Preprocessing: %s' % str(err)) - return + new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move) + replace_info_dict(new_info) if self.params.get('skip_download'): info_dict['filepath'] = temp_filename @@ -3009,7 +2996,16 @@ class YoutubeDL: info_dict['ext'] = os.path.splitext(file)[1][1:] return file - success = True + fd, success = None, True + if info_dict.get('protocol') or info_dict.get('url'): + fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') + if fd is not FFmpegFD and ( + info_dict.get('section_start') or info_dict.get('section_end')): + msg = ('This format cannot be partially downloaded' if FFmpegFD.available() + else 'You have requested downloading the video partially, but ffmpeg is not installed') + self.report_error(f'{msg}. Aborting') + return + if info_dict.get('requested_formats') is not None: def compatible_formats(formats): @@ -3042,7 +3038,7 @@ class YoutubeDL: and info_dict.get('thumbnails') # check with type instead of pp_key, __name__, or isinstance # since we dont want any custom PPs to trigger this - and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): + and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): # noqa: E721 info_dict['ext'] = 'mkv' self.report_warning( 'webm doesn\'t support embedding a thumbnail, mkv will be used') @@ -3064,10 +3060,8 @@ class YoutubeDL: dl_filename = existing_video_file(full_filename, temp_filename) info_dict['__real_download'] = False - downloaded = [] merger = FFmpegMergerPP(self) - - fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') + downloaded = [] if dl_filename is not None: self.report_file_already_downloaded(dl_filename) elif fd: @@ -3147,6 +3141,7 @@ class YoutubeDL: self.report_error(f'content too short (expected {err.expected} bytes and served {err.downloaded})') return + self._raise_pending_errors(info_dict) if success and full_filename != '-': def fixup(): @@ -3216,15 +3211,10 @@ class YoutubeDL: return info_dict['__write_download_archive'] = True + assert info_dict is original_infodict # Make sure the info_dict was modified in-place if self.params.get('force_write_download_archive'): info_dict['__write_download_archive'] = True - - # Make sure the info_dict was modified in-place - assert info_dict is original_infodict - - max_downloads = self.params.get('max_downloads') - if max_downloads is not None and self._num_downloads >= int(max_downloads): - raise MaxDownloadsReached() + check_max_downloads() def __download_wrapper(self, func): @functools.wraps(func) @@ -3246,7 +3236,7 @@ class YoutubeDL: def download(self, url_list): """Download a given list of URLs.""" url_list = variadic(url_list) # Passing a single URL is a common mistake - outtmpl = self.outtmpl_dict['default'] + outtmpl = self.params['outtmpl']['default'] if (len(url_list) > 1 and outtmpl != '-' and '%' not in outtmpl @@ -3367,7 +3357,12 @@ class YoutubeDL: def pre_process(self, ie_info, key='pre_process', files_to_move=None): info = dict(ie_info) info['__files_to_move'] = files_to_move or {} - info = self.run_all_pps(key, info) + try: + info = self.run_all_pps(key, info) + except PostProcessingError as err: + msg = f'Preprocessing: {err}' + info.setdefault('__pending_error', msg) + self.report_error(msg, is_error=False) return info, info.pop('__files_to_move', None) def post_process(self, filename, info, files_to_move=None): @@ -3437,7 +3432,7 @@ class YoutubeDL: def _list_format_headers(self, *headers): if self.params.get('listformats_table', True) is not False: - return [self._format_screen(header, self.Styles.HEADERS) for header in headers] + return [self._format_out(header, self.Styles.HEADERS) for header in headers] return headers def _format_note(self, fdict): @@ -3515,10 +3510,10 @@ class YoutubeDL: ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1) - delim = self._format_screen('\u2502', self.Styles.DELIM, '|', test_encoding=True) + delim = self._format_out('\u2502', self.Styles.DELIM, '|', test_encoding=True) table = [ [ - self._format_screen(format_field(f, 'format_id'), self.Styles.ID), + self._format_out(format_field(f, 'format_id'), self.Styles.ID), format_field(f, 'ext'), format_field(f, func=self.format_resolution, ignore=('audio only', 'images')), format_field(f, 'fps', '\t%d'), @@ -3530,15 +3525,15 @@ class YoutubeDL: delim, format_field(f, 'vcodec', default='unknown').replace( 'none', 'images' if f.get('acodec') == 'none' - else self._format_screen('audio only', self.Styles.SUPPRESS)), + else self._format_out('audio only', self.Styles.SUPPRESS)), format_field(f, 'vbr', '\t%dk'), format_field(f, 'acodec', default='unknown').replace( 'none', '' if f.get('vcodec') == 'none' - else self._format_screen('video only', self.Styles.SUPPRESS)), + else self._format_out('video only', self.Styles.SUPPRESS)), format_field(f, 'abr', '\t%dk'), format_field(f, 'asr', '\t%dHz'), join_nonempty( - self._format_screen('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None, + self._format_out('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None, format_field(f, 'language', '[%s]'), join_nonempty(format_field(f, 'format_note'), format_field(f, 'container', ignore=(None, f.get('ext'))), @@ -3551,7 +3546,7 @@ class YoutubeDL: return render_table( header_line, table, hide_empty=True, - delim=self._format_screen('\u2500', self.Styles.DELIM, '-', test_encoding=True)) + delim=self._format_out('\u2500', self.Styles.DELIM, '-', test_encoding=True)) def render_thumbnails_table(self, info_dict): thumbnails = list(info_dict.get('thumbnails') or []) @@ -3602,18 +3597,25 @@ class YoutubeDL: if not self.params.get('verbose'): return + # These imports can be slow. So import them only as needed + from .extractor.extractors import _LAZY_LOADER + from .extractor.extractors import _PLUGIN_CLASSES as plugin_extractors + def get_encoding(stream): ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)) if not supports_terminal_sequences(stream): - from .compat import WINDOWS_VT_MODE # Must be imported locally + from .utils import WINDOWS_VT_MODE # Must be imported locally ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)' return ret - encoding_str = 'Encodings: locale %s, fs %s, out %s, err %s, pref %s' % ( + encoding_str = 'Encodings: locale %s, fs %s, pref %s, %s' % ( locale.getpreferredencoding(), sys.getfilesystemencoding(), - get_encoding(self._out_files['screen']), get_encoding(self._out_files['error']), - self.get_encoding()) + self.get_encoding(), + ', '.join( + f'{key} {get_encoding(stream)}' for key, stream in self._out_files.items_ + if stream is not None and key != 'console') + ) logger = self.params.get('logger') if logger: @@ -3638,19 +3640,17 @@ class YoutubeDL: write_debug('Plugins: %s' % [ '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}') for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())]) - if self.params.get('compat_opts'): - write_debug('Compatibility options: %s' % ', '.join(self.params.get('compat_opts'))) + if self.params['compat_opts']: + write_debug('Compatibility options: %s' % ', '.join(self.params['compat_opts'])) if source == 'source': try: - sp = Popen( + stdout, _, _ = Popen.run( ['git', 'rev-parse', '--short', 'HEAD'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - cwd=os.path.dirname(os.path.abspath(__file__))) - out, err = sp.communicate_or_kill() - out = out.decode().strip() - if re.match('[0-9a-f]+', out): - write_debug('Git HEAD: %s' % out) + text=True, cwd=os.path.dirname(os.path.abspath(__file__)), + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if re.fullmatch('[0-9a-f]+', stdout.strip()): + write_debug(f'Git HEAD: {stdout.strip()}') except Exception: with contextlib.suppress(Exception): sys.exc_clear() @@ -3724,7 +3724,7 @@ class YoutubeDL: else: proxies = {'http': opts_proxy, 'https': opts_proxy} else: - proxies = compat_urllib_request.getproxies() + proxies = urllib.request.getproxies() # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805) if 'http' in proxies and 'https' not in proxies: proxies['https'] = proxies['http'] @@ -3740,13 +3740,13 @@ class YoutubeDL: # default FileHandler and allows us to disable the file protocol, which # can be used for malicious purposes (see # https://github.com/ytdl-org/youtube-dl/issues/8227) - file_handler = compat_urllib_request.FileHandler() + file_handler = urllib.request.FileHandler() def file_open(*args, **kwargs): - raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons') + raise urllib.error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons') file_handler.file_open = file_open - opener = compat_urllib_request.build_opener( + opener = urllib.request.build_opener( proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler) # Delete the default user-agent header, which would otherwise apply in diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 386996e16..0c68f8571 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -3,15 +3,18 @@ f'You are using an unsupported version of Python. Only Python versions 3.6 and a __license__ = 'CC0-1.0' +import getpass import itertools +import optparse import os import re import sys -from .compat import compat_getpass, compat_os_name, compat_shlex_quote +from .compat import compat_shlex_quote from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS from .downloader import FileDownloader -from .extractor import GenericIE, list_extractor_classes +from .downloader.external import get_external_downloader +from .extractor import list_extractor_classes from .extractor.adobepass import MSO_INFO from .extractor.common import InfoExtractor from .options import parseOpts @@ -24,7 +27,7 @@ from .postprocessor import ( MetadataFromFieldPP, MetadataParserPP, ) -from .update import run_update +from .update import Updater from .utils import ( NO_DEFAULT, POSTPROCESS_WHEN, @@ -32,42 +35,47 @@ from .utils import ( DownloadCancelled, DownloadError, GeoUtils, + PlaylistEntries, SameFileError, decodeOption, + download_range_func, expand_path, float_or_none, + format_field, int_or_none, match_filter_func, parse_duration, preferredencoding, read_batch_urls, + read_stdin, render_table, setproctitle, std_headers, traverse_obj, + variadic, write_string, ) from .YoutubeDL import YoutubeDL +def _exit(status=0, *args): + for msg in args: + sys.stderr.write(msg) + raise SystemExit(status) + + def get_urls(urls, batchfile, verbose): # Batch file verification batch_urls = [] if batchfile is not None: try: - if batchfile == '-': - write_string('Reading URLs from stdin - EOF (%s) to end:\n' % ( - 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D')) - batchfd = sys.stdin - else: - batchfd = io.open( - expand_path(batchfile), - 'r', encoding='utf-8', errors='ignore') - batch_urls = read_batch_urls(batchfd) + batch_urls = read_batch_urls( + read_stdin('URLs') if batchfile == '-' + else open(expand_path(batchfile), encoding='utf-8', errors='ignore')) if verbose: write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') - except IOError: - sys.exit('ERROR: batch file %s could not be read' % batchfile) + except OSError: + _exit(f'ERROR: batch file {batchfile} could not be read') _enc = preferredencoding() return [ url.strip().decode(_enc, 'ignore') if isinstance(url, bytes) else url.strip() @@ -75,6 +83,11 @@ def get_urls(urls, batchfile, verbose): def print_extractor_information(opts, urls): + # Importing GenericIE is currently slow since it imports other extractors + # TODO: Move this back to module level after generalization of embed detection + from .extractor.generic import GenericIE + + out = '' if opts.list_extractors: for ie in list_extractors(opts.age_limit): write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie.working() else '') + '\n', out=sys.stdout) @@ -210,15 +223,11 @@ def validate_options(opts): validate_regex('format sorting', f, InfoExtractor.FormatSort.regex) # Postprocessor formats - validate_in('audio format', opts.audioformat, ['best'] + list(FFmpegExtractAudioPP.SUPPORTED_EXTS)) + validate_regex('audio format', opts.audioformat, FFmpegExtractAudioPP.FORMAT_RE) validate_in('subtitle format', opts.convertsubtitles, FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS) - validate_in('thumbnail format', opts.convertthumbnails, FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS) - if opts.recodevideo is not None: - opts.recodevideo = opts.recodevideo.replace(' ', '') - validate_regex('video recode format', opts.recodevideo, FFmpegVideoConvertorPP.FORMAT_RE) - if opts.remuxvideo is not None: - opts.remuxvideo = opts.remuxvideo.replace(' ', '') - validate_regex('video remux format', opts.remuxvideo, FFmpegVideoRemuxerPP.FORMAT_RE) + validate_regex('thumbnail format', opts.convertthumbnails, FFmpegThumbnailsConvertorPP.FORMAT_RE) + validate_regex('recode video format', opts.recodevideo, FFmpegVideoConvertorPP.FORMAT_RE) + validate_regex('remux video format', opts.remuxvideo, FFmpegVideoRemuxerPP.FORMAT_RE) if opts.audioquality: opts.audioquality = opts.audioquality.strip('k').strip('K') # int_or_none prevents inf, nan @@ -240,6 +249,28 @@ def validate_options(opts): opts.extractor_retries = parse_retries('extractor', opts.extractor_retries) opts.file_access_retries = parse_retries('file access', opts.file_access_retries) + # Retry sleep function + def parse_sleep_func(expr): + NUMBER_RE = r'\d+(?:\.\d+)?' + op, start, limit, step, *_ = tuple(re.fullmatch( + rf'(?:(linear|exp)=)?({NUMBER_RE})(?::({NUMBER_RE})?)?(?::({NUMBER_RE}))?', + expr.strip()).groups()) + (None, None) + + if op == 'exp': + return lambda n: min(float(start) * (float(step or 2) ** n), float(limit or 'inf')) + else: + default_step = start if op or limit else 0 + return lambda n: min(float(start) + float(step or default_step) * n, float(limit or 'inf')) + + for key, expr in opts.retry_sleep.items(): + if not expr: + del opts.retry_sleep[key] + continue + try: + opts.retry_sleep[key] = parse_sleep_func(expr) + except AttributeError: + raise ValueError(f'invalid {key} retry sleep expression {expr!r}') + # Bytes def parse_bytes(name, value): if value is None: @@ -284,20 +315,25 @@ def validate_options(opts): 'Cannot download a video and extract audio into the same file! ' f'Use "{outtmpl_default}.%(ext)s" instead of "{outtmpl_default}" as the output template') - # Remove chapters - remove_chapters_patterns, opts.remove_ranges = [], [] - for regex in opts.remove_chapters or []: - if regex.startswith('*'): - dur = list(map(parse_duration, regex[1:].split('-'))) - if len(dur) == 2 and all(t is not None for t in dur): - opts.remove_ranges.append(tuple(dur)) + def parse_chapters(name, value): + chapters, ranges = [], [] + for regex in value or []: + if regex.startswith('*'): + for range in regex[1:].split(','): + dur = tuple(map(parse_duration, range.strip().split('-'))) + if len(dur) == 2 and all(t is not None for t in dur): + ranges.append(dur) + else: + raise ValueError(f'invalid {name} time range "{regex}". Must be of the form *start-end') continue - raise ValueError(f'invalid --remove-chapters time range "{regex}". Must be of the form *start-end') - try: - remove_chapters_patterns.append(re.compile(regex)) - except re.error as err: - raise ValueError(f'invalid --remove-chapters regex "{regex}" - {err}') - opts.remove_chapters = remove_chapters_patterns + try: + chapters.append(re.compile(regex)) + except re.error as err: + raise ValueError(f'invalid {name} regex "{regex}" - {err}') + return chapters, ranges + + opts.remove_chapters, opts.remove_ranges = parse_chapters('--remove-chapters', opts.remove_chapters) + opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges)) # Cookies from browser if opts.cookiesfrombrowser: @@ -341,6 +377,12 @@ def validate_options(opts): opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, parse_metadata))) # Other options + if opts.playlist_items is not None: + try: + tuple(PlaylistEntries.parse_playlist_items(opts.playlist_items)) + except Exception as err: + raise ValueError(f'Invalid playlist-items {opts.playlist_items!r}: {err}') + geo_bypass_code = opts.geo_bypass_ip_block or opts.geo_bypass_country if geo_bypass_code is not None: try: @@ -361,6 +403,17 @@ def validate_options(opts): if opts.no_sponsorblock: opts.sponsorblock_mark = opts.sponsorblock_remove = set() + default_downloader = None + for proto, path in opts.external_downloader.items(): + if path == 'native': + continue + ed = get_external_downloader(path) + if ed is None: + raise ValueError( + f'No such {format_field(proto, None, "%s ", ignore="default")}external downloader "{path}"') + elif ed and proto == 'default': + default_downloader = ed.get_basename() + warnings, deprecation_warnings = [], [] # Common mistake: -f best @@ -371,13 +424,18 @@ def validate_options(opts): 'If you know what you are doing and want only the best pre-merged format, use "-f b" instead to suppress this warning'))) # --(postprocessor/downloader)-args without name - def report_args_compat(name, value, key1, key2=None): + def report_args_compat(name, value, key1, key2=None, where=None): if key1 in value and key2 not in value: - warnings.append(f'{name} arguments given without specifying name. The arguments will be given to all {name}s') + warnings.append(f'{name.title()} arguments given without specifying name. ' + f'The arguments will be given to {where or f"all {name}s"}') return True return False - report_args_compat('external downloader', opts.external_downloader_args, 'default') + if report_args_compat('external downloader', opts.external_downloader_args, + 'default', where=default_downloader) and default_downloader: + # Compat with youtube-dl's behavior. See https://github.com/ytdl-org/youtube-dl/commit/49c5293014bc11ec8c009856cd63cffa6296c1e1 + opts.external_downloader_args.setdefault(default_downloader, opts.external_downloader_args.pop('default')) + if report_args_compat('post-processor', opts.postprocessor_args, 'default-compat', 'default'): opts.postprocessor_args['default'] = opts.postprocessor_args.pop('default-compat') opts.postprocessor_args.setdefault('sponskrub', []) @@ -396,6 +454,9 @@ def validate_options(opts): setattr(opts, opt1, default) # Conflicting options + report_conflict('--playlist-reverse', 'playlist_reverse', '--playlist-random', 'playlist_random') + report_conflict('--playlist-reverse', 'playlist_reverse', '--lazy-playlist', 'lazy_playlist') + report_conflict('--playlist-random', 'playlist_random', '--lazy-playlist', 'lazy_playlist') report_conflict('--dateafter', 'dateafter', '--date', 'date', default=None) report_conflict('--datebefore', 'datebefore', '--date', 'date', default=None) report_conflict('--exec-before-download', 'exec_before_dl_cmd', '"--exec before_dl:"', 'exec_cmd', opts.exec_cmd.get('before_dl')) @@ -470,9 +531,9 @@ def validate_options(opts): # Ask for passwords if opts.username is not None and opts.password is None: - opts.password = compat_getpass('Type account password and press [Return]: ') + opts.password = getpass.getpass('Type account password and press [Return]: ') if opts.ap_username is not None and opts.ap_password is None: - opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ') + opts.ap_password = getpass.getpass('Type TV provider account password and press [Return]: ') return warnings, deprecation_warnings @@ -626,7 +687,7 @@ def parse_options(argv=None): final_ext = ( opts.recodevideo if opts.recodevideo in FFmpegVideoConvertorPP.SUPPORTED_EXTS else opts.remuxvideo if opts.remuxvideo in FFmpegVideoRemuxerPP.SUPPORTED_EXTS - else opts.audioformat if (opts.extractaudio and opts.audioformat != 'best') + else opts.audioformat if (opts.extractaudio and opts.audioformat in FFmpegExtractAudioPP.SUPPORTED_EXTS) else None) return parser, opts, urls, { @@ -682,6 +743,7 @@ def parse_options(argv=None): 'file_access_retries': opts.file_access_retries, 'fragment_retries': opts.fragment_retries, 'extractor_retries': opts.extractor_retries, + 'retry_sleep_functions': opts.retry_sleep, 'skip_unavailable_fragments': opts.skip_unavailable_fragments, 'keep_fragments': opts.keep_fragments, 'concurrent_fragment_downloads': opts.concurrent_fragment_downloads, @@ -696,6 +758,7 @@ def parse_options(argv=None): 'playlistend': opts.playlistend, 'playlistreverse': opts.playlist_reverse, 'playlistrandom': opts.playlist_random, + 'lazy_playlist': opts.lazy_playlist, 'noplaylist': opts.noplaylist, 'logtostderr': opts.outtmpl.get('default') == '-', 'consoletitle': opts.consoletitle, @@ -727,6 +790,7 @@ def parse_options(argv=None): 'verbose': opts.verbose, 'dump_intermediate_pages': opts.dump_intermediate_pages, 'write_pages': opts.write_pages, + 'load_pages': opts.load_pages, 'test': opts.test, 'keepvideo': opts.keepvideo, 'min_filesize': opts.min_filesize, @@ -775,6 +839,8 @@ def parse_options(argv=None): 'max_sleep_interval': opts.max_sleep_interval, 'sleep_interval_subtitles': opts.sleep_interval_subtitles, 'external_downloader': opts.external_downloader, + 'download_ranges': opts.download_ranges, + 'force_keyframes_at_cuts': opts.force_keyframes_at_cuts, 'list_thumbnails': opts.list_thumbnails, 'playlist_items': opts.playlist_items, 'xattr_set_filesize': opts.xattr_set_filesize, @@ -813,52 +879,66 @@ def _real_main(argv=None): if opts.dump_user_agent: ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent']) write_string(f'{ua}\n', out=sys.stdout) - sys.exit(0) + return if print_extractor_information(opts, all_urls): - sys.exit(0) + return with YoutubeDL(ydl_opts) as ydl: + pre_process = opts.update_self or opts.rm_cachedir actual_use = all_urls or opts.load_info_filename - # Remove cache dir if opts.rm_cachedir: ydl.cache.remove() - # Maybe do nothing + updater = Updater(ydl) + if opts.update_self and updater.update() and actual_use: + if updater.cmd: + return updater.restart() + # This code is reachable only for zip variant in py < 3.10 + # It makes sense to exit here, but the old behavior is to continue + ydl.report_warning('Restart yt-dlp to use the updated version') + # return 100, 'ERROR: The program must exit for the update to complete' + if not actual_use: + if pre_process: + return ydl._download_retcode + ydl.warn_if_short_id(sys.argv[1:] if argv is None else argv) parser.error( 'You must provide at least one URL.\n' 'Type yt-dlp --help to see a list of all options.') + parser.destroy() try: if opts.load_info_filename is not None: - retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename)) + return ydl.download_with_info_file(expand_path(opts.load_info_filename)) else: - retcode = ydl.download(all_urls) + return ydl.download(all_urls) except DownloadCancelled: ydl.to_screen('Aborting remaining downloads') - retcode = 101 - - sys.exit(retcode) + return 101 def main(argv=None): try: - _real_main(argv) + _exit(*variadic(_real_main(argv))) except DownloadError: - sys.exit(1) + _exit(1) except SameFileError as e: - sys.exit(f'ERROR: {e}') + _exit(f'ERROR: {e}') except KeyboardInterrupt: - sys.exit('\nERROR: Interrupted by user') + _exit('\nERROR: Interrupted by user') except BrokenPipeError as e: # https://docs.python.org/3/library/signal.html#note-on-sigpipe devnull = os.open(os.devnull, os.O_WRONLY) os.dup2(devnull, sys.stdout.fileno()) - sys.exit(f'\nERROR: {e}') + _exit(f'\nERROR: {e}') + except optparse.OptParseError as e: + _exit(2, f'\n{e}') + +from .extractor import gen_extractors, list_extractors __all__ = [ 'main', diff --git a/yt_dlp/__main__.py b/yt_dlp/__main__.py index c9d275b86..ff5d71d3c 100644 --- a/yt_dlp/__main__.py +++ b/yt_dlp/__main__.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + # Execute with # $ python -m yt_dlp diff --git a/yt_dlp/aes.py b/yt_dlp/aes.py index d0e6d7549..b3f504977 100644 --- a/yt_dlp/aes.py +++ b/yt_dlp/aes.py @@ -1,6 +1,7 @@ +import base64 from math import ceil -from .compat import compat_b64decode, compat_ord +from .compat import compat_ord from .dependencies import Cryptodome_AES from .utils import bytes_to_intlist, intlist_to_bytes @@ -264,7 +265,7 @@ def aes_decrypt_text(data, password, key_size_bytes): """ NONCE_LENGTH_BYTES = 8 - data = bytes_to_intlist(compat_b64decode(data)) + data = bytes_to_intlist(base64.b64decode(data)) password = bytes_to_intlist(password.encode()) key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) diff --git a/yt_dlp/cache.py b/yt_dlp/cache.py index e3f8a7dab..83351b797 100644 --- a/yt_dlp/cache.py +++ b/yt_dlp/cache.py @@ -6,7 +6,6 @@ import re import shutil import traceback -from .compat import compat_getenv from .utils import expand_path, write_json_file @@ -17,7 +16,7 @@ class Cache: def _get_root_dir(self): res = self._ydl.params.get('cachedir') if res is None: - cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache') + cache_root = os.getenv('XDG_CACHE_HOME', '~/.cache') res = os.path.join(cache_root, 'yt-dlp') return expand_path(res) diff --git a/yt_dlp/compat/__init__.py b/yt_dlp/compat/__init__.py index a0cd62110..9f8e8c3e5 100644 --- a/yt_dlp/compat/__init__.py +++ b/yt_dlp/compat/__init__.py @@ -1,6 +1,4 @@ -import contextlib import os -import subprocess import sys import warnings import xml.etree.ElementTree as etree @@ -9,10 +7,14 @@ from . import re from ._deprecated import * # noqa: F401, F403 from .compat_utils import passthrough_module - # XXX: Implement this the same way as other DeprecationWarnings without circular import -passthrough_module(__name__, '._legacy', callback=lambda attr: warnings.warn( - DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=2)) +try: + passthrough_module(__name__, '._legacy', callback=lambda attr: warnings.warn( + DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=2)) + HAS_LEGACY = True +except ModuleNotFoundError: + # Keep working even without _legacy module + HAS_LEGACY = False del passthrough_module @@ -52,7 +54,7 @@ if compat_os_name == 'nt' and sys.version_info < (3, 8): def compat_realpath(path): while os.path.islink(path): path = os.path.abspath(os.readlink(path)) - return path + return os.path.realpath(path) else: compat_realpath = os.path.realpath @@ -74,17 +76,3 @@ if compat_os_name in ('nt', 'ce'): return userhome + path[i:] else: compat_expanduser = os.path.expanduser - - -WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None - - -def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075 - if compat_os_name != 'nt': - return - global WINDOWS_VT_MODE - startupinfo = subprocess.STARTUPINFO() - startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW - with contextlib.suppress(Exception): - subprocess.Popen('', shell=True, startupinfo=startupinfo).wait() - WINDOWS_VT_MODE = True diff --git a/yt_dlp/compat/_deprecated.py b/yt_dlp/compat/_deprecated.py index 390f76577..342f1f80d 100644 --- a/yt_dlp/compat/_deprecated.py +++ b/yt_dlp/compat/_deprecated.py @@ -1,52 +1,16 @@ """Deprecated - New code should avoid these""" import base64 -import getpass -import html -import html.parser -import http -import http.client -import http.cookiejar -import http.cookies -import http.server -import itertools -import os -import shutil -import struct -import tokenize -import urllib +import urllib.error +import urllib.parse + +compat_str = str compat_b64decode = base64.b64decode -compat_chr = chr -compat_cookiejar = http.cookiejar -compat_cookiejar_Cookie = http.cookiejar.Cookie -compat_cookies_SimpleCookie = http.cookies.SimpleCookie -compat_get_terminal_size = shutil.get_terminal_size -compat_getenv = os.getenv -compat_getpass = getpass.getpass -compat_html_entities = html.entities -compat_html_entities_html5 = html.entities.html5 -compat_HTMLParser = html.parser.HTMLParser -compat_http_client = http.client -compat_http_server = http.server + compat_HTTPError = urllib.error.HTTPError -compat_itertools_count = itertools.count +compat_urlparse = urllib.parse compat_parse_qs = urllib.parse.parse_qs -compat_str = str -compat_struct_pack = struct.pack -compat_struct_unpack = struct.unpack -compat_tokenize_tokenize = tokenize.tokenize -compat_urllib_error = urllib.error compat_urllib_parse_unquote = urllib.parse.unquote -compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus compat_urllib_parse_urlencode = urllib.parse.urlencode compat_urllib_parse_urlparse = urllib.parse.urlparse -compat_urllib_request = urllib.request -compat_urlparse = compat_urllib_parse = urllib.parse - - -def compat_setenv(key, value, env=os.environ): - env[key] = value - - -__all__ = [x for x in globals() if x.startswith('compat_')] diff --git a/yt_dlp/compat/_legacy.py b/yt_dlp/compat/_legacy.py index ce24760e5..49bb13a3c 100644 --- a/yt_dlp/compat/_legacy.py +++ b/yt_dlp/compat/_legacy.py @@ -2,18 +2,27 @@ import collections import ctypes -import http +import getpass +import html.entities +import html.parser import http.client import http.cookiejar import http.cookies import http.server +import itertools +import os import shlex +import shutil import socket import struct -import urllib +import tokenize +import urllib.error +import urllib.parse +import urllib.request import xml.etree.ElementTree as etree from subprocess import DEVNULL +from .compat_utils import passthrough_module # isort: split from .asyncio import run as compat_asyncio_run # noqa: F401 from .re import Pattern as compat_Pattern # noqa: F401 from .re import match as compat_Match # noqa: F401 @@ -21,6 +30,8 @@ from ..dependencies import Cryptodome_AES as compat_pycrypto_AES # noqa: F401 from ..dependencies import brotli as compat_brotli # noqa: F401 from ..dependencies import websockets as compat_websockets # noqa: F401 +passthrough_module(__name__, '...utils', ('WINDOWS_VT_MODE', 'windows_enable_vt_mode')) + # compat_ctypes_WINFUNCTYPE = ctypes.WINFUNCTYPE # will not work since ctypes.WINFUNCTYPE does not exist in UNIX machines @@ -28,12 +39,17 @@ def compat_ctypes_WINFUNCTYPE(*args, **kwargs): return ctypes.WINFUNCTYPE(*args, **kwargs) +def compat_setenv(key, value, env=os.environ): + env[key] = value + + compat_basestring = str compat_collections_abc = collections.abc compat_cookies = http.cookies compat_etree_Element = etree.Element compat_etree_register_namespace = etree.register_namespace compat_filter = filter +compat_getenv = os.getenv compat_input = input compat_integer_types = (int, ) compat_kwargs = lambda kwargs: kwargs @@ -49,9 +65,28 @@ compat_urllib_parse_quote_plus = urllib.parse.quote_plus compat_urllib_parse_unquote_to_bytes = urllib.parse.unquote_to_bytes compat_urllib_parse_urlunparse = urllib.parse.urlunparse compat_urllib_request_DataHandler = urllib.request.DataHandler +compat_urllib_request = urllib.request compat_urllib_response = urllib.response compat_urlretrieve = urllib.request.urlretrieve compat_xml_parse_error = etree.ParseError compat_xpath = lambda xpath: xpath compat_zip = zip workaround_optparse_bug9161 = lambda: None +compat_getpass = getpass.getpass +compat_chr = chr +compat_urllib_parse = urllib.parse +compat_itertools_count = itertools.count +compat_cookiejar = http.cookiejar +compat_cookiejar_Cookie = http.cookiejar.Cookie +compat_cookies_SimpleCookie = http.cookies.SimpleCookie +compat_get_terminal_size = shutil.get_terminal_size +compat_html_entities = html.entities +compat_html_entities_html5 = html.entities.html5 +compat_tokenize_tokenize = tokenize.tokenize +compat_HTMLParser = html.parser.HTMLParser +compat_http_client = http.client +compat_http_server = http.server +compat_struct_pack = struct.pack +compat_struct_unpack = struct.unpack +compat_urllib_error = urllib.error +compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus diff --git a/yt_dlp/compat/compat_utils.py b/yt_dlp/compat/compat_utils.py index b1d58f5b9..82e176281 100644 --- a/yt_dlp/compat/compat_utils.py +++ b/yt_dlp/compat/compat_utils.py @@ -4,7 +4,6 @@ import importlib import sys import types - _NO_ATTRIBUTE = object() _Package = collections.namedtuple('Package', ('name', 'version')) @@ -31,9 +30,9 @@ def _is_package(module): return True -def passthrough_module(parent, child, *, callback=lambda _: None): +def passthrough_module(parent, child, allowed_attributes=None, *, callback=lambda _: None): parent_module = importlib.import_module(parent) - child_module = importlib.import_module(child, parent) + child_module = None # Import child module only as needed class PassthroughModule(types.ModuleType): def __getattr__(self, attr): @@ -41,19 +40,30 @@ def passthrough_module(parent, child, *, callback=lambda _: None): with contextlib.suppress(ImportError): return importlib.import_module(f'.{attr}', parent) - ret = _NO_ATTRIBUTE + ret = self.__from_child(attr) + if ret is _NO_ATTRIBUTE: + raise AttributeError(f'module {parent} has no attribute {attr}') + callback(attr) + return ret + + def __from_child(self, attr): + if allowed_attributes is None: + if attr.startswith('__') and attr.endswith('__'): + return _NO_ATTRIBUTE + elif attr not in allowed_attributes: + return _NO_ATTRIBUTE + + nonlocal child_module + child_module = child_module or importlib.import_module(child, parent) + with contextlib.suppress(AttributeError): - ret = getattr(child_module, attr) + return getattr(child_module, attr) if _is_package(child_module): with contextlib.suppress(ImportError): - ret = importlib.import_module(f'.{attr}', child) - - if ret is _NO_ATTRIBUTE: - raise AttributeError(f'module {parent} has no attribute {attr}') + return importlib.import_module(f'.{attr}', child) - callback(attr) - return ret + return _NO_ATTRIBUTE # Python 3.6 does not have module level __getattr__ # https://peps.python.org/pep-0562/ diff --git a/yt_dlp/compat/functools.py b/yt_dlp/compat/functools.py new file mode 100644 index 000000000..ec003ea90 --- /dev/null +++ b/yt_dlp/compat/functools.py @@ -0,0 +1,26 @@ +# flake8: noqa: F405 +from functools import * # noqa: F403 + +from .compat_utils import passthrough_module + +passthrough_module(__name__, 'functools') +del passthrough_module + +try: + cache # >= 3.9 +except NameError: + cache = lru_cache(maxsize=None) + +try: + cached_property # >= 3.8 +except NameError: + class cached_property: + def __init__(self, func): + update_wrapper(self, func) + self.func = func + + def __get__(self, instance, _): + if instance is None: + return self + setattr(instance, self.func.__name__, self.func(instance)) + return getattr(instance, self.func.__name__) diff --git a/yt_dlp/compat/imghdr.py b/yt_dlp/compat/imghdr.py new file mode 100644 index 000000000..734b0d876 --- /dev/null +++ b/yt_dlp/compat/imghdr.py @@ -0,0 +1,14 @@ +tests = { + 'webp': lambda h: h[0:4] == b'RIFF' and h[8:] == b'WEBP', + 'png': lambda h: h[:8] == b'\211PNG\r\n\032\n', + 'jpeg': lambda h: h[6:10] in (b'JFIF', b'Exif'), +} + + +def what(path): + """Detect format of image (Currently supports jpeg, png, webp only) + Ref: https://github.com/python/cpython/blob/3.10/Lib/imghdr.py + """ + with open(path, 'rb') as f: + head = f.read(12) + return next((type_ for type_, test in tests.items() if test(head)), None) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 1598828f2..df8f97b44 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1,5 +1,7 @@ +import base64 import contextlib import ctypes +import http.cookiejar import json import os import shutil @@ -17,7 +19,6 @@ from .aes import ( aes_gcm_decrypt_and_verify_bytes, unpad_pkcs7, ) -from .compat import compat_b64decode, compat_cookiejar_Cookie from .dependencies import ( _SECRETSTORAGE_UNAVAILABLE_REASON, secretstorage, @@ -63,7 +64,7 @@ class YDLLogger: # Do not print to files/pipes, loggers, or when --no-progress is used if not self._ydl or self._ydl.params.get('noprogress') or self._ydl.params.get('logger'): return - file = self._ydl._out_files['error'] + file = self._ydl._out_files.error try: if not file.isatty(): return @@ -142,7 +143,7 @@ def _extract_firefox_cookies(profile, logger): total_cookie_count = len(table) for i, (host, name, value, path, expiry, is_secure) in enumerate(table): progress_bar.print(f'Loading cookie {i: 6d}/{total_cookie_count: 6d}') - cookie = compat_cookiejar_Cookie( + cookie = http.cookiejar.Cookie( version=0, name=name, value=value, port=None, port_specified=False, domain=host, domain_specified=bool(host), domain_initial_dot=host.startswith('.'), path=path, path_specified=bool(path), secure=is_secure, expires=expiry, discard=False, @@ -156,30 +157,16 @@ def _extract_firefox_cookies(profile, logger): def _firefox_browser_dir(): - if sys.platform in ('linux', 'linux2'): - return os.path.expanduser('~/.mozilla/firefox') - elif sys.platform == 'win32': + if sys.platform in ('cygwin', 'win32'): return os.path.expandvars(R'%APPDATA%\Mozilla\Firefox\Profiles') elif sys.platform == 'darwin': return os.path.expanduser('~/Library/Application Support/Firefox') - else: - raise ValueError(f'unsupported platform: {sys.platform}') + return os.path.expanduser('~/.mozilla/firefox') def _get_chromium_based_browser_settings(browser_name): # https://chromium.googlesource.com/chromium/src/+/HEAD/docs/user_data_dir.md - if sys.platform in ('linux', 'linux2'): - config = _config_home() - browser_dir = { - 'brave': os.path.join(config, 'BraveSoftware/Brave-Browser'), - 'chrome': os.path.join(config, 'google-chrome'), - 'chromium': os.path.join(config, 'chromium'), - 'edge': os.path.join(config, 'microsoft-edge'), - 'opera': os.path.join(config, 'opera'), - 'vivaldi': os.path.join(config, 'vivaldi'), - }[browser_name] - - elif sys.platform == 'win32': + if sys.platform in ('cygwin', 'win32'): appdata_local = os.path.expandvars('%LOCALAPPDATA%') appdata_roaming = os.path.expandvars('%APPDATA%') browser_dir = { @@ -203,7 +190,15 @@ def _get_chromium_based_browser_settings(browser_name): }[browser_name] else: - raise ValueError(f'unsupported platform: {sys.platform}') + config = _config_home() + browser_dir = { + 'brave': os.path.join(config, 'BraveSoftware/Brave-Browser'), + 'chrome': os.path.join(config, 'google-chrome'), + 'chromium': os.path.join(config, 'chromium'), + 'edge': os.path.join(config, 'microsoft-edge'), + 'opera': os.path.join(config, 'opera'), + 'vivaldi': os.path.join(config, 'vivaldi'), + }[browser_name] # Linux keyring names can be determined by snooping on dbus while opening the browser in KDE: # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" @@ -303,7 +298,7 @@ def _process_chrome_cookie(decryptor, host_key, name, value, encrypted_value, pa if value is None: return is_encrypted, None - return is_encrypted, compat_cookiejar_Cookie( + return is_encrypted, http.cookiejar.Cookie( version=0, name=name, value=value, port=None, port_specified=False, domain=host_key, domain_specified=bool(host_key), domain_initial_dot=host_key.startswith('.'), path=path, path_specified=bool(path), secure=is_secure, expires=expires_utc, discard=False, @@ -343,14 +338,11 @@ class ChromeCookieDecryptor: def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None): - if sys.platform in ('linux', 'linux2'): - return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring) - elif sys.platform == 'darwin': + if sys.platform == 'darwin': return MacChromeCookieDecryptor(browser_keyring_name, logger) - elif sys.platform == 'win32': + elif sys.platform in ('win32', 'cygwin'): return WindowsChromeCookieDecryptor(browser_root, logger) - else: - raise NotImplementedError(f'Chrome cookie decryption is not supported on this platform: {sys.platform}') + return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring) class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): @@ -598,7 +590,7 @@ def _parse_safari_cookies_record(data, jar, logger): p.skip_to(record_size, 'space at the end of the record') - cookie = compat_cookiejar_Cookie( + cookie = http.cookiejar.Cookie( version=0, name=name, value=value, port=None, port_specified=False, domain=domain, domain_specified=bool(domain), domain_initial_dot=domain.startswith('.'), path=path, path_specified=bool(path), secure=is_secure, expires=expiration_date, discard=False, @@ -718,21 +710,19 @@ def _get_kwallet_network_wallet(logger): """ default_wallet = 'kdewallet' try: - proc = Popen([ + stdout, _, returncode = Popen.run([ 'dbus-send', '--session', '--print-reply=literal', '--dest=org.kde.kwalletd5', '/modules/kwalletd5', 'org.kde.KWallet.networkWallet' - ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + ], text=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) - stdout, stderr = proc.communicate_or_kill() - if proc.returncode != 0: + if returncode: logger.warning('failed to read NetworkWallet') return default_wallet else: - network_wallet = stdout.decode().strip() - logger.debug(f'NetworkWallet = "{network_wallet}"') - return network_wallet + logger.debug(f'NetworkWallet = "{stdout.strip()}"') + return stdout.strip() except Exception as e: logger.warning(f'exception while obtaining NetworkWallet: {e}') return default_wallet @@ -750,17 +740,16 @@ def _get_kwallet_password(browser_keyring_name, logger): network_wallet = _get_kwallet_network_wallet(logger) try: - proc = Popen([ + stdout, _, returncode = Popen.run([ 'kwallet-query', '--read-password', f'{browser_keyring_name} Safe Storage', '--folder', f'{browser_keyring_name} Keys', network_wallet ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) - stdout, stderr = proc.communicate_or_kill() - if proc.returncode != 0: - logger.error(f'kwallet-query failed with return code {proc.returncode}. Please consult ' - 'the kwallet-query man page for details') + if returncode: + logger.error(f'kwallet-query failed with return code {returncode}. ' + 'Please consult the kwallet-query man page for details') return b'' else: if stdout.lower().startswith(b'failed to read'): @@ -775,9 +764,7 @@ def _get_kwallet_password(browser_keyring_name, logger): return b'' else: logger.debug('password found') - if stdout[-1:] == b'\n': - stdout = stdout[:-1] - return stdout + return stdout.rstrip(b'\n') except Exception as e: logger.warning(f'exception running kwallet-query: {error_to_str(e)}') return b'' @@ -824,17 +811,13 @@ def _get_linux_keyring_password(browser_keyring_name, keyring, logger): def _get_mac_keyring_password(browser_keyring_name, logger): logger.debug('using find-generic-password to obtain password from OSX keychain') try: - proc = Popen( + stdout, _, _ = Popen.run( ['security', 'find-generic-password', '-w', # write password to stdout '-a', browser_keyring_name, # match 'account' '-s', f'{browser_keyring_name} Safe Storage'], # match 'service' stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) - - stdout, stderr = proc.communicate_or_kill() - if stdout[-1:] == b'\n': - stdout = stdout[:-1] - return stdout + return stdout.rstrip(b'\n') except Exception as e: logger.warning(f'exception running find-generic-password: {error_to_str(e)}') return None @@ -853,7 +836,7 @@ def _get_windows_v10_key(browser_root, logger): except KeyError: logger.error('no encrypted key in Local State') return None - encrypted_key = compat_b64decode(base64_key) + encrypted_key = base64.b64decode(base64_key) prefix = b'DPAPI' if not encrypted_key.startswith(prefix): logger.error('invalid key') diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py index 5aba303dd..a7dc6c9d0 100644 --- a/yt_dlp/downloader/__init__.py +++ b/yt_dlp/downloader/__init__.py @@ -1,4 +1,3 @@ -from ..compat import compat_str from ..utils import NO_DEFAULT, determine_protocol @@ -85,13 +84,13 @@ def _get_suitable_downloader(info_dict, protocol, params, default): if default is NO_DEFAULT: default = HttpFD - # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): - # return FFmpegFD + if (info_dict.get('section_start') or info_dict.get('section_end')) and FFmpegFD.can_download(info_dict): + return FFmpegFD info_dict['protocol'] = protocol downloaders = params.get('external_downloader') external_downloader = ( - downloaders if isinstance(downloaders, compat_str) or downloaders is None + downloaders if isinstance(downloaders, str) or downloaders is None else downloaders.get(shorten_protocol_name(protocol, True), downloaders.get('default'))) if external_downloader is None: diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index 1f14ebb3a..3a0a014ef 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -15,14 +15,18 @@ from ..utils import ( NUMBER_RE, LockingUnsupportedError, Namespace, + classproperty, decodeArgument, encodeFilename, error_to_compat_str, + float_or_none, format_bytes, + join_nonempty, sanitize_open, shell_quote, timeconvert, timetuple_from_msec, + try_call, ) @@ -41,6 +45,7 @@ class FileDownloader: verbose: Print additional info to stdout. quiet: Do not print messages to stdout. ratelimit: Download speed limit, in bytes/sec. + continuedl: Attempt to continue downloads if possible throttledratelimit: Assume the download is being throttled below this speed (bytes/sec) retries: Number of times to retry for HTTP error 5xx file_access_retries: Number of times to retry on file access error @@ -64,6 +69,7 @@ class FileDownloader: useful for bypassing bandwidth throttling imposed by a webserver (experimental) progress_template: See YoutubeDL.py + retry_sleep_functions: See YoutubeDL.py Subclasses of this one must re-define the real_download method. """ @@ -98,12 +104,16 @@ class FileDownloader: def to_screen(self, *args, **kargs): self.ydl.to_screen(*args, quiet=self.params.get('quiet'), **kargs) - @property - def FD_NAME(self): - return re.sub(r'(?<!^)(?=[A-Z])', '_', type(self).__name__[:-2]).lower() + __to_screen = to_screen + + @classproperty + def FD_NAME(cls): + return re.sub(r'(?<=[a-z])(?=[A-Z])', '_', cls.__name__[:-2]).lower() @staticmethod def format_seconds(seconds): + if seconds is None: + return ' Unknown' time = timetuple_from_msec(seconds * 1000) if time.hours > 99: return '--:--:--' @@ -111,6 +121,8 @@ class FileDownloader: return '%02d:%02d' % time[1:-1] return '%02d:%02d:%02d' % time[:-1] + format_eta = format_seconds + @staticmethod def calc_percent(byte_counter, data_len): if data_len is None: @@ -119,11 +131,7 @@ class FileDownloader: @staticmethod def format_percent(percent): - if percent is None: - return '---.-%' - elif percent == 100: - return '100%' - return '%6s' % ('%3.1f%%' % percent) + return ' N/A%' if percent is None else f'{percent:>5.1f}%' @staticmethod def calc_eta(start, now, total, current): @@ -138,12 +146,6 @@ class FileDownloader: return int((float(total) - float(current)) / rate) @staticmethod - def format_eta(eta): - if eta is None: - return '--:--' - return FileDownloader.format_seconds(eta) - - @staticmethod def calc_speed(start, now, bytes): dif = now - start if bytes == 0 or dif < 0.001: # One millisecond @@ -152,13 +154,11 @@ class FileDownloader: @staticmethod def format_speed(speed): - if speed is None: - return '%10s' % '---b/s' - return '%10s' % ('%s/s' % format_bytes(speed)) + return ' Unknown B/s' if speed is None else f'{format_bytes(speed):>10s}/s' @staticmethod def format_retries(retries): - return 'inf' if retries == float('inf') else '%.0f' % retries + return 'inf' if retries == float('inf') else int(retries) @staticmethod def best_block_size(elapsed_time, bytes): @@ -232,7 +232,8 @@ class FileDownloader: self.to_screen( f'[download] Unable to {action} file due to file access error. ' f'Retrying (attempt {retry} of {self.format_retries(file_access_retries)}) ...') - time.sleep(0.01) + if not self.sleep_retry('file_access', retry): + time.sleep(0.01) return inner return outer @@ -282,9 +283,9 @@ class FileDownloader: elif self.ydl.params.get('logger'): self._multiline = MultilineLogger(self.ydl.params['logger'], lines) elif self.params.get('progress_with_newline'): - self._multiline = BreaklineStatusPrinter(self.ydl._out_files['screen'], lines) + self._multiline = BreaklineStatusPrinter(self.ydl._out_files.out, lines) else: - self._multiline = MultilinePrinter(self.ydl._out_files['screen'], lines, not self.params.get('quiet')) + self._multiline = MultilinePrinter(self.ydl._out_files.out, lines, not self.params.get('quiet')) self._multiline.allow_colors = self._multiline._HAVE_FULLCAP and not self.params.get('no_color') def _finish_multiline_status(self): @@ -301,7 +302,7 @@ class FileDownloader: ) def _report_progress_status(self, s, default_template): - for name, style in self.ProgressStyles._asdict().items(): + for name, style in self.ProgressStyles.items_: name = f'_{name}_str' if name not in s: continue @@ -325,63 +326,52 @@ class FileDownloader: self._multiline.stream, self._multiline.allow_colors, *args, **kwargs) def report_progress(self, s): + def with_fields(*tups, default=''): + for *fields, tmpl in tups: + if all(s.get(f) is not None for f in fields): + return tmpl + return default + if s['status'] == 'finished': if self.params.get('noprogress'): self.to_screen('[download] Download completed') - msg_template = '100%%' - if s.get('total_bytes') is not None: - s['_total_bytes_str'] = format_bytes(s['total_bytes']) - msg_template += ' of %(_total_bytes_str)s' - if s.get('elapsed') is not None: - s['_elapsed_str'] = self.format_seconds(s['elapsed']) - msg_template += ' in %(_elapsed_str)s' - s['_percent_str'] = self.format_percent(100) - self._report_progress_status(s, msg_template) - return + s.update({ + '_total_bytes_str': format_bytes(s.get('total_bytes')), + '_elapsed_str': self.format_seconds(s.get('elapsed')), + '_percent_str': self.format_percent(100), + }) + self._report_progress_status(s, join_nonempty( + '100%%', + with_fields(('total_bytes', 'of %(_total_bytes_str)s')), + with_fields(('elapsed', 'in %(_elapsed_str)s')), + delim=' ')) if s['status'] != 'downloading': return - if s.get('eta') is not None: - s['_eta_str'] = self.format_eta(s['eta']) - else: - s['_eta_str'] = 'Unknown' - - if s.get('total_bytes') and s.get('downloaded_bytes') is not None: - s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes']) - elif s.get('total_bytes_estimate') and s.get('downloaded_bytes') is not None: - s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes_estimate']) - else: - if s.get('downloaded_bytes') == 0: - s['_percent_str'] = self.format_percent(0) - else: - s['_percent_str'] = 'Unknown %' - - if s.get('speed') is not None: - s['_speed_str'] = self.format_speed(s['speed']) - else: - s['_speed_str'] = 'Unknown speed' - - if s.get('total_bytes') is not None: - s['_total_bytes_str'] = format_bytes(s['total_bytes']) - msg_template = '%(_percent_str)s of %(_total_bytes_str)s at %(_speed_str)s ETA %(_eta_str)s' - elif s.get('total_bytes_estimate') is not None: - s['_total_bytes_estimate_str'] = format_bytes(s['total_bytes_estimate']) - msg_template = '%(_percent_str)s of ~%(_total_bytes_estimate_str)s at %(_speed_str)s ETA %(_eta_str)s' - else: - if s.get('downloaded_bytes') is not None: - s['_downloaded_bytes_str'] = format_bytes(s['downloaded_bytes']) - if s.get('elapsed'): - s['_elapsed_str'] = self.format_seconds(s['elapsed']) - msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s (%(_elapsed_str)s)' - else: - msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s' - else: - msg_template = '%(_percent_str)s at %(_speed_str)s ETA %(_eta_str)s' - if s.get('fragment_index') and s.get('fragment_count'): - msg_template += ' (frag %(fragment_index)s/%(fragment_count)s)' - elif s.get('fragment_index'): - msg_template += ' (frag %(fragment_index)s)' + s.update({ + '_eta_str': self.format_eta(s.get('eta')), + '_speed_str': self.format_speed(s.get('speed')), + '_percent_str': self.format_percent(try_call( + lambda: 100 * s['downloaded_bytes'] / s['total_bytes'], + lambda: 100 * s['downloaded_bytes'] / s['total_bytes_estimate'], + lambda: s['downloaded_bytes'] == 0 and 0)), + '_total_bytes_str': format_bytes(s.get('total_bytes')), + '_total_bytes_estimate_str': format_bytes(s.get('total_bytes_estimate')), + '_downloaded_bytes_str': format_bytes(s.get('downloaded_bytes')), + '_elapsed_str': self.format_seconds(s.get('elapsed')), + }) + + msg_template = with_fields( + ('total_bytes', '%(_percent_str)s of %(_total_bytes_str)s at %(_speed_str)s ETA %(_eta_str)s'), + ('total_bytes_estimate', '%(_percent_str)s of ~%(_total_bytes_estimate_str)s at %(_speed_str)s ETA %(_eta_str)s'), + ('downloaded_bytes', 'elapsed', '%(_downloaded_bytes_str)s at %(_speed_str)s (%(_elapsed_str)s)'), + ('downloaded_bytes', '%(_downloaded_bytes_str)s at %(_speed_str)s'), + default='%(_percent_str)s at %(_speed_str)s ETA %(_eta_str)s') + + msg_template += with_fields( + ('fragment_index', 'fragment_count', ' (frag %(fragment_index)s/%(fragment_count)s)'), + ('fragment_index', ' (frag %(fragment_index)s)')) self._report_progress_status(s, msg_template) def report_resuming_byte(self, resume_len): @@ -390,14 +380,23 @@ class FileDownloader: def report_retry(self, err, count, retries): """Report retry in case of HTTP error 5xx""" - self.to_screen( + self.__to_screen( '[download] Got server HTTP error: %s. Retrying (attempt %d of %s) ...' % (error_to_compat_str(err), count, self.format_retries(retries))) + self.sleep_retry('http', count) def report_unable_to_resume(self): """Report it was impossible to resume download.""" self.to_screen('[download] Unable to resume') + def sleep_retry(self, retry_type, count): + sleep_func = self.params.get('retry_sleep_functions', {}).get(retry_type) + delay = float_or_none(sleep_func(n=count - 1)) if sleep_func else None + if delay: + self.__to_screen(f'Sleeping {delay:.2f} seconds ...') + time.sleep(delay) + return sleep_func is not None + @staticmethod def supports_manifest(manifest): """ Whether the downloader can download the fragments from the manifest. diff --git a/yt_dlp/downloader/dash.py b/yt_dlp/downloader/dash.py index e6efae485..a6da26f09 100644 --- a/yt_dlp/downloader/dash.py +++ b/yt_dlp/downloader/dash.py @@ -1,7 +1,7 @@ import time +from . import get_suitable_downloader from .fragment import FragmentFD -from ..downloader import get_suitable_downloader from ..utils import urljoin @@ -73,6 +73,7 @@ class DashSegmentsFD(FragmentFD): yield { 'frag_index': frag_index, + 'fragment_count': fragment.get('fragment_count'), 'index': i, 'url': fragment_url, } diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index 85c6a6977..f84a17f23 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -1,3 +1,4 @@ +import enum import os.path import re import subprocess @@ -5,7 +6,7 @@ import sys import time from .fragment import FragmentFD -from ..compat import compat_setenv, compat_str +from ..compat import functools from ..postprocessor.ffmpeg import EXT_TO_OUT_FORMATS, FFmpegPostProcessor from ..utils import ( Popen, @@ -24,9 +25,15 @@ from ..utils import ( ) +class Features(enum.Enum): + TO_STDOUT = enum.auto() + MULTIPLE_FORMATS = enum.auto() + + class ExternalFD(FragmentFD): SUPPORTED_PROTOCOLS = ('http', 'https', 'ftp', 'ftps') - can_download_to_stdout = False + SUPPORTED_FEATURES = () + _CAPTURE_STDERR = True def real_download(self, filename, info_dict): self.report_destination(filename) @@ -74,7 +81,7 @@ class ExternalFD(FragmentFD): def EXE_NAME(cls): return cls.get_basename() - @property + @functools.cached_property def exe(self): return self.EXE_NAME @@ -90,9 +97,11 @@ class ExternalFD(FragmentFD): @classmethod def supports(cls, info_dict): - return ( - (cls.can_download_to_stdout or not info_dict.get('to_stdout')) - and info_dict['protocol'] in cls.SUPPORTED_PROTOCOLS) + return all(( + not info_dict.get('to_stdout') or Features.TO_STDOUT in cls.SUPPORTED_FEATURES, + '+' not in info_dict['protocol'] or Features.MULTIPLE_FORMATS in cls.SUPPORTED_FEATURES, + all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')), + )) @classmethod def can_download(cls, info_dict, path=None): @@ -119,29 +128,31 @@ class ExternalFD(FragmentFD): self._debug_cmd(cmd) if 'fragments' not in info_dict: - p = Popen(cmd, stderr=subprocess.PIPE) - _, stderr = p.communicate_or_kill() - if p.returncode != 0: - self.to_stderr(stderr.decode('utf-8', 'replace')) - return p.returncode + _, stderr, returncode = Popen.run( + cmd, text=True, stderr=subprocess.PIPE if self._CAPTURE_STDERR else None) + if returncode and stderr: + self.to_stderr(stderr) + return returncode fragment_retries = self.params.get('fragment_retries', 0) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) count = 0 while count <= fragment_retries: - p = Popen(cmd, stderr=subprocess.PIPE) - _, stderr = p.communicate_or_kill() - if p.returncode == 0: + _, stderr, returncode = Popen.run(cmd, text=True, stderr=subprocess.PIPE) + if not returncode: break + # TODO: Decide whether to retry based on error code # https://aria2.github.io/manual/en/html/aria2c.html#exit-status - self.to_stderr(stderr.decode('utf-8', 'replace')) + if stderr: + self.to_stderr(stderr) count += 1 if count <= fragment_retries: self.to_screen( '[%s] Got error. Retrying fragments (attempt %d of %s)...' % (self.get_basename(), count, self.format_retries(fragment_retries))) + self.sleep_retry('fragment', count) if count > fragment_retries: if not skip_unavailable_fragments: self.report_error('Giving up after %s fragment retries' % fragment_retries) @@ -170,6 +181,7 @@ class ExternalFD(FragmentFD): class CurlFD(ExternalFD): AVAILABLE_OPT = '-V' + _CAPTURE_STDERR = False # curl writes the progress to stderr def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '--location', '-o', tmpfilename, '--compressed'] @@ -194,16 +206,6 @@ class CurlFD(ExternalFD): cmd += ['--', info_dict['url']] return cmd - def _call_downloader(self, tmpfilename, info_dict): - cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] - - self._debug_cmd(cmd) - - # curl writes the progress to stderr so don't capture it. - p = Popen(cmd) - p.communicate_or_kill() - return p.returncode - class AxelFD(ExternalFD): AVAILABLE_OPT = '-V' @@ -322,7 +324,7 @@ class HttpieFD(ExternalFD): class FFmpegFD(ExternalFD): SUPPORTED_PROTOCOLS = ('http', 'https', 'ftp', 'ftps', 'm3u8', 'm3u8_native', 'rtsp', 'rtmp', 'rtmp_ffmpeg', 'mms', 'http_dash_segments') - can_download_to_stdout = True + SUPPORTED_FEATURES = (Features.TO_STDOUT, Features.MULTIPLE_FORMATS) @classmethod def available(cls, path=None): @@ -330,10 +332,6 @@ class FFmpegFD(ExternalFD): # Fixme: This may be wrong when --ffmpeg-location is used return FFmpegPostProcessor().available - @classmethod - def supports(cls, info_dict): - return all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')) - def on_process_started(self, proc, stdin): """ Override this in subclasses """ pass @@ -378,13 +376,6 @@ class FFmpegFD(ExternalFD): # http://trac.ffmpeg.org/ticket/6125#comment:10 args += ['-seekable', '1' if seekable else '0'] - # start_time = info_dict.get('start_time') or 0 - # if start_time: - # args += ['-ss', compat_str(start_time)] - # end_time = info_dict.get('end_time') - # if end_time: - # args += ['-t', compat_str(end_time - start_time)] - http_headers = None if info_dict.get('http_headers'): youtubedl_headers = handle_youtubedl_headers(info_dict['http_headers']) @@ -411,8 +402,8 @@ class FFmpegFD(ExternalFD): # We could switch to the following code if we are able to detect version properly # args += ['-http_proxy', proxy] env = os.environ.copy() - compat_setenv('HTTP_PROXY', proxy, env=env) - compat_setenv('http_proxy', proxy, env=env) + env['HTTP_PROXY'] = proxy + env['http_proxy'] = proxy protocol = info_dict.get('protocol') @@ -442,25 +433,31 @@ class FFmpegFD(ExternalFD): if isinstance(conn, list): for entry in conn: args += ['-rtmp_conn', entry] - elif isinstance(conn, compat_str): + elif isinstance(conn, str): args += ['-rtmp_conn', conn] + start_time, end_time = info_dict.get('section_start') or 0, info_dict.get('section_end') + for i, url in enumerate(urls): - # We need to specify headers for each http input stream - # otherwise, it will only be applied to the first. - # https://github.com/yt-dlp/yt-dlp/issues/2696 if http_headers is not None and re.match(r'^https?://', url): args += http_headers + if start_time: + args += ['-ss', str(start_time)] + if end_time: + args += ['-t', str(end_time - start_time)] + args += self._configuration_args((f'_i{i + 1}', '_i')) + ['-i', url] - args += ['-c', 'copy'] + if not (start_time or end_time) or not self.params.get('force_keyframes_at_cuts'): + args += ['-c', 'copy'] + if info_dict.get('requested_formats') or protocol == 'http_dash_segments': for (i, fmt) in enumerate(info_dict.get('requested_formats') or [info_dict]): stream_number = fmt.get('manifest_stream_number', 0) args.extend(['-map', f'{i}:{stream_number}']) if self.params.get('test', False): - args += ['-fs', compat_str(self._TEST_FILE_SIZE)] + args += ['-fs', str(self._TEST_FILE_SIZE)] ext = info_dict['ext'] if protocol in ('m3u8', 'm3u8_native'): @@ -495,24 +492,23 @@ class FFmpegFD(ExternalFD): args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) self._debug_cmd(args) - proc = Popen(args, stdin=subprocess.PIPE, env=env) - if url in ('-', 'pipe:'): - self.on_process_started(proc, proc.stdin) - try: - retval = proc.wait() - except BaseException as e: - # subprocces.run would send the SIGKILL signal to ffmpeg and the - # mp4 file couldn't be played, but if we ask ffmpeg to quit it - # produces a file that is playable (this is mostly useful for live - # streams). Note that Windows is not affected and produces playable - # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). - if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and url not in ('-', 'pipe:'): - proc.communicate_or_kill(b'q') - else: - proc.kill() - proc.wait() - raise - return retval + with Popen(args, stdin=subprocess.PIPE, env=env) as proc: + if url in ('-', 'pipe:'): + self.on_process_started(proc, proc.stdin) + try: + retval = proc.wait() + except BaseException as e: + # subprocces.run would send the SIGKILL signal to ffmpeg and the + # mp4 file couldn't be played, but if we ask ffmpeg to quit it + # produces a file that is playable (this is mostly useful for live + # streams). Note that Windows is not affected and produces playable + # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). + if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and url not in ('-', 'pipe:'): + proc.communicate_or_kill(b'q') + else: + proc.kill(timeout=None) + raise + return retval class AVconvFD(FFmpegFD): diff --git a/yt_dlp/downloader/f4m.py b/yt_dlp/downloader/f4m.py index 3629d63f5..770354de7 100644 --- a/yt_dlp/downloader/f4m.py +++ b/yt_dlp/downloader/f4m.py @@ -1,17 +1,13 @@ +import base64 import io import itertools +import struct import time +import urllib.error +import urllib.parse from .fragment import FragmentFD -from ..compat import ( - compat_b64decode, - compat_etree_fromstring, - compat_struct_pack, - compat_struct_unpack, - compat_urllib_error, - compat_urllib_parse_urlparse, - compat_urlparse, -) +from ..compat import compat_etree_fromstring from ..utils import fix_xml_ampersands, xpath_text @@ -35,13 +31,13 @@ class FlvReader(io.BytesIO): # Utility functions for reading numbers and strings def read_unsigned_long_long(self): - return compat_struct_unpack('!Q', self.read_bytes(8))[0] + return struct.unpack('!Q', self.read_bytes(8))[0] def read_unsigned_int(self): - return compat_struct_unpack('!I', self.read_bytes(4))[0] + return struct.unpack('!I', self.read_bytes(4))[0] def read_unsigned_char(self): - return compat_struct_unpack('!B', self.read_bytes(1))[0] + return struct.unpack('!B', self.read_bytes(1))[0] def read_string(self): res = b'' @@ -203,11 +199,11 @@ def build_fragments_list(boot_info): def write_unsigned_int(stream, val): - stream.write(compat_struct_pack('!I', val)) + stream.write(struct.pack('!I', val)) def write_unsigned_int_24(stream, val): - stream.write(compat_struct_pack('!I', val)[1:]) + stream.write(struct.pack('!I', val)[1:]) def write_flv_header(stream): @@ -301,12 +297,12 @@ class F4mFD(FragmentFD): # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m bootstrap_url = node.get('url') if bootstrap_url: - bootstrap_url = compat_urlparse.urljoin( + bootstrap_url = urllib.parse.urljoin( base_url, bootstrap_url) boot_info = self._get_bootstrap_from_url(bootstrap_url) else: bootstrap_url = None - bootstrap = compat_b64decode(node.text) + bootstrap = base64.b64decode(node.text) boot_info = read_bootstrap_info(bootstrap) return boot_info, bootstrap_url @@ -336,14 +332,14 @@ class F4mFD(FragmentFD): # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. man_base_url = get_base_url(doc) or man_url - base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url']) + base_url = urllib.parse.urljoin(man_base_url, media.attrib['url']) bootstrap_node = doc.find(_add_ns('bootstrapInfo')) boot_info, bootstrap_url = self._parse_bootstrap_node( bootstrap_node, man_base_url) live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: - metadata = compat_b64decode(metadata_node.text) + metadata = base64.b64decode(metadata_node.text) else: metadata = None @@ -371,7 +367,7 @@ class F4mFD(FragmentFD): if not live: write_metadata_tag(dest_stream, metadata) - base_url_parsed = compat_urllib_parse_urlparse(base_url) + base_url_parsed = urllib.parse.urlparse(base_url) self._start_frag_download(ctx, info_dict) @@ -391,9 +387,10 @@ class F4mFD(FragmentFD): query.append(info_dict['extra_param_to_segment_url']) url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) try: - success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict) + success = self._download_fragment(ctx, url_parsed.geturl(), info_dict) if not success: return False + down_data = self._read_fragment(ctx) reader = FlvReader(down_data) while True: try: @@ -410,7 +407,7 @@ class F4mFD(FragmentFD): if box_type == b'mdat': self._append_fragment(ctx, box_data) break - except compat_urllib_error.HTTPError as err: + except urllib.error.HTTPError as err: if live and (err.code == 404 or err.code == 410): # We didn't keep up with the live window. Continue # with the next available fragment. diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 4655f067f..3535e0e7d 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -4,12 +4,14 @@ import http.client import json import math import os +import struct import time +import urllib.error from .common import FileDownloader from .http import HttpFD from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 -from ..compat import compat_os_name, compat_struct_pack, compat_urllib_error +from ..compat import compat_os_name from ..utils import ( DownloadError, encodeFilename, @@ -23,11 +25,7 @@ class HttpQuietDownloader(HttpFD): def to_screen(self, *args, **kargs): pass - console_title = to_screen - - def report_retry(self, err, count, retries): - super().to_screen( - f'[download] Got server HTTP error: {err}. Retrying (attempt {count} of {self.format_retries(retries)}) ...') + to_console_title = to_screen class FragmentFD(FileDownloader): @@ -70,6 +68,7 @@ class FragmentFD(FileDownloader): self.to_screen( '\r[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s) ...' % (error_to_compat_str(err), frag_index, count, self.format_retries(retries))) + self.sleep_retry('fragment', count) def report_skip_fragment(self, frag_index, err=None): err = f' {err};' if err else '' @@ -168,18 +167,11 @@ class FragmentFD(FileDownloader): total_frags_str = 'unknown (live)' self.to_screen(f'[{self.FD_NAME}] Total fragments: {total_frags_str}') self.report_destination(ctx['filename']) - dl = HttpQuietDownloader( - self.ydl, - { - 'continuedl': self.params.get('continuedl', True), - 'quiet': self.params.get('quiet'), - 'noprogress': True, - 'ratelimit': self.params.get('ratelimit'), - 'retries': self.params.get('retries', 0), - 'nopart': self.params.get('nopart', False), - 'test': False, - } - ) + dl = HttpQuietDownloader(self.ydl, { + **self.params, + 'noprogress': True, + 'test': False, + }) tmpfilename = self.temp_name(ctx['filename']) open_mode = 'wb' resume_len = 0 @@ -252,6 +244,9 @@ class FragmentFD(FileDownloader): if s['status'] not in ('downloading', 'finished'): return + if not total_frags and ctx.get('fragment_count'): + state['fragment_count'] = ctx['fragment_count'] + if ctx_id is not None and s.get('ctx_id') != ctx_id: return @@ -355,7 +350,7 @@ class FragmentFD(FileDownloader): decrypt_info = fragment.get('decrypt_info') if not decrypt_info or decrypt_info['METHOD'] != 'AES-128': return frag_content - iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', fragment['media_sequence']) + iv = decrypt_info.get('IV') or struct.pack('>8xq', fragment['media_sequence']) decrypt_info['KEY'] = decrypt_info.get('KEY') or _get_key(info_dict.get('_decryption_key_url') or decrypt_info['URI']) # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded, @@ -460,10 +455,11 @@ class FragmentFD(FileDownloader): fatal, count = is_fatal(fragment.get('index') or (frag_index - 1)), 0 while count <= fragment_retries: try: + ctx['fragment_count'] = fragment.get('fragment_count') if self._download_fragment(ctx, fragment['url'], info_dict, headers): break return - except (compat_urllib_error.HTTPError, http.client.IncompleteRead) as err: + except (urllib.error.HTTPError, http.client.IncompleteRead) as err: # Unavailable (possibly temporary) fragments may be served. # First we try to retry then either skip or abort. # See https://github.com/ytdl-org/youtube-dl/issues/10165, @@ -506,12 +502,20 @@ class FragmentFD(FileDownloader): self.report_warning('The download speed shown is only of one thread. This is a known issue and patches are welcome') with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool: - for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments): - ctx['fragment_filename_sanitized'] = frag_filename - ctx['fragment_index'] = frag_index - result = append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), frag_index, ctx) - if not result: - return False + try: + for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments): + ctx.update({ + 'fragment_filename_sanitized': frag_filename, + 'fragment_index': frag_index, + }) + if not append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), frag_index, ctx): + return False + except KeyboardInterrupt: + self._finish_multiline_status() + self.report_error( + 'Interrupted by user. Waiting for all threads to shutdown...', is_error=False, tb=False) + pool.shutdown(wait=False) + raise else: for fragment in fragments: if not interrupt_trigger[0]: diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py index 0bd2f121c..f54b3f473 100644 --- a/yt_dlp/downloader/hls.py +++ b/yt_dlp/downloader/hls.py @@ -1,13 +1,13 @@ import binascii import io import re +import urllib.parse +from . import get_suitable_downloader from .external import FFmpegFD from .fragment import FragmentFD from .. import webvtt -from ..compat import compat_urlparse from ..dependencies import Cryptodome_AES -from ..downloader import get_suitable_downloader from ..utils import bug_reports_message, parse_m3u8_attributes, update_url_query @@ -61,12 +61,18 @@ class HlsFD(FragmentFD): s = urlh.read().decode('utf-8', 'ignore') can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None - if can_download and not Cryptodome_AES and '#EXT-X-KEY:METHOD=AES-128' in s: - if FFmpegFD.available(): + if can_download: + has_ffmpeg = FFmpegFD.available() + no_crypto = not Cryptodome_AES and '#EXT-X-KEY:METHOD=AES-128' in s + if no_crypto and has_ffmpeg: can_download, message = False, 'The stream has AES-128 encryption and pycryptodome is not available' - else: + elif no_crypto: message = ('The stream has AES-128 encryption and neither ffmpeg nor pycryptodome are available; ' 'Decryption will be performed natively, but will be extremely slow') + elif re.search(r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', s): + install_ffmpeg = '' if has_ffmpeg else 'install ffmpeg and ' + message = ('Live HLS streams are not supported by the native downloader. If this is a livestream, ' + f'please {install_ffmpeg}add "--downloader ffmpeg --hls-use-mpegts" to your command') if not can_download: has_drm = re.search('|'.join([ r'#EXT-X-FAXS-CM:', # Adobe Flash Access @@ -140,7 +146,7 @@ class HlsFD(FragmentFD): extra_query = None extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') if extra_param_to_segment_url: - extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url) + extra_query = urllib.parse.parse_qs(extra_param_to_segment_url) i = 0 media_sequence = 0 decrypt_info = {'METHOD': 'NONE'} @@ -162,7 +168,7 @@ class HlsFD(FragmentFD): frag_url = ( line if re.match(r'^https?://', line) - else compat_urlparse.urljoin(man_url, line)) + else urllib.parse.urljoin(man_url, line)) if extra_query: frag_url = update_url_query(frag_url, extra_query) @@ -187,7 +193,7 @@ class HlsFD(FragmentFD): frag_url = ( map_info.get('URI') if re.match(r'^https?://', map_info.get('URI')) - else compat_urlparse.urljoin(man_url, map_info.get('URI'))) + else urllib.parse.urljoin(man_url, map_info.get('URI'))) if extra_query: frag_url = update_url_query(frag_url, extra_query) @@ -215,7 +221,7 @@ class HlsFD(FragmentFD): if 'IV' in decrypt_info: decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32)) if not re.match(r'^https?://', decrypt_info['URI']): - decrypt_info['URI'] = compat_urlparse.urljoin( + decrypt_info['URI'] = urllib.parse.urljoin( man_url, decrypt_info['URI']) if extra_query: decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index 12a2f0cc7..6b59320b8 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -1,11 +1,12 @@ +import http.client import os import random import socket import ssl import time +import urllib.error from .common import FileDownloader -from ..compat import compat_http_client, compat_urllib_error from ..utils import ( ContentTooShortError, ThrottledDownload, @@ -24,7 +25,7 @@ RESPONSE_READ_EXCEPTIONS = ( socket.timeout, # compat: py < 3.10 ConnectionError, ssl.SSLError, - compat_http_client.HTTPException + http.client.HTTPException ) @@ -136,20 +137,18 @@ class HttpFD(FileDownloader): if has_range: content_range = ctx.data.headers.get('Content-Range') content_range_start, content_range_end, content_len = parse_http_range(content_range) - if content_range_start is not None and range_start == content_range_start: - # Content-Range is present and matches requested Range, resume is possible - accept_content_len = ( + # Content-Range is present and matches requested Range, resume is possible + if range_start == content_range_start and ( # Non-chunked download not ctx.chunk_size # Chunked download and requested piece or # its part is promised to be served or content_range_end == range_end - or content_len < range_end) - if accept_content_len: - ctx.content_len = content_len - if content_len or req_end: - ctx.data_len = min(content_len or req_end, req_end or content_len) - (req_start or 0) - return + or content_len < range_end): + ctx.content_len = content_len + if content_len or req_end: + ctx.data_len = min(content_len or req_end, req_end or content_len) - (req_start or 0) + return # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload @@ -157,7 +156,7 @@ class HttpFD(FileDownloader): ctx.resume_len = 0 ctx.open_mode = 'wb' ctx.data_len = ctx.content_len = int_or_none(ctx.data.info().get('Content-length', None)) - except compat_urllib_error.HTTPError as err: + except urllib.error.HTTPError as err: if err.code == 416: # Unable to resume (requested range not satisfiable) try: @@ -165,7 +164,7 @@ class HttpFD(FileDownloader): ctx.data = self.ydl.urlopen( sanitized_Request(url, request_data, headers)) content_length = ctx.data.info()['Content-Length'] - except compat_urllib_error.HTTPError as err: + except urllib.error.HTTPError as err: if err.code < 500 or err.code >= 600: raise else: @@ -198,7 +197,7 @@ class HttpFD(FileDownloader): # Unexpected HTTP error raise raise RetryDownload(err) - except compat_urllib_error.URLError as err: + except urllib.error.URLError as err: if isinstance(err.reason, ssl.CertificateError): raise raise RetryDownload(err) diff --git a/yt_dlp/downloader/ism.py b/yt_dlp/downloader/ism.py index 9efc5e4d9..8a0071ab3 100644 --- a/yt_dlp/downloader/ism.py +++ b/yt_dlp/downloader/ism.py @@ -2,9 +2,9 @@ import binascii import io import struct import time +import urllib.error from .fragment import FragmentFD -from ..compat import compat_urllib_error u8 = struct.Struct('>B') u88 = struct.Struct('>Bx') @@ -268,7 +268,7 @@ class IsmFD(FragmentFD): extra_state['ism_track_written'] = True self._append_fragment(ctx, frag_content) break - except compat_urllib_error.HTTPError as err: + except urllib.error.HTTPError as err: count += 1 if count <= fragment_retries: self.report_retry_fragment(err, frag_index, count, fragment_retries) diff --git a/yt_dlp/downloader/niconico.py b/yt_dlp/downloader/niconico.py index 5e9dda03d..77ed39e5b 100644 --- a/yt_dlp/downloader/niconico.py +++ b/yt_dlp/downloader/niconico.py @@ -1,8 +1,7 @@ import threading +from . import get_suitable_downloader from .common import FileDownloader -from ..downloader import get_suitable_downloader -from ..extractor.niconico import NiconicoIE from ..utils import sanitized_Request @@ -10,8 +9,9 @@ class NiconicoDmcFD(FileDownloader): """ Downloading niconico douga from DMC with heartbeat """ def real_download(self, filename, info_dict): - self.to_screen('[%s] Downloading from DMC' % self.FD_NAME) + from ..extractor.niconico import NiconicoIE + self.to_screen('[%s] Downloading from DMC' % self.FD_NAME) ie = NiconicoIE(self.ydl) info_dict, heartbeat_info_dict = ie._get_heartbeat_info(info_dict) diff --git a/yt_dlp/downloader/rtmp.py b/yt_dlp/downloader/rtmp.py index 3464eeef9..0e0952599 100644 --- a/yt_dlp/downloader/rtmp.py +++ b/yt_dlp/downloader/rtmp.py @@ -4,7 +4,6 @@ import subprocess import time from .common import FileDownloader -from ..compat import compat_str from ..utils import ( Popen, check_executable, @@ -92,8 +91,7 @@ class RtmpFD(FileDownloader): self.to_screen('') return proc.wait() except BaseException: # Including KeyboardInterrupt - proc.kill() - proc.wait() + proc.kill(timeout=None) raise url = info_dict['url'] @@ -144,7 +142,7 @@ class RtmpFD(FileDownloader): if isinstance(conn, list): for entry in conn: basic_args += ['--conn', entry] - elif isinstance(conn, compat_str): + elif isinstance(conn, str): basic_args += ['--conn', conn] if protocol is not None: basic_args += ['--protocol', protocol] diff --git a/yt_dlp/downloader/youtube_live_chat.py b/yt_dlp/downloader/youtube_live_chat.py index cc528029d..5334c6c95 100644 --- a/yt_dlp/downloader/youtube_live_chat.py +++ b/yt_dlp/downloader/youtube_live_chat.py @@ -1,9 +1,8 @@ import json import time +import urllib.error from .fragment import FragmentFD -from ..compat import compat_urllib_error -from ..extractor.youtube import YoutubeBaseInfoExtractor as YT_BaseIE from ..utils import RegexNotFoundError, dict_get, int_or_none, try_get @@ -26,7 +25,9 @@ class YoutubeLiveChatFD(FragmentFD): 'total_frags': None, } - ie = YT_BaseIE(self.ydl) + from ..extractor.youtube import YoutubeBaseInfoExtractor + + ie = YoutubeBaseInfoExtractor(self.ydl) start_time = int(time.time() * 1000) @@ -127,7 +128,7 @@ class YoutubeLiveChatFD(FragmentFD): elif info_dict['protocol'] == 'youtube_live_chat': continuation_id, offset, click_tracking_params = parse_actions_live(live_chat_continuation) return True, continuation_id, offset, click_tracking_params - except compat_urllib_error.HTTPError as err: + except urllib.error.HTTPError as err: count += 1 if count <= fragment_retries: self.report_retry_fragment(err, frag_index, count, fragment_retries) diff --git a/yt_dlp/extractor/__init__.py b/yt_dlp/extractor/__init__.py index afd3d05ac..6bfa4bd7b 100644 --- a/yt_dlp/extractor/__init__.py +++ b/yt_dlp/extractor/__init__.py @@ -1,32 +1,15 @@ -import contextlib -import os +from ..compat.compat_utils import passthrough_module -from ..utils import load_plugins - -_LAZY_LOADER = False -if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): - with contextlib.suppress(ImportError): - from .lazy_extractors import * # noqa: F403 - from .lazy_extractors import _ALL_CLASSES - _LAZY_LOADER = True - -if not _LAZY_LOADER: - from .extractors import * # noqa: F403 - _ALL_CLASSES = [ # noqa: F811 - klass - for name, klass in globals().items() - if name.endswith('IE') and name != 'GenericIE' - ] - _ALL_CLASSES.append(GenericIE) # noqa: F405 - -_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) -_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES +passthrough_module(__name__, '.extractors') +del passthrough_module def gen_extractor_classes(): """ Return a list of supported extractors. The order does matter; the first extractor matched is the one handling the URL. """ + from .extractors import _ALL_CLASSES + return _ALL_CLASSES @@ -39,10 +22,12 @@ def gen_extractors(): def list_extractor_classes(age_limit=None): """Return a list of extractors that are suitable for the given age, sorted by extractor name""" + from .generic import GenericIE + yield from sorted(filter( - lambda ie: ie.is_suitable(age_limit) and ie != GenericIE, # noqa: F405 + lambda ie: ie.is_suitable(age_limit) and ie != GenericIE, gen_extractor_classes()), key=lambda ie: ie.IE_NAME.lower()) - yield GenericIE # noqa: F405 + yield GenericIE def list_extractors(age_limit=None): @@ -52,4 +37,6 @@ def list_extractors(age_limit=None): def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" - return globals()[ie_name + 'IE'] + from . import extractors + + return getattr(extractors, f'{ie_name}IE') diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py new file mode 100644 index 000000000..37328dfc8 --- /dev/null +++ b/yt_dlp/extractor/_extractors.py @@ -0,0 +1,2198 @@ +# flake8: noqa: F401 + +from .abc import ( + ABCIE, + ABCIViewIE, + ABCIViewShowSeriesIE, +) +from .abcnews import ( + AbcNewsIE, + AbcNewsVideoIE, +) +from .abcotvs import ( + ABCOTVSIE, + ABCOTVSClipsIE, +) +from .abematv import ( + AbemaTVIE, + AbemaTVTitleIE, +) +from .academicearth import AcademicEarthCourseIE +from .acast import ( + ACastIE, + ACastChannelIE, +) +from .adn import ADNIE +from .adobeconnect import AdobeConnectIE +from .adobetv import ( + AdobeTVEmbedIE, + AdobeTVIE, + AdobeTVShowIE, + AdobeTVChannelIE, + AdobeTVVideoIE, +) +from .adultswim import AdultSwimIE +from .aenetworks import ( + AENetworksIE, + AENetworksCollectionIE, + AENetworksShowIE, + HistoryTopicIE, + HistoryPlayerIE, + BiographyIE, +) +from .afreecatv import ( + AfreecaTVIE, + AfreecaTVLiveIE, + AfreecaTVUserIE, +) +from .airmozilla import AirMozillaIE +from .aljazeera import AlJazeeraIE +from .alphaporno import AlphaPornoIE +from .amara import AmaraIE +from .alura import ( + AluraIE, + AluraCourseIE +) +from .amcnetworks import AMCNetworksIE +from .amazon import AmazonStoreIE +from .americastestkitchen import ( + AmericasTestKitchenIE, + AmericasTestKitchenSeasonIE, +) +from .animeondemand import AnimeOnDemandIE +from .anvato import AnvatoIE +from .aol import AolIE +from .allocine import AllocineIE +from .aliexpress import AliExpressLiveIE +from .alsace20tv import ( + Alsace20TVIE, + Alsace20TVEmbedIE, +) +from .apa import APAIE +from .aparat import AparatIE +from .appleconnect import AppleConnectIE +from .appletrailers import ( + AppleTrailersIE, + AppleTrailersSectionIE, +) +from .applepodcasts import ApplePodcastsIE +from .archiveorg import ( + ArchiveOrgIE, + YoutubeWebArchiveIE, +) +from .arcpublishing import ArcPublishingIE +from .arkena import ArkenaIE +from .ard import ( + ARDBetaMediathekIE, + ARDIE, + ARDMediathekIE, +) +from .arte import ( + ArteTVIE, + ArteTVEmbedIE, + ArteTVPlaylistIE, + ArteTVCategoryIE, +) +from .arnes import ArnesIE +from .asiancrush import ( + AsianCrushIE, + AsianCrushPlaylistIE, +) +from .atresplayer import AtresPlayerIE +from .atscaleconf import AtScaleConfEventIE +from .atttechchannel import ATTTechChannelIE +from .atvat import ATVAtIE +from .audimedia import AudiMediaIE +from .audioboom import AudioBoomIE +from .audiomack import AudiomackIE, AudiomackAlbumIE +from .audius import ( + AudiusIE, + AudiusTrackIE, + AudiusPlaylistIE, + AudiusProfileIE, +) +from .awaan import ( + AWAANIE, + AWAANVideoIE, + AWAANLiveIE, + AWAANSeasonIE, +) +from .azmedien import AZMedienIE +from .baidu import BaiduVideoIE +from .banbye import ( + BanByeIE, + BanByeChannelIE, +) +from .bandaichannel import BandaiChannelIE +from .bandcamp import ( + BandcampIE, + BandcampAlbumIE, + BandcampWeeklyIE, + BandcampUserIE, +) +from .bannedvideo import BannedVideoIE +from .bbc import ( + BBCCoUkIE, + BBCCoUkArticleIE, + BBCCoUkIPlayerEpisodesIE, + BBCCoUkIPlayerGroupIE, + BBCCoUkPlaylistIE, + BBCIE, +) +from .beeg import BeegIE +from .behindkink import BehindKinkIE +from .bellmedia import BellMediaIE +from .beatport import BeatportIE +from .bet import BetIE +from .bfi import BFIPlayerIE +from .bfmtv import ( + BFMTVIE, + BFMTVLiveIE, + BFMTVArticleIE, +) +from .bibeltv import BibelTVIE +from .bigflix import BigflixIE +from .bigo import BigoIE +from .bild import BildIE +from .bilibili import ( + BiliBiliIE, + BiliBiliSearchIE, + BilibiliCategoryIE, + BiliBiliBangumiIE, + BilibiliAudioIE, + BilibiliAudioAlbumIE, + BiliBiliPlayerIE, + BilibiliChannelIE, + BiliIntlIE, + BiliIntlSeriesIE, + BiliLiveIE, +) +from .biobiochiletv import BioBioChileTVIE +from .bitchute import ( + BitChuteIE, + BitChuteChannelIE, +) +from .bitwave import ( + BitwaveReplayIE, + BitwaveStreamIE, +) +from .biqle import BIQLEIE +from .blackboardcollaborate import BlackboardCollaborateIE +from .bleacherreport import ( + BleacherReportIE, + BleacherReportCMSIE, +) +from .blogger import BloggerIE +from .bloomberg import BloombergIE +from .bokecc import BokeCCIE +from .bongacams import BongaCamsIE +from .bostonglobe import BostonGlobeIE +from .box import BoxIE +from .bpb import BpbIE +from .br import ( + BRIE, + BRMediathekIE, +) +from .bravotv import BravoTVIE +from .breakcom import BreakIE +from .breitbart import BreitBartIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) +from .businessinsider import BusinessInsiderIE +from .buzzfeed import BuzzFeedIE +from .byutv import BYUtvIE +from .c56 import C56IE +from .cableav import CableAVIE +from .callin import CallinIE +from .caltrans import CaltransIE +from .cam4 import CAM4IE +from .camdemy import ( + CamdemyIE, + CamdemyFolderIE +) +from .cammodels import CamModelsIE +from .camwithher import CamWithHerIE +from .canalalpha import CanalAlphaIE +from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE +from .canvas import ( + CanvasIE, + CanvasEenIE, + VrtNUIE, + DagelijkseKostIE, +) +from .carambatv import ( + CarambaTVIE, + CarambaTVPageIE, +) +from .cartoonnetwork import CartoonNetworkIE +from .cbc import ( + CBCIE, + CBCPlayerIE, + CBCGemIE, + CBCGemPlaylistIE, + CBCGemLiveIE, +) +from .cbs import CBSIE +from .cbslocal import ( + CBSLocalIE, + CBSLocalArticleIE, +) +from .cbsinteractive import CBSInteractiveIE +from .cbsnews import ( + CBSNewsEmbedIE, + CBSNewsIE, + CBSNewsLiveVideoIE, +) +from .cbssports import ( + CBSSportsEmbedIE, + CBSSportsIE, + TwentyFourSevenSportsIE, +) +from .ccc import ( + CCCIE, + CCCPlaylistIE, +) +from .ccma import CCMAIE +from .cctv import CCTVIE +from .cda import CDAIE +from .ceskatelevize import CeskaTelevizeIE +from .cgtn import CGTNIE +from .channel9 import Channel9IE +from .charlierose import CharlieRoseIE +from .chaturbate import ChaturbateIE +from .chilloutzone import ChilloutzoneIE +from .chingari import ( + ChingariIE, + ChingariUserIE, +) +from .chirbit import ( + ChirbitIE, + ChirbitProfileIE, +) +from .cinchcast import CinchcastIE +from .cinemax import CinemaxIE +from .ciscolive import ( + CiscoLiveSessionIE, + CiscoLiveSearchIE, +) +from .ciscowebex import CiscoWebexIE +from .cjsw import CJSWIE +from .cliphunter import CliphunterIE +from .clippit import ClippitIE +from .cliprs import ClipRsIE +from .clipsyndicate import ClipsyndicateIE +from .closertotruth import CloserToTruthIE +from .cloudflarestream import CloudflareStreamIE +from .cloudy import CloudyIE +from .clubic import ClubicIE +from .clyp import ClypIE +from .cmt import CMTIE +from .cnbc import ( + CNBCIE, + CNBCVideoIE, +) +from .cnn import ( + CNNIE, + CNNBlogsIE, + CNNArticleIE, +) +from .coub import CoubIE +from .comedycentral import ( + ComedyCentralIE, + ComedyCentralTVIE, +) +from .commonmistakes import CommonMistakesIE, UnicodeBOMIE +from .commonprotocols import ( + MmsIE, + RtmpIE, + ViewSourceIE, +) +from .condenast import CondeNastIE +from .contv import CONtvIE +from .corus import CorusIE +from .cpac import ( + CPACIE, + CPACPlaylistIE, +) +from .cozytv import CozyTVIE +from .cracked import CrackedIE +from .crackle import CrackleIE +from .craftsy import CraftsyIE +from .crooksandliars import CrooksAndLiarsIE +from .crowdbunker import ( + CrowdBunkerIE, + CrowdBunkerChannelIE, +) +from .crunchyroll import ( + CrunchyrollIE, + CrunchyrollShowPlaylistIE, + CrunchyrollBetaIE, + CrunchyrollBetaShowIE, +) +from .cspan import CSpanIE, CSpanCongressIE +from .ctsnews import CtsNewsIE +from .ctv import CTVIE +from .ctvnews import CTVNewsIE +from .cultureunplugged import CultureUnpluggedIE +from .curiositystream import ( + CuriosityStreamIE, + CuriosityStreamCollectionsIE, + CuriosityStreamSeriesIE, +) +from .cwtv import CWTVIE +from .cybrary import ( + CybraryIE, + CybraryCourseIE +) +from .daftsex import DaftsexIE +from .dailymail import DailyMailIE +from .dailymotion import ( + DailymotionIE, + DailymotionPlaylistIE, + DailymotionUserIE, +) +from .dailywire import ( + DailyWireIE, + DailyWirePodcastIE, +) +from .damtomo import ( + DamtomoRecordIE, + DamtomoVideoIE, +) +from .daum import ( + DaumIE, + DaumClipIE, + DaumPlaylistIE, + DaumUserIE, +) +from .daystar import DaystarClipIE +from .dbtv import DBTVIE +from .dctp import DctpTvIE +from .deezer import ( + DeezerPlaylistIE, + DeezerAlbumIE, +) +from .democracynow import DemocracynowIE +from .dfb import DFBIE +from .dhm import DHMIE +from .digg import DiggIE +from .dotsub import DotsubIE +from .douyutv import ( + DouyuShowIE, + DouyuTVIE, +) +from .dplay import ( + DPlayIE, + DiscoveryPlusIE, + HGTVDeIE, + GoDiscoveryIE, + TravelChannelIE, + CookingChannelIE, + HGTVUsaIE, + FoodNetworkIE, + InvestigationDiscoveryIE, + DestinationAmericaIE, + AmHistoryChannelIE, + ScienceChannelIE, + DIYNetworkIE, + DiscoveryLifeIE, + AnimalPlanetIE, + TLCIE, + DiscoveryPlusIndiaIE, + DiscoveryNetworksDeIE, + DiscoveryPlusItalyIE, + DiscoveryPlusItalyShowIE, + DiscoveryPlusIndiaShowIE, +) +from .dreisat import DreiSatIE +from .drbonanza import DRBonanzaIE +from .drtuber import DrTuberIE +from .drtv import ( + DRTVIE, + DRTVLiveIE, +) +from .dtube import DTubeIE +from .dvtv import DVTVIE +from .duboku import ( + DubokuIE, + DubokuPlaylistIE +) +from .dumpert import DumpertIE +from .defense import DefenseGouvFrIE +from .digitalconcerthall import DigitalConcertHallIE +from .discovery import DiscoveryIE +from .disney import DisneyIE +from .dispeak import DigitallySpeakingIE +from .doodstream import DoodStreamIE +from .dropbox import DropboxIE +from .dropout import ( + DropoutSeasonIE, + DropoutIE +) +from .dw import ( + DWIE, + DWArticleIE, +) +from .eagleplatform import EaglePlatformIE +from .ebaumsworld import EbaumsWorldIE +from .echomsk import EchoMskIE +from .egghead import ( + EggheadCourseIE, + EggheadLessonIE, +) +from .ehow import EHowIE +from .eighttracks import EightTracksIE +from .einthusan import EinthusanIE +from .eitb import EitbIE +from .ellentube import ( + EllenTubeIE, + EllenTubeVideoIE, + EllenTubePlaylistIE, +) +from .elonet import ElonetIE +from .elpais import ElPaisIE +from .embedly import EmbedlyIE +from .engadget import EngadgetIE +from .epicon import ( + EpiconIE, + EpiconSeriesIE, +) +from .eporner import EpornerIE +from .eroprofile import ( + EroProfileIE, + EroProfileAlbumIE, +) +from .ertgr import ( + ERTFlixCodenameIE, + ERTFlixIE, + ERTWebtvEmbedIE, +) +from .escapist import EscapistIE +from .espn import ( + ESPNIE, + WatchESPNIE, + ESPNArticleIE, + FiveThirtyEightIE, + ESPNCricInfoIE, +) +from .esri import EsriVideoIE +from .europa import EuropaIE +from .europeantour import EuropeanTourIE +from .euscreen import EUScreenIE +from .expotv import ExpoTVIE +from .expressen import ExpressenIE +from .extremetube import ExtremeTubeIE +from .eyedotv import EyedoTVIE +from .facebook import ( + FacebookIE, + FacebookPluginsVideoIE, + FacebookRedirectURLIE, +) +from .fancode import ( + FancodeVodIE, + FancodeLiveIE +) + +from .faz import FazIE +from .fc2 import ( + FC2IE, + FC2EmbedIE, + FC2LiveIE, +) +from .fczenit import FczenitIE +from .fifa import FifaIE +from .filmmodu import FilmmoduIE +from .filmon import ( + FilmOnIE, + FilmOnChannelIE, +) +from .filmweb import FilmwebIE +from .firsttv import FirstTVIE +from .fivetv import FiveTVIE +from .flickr import FlickrIE +from .folketinget import FolketingetIE +from .footyroom import FootyRoomIE +from .formula1 import Formula1IE +from .fourtube import ( + FourTubeIE, + PornTubeIE, + PornerBrosIE, + FuxIE, +) +from .fourzerostudio import ( + FourZeroStudioArchiveIE, + FourZeroStudioClipIE, +) +from .fox import FOXIE +from .fox9 import ( + FOX9IE, + FOX9NewsIE, +) +from .foxgay import FoxgayIE +from .foxnews import ( + FoxNewsIE, + FoxNewsArticleIE, +) +from .foxsports import FoxSportsIE +from .fptplay import FptplayIE +from .franceinter import FranceInterIE +from .francetv import ( + FranceTVIE, + FranceTVSiteIE, + FranceTVInfoIE, +) +from .freesound import FreesoundIE +from .freespeech import FreespeechIE +from .frontendmasters import ( + FrontendMastersIE, + FrontendMastersLessonIE, + FrontendMastersCourseIE +) +from .freetv import ( + FreeTvIE, + FreeTvMoviesIE, +) +from .fujitv import FujiTVFODPlus7IE +from .funimation import ( + FunimationIE, + FunimationPageIE, + FunimationShowIE, +) +from .funk import FunkIE +from .fusion import FusionIE +from .fuyintv import FuyinTVIE +from .gab import ( + GabTVIE, + GabIE, +) +from .gaia import GaiaIE +from .gameinformer import GameInformerIE +from .gamejolt import ( + GameJoltIE, + GameJoltUserIE, + GameJoltGameIE, + GameJoltGameSoundtrackIE, + GameJoltCommunityIE, + GameJoltSearchIE, +) +from .gamespot import GameSpotIE +from .gamestar import GameStarIE +from .gaskrank import GaskrankIE +from .gazeta import GazetaIE +from .gdcvault import GDCVaultIE +from .gedidigital import GediDigitalIE +from .generic import GenericIE +from .gettr import ( + GettrIE, + GettrStreamingIE, +) +from .gfycat import GfycatIE +from .giantbomb import GiantBombIE +from .giga import GigaIE +from .glide import GlideIE +from .globo import ( + GloboIE, + GloboArticleIE, +) +from .go import GoIE +from .godtube import GodTubeIE +from .gofile import GofileIE +from .golem import GolemIE +from .goodgame import GoodGameIE +from .googledrive import ( + GoogleDriveIE, + GoogleDriveFolderIE, +) +from .googlepodcasts import ( + GooglePodcastsIE, + GooglePodcastsFeedIE, +) +from .googlesearch import GoogleSearchIE +from .gopro import GoProIE +from .goshgay import GoshgayIE +from .gotostage import GoToStageIE +from .gputechconf import GPUTechConfIE +from .gronkh import ( + GronkhIE, + GronkhFeedIE, + GronkhVodsIE +) +from .groupon import GrouponIE +from .hbo import HBOIE +from .hearthisat import HearThisAtIE +from .heise import HeiseIE +from .hellporno import HellPornoIE +from .helsinki import HelsinkiIE +from .hentaistigma import HentaiStigmaIE +from .hgtv import HGTVComShowIE +from .hketv import HKETVIE +from .hidive import HiDiveIE +from .historicfilms import HistoricFilmsIE +from .hitbox import HitboxIE, HitboxLiveIE +from .hitrecord import HitRecordIE +from .hotnewhiphop import HotNewHipHopIE +from .hotstar import ( + HotStarIE, + HotStarPrefixIE, + HotStarPlaylistIE, + HotStarSeriesIE, +) +from .howcast import HowcastIE +from .howstuffworks import HowStuffWorksIE +from .hrfensehen import HRFernsehenIE +from .hrti import ( + HRTiIE, + HRTiPlaylistIE, +) +from .hse import ( + HSEShowIE, + HSEProductIE, +) +from .huajiao import HuajiaoIE +from .huya import HuyaLiveIE +from .huffpost import HuffPostIE +from .hungama import ( + HungamaIE, + HungamaSongIE, + HungamaAlbumPlaylistIE, +) +from .hypem import HypemIE +from .icareus import IcareusIE +from .ichinanalive import ( + IchinanaLiveIE, + IchinanaLiveClipIE, +) +from .ign import ( + IGNIE, + IGNVideoIE, + IGNArticleIE, +) +from .iheart import ( + IHeartRadioIE, + IHeartRadioPodcastIE, +) +from .imdb import ( + ImdbIE, + ImdbListIE +) +from .imgur import ( + ImgurIE, + ImgurAlbumIE, + ImgurGalleryIE, +) +from .ina import InaIE +from .inc import IncIE +from .indavideo import IndavideoEmbedIE +from .infoq import InfoQIE +from .instagram import ( + InstagramIE, + InstagramIOSIE, + InstagramUserIE, + InstagramTagIE, + InstagramStoryIE, +) +from .internazionale import InternazionaleIE +from .internetvideoarchive import InternetVideoArchiveIE +from .iprima import ( + IPrimaIE, + IPrimaCNNIE +) +from .iqiyi import ( + IqiyiIE, + IqIE, + IqAlbumIE +) +from .itprotv import ( + ITProTVIE, + ITProTVCourseIE +) +from .itv import ( + ITVIE, + ITVBTCCIE, +) +from .ivi import ( + IviIE, + IviCompilationIE +) +from .ivideon import IvideonIE +from .iwara import ( + IwaraIE, + IwaraPlaylistIE, + IwaraUserIE, +) +from .ixigua import IxiguaIE +from .izlesene import IzleseneIE +from .jable import ( + JableIE, + JablePlaylistIE, +) +from .jamendo import ( + JamendoIE, + JamendoAlbumIE, +) +from .jeuxvideo import JeuxVideoIE +from .jove import JoveIE +from .joj import JojIE +from .jwplatform import JWPlatformIE +from .kakao import KakaoIE +from .kaltura import KalturaIE +from .karaoketv import KaraoketvIE +from .karrierevideos import KarriereVideosIE +from .keezmovies import KeezMoviesIE +from .kelbyone import KelbyOneIE +from .ketnet import KetnetIE +from .khanacademy import ( + KhanAcademyIE, + KhanAcademyUnitIE, +) +from .kicker import KickerIE +from .kickstarter import KickStarterIE +from .kinja import KinjaEmbedIE +from .kinopoisk import KinoPoiskIE +from .konserthusetplay import KonserthusetPlayIE +from .koo import KooIE +from .kth import KTHIE +from .krasview import KrasViewIE +from .ku6 import Ku6IE +from .kusi import KUSIIE +from .kuwo import ( + KuwoIE, + KuwoAlbumIE, + KuwoChartIE, + KuwoSingerIE, + KuwoCategoryIE, + KuwoMvIE, +) +from .la7 import ( + LA7IE, + LA7PodcastEpisodeIE, + LA7PodcastIE, +) +from .laola1tv import ( + Laola1TvEmbedIE, + Laola1TvIE, + EHFTVIE, + ITTFIE, +) +from .lastfm import ( + LastFMIE, + LastFMPlaylistIE, + LastFMUserIE, +) +from .lbry import ( + LBRYIE, + LBRYChannelIE, +) +from .lci import LCIIE +from .lcp import ( + LcpPlayIE, + LcpIE, +) +from .lecture2go import Lecture2GoIE +from .lecturio import ( + LecturioIE, + LecturioCourseIE, + LecturioDeCourseIE, +) +from .leeco import ( + LeIE, + LePlaylistIE, + LetvCloudIE, +) +from .lego import LEGOIE +from .lemonde import LemondeIE +from .lenta import LentaIE +from .libraryofcongress import LibraryOfCongressIE +from .libsyn import LibsynIE +from .lifenews import ( + LifeNewsIE, + LifeEmbedIE, +) +from .likee import ( + LikeeIE, + LikeeUserIE +) +from .limelight import ( + LimelightMediaIE, + LimelightChannelIE, + LimelightChannelListIE, +) +from .line import ( + LineLiveIE, + LineLiveChannelIE, +) +from .linkedin import ( + LinkedInIE, + LinkedInLearningIE, + LinkedInLearningCourseIE, +) +from .linuxacademy import LinuxAcademyIE +from .litv import LiTVIE +from .livejournal import LiveJournalIE +from .livestream import ( + LivestreamIE, + LivestreamOriginalIE, + LivestreamShortenerIE, +) +from .lnkgo import ( + LnkGoIE, + LnkIE, +) +from .localnews8 import LocalNews8IE +from .lovehomeporn import LoveHomePornIE +from .lrt import ( + LRTVODIE, + LRTStreamIE +) +from .lynda import ( + LyndaIE, + LyndaCourseIE +) +from .m6 import M6IE +from .magentamusik360 import MagentaMusik360IE +from .mailru import ( + MailRuIE, + MailRuMusicIE, + MailRuMusicSearchIE, +) +from .mainstreaming import MainStreamingIE +from .malltv import MallTVIE +from .mangomolo import ( + MangomoloVideoIE, + MangomoloLiveIE, +) +from .manoto import ( + ManotoTVIE, + ManotoTVShowIE, + ManotoTVLiveIE, +) +from .manyvids import ManyVidsIE +from .maoritv import MaoriTVIE +from .markiza import ( + MarkizaIE, + MarkizaPageIE, +) +from .massengeschmacktv import MassengeschmackTVIE +from .masters import MastersIE +from .matchtv import MatchTVIE +from .mdr import MDRIE +from .medaltv import MedalTVIE +from .mediaite import MediaiteIE +from .mediaklikk import MediaKlikkIE +from .mediaset import ( + MediasetIE, + MediasetShowIE, +) +from .mediasite import ( + MediasiteIE, + MediasiteCatalogIE, + MediasiteNamedCatalogIE, +) +from .medici import MediciIE +from .megaphone import MegaphoneIE +from .meipai import MeipaiIE +from .melonvod import MelonVODIE +from .meta import METAIE +from .metacafe import MetacafeIE +from .metacritic import MetacriticIE +from .mgoon import MgoonIE +from .mgtv import MGTVIE +from .miaopai import MiaoPaiIE +from .microsoftstream import MicrosoftStreamIE +from .microsoftvirtualacademy import ( + MicrosoftVirtualAcademyIE, + MicrosoftVirtualAcademyCourseIE, +) +from .mildom import ( + MildomIE, + MildomVodIE, + MildomClipIE, + MildomUserVodIE, +) +from .minds import ( + MindsIE, + MindsChannelIE, + MindsGroupIE, +) +from .ministrygrid import MinistryGridIE +from .minoto import MinotoIE +from .miomio import MioMioIE +from .mirrativ import ( + MirrativIE, + MirrativUserIE, +) +from .mirrorcouk import MirrorCoUKIE +from .mit import TechTVMITIE, OCWMITIE +from .mitele import MiTeleIE +from .mixch import ( + MixchIE, + MixchArchiveIE, +) +from .mixcloud import ( + MixcloudIE, + MixcloudUserIE, + MixcloudPlaylistIE, +) +from .mlb import ( + MLBIE, + MLBVideoIE, +) +from .mlssoccer import MLSSoccerIE +from .mnet import MnetIE +from .moevideo import MoeVideoIE +from .mofosex import ( + MofosexIE, + MofosexEmbedIE, +) +from .mojvideo import MojvideoIE +from .morningstar import MorningstarIE +from .motherless import ( + MotherlessIE, + MotherlessGroupIE +) +from .motorsport import MotorsportIE +from .movieclips import MovieClipsIE +from .moviepilot import MoviepilotIE +from .moviezine import MoviezineIE +from .movingimage import MovingImageIE +from .msn import MSNIE +from .mtv import ( + MTVIE, + MTVVideoIE, + MTVServicesEmbeddedIE, + MTVDEIE, + MTVJapanIE, + MTVItaliaIE, + MTVItaliaProgrammaIE, +) +from .muenchentv import MuenchenTVIE +from .murrtube import MurrtubeIE, MurrtubeUserIE +from .musescore import MuseScoreIE +from .musicdex import ( + MusicdexSongIE, + MusicdexAlbumIE, + MusicdexArtistIE, + MusicdexPlaylistIE, +) +from .mwave import MwaveIE, MwaveMeetGreetIE +from .mxplayer import ( + MxplayerIE, + MxplayerShowIE, +) +from .mychannels import MyChannelsIE +from .myspace import MySpaceIE, MySpaceAlbumIE +from .myspass import MySpassIE +from .myvi import ( + MyviIE, + MyviEmbedIE, +) +from .myvideoge import MyVideoGeIE +from .myvidster import MyVidsterIE +from .n1 import ( + N1InfoAssetIE, + N1InfoIIE, +) +from .nate import ( + NateIE, + NateProgramIE, +) +from .nationalgeographic import ( + NationalGeographicVideoIE, + NationalGeographicTVIE, +) +from .naver import ( + NaverIE, + NaverLiveIE, + NaverNowIE, +) +from .nba import ( + NBAWatchEmbedIE, + NBAWatchIE, + NBAWatchCollectionIE, + NBAEmbedIE, + NBAIE, + NBAChannelIE, +) +from .nbc import ( + NBCIE, + NBCNewsIE, + NBCOlympicsIE, + NBCOlympicsStreamIE, + NBCSportsIE, + NBCSportsStreamIE, + NBCSportsVPlayerIE, +) +from .ndr import ( + NDRIE, + NJoyIE, + NDREmbedBaseIE, + NDREmbedIE, + NJoyEmbedIE, +) +from .ndtv import NDTVIE +from .nebula import ( + NebulaIE, + NebulaSubscriptionsIE, + NebulaChannelIE, +) +from .nerdcubed import NerdCubedFeedIE +from .netzkino import NetzkinoIE +from .neteasemusic import ( + NetEaseMusicIE, + NetEaseMusicAlbumIE, + NetEaseMusicSingerIE, + NetEaseMusicListIE, + NetEaseMusicMvIE, + NetEaseMusicProgramIE, + NetEaseMusicDjRadioIE, +) +from .netverse import ( + NetverseIE, + NetversePlaylistIE, +) +from .newgrounds import ( + NewgroundsIE, + NewgroundsPlaylistIE, + NewgroundsUserIE, +) +from .newstube import NewstubeIE +from .newsy import NewsyIE +from .nextmedia import ( + NextMediaIE, + NextMediaActionNewsIE, + AppleDailyIE, + NextTVIE, +) +from .nexx import ( + NexxIE, + NexxEmbedIE, +) +from .nfb import NFBIE +from .nfhsnetwork import NFHSNetworkIE +from .nfl import ( + NFLIE, + NFLArticleIE, +) +from .nhk import ( + NhkVodIE, + NhkVodProgramIE, + NhkForSchoolBangumiIE, + NhkForSchoolSubjectIE, + NhkForSchoolProgramListIE, +) +from .nhl import NHLIE +from .nick import ( + NickIE, + NickBrIE, + NickDeIE, + NickNightIE, + NickRuIE, +) +from .niconico import ( + NiconicoIE, + NiconicoPlaylistIE, + NiconicoUserIE, + NiconicoSeriesIE, + NiconicoHistoryIE, + NicovideoSearchDateIE, + NicovideoSearchIE, + NicovideoSearchURLIE, + NicovideoTagURLIE, +) +from .ninecninemedia import ( + NineCNineMediaIE, + CPTwentyFourIE, +) +from .ninegag import NineGagIE +from .ninenow import NineNowIE +from .nintendo import NintendoIE +from .nitter import NitterIE +from .njpwworld import NJPWWorldIE +from .nobelprize import NobelPrizeIE +from .nonktube import NonkTubeIE +from .noodlemagazine import NoodleMagazineIE +from .noovo import NoovoIE +from .normalboots import NormalbootsIE +from .nosvideo import NosVideoIE +from .nova import ( + NovaEmbedIE, + NovaIE, +) +from .novaplay import NovaPlayIE +from .nowness import ( + NownessIE, + NownessPlaylistIE, + NownessSeriesIE, +) +from .noz import NozIE +from .npo import ( + AndereTijdenIE, + NPOIE, + NPOLiveIE, + NPORadioIE, + NPORadioFragmentIE, + SchoolTVIE, + HetKlokhuisIE, + VPROIE, + WNLIE, +) +from .npr import NprIE +from .nrk import ( + NRKIE, + NRKPlaylistIE, + NRKSkoleIE, + NRKTVIE, + NRKTVDirekteIE, + NRKRadioPodkastIE, + NRKTVEpisodeIE, + NRKTVEpisodesIE, + NRKTVSeasonIE, + NRKTVSeriesIE, +) +from .nrl import NRLTVIE +from .ntvcojp import NTVCoJpCUIE +from .ntvde import NTVDeIE +from .ntvru import NTVRuIE +from .nytimes import ( + NYTimesIE, + NYTimesArticleIE, + NYTimesCookingIE, +) +from .nuvid import NuvidIE +from .nzherald import NZHeraldIE +from .nzz import NZZIE +from .odatv import OdaTVIE +from .odnoklassniki import OdnoklassnikiIE +from .oktoberfesttv import OktoberfestTVIE +from .olympics import OlympicsReplayIE +from .on24 import On24IE +from .ondemandkorea import OnDemandKoreaIE +from .onefootball import OneFootballIE +from .onet import ( + OnetIE, + OnetChannelIE, + OnetMVPIE, + OnetPlIE, +) +from .onionstudios import OnionStudiosIE +from .ooyala import ( + OoyalaIE, + OoyalaExternalIE, +) +from .opencast import ( + OpencastIE, + OpencastPlaylistIE, +) +from .openrec import ( + OpenRecIE, + OpenRecCaptureIE, + OpenRecMovieIE, +) +from .ora import OraTVIE +from .orf import ( + ORFTVthekIE, + ORFFM4IE, + ORFFM4StoryIE, + ORFOE1IE, + ORFOE3IE, + ORFNOEIE, + ORFWIEIE, + ORFBGLIE, + ORFOOEIE, + ORFSTMIE, + ORFKTNIE, + ORFSBGIE, + ORFTIRIE, + ORFVBGIE, + ORFIPTVIE, +) +from .outsidetv import OutsideTVIE +from .packtpub import ( + PacktPubIE, + PacktPubCourseIE, +) +from .palcomp3 import ( + PalcoMP3IE, + PalcoMP3ArtistIE, + PalcoMP3VideoIE, +) +from .pandoratv import PandoraTVIE +from .panopto import ( + PanoptoIE, + PanoptoListIE, + PanoptoPlaylistIE +) +from .paramountplus import ( + ParamountPlusIE, + ParamountPlusSeriesIE, +) +from .parliamentliveuk import ParliamentLiveUKIE +from .parlview import ParlviewIE +from .patreon import ( + PatreonIE, + PatreonUserIE +) +from .pbs import PBSIE +from .pearvideo import PearVideoIE +from .peekvids import PeekVidsIE, PlayVidsIE +from .peertube import ( + PeerTubeIE, + PeerTubePlaylistIE, +) +from .peertv import PeerTVIE +from .peloton import ( + PelotonIE, + PelotonLiveIE +) +from .people import PeopleIE +from .performgroup import PerformGroupIE +from .periscope import ( + PeriscopeIE, + PeriscopeUserIE, +) +from .philharmoniedeparis import PhilharmonieDeParisIE +from .phoenix import PhoenixIE +from .photobucket import PhotobucketIE +from .piapro import PiaproIE +from .picarto import ( + PicartoIE, + PicartoVodIE, +) +from .piksel import PikselIE +from .pinkbike import PinkbikeIE +from .pinterest import ( + PinterestIE, + PinterestCollectionIE, +) +from .pixivsketch import ( + PixivSketchIE, + PixivSketchUserIE, +) +from .pladform import PladformIE +from .planetmarathi import PlanetMarathiIE +from .platzi import ( + PlatziIE, + PlatziCourseIE, +) +from .playfm import PlayFMIE +from .playplustv import PlayPlusTVIE +from .plays import PlaysTVIE +from .playstuff import PlayStuffIE +from .playsuisse import PlaySuisseIE +from .playtvak import PlaytvakIE +from .playvid import PlayvidIE +from .playwire import PlaywireIE +from .plutotv import PlutoTVIE +from .pluralsight import ( + PluralsightIE, + PluralsightCourseIE, +) +from .podchaser import PodchaserIE +from .podomatic import PodomaticIE +from .pokemon import ( + PokemonIE, + PokemonWatchIE, +) +from .pokergo import ( + PokerGoIE, + PokerGoCollectionIE, +) +from .polsatgo import PolsatGoIE +from .polskieradio import ( + PolskieRadioIE, + PolskieRadioCategoryIE, + PolskieRadioPlayerIE, + PolskieRadioPodcastIE, + PolskieRadioPodcastListIE, + PolskieRadioRadioKierowcowIE, +) +from .popcorntimes import PopcorntimesIE +from .popcorntv import PopcornTVIE +from .porn91 import Porn91IE +from .porncom import PornComIE +from .pornflip import PornFlipIE +from .pornhd import PornHdIE +from .pornhub import ( + PornHubIE, + PornHubUserIE, + PornHubPlaylistIE, + PornHubPagedVideoListIE, + PornHubUserVideosUploadIE, +) +from .pornotube import PornotubeIE +from .pornovoisines import PornoVoisinesIE +from .pornoxo import PornoXOIE +from .pornez import PornezIE +from .puhutv import ( + PuhuTVIE, + PuhuTVSerieIE, +) +from .premiershiprugby import PremiershipRugbyIE +from .presstv import PressTVIE +from .projectveritas import ProjectVeritasIE +from .prosiebensat1 import ProSiebenSat1IE +from .prx import ( + PRXStoryIE, + PRXSeriesIE, + PRXAccountIE, + PRXStoriesSearchIE, + PRXSeriesSearchIE +) +from .puls4 import Puls4IE +from .pyvideo import PyvideoIE +from .qqmusic import ( + QQMusicIE, + QQMusicSingerIE, + QQMusicAlbumIE, + QQMusicToplistIE, + QQMusicPlaylistIE, +) +from .r7 import ( + R7IE, + R7ArticleIE, +) +from .radiko import RadikoIE, RadikoRadioIE +from .radiocanada import ( + RadioCanadaIE, + RadioCanadaAudioVideoIE, +) +from .radiode import RadioDeIE +from .radiojavan import RadioJavanIE +from .radiobremen import RadioBremenIE +from .radiofrance import FranceCultureIE, RadioFranceIE +from .radiozet import RadioZetPodcastIE +from .radiokapital import ( + RadioKapitalIE, + RadioKapitalShowIE, +) +from .radlive import ( + RadLiveIE, + RadLiveChannelIE, + RadLiveSeasonIE, +) +from .rai import ( + RaiPlayIE, + RaiPlayLiveIE, + RaiPlayPlaylistIE, + RaiPlaySoundIE, + RaiPlaySoundLiveIE, + RaiPlaySoundPlaylistIE, + RaiIE, +) +from .raywenderlich import ( + RayWenderlichIE, + RayWenderlichCourseIE, +) +from .rbmaradio import RBMARadioIE +from .rcs import ( + RCSIE, + RCSEmbedsIE, + RCSVariousIE, +) +from .rcti import ( + RCTIPlusIE, + RCTIPlusSeriesIE, + RCTIPlusTVIE, +) +from .rds import RDSIE +from .redbulltv import ( + RedBullTVIE, + RedBullEmbedIE, + RedBullTVRrnContentIE, + RedBullIE, +) +from .reddit import RedditIE +from .redgifs import ( + RedGifsIE, + RedGifsSearchIE, + RedGifsUserIE, +) +from .redtube import RedTubeIE +from .regiotv import RegioTVIE +from .rentv import ( + RENTVIE, + RENTVArticleIE, +) +from .restudy import RestudyIE +from .reuters import ReutersIE +from .reverbnation import ReverbNationIE +from .rice import RICEIE +from .rmcdecouverte import RMCDecouverteIE +from .rockstargames import RockstarGamesIE +from .rokfin import ( + RokfinIE, + RokfinStackIE, + RokfinChannelIE, + RokfinSearchIE, +) +from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE +from .rottentomatoes import RottenTomatoesIE +from .rozhlas import RozhlasIE +from .rtbf import RTBFIE +from .rte import RteIE, RteRadioIE +from .rtlnl import RtlNlIE +from .rtl2 import ( + RTL2IE, + RTL2YouIE, + RTL2YouSeriesIE, +) +from .rtnews import ( + RTNewsIE, + RTDocumentryIE, + RTDocumentryPlaylistIE, + RuptlyIE, +) +from .rtp import RTPIE +from .rtrfm import RTRFMIE +from .rts import RTSIE +from .rtve import ( + RTVEALaCartaIE, + RTVEAudioIE, + RTVELiveIE, + RTVEInfantilIE, + RTVETelevisionIE, +) +from .rtvnh import RTVNHIE +from .rtvs import RTVSIE +from .ruhd import RUHDIE +from .rule34video import Rule34VideoIE +from .rumble import ( + RumbleEmbedIE, + RumbleChannelIE, +) +from .rutube import ( + RutubeIE, + RutubeChannelIE, + RutubeEmbedIE, + RutubeMovieIE, + RutubePersonIE, + RutubePlaylistIE, + RutubeTagsIE, +) +from .glomex import ( + GlomexIE, + GlomexEmbedIE, +) +from .megatvcom import ( + MegaTVComIE, + MegaTVComEmbedIE, +) +from .ant1newsgr import ( + Ant1NewsGrWatchIE, + Ant1NewsGrArticleIE, + Ant1NewsGrEmbedIE, +) +from .rutv import RUTVIE +from .ruutu import RuutuIE +from .ruv import ( + RuvIE, + RuvSpilaIE +) +from .safari import ( + SafariIE, + SafariApiIE, + SafariCourseIE, +) +from .saitosan import SaitosanIE +from .samplefocus import SampleFocusIE +from .sapo import SapoIE +from .savefrom import SaveFromIE +from .sbs import SBSIE +from .screencast import ScreencastIE +from .screencastomatic import ScreencastOMaticIE +from .scrippsnetworks import ( + ScrippsNetworksWatchIE, + ScrippsNetworksIE, +) +from .scte import ( + SCTEIE, + SCTECourseIE, +) +from .seeker import SeekerIE +from .senategov import SenateISVPIE, SenateGovIE +from .sendtonews import SendtoNewsIE +from .servus import ServusIE +from .sevenplus import SevenPlusIE +from .sexu import SexuIE +from .seznamzpravy import ( + SeznamZpravyIE, + SeznamZpravyArticleIE, +) +from .shahid import ( + ShahidIE, + ShahidShowIE, +) +from .shared import ( + SharedIE, + VivoIE, +) +from .shemaroome import ShemarooMeIE +from .showroomlive import ShowRoomLiveIE +from .simplecast import ( + SimplecastIE, + SimplecastEpisodeIE, + SimplecastPodcastIE, +) +from .sina import SinaIE +from .sixplay import SixPlayIE +from .skeb import SkebIE +from .skyit import ( + SkyItPlayerIE, + SkyItVideoIE, + SkyItVideoLiveIE, + SkyItIE, + SkyItAcademyIE, + SkyItArteIE, + CieloTVItIE, + TV8ItIE, +) +from .skylinewebcams import SkylineWebcamsIE +from .skynewsarabia import ( + SkyNewsArabiaIE, + SkyNewsArabiaArticleIE, +) +from .skynewsau import SkyNewsAUIE +from .sky import ( + SkyNewsIE, + SkyNewsStoryIE, + SkySportsIE, + SkySportsNewsIE, +) +from .slideshare import SlideshareIE +from .slideslive import SlidesLiveIE +from .slutload import SlutloadIE +from .snotr import SnotrIE +from .sohu import SohuIE +from .sonyliv import ( + SonyLIVIE, + SonyLIVSeriesIE, +) +from .soundcloud import ( + SoundcloudEmbedIE, + SoundcloudIE, + SoundcloudSetIE, + SoundcloudRelatedIE, + SoundcloudUserIE, + SoundcloudTrackStationIE, + SoundcloudPlaylistIE, + SoundcloudSearchIE, +) +from .soundgasm import ( + SoundgasmIE, + SoundgasmProfileIE +) +from .southpark import ( + SouthParkIE, + SouthParkDeIE, + SouthParkDkIE, + SouthParkEsIE, + SouthParkLatIE, + SouthParkNlIE +) +from .sovietscloset import ( + SovietsClosetIE, + SovietsClosetPlaylistIE +) +from .spankbang import ( + SpankBangIE, + SpankBangPlaylistIE, +) +from .spankwire import SpankwireIE +from .spiegel import SpiegelIE +from .spike import ( + BellatorIE, + ParamountNetworkIE, +) +from .stitcher import ( + StitcherIE, + StitcherShowIE, +) +from .sport5 import Sport5IE +from .sportbox import SportBoxIE +from .sportdeutschland import SportDeutschlandIE +from .spotify import ( + SpotifyIE, + SpotifyShowIE, +) +from .spreaker import ( + SpreakerIE, + SpreakerPageIE, + SpreakerShowIE, + SpreakerShowPageIE, +) +from .springboardplatform import SpringboardPlatformIE +from .sprout import SproutIE +from .srgssr import ( + SRGSSRIE, + SRGSSRPlayIE, +) +from .srmediathek import SRMediathekIE +from .stanfordoc import StanfordOpenClassroomIE +from .startv import StarTVIE +from .steam import SteamIE +from .storyfire import ( + StoryFireIE, + StoryFireUserIE, + StoryFireSeriesIE, +) +from .streamable import StreamableIE +from .streamanity import StreamanityIE +from .streamcloud import StreamcloudIE +from .streamcz import StreamCZIE +from .streamff import StreamFFIE +from .streetvoice import StreetVoiceIE +from .stretchinternet import StretchInternetIE +from .stripchat import StripchatIE +from .stv import STVPlayerIE +from .substack import SubstackIE +from .sunporno import SunPornoIE +from .sverigesradio import ( + SverigesRadioEpisodeIE, + SverigesRadioPublicationIE, +) +from .svt import ( + SVTIE, + SVTPageIE, + SVTPlayIE, + SVTSeriesIE, +) +from .swrmediathek import SWRMediathekIE +from .syfy import SyfyIE +from .sztvhu import SztvHuIE +from .tagesschau import TagesschauIE +from .tass import TassIE +from .tbs import TBSIE +from .tdslifeway import TDSLifewayIE +from .teachable import ( + TeachableIE, + TeachableCourseIE, +) +from .teachertube import ( + TeacherTubeIE, + TeacherTubeUserIE, +) +from .teachingchannel import TeachingChannelIE +from .teamcoco import TeamcocoIE +from .teamtreehouse import TeamTreeHouseIE +from .techtalks import TechTalksIE +from .ted import ( + TedEmbedIE, + TedPlaylistIE, + TedSeriesIE, + TedTalkIE, +) +from .tele5 import Tele5IE +from .tele13 import Tele13IE +from .telebruxelles import TeleBruxellesIE +from .telecinco import TelecincoIE +from .telegraaf import TelegraafIE +from .telegram import TelegramEmbedIE +from .telemb import TeleMBIE +from .telemundo import TelemundoIE +from .telequebec import ( + TeleQuebecIE, + TeleQuebecSquatIE, + TeleQuebecEmissionIE, + TeleQuebecLiveIE, + TeleQuebecVideoIE, +) +from .teletask import TeleTaskIE +from .telewebion import TelewebionIE +from .tennistv import TennisTVIE +from .tenplay import TenPlayIE +from .testurl import TestURLIE +from .tf1 import TF1IE +from .tfo import TFOIE +from .theintercept import TheInterceptIE +from .theplatform import ( + ThePlatformIE, + ThePlatformFeedIE, +) +from .thestar import TheStarIE +from .thesun import TheSunIE +from .theta import ( + ThetaVideoIE, + ThetaStreamIE, +) +from .theweatherchannel import TheWeatherChannelIE +from .thisamericanlife import ThisAmericanLifeIE +from .thisav import ThisAVIE +from .thisoldhouse import ThisOldHouseIE +from .threespeak import ( + ThreeSpeakIE, + ThreeSpeakUserIE, +) +from .threeqsdn import ThreeQSDNIE +from .tiktok import ( + TikTokIE, + TikTokUserIE, + TikTokSoundIE, + TikTokEffectIE, + TikTokTagIE, + TikTokVMIE, + DouyinIE, +) +from .tinypic import TinyPicIE +from .tmz import TMZIE +from .tnaflix import ( + TNAFlixNetworkEmbedIE, + TNAFlixIE, + EMPFlixIE, + MovieFapIE, +) +from .toggle import ( + ToggleIE, + MeWatchIE, +) +from .toggo import ( + ToggoIE, +) +from .tokentube import ( + TokentubeIE, + TokentubeChannelIE +) +from .tonline import TOnlineIE +from .toongoggles import ToonGogglesIE +from .toutv import TouTvIE +from .toypics import ToypicsUserIE, ToypicsIE +from .traileraddict import TrailerAddictIE +from .trilulilu import TriluliluIE +from .trovo import ( + TrovoIE, + TrovoVodIE, + TrovoChannelVodIE, + TrovoChannelClipIE, +) +from .trueid import TrueIDIE +from .trunews import TruNewsIE +from .trutv import TruTVIE +from .tube8 import Tube8IE +from .tubitv import ( + TubiTvIE, + TubiTvShowIE, +) +from .tumblr import TumblrIE +from .tunein import ( + TuneInClipIE, + TuneInStationIE, + TuneInProgramIE, + TuneInTopicIE, + TuneInShortenerIE, +) +from .tunepk import TunePkIE +from .turbo import TurboIE +from .tv2 import ( + TV2IE, + TV2ArticleIE, + KatsomoIE, + MTVUutisetArticleIE, +) +from .tv2dk import ( + TV2DKIE, + TV2DKBornholmPlayIE, +) +from .tv2hu import ( + TV2HuIE, + TV2HuSeriesIE, +) +from .tv4 import TV4IE +from .tv5mondeplus import TV5MondePlusIE +from .tv5unis import ( + TV5UnisVideoIE, + TV5UnisIE, +) +from .tva import ( + TVAIE, + QubIE, +) +from .tvanouvelles import ( + TVANouvellesIE, + TVANouvellesArticleIE, +) +from .tvc import ( + TVCIE, + TVCArticleIE, +) +from .tver import TVerIE +from .tvigle import TvigleIE +from .tvland import TVLandIE +from .tvn24 import TVN24IE +from .tvnet import TVNetIE +from .tvnoe import TVNoeIE +from .tvnow import ( + TVNowIE, + TVNowFilmIE, + TVNowNewIE, + TVNowSeasonIE, + TVNowAnnualIE, + TVNowShowIE, +) +from .tvopengr import ( + TVOpenGrWatchIE, + TVOpenGrEmbedIE, +) +from .tvp import ( + TVPEmbedIE, + TVPIE, + TVPStreamIE, + TVPWebsiteIE, +) +from .tvplay import ( + TVPlayIE, + ViafreeIE, + TVPlayHomeIE, +) +from .tvplayer import TVPlayerIE +from .tweakers import TweakersIE +from .twentyfourvideo import TwentyFourVideoIE +from .twentymin import TwentyMinutenIE +from .twentythreevideo import TwentyThreeVideoIE +from .twitcasting import ( + TwitCastingIE, + TwitCastingLiveIE, + TwitCastingUserIE, +) +from .twitch import ( + TwitchVodIE, + TwitchCollectionIE, + TwitchVideosIE, + TwitchVideosClipsIE, + TwitchVideosCollectionsIE, + TwitchStreamIE, + TwitchClipsIE, +) +from .twitter import ( + TwitterCardIE, + TwitterIE, + TwitterAmplifyIE, + TwitterBroadcastIE, + TwitterShortenerIE, +) +from .udemy import ( + UdemyIE, + UdemyCourseIE +) +from .udn import UDNEmbedIE +from .ufctv import ( + UFCTVIE, + UFCArabiaIE, +) +from .ukcolumn import UkColumnIE +from .uktvplay import UKTVPlayIE +from .digiteka import DigitekaIE +from .dlive import ( + DLiveVODIE, + DLiveStreamIE, +) +from .drooble import DroobleIE +from .umg import UMGDeIE +from .unistra import UnistraIE +from .unity import UnityIE +from .uol import UOLIE +from .uplynk import ( + UplynkIE, + UplynkPreplayIE, +) +from .urort import UrortIE +from .urplay import URPlayIE +from .usanetwork import USANetworkIE +from .usatoday import USATodayIE +from .ustream import UstreamIE, UstreamChannelIE +from .ustudio import ( + UstudioIE, + UstudioEmbedIE, +) +from .utreon import UtreonIE +from .varzesh3 import Varzesh3IE +from .vbox7 import Vbox7IE +from .veehd import VeeHDIE +from .veo import VeoIE +from .veoh import VeohIE +from .vesti import VestiIE +from .vevo import ( + VevoIE, + VevoPlaylistIE, +) +from .vgtv import ( + BTArticleIE, + BTVestlendingenIE, + VGTVIE, +) +from .vh1 import VH1IE +from .vice import ( + ViceIE, + ViceArticleIE, + ViceShowIE, +) +from .vidbit import VidbitIE +from .viddler import ViddlerIE +from .videa import VideaIE +from .videocampus_sachsen import VideocampusSachsenIE +from .videodetective import VideoDetectiveIE +from .videofyme import VideofyMeIE +from .videomore import ( + VideomoreIE, + VideomoreVideoIE, + VideomoreSeasonIE, +) +from .videopress import VideoPressIE +from .vidio import ( + VidioIE, + VidioPremierIE, + VidioLiveIE +) +from .vidlii import VidLiiIE +from .vier import VierIE, VierVideosIE +from .viewlift import ( + ViewLiftIE, + ViewLiftEmbedIE, +) +from .viidea import ViideaIE +from .vimeo import ( + VimeoIE, + VimeoAlbumIE, + VimeoChannelIE, + VimeoGroupsIE, + VimeoLikesIE, + VimeoOndemandIE, + VimeoReviewIE, + VimeoUserIE, + VimeoWatchLaterIE, + VHXEmbedIE, +) +from .vimm import ( + VimmIE, + VimmRecordingIE, +) +from .vimple import VimpleIE +from .vine import ( + VineIE, + VineUserIE, +) +from .viki import ( + VikiIE, + VikiChannelIE, +) +from .viqeo import ViqeoIE +from .viu import ( + ViuIE, + ViuPlaylistIE, + ViuOTTIE, +) +from .vk import ( + VKIE, + VKUserVideosIE, + VKWallPostIE, +) +from .vlive import ( + VLiveIE, + VLivePostIE, + VLiveChannelIE, +) +from .vodlocker import VodlockerIE +from .vodpl import VODPlIE +from .vodplatform import VODPlatformIE +from .voicerepublic import VoiceRepublicIE +from .voicy import ( + VoicyIE, + VoicyChannelIE, +) +from .voot import ( + VootIE, + VootSeriesIE, +) +from .voxmedia import ( + VoxMediaVolumeIE, + VoxMediaIE, +) +from .vrt import VRTIE +from .vrak import VrakIE +from .vrv import ( + VRVIE, + VRVSeriesIE, +) +from .vshare import VShareIE +from .vtm import VTMIE +from .medialaan import MedialaanIE +from .vuclip import VuClipIE +from .vupload import VuploadIE +from .vvvvid import ( + VVVVIDIE, + VVVVIDShowIE, +) +from .vyborymos import VyboryMosIE +from .vzaar import VzaarIE +from .wakanim import WakanimIE +from .walla import WallaIE +from .washingtonpost import ( + WashingtonPostIE, + WashingtonPostArticleIE, +) +from .wasdtv import ( + WASDTVStreamIE, + WASDTVRecordIE, + WASDTVClipIE, +) +from .wat import WatIE +from .watchbox import WatchBoxIE +from .watchindianporn import WatchIndianPornIE +from .wdr import ( + WDRIE, + WDRPageIE, + WDRElefantIE, + WDRMobileIE, +) +from .webcaster import ( + WebcasterIE, + WebcasterFeedIE, +) +from .webofstories import ( + WebOfStoriesIE, + WebOfStoriesPlaylistIE, +) +from .weibo import ( + WeiboIE, + WeiboMobileIE +) +from .weiqitv import WeiqiTVIE +from .willow import WillowIE +from .wimtv import WimTVIE +from .whowatch import WhoWatchIE +from .wistia import ( + WistiaIE, + WistiaPlaylistIE, +) +from .worldstarhiphop import WorldStarHipHopIE +from .wppilot import ( + WPPilotIE, + WPPilotChannelsIE, +) +from .wsj import ( + WSJIE, + WSJArticleIE, +) +from .wwe import WWEIE +from .xbef import XBefIE +from .xboxclips import XboxClipsIE +from .xfileshare import XFileShareIE +from .xhamster import ( + XHamsterIE, + XHamsterEmbedIE, + XHamsterUserIE, +) +from .xiami import ( + XiamiSongIE, + XiamiAlbumIE, + XiamiArtistIE, + XiamiCollectionIE +) +from .ximalaya import ( + XimalayaIE, + XimalayaAlbumIE +) +from .xinpianchang import XinpianchangIE +from .xminus import XMinusIE +from .xnxx import XNXXIE +from .xstream import XstreamIE +from .xtube import XTubeUserIE, XTubeIE +from .xuite import XuiteIE +from .xvideos import XVideosIE +from .xxxymovies import XXXYMoviesIE +from .yahoo import ( + YahooIE, + YahooSearchIE, + YahooGyaOPlayerIE, + YahooGyaOIE, + YahooJapanNewsIE, +) +from .yandexdisk import YandexDiskIE +from .yandexmusic import ( + YandexMusicTrackIE, + YandexMusicAlbumIE, + YandexMusicPlaylistIE, + YandexMusicArtistTracksIE, + YandexMusicArtistAlbumsIE, +) +from .yandexvideo import ( + YandexVideoIE, + YandexVideoPreviewIE, + ZenYandexIE, + ZenYandexChannelIE, +) +from .yapfiles import YapFilesIE +from .yesjapan import YesJapanIE +from .yinyuetai import YinYueTaiIE +from .ynet import YnetIE +from .youjizz import YouJizzIE +from .youku import ( + YoukuIE, + YoukuShowIE, +) +from .younow import ( + YouNowLiveIE, + YouNowChannelIE, + YouNowMomentIE, +) +from .youporn import YouPornIE +from .yourporn import YourPornIE +from .yourupload import YourUploadIE +from .youtube import ( + YoutubeIE, + YoutubeClipIE, + YoutubeFavouritesIE, + YoutubeNotificationsIE, + YoutubeHistoryIE, + YoutubeTabIE, + YoutubeLivestreamEmbedIE, + YoutubePlaylistIE, + YoutubeRecommendedIE, + YoutubeSearchDateIE, + YoutubeSearchIE, + YoutubeSearchURLIE, + YoutubeMusicSearchURLIE, + YoutubeSubscriptionsIE, + YoutubeStoriesIE, + YoutubeTruncatedIDIE, + YoutubeTruncatedURLIE, + YoutubeYtBeIE, + YoutubeYtUserIE, + YoutubeWatchLaterIE, +) +from .zapiks import ZapiksIE +from .zattoo import ( + BBVTVIE, + EinsUndEinsTVIE, + EWETVIE, + GlattvisionTVIE, + MNetTVIE, + NetPlusIE, + OsnatelTVIE, + QuantumTVIE, + SaltTVIE, + SAKTVIE, + VTXTVIE, + WalyTVIE, + ZattooIE, + ZattooLiveIE, + ZattooMoviesIE, + ZattooRecordingsIE, +) +from .zdf import ZDFIE, ZDFChannelIE +from .zee5 import ( + Zee5IE, + Zee5SeriesIE, +) +from .zhihu import ZhihuIE +from .zingmp3 import ( + ZingMp3IE, + ZingMp3AlbumIE, + ZingMp3ChartHomeIE, + ZingMp3WeekChartIE, + ZingMp3ChartMusicVideoIE, + ZingMp3UserIE, +) +from .zoom import ZoomIE +from .zype import ZypeIE diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 1b9deeae8..a75efdd0f 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -7,16 +7,17 @@ import json import re import struct import time +import urllib.parse +import urllib.request import urllib.response import uuid from .common import InfoExtractor from ..aes import aes_ecb_decrypt -from ..compat import compat_urllib_parse_urlparse, compat_urllib_request from ..utils import ( ExtractorError, bytes_to_intlist, - decode_base, + decode_base_n, int_or_none, intlist_to_bytes, request_to_url, @@ -33,7 +34,7 @@ def add_opener(ydl, handler): ''' Add a handler for opening URLs, like _download_webpage ''' # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 - assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + assert isinstance(ydl._opener, urllib.request.OpenerDirector) ydl._opener.add_handler(handler) @@ -46,7 +47,7 @@ def remove_opener(ydl, handler): # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 opener = ydl._opener - assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + assert isinstance(ydl._opener, urllib.request.OpenerDirector) if isinstance(handler, (type, tuple)): find_cp = lambda x: isinstance(x, handler) else: @@ -96,7 +97,7 @@ def remove_opener(ydl, handler): opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)] -class AbemaLicenseHandler(compat_urllib_request.BaseHandler): +class AbemaLicenseHandler(urllib.request.BaseHandler): handler_order = 499 STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E' @@ -109,7 +110,7 @@ class AbemaLicenseHandler(compat_urllib_request.BaseHandler): self.ie = ie def _get_videokey_from_ticket(self, ticket): - to_show = self.ie._downloader.params.get('verbose', False) + to_show = self.ie.get_param('verbose', False) media_token = self.ie._get_media_token(to_show=to_show) license_response = self.ie._download_json( @@ -123,7 +124,7 @@ class AbemaLicenseHandler(compat_urllib_request.BaseHandler): 'Content-Type': 'application/json', }) - res = decode_base(license_response['k'], self.STRTABLE) + res = decode_base_n(license_response['k'], table=self.STRTABLE) encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff)) h = hmac.new( @@ -136,7 +137,7 @@ class AbemaLicenseHandler(compat_urllib_request.BaseHandler): def abematv_license_open(self, url): url = request_to_url(url) - ticket = compat_urllib_parse_urlparse(url).netloc + ticket = urllib.parse.urlparse(url).netloc response_data = self._get_videokey_from_ticket(ticket) return urllib.response.addinfourl(io.BytesIO(response_data), headers={ 'Content-Length': len(response_data), diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index a8e6c4363..a2666c2b8 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -1,3 +1,4 @@ +import getpass import json import re import time @@ -5,19 +6,15 @@ import urllib.error import xml.etree.ElementTree as etree from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_getpass -) +from ..compat import compat_urlparse from ..utils import ( + NO_DEFAULT, + ExtractorError, unescapeHTML, - urlencode_postdata, unified_timestamp, - ExtractorError, - NO_DEFAULT, + urlencode_postdata, ) - MSO_INFO = { 'DTV': { 'name': 'DIRECTV', @@ -1431,7 +1428,7 @@ class AdobePassIE(InfoExtractor): guid = xml_text(resource, 'guid') if '<' in resource else resource count = 0 while count < 2: - requestor_info = self._downloader.cache.load(self._MVPD_CACHE, requestor_id) or {} + requestor_info = self.cache.load(self._MVPD_CACHE, requestor_id) or {} authn_token = requestor_info.get('authn_token') if authn_token and is_expired(authn_token, 'simpleTokenExpires'): authn_token = None @@ -1506,7 +1503,7 @@ class AdobePassIE(InfoExtractor): 'send_confirm_link': False, 'send_token': True })) - philo_code = compat_getpass('Type auth code you have received [Return]: ') + philo_code = getpass.getpass('Type auth code you have received [Return]: ') self._download_webpage( 'https://idp.philo.com/auth/update/login_code', video_id, 'Submitting token', data=urlencode_postdata({ 'token': philo_code @@ -1726,12 +1723,12 @@ class AdobePassIE(InfoExtractor): raise_mvpd_required() raise if '<pendingLogout' in session: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) + self.cache.store(self._MVPD_CACHE, requestor_id, {}) count += 1 continue authn_token = unescapeHTML(xml_text(session, 'authnToken')) requestor_info['authn_token'] = authn_token - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) + self.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) authz_token = requestor_info.get(guid) if authz_token and is_expired(authz_token, 'simpleTokenTTL'): @@ -1747,14 +1744,14 @@ class AdobePassIE(InfoExtractor): 'userMeta': '1', }), headers=mvpd_headers) if '<pendingLogout' in authorize: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) + self.cache.store(self._MVPD_CACHE, requestor_id, {}) count += 1 continue if '<error' in authorize: raise ExtractorError(xml_text(authorize, 'details'), expected=True) authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) requestor_info[guid] = authz_token - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) + self.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) mvpd_headers.update({ 'ap_19': xml_text(authn_token, 'simpleSamlNameID'), @@ -1770,7 +1767,7 @@ class AdobePassIE(InfoExtractor): 'hashed_guid': 'false', }), headers=mvpd_headers) if '<pendingLogout' in short_authorize: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) + self.cache.store(self._MVPD_CACHE, requestor_id, {}) count += 1 continue return short_authorize diff --git a/yt_dlp/extractor/animelab.py b/yt_dlp/extractor/animelab.py deleted file mode 100644 index fe2b70aed..000000000 --- a/yt_dlp/extractor/animelab.py +++ /dev/null @@ -1,270 +0,0 @@ -from .common import InfoExtractor - -from ..utils import ( - ExtractorError, - urlencode_postdata, - int_or_none, - str_or_none, - determine_ext, -) - -from ..compat import compat_HTTPError - - -class AnimeLabBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.animelab.com/login' - _NETRC_MACHINE = 'animelab' - _LOGGED_IN = False - - def _is_logged_in(self, login_page=None): - if not self._LOGGED_IN: - if not login_page: - login_page = self._download_webpage(self._LOGIN_URL, None, 'Downloading login page') - AnimeLabBaseIE._LOGGED_IN = 'Sign In' not in login_page - return self._LOGGED_IN - - def _perform_login(self, username, password): - if self._is_logged_in(): - return - - login_form = { - 'email': username, - 'password': password, - } - - try: - response = self._download_webpage( - self._LOGIN_URL, None, 'Logging in', 'Wrong login info', - data=urlencode_postdata(login_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - raise ExtractorError('Unable to log in (wrong credentials?)', expected=True) - raise - - if not self._is_logged_in(response): - raise ExtractorError('Unable to login (cannot verify if logged in)') - - def _real_initialize(self): - if not self._is_logged_in(): - self.raise_login_required('Login is required to access any AnimeLab content') - - -class AnimeLabIE(AnimeLabBaseIE): - _VALID_URL = r'https?://(?:www\.)?animelab\.com/player/(?P<id>[^/]+)' - - _TEST = { - 'url': 'https://www.animelab.com/player/fullmetal-alchemist-brotherhood-episode-42', - 'md5': '05bde4b91a5d1ff46ef5b94df05b0f7f', - 'info_dict': { - 'id': '383', - 'ext': 'mp4', - 'display_id': 'fullmetal-alchemist-brotherhood-episode-42', - 'title': 'Fullmetal Alchemist: Brotherhood - Episode 42 - Signs of a Counteroffensive', - 'description': 'md5:103eb61dd0a56d3dfc5dbf748e5e83f4', - 'series': 'Fullmetal Alchemist: Brotherhood', - 'episode': 'Signs of a Counteroffensive', - 'episode_number': 42, - 'duration': 1469, - 'season': 'Season 1', - 'season_number': 1, - 'season_id': '38', - }, - 'params': { - # Ensure the same video is downloaded whether the user is premium or not - 'format': '[format_id=21711_yeshardsubbed_ja-JP][height=480]', - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - # unfortunately we can get different URLs for the same formats - # e.g. if we are using a "free" account so no dubs available - # (so _remove_duplicate_formats is not effective) - # so we use a dictionary as a workaround - formats = {} - for language_option_url in ('https://www.animelab.com/player/%s/subtitles', - 'https://www.animelab.com/player/%s/dubbed'): - actual_url = language_option_url % display_id - webpage = self._download_webpage(actual_url, display_id, 'Downloading URL ' + actual_url) - - video_collection = self._parse_json(self._search_regex(r'new\s+?AnimeLabApp\.VideoCollection\s*?\((.*?)\);', webpage, 'AnimeLab VideoCollection'), display_id) - position = int_or_none(self._search_regex(r'playlistPosition\s*?=\s*?(\d+)', webpage, 'Playlist Position')) - - raw_data = video_collection[position]['videoEntry'] - - video_id = str_or_none(raw_data['id']) - - # create a title from many sources (while grabbing other info) - # TODO use more fallback sources to get some of these - series = raw_data.get('showTitle') - video_type = raw_data.get('videoEntryType', {}).get('name') - episode_number = raw_data.get('episodeNumber') - episode_name = raw_data.get('name') - - title_parts = (series, video_type, episode_number, episode_name) - if None not in title_parts: - title = '%s - %s %s - %s' % title_parts - else: - title = episode_name - - description = raw_data.get('synopsis') or self._og_search_description(webpage, default=None) - - duration = int_or_none(raw_data.get('duration')) - - thumbnail_data = raw_data.get('images', []) - thumbnails = [] - for thumbnail in thumbnail_data: - for instance in thumbnail['imageInstances']: - image_data = instance.get('imageInfo', {}) - thumbnails.append({ - 'id': str_or_none(image_data.get('id')), - 'url': image_data.get('fullPath'), - 'width': image_data.get('width'), - 'height': image_data.get('height'), - }) - - season_data = raw_data.get('season', {}) or {} - season = str_or_none(season_data.get('name')) - season_number = int_or_none(season_data.get('seasonNumber')) - season_id = str_or_none(season_data.get('id')) - - for video_data in raw_data['videoList']: - current_video_list = {} - current_video_list['language'] = video_data.get('language', {}).get('languageCode') - - is_hardsubbed = video_data.get('hardSubbed') - - for video_instance in video_data['videoInstances']: - httpurl = video_instance.get('httpUrl') - url = httpurl if httpurl else video_instance.get('rtmpUrl') - if url is None: - # this video format is unavailable to the user (not premium etc.) - continue - - current_format = current_video_list.copy() - - format_id_parts = [] - - format_id_parts.append(str_or_none(video_instance.get('id'))) - - if is_hardsubbed is not None: - if is_hardsubbed: - format_id_parts.append('yeshardsubbed') - else: - format_id_parts.append('nothardsubbed') - - format_id_parts.append(current_format['language']) - - format_id = '_'.join([x for x in format_id_parts if x is not None]) - - ext = determine_ext(url) - if ext == 'm3u8': - for format_ in self._extract_m3u8_formats( - url, video_id, m3u8_id=format_id, fatal=False): - formats[format_['format_id']] = format_ - continue - elif ext == 'mpd': - for format_ in self._extract_mpd_formats( - url, video_id, mpd_id=format_id, fatal=False): - formats[format_['format_id']] = format_ - continue - - current_format['url'] = url - quality_data = video_instance.get('videoQuality') - if quality_data: - quality = quality_data.get('name') or quality_data.get('description') - else: - quality = None - - height = None - if quality: - height = int_or_none(self._search_regex(r'(\d+)p?$', quality, 'Video format height', default=None)) - - if height is None: - self.report_warning('Could not get height of video') - else: - current_format['height'] = height - current_format['format_id'] = format_id - - formats[current_format['format_id']] = current_format - - formats = list(formats.values()) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'series': series, - 'episode': episode_name, - 'episode_number': int_or_none(episode_number), - 'thumbnails': thumbnails, - 'duration': duration, - 'formats': formats, - 'season': season, - 'season_number': season_number, - 'season_id': season_id, - } - - -class AnimeLabShowsIE(AnimeLabBaseIE): - _VALID_URL = r'https?://(?:www\.)?animelab\.com/shows/(?P<id>[^/]+)' - - _TEST = { - 'url': 'https://www.animelab.com/shows/attack-on-titan', - 'info_dict': { - 'id': '45', - 'title': 'Attack on Titan', - 'description': 'md5:989d95a2677e9309368d5cf39ba91469', - }, - 'playlist_count': 59, - 'skip': 'All AnimeLab content requires authentication', - } - - def _real_extract(self, url): - _BASE_URL = 'http://www.animelab.com' - _SHOWS_API_URL = '/api/videoentries/show/videos/' - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id, 'Downloading requested URL') - - show_data_str = self._search_regex(r'({"id":.*}),\svideoEntry', webpage, 'AnimeLab show data') - show_data = self._parse_json(show_data_str, display_id) - - show_id = str_or_none(show_data.get('id')) - title = show_data.get('name') - description = show_data.get('shortSynopsis') or show_data.get('longSynopsis') - - entries = [] - for season in show_data['seasons']: - season_id = season['id'] - get_data = urlencode_postdata({ - 'seasonId': season_id, - 'limit': 1000, - }) - # despite using urlencode_postdata, we are sending a GET request - target_url = _BASE_URL + _SHOWS_API_URL + show_id + "?" + get_data.decode('utf-8') - response = self._download_webpage( - target_url, - None, 'Season id %s' % season_id) - - season_data = self._parse_json(response, display_id) - - for video_data in season_data['list']: - entries.append(self.url_result( - _BASE_URL + '/player/' + video_data['slug'], 'AnimeLab', - str_or_none(video_data.get('id')), video_data.get('name') - )) - - return { - '_type': 'playlist', - 'id': show_id, - 'title': title, - 'description': description, - 'entries': entries, - } - -# TODO implement myqueue diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index c85d5297d..1ca6ddc4d 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -1,36 +1,34 @@ -import re import json +import re +import urllib.parse + from .common import InfoExtractor -from .youtube import YoutubeIE, YoutubeBaseInfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, - compat_HTTPError -) +from .youtube import YoutubeBaseInfoExtractor, YoutubeIE +from ..compat import compat_HTTPError, compat_urllib_parse_unquote from ..utils import ( + KNOWN_EXTENSIONS, + ExtractorError, + HEADRequest, bug_reports_message, clean_html, dict_get, extract_attributes, - ExtractorError, get_element_by_id, - HEADRequest, int_or_none, join_nonempty, - KNOWN_EXTENSIONS, merge_dicts, mimetype2ext, orderedSet, parse_duration, parse_qs, - str_to_int, str_or_none, + str_to_int, traverse_obj, try_get, unified_strdate, unified_timestamp, + url_or_none, urlhandle_detect_ext, - url_or_none ) @@ -143,7 +141,7 @@ class ArchiveOrgIE(InfoExtractor): return json.loads(extract_attributes(element)['value']) def _real_extract(self, url): - video_id = compat_urllib_parse_unquote_plus(self._match_id(url)) + video_id = urllib.parse.unquote_plus(self._match_id(url)) identifier, entry_id = (video_id.split('/', 1) + [None])[:2] # Archive.org metadata API doesn't clearly demarcate playlist entries @@ -442,9 +440,10 @@ class YoutubeWebArchiveIE(InfoExtractor): 'only_matching': True }, ] - _YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE - _YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE - _YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|</script|\n)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_BOUNDARY_RE + _YT_INITIAL_DATA_RE = YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE + _YT_INITIAL_PLAYER_RESPONSE_RE = fr'''(?x) + (?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*| + {YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE}''' _YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers _YT_ALL_THUMB_SERVERS = orderedSet( @@ -474,11 +473,6 @@ class YoutubeWebArchiveIE(InfoExtractor): elif not isinstance(res, list) or len(res) != 0: self.report_warning('Error while parsing CDX API response' + bug_reports_message()) - def _extract_yt_initial_variable(self, webpage, regex, video_id, name): - return self._parse_json(self._search_regex( - (fr'{regex}\s*{self._YT_INITIAL_BOUNDARY_RE}', - regex), webpage, name, default='{}'), video_id, fatal=False) - def _extract_webpage_title(self, webpage): page_title = self._html_extract_title(webpage, default='') # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix. @@ -488,10 +482,11 @@ class YoutubeWebArchiveIE(InfoExtractor): def _extract_metadata(self, video_id, webpage): search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None)) - player_response = self._extract_yt_initial_variable( - webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {} - initial_data = self._extract_yt_initial_variable( - webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {} + player_response = self._search_json( + self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', + video_id, default={}) + initial_data = self._search_json( + self._YT_INITIAL_DATA_RE, webpage, 'initial data', video_id, default={}) initial_data_video = traverse_obj( initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'), diff --git a/yt_dlp/extractor/arnes.py b/yt_dlp/extractor/arnes.py index 96b134fa0..c80ce2233 100644 --- a/yt_dlp/extractor/arnes.py +++ b/yt_dlp/extractor/arnes.py @@ -90,7 +90,7 @@ class ArnesIE(InfoExtractor): 'timestamp': parse_iso8601(video.get('creationTime')), 'channel': channel.get('name'), 'channel_id': channel_id, - 'channel_url': format_field(channel_id, template=f'{self._BASE_URL}/?channel=%s'), + 'channel_url': format_field(channel_id, None, f'{self._BASE_URL}/?channel=%s'), 'duration': float_or_none(video.get('duration'), 1000), 'view_count': int_or_none(video.get('views')), 'tags': video.get('hashtags'), diff --git a/yt_dlp/extractor/atscaleconf.py b/yt_dlp/extractor/atscaleconf.py new file mode 100644 index 000000000..3f7b1e9f8 --- /dev/null +++ b/yt_dlp/extractor/atscaleconf.py @@ -0,0 +1,34 @@ +import re + +from .common import InfoExtractor + + +class AtScaleConfEventIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?atscaleconference\.com/events/(?P<id>[^/&$?]+)' + + _TESTS = [{ + 'url': 'https://atscaleconference.com/events/data-scale-spring-2022/', + 'playlist_mincount': 13, + 'info_dict': { + 'id': 'data-scale-spring-2022', + 'title': 'Data @Scale Spring 2022', + 'description': 'md5:7d7ca1c42ac9c6d8a785092a1aea4b55' + }, + }, { + 'url': 'https://atscaleconference.com/events/video-scale-2021/', + 'playlist_mincount': 14, + 'info_dict': { + 'id': 'video-scale-2021', + 'title': 'Video @Scale 2021', + 'description': 'md5:7d7ca1c42ac9c6d8a785092a1aea4b55' + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + + return self.playlist_from_matches( + re.findall(r'data-url\s*=\s*"(https?://(?:www\.)?atscaleconference\.com/videos/[^"]+)"', webpage), + ie='Generic', playlist_id=id, + title=self._og_search_title(webpage), description=self._og_search_description(webpage)) diff --git a/yt_dlp/extractor/audius.py b/yt_dlp/extractor/audius.py index 189d1224f..0105d9db8 100644 --- a/yt_dlp/extractor/audius.py +++ b/yt_dlp/extractor/audius.py @@ -1,8 +1,8 @@ import random from .common import InfoExtractor -from ..utils import ExtractorError, try_get, compat_str, str_or_none -from ..compat import compat_urllib_parse_unquote +from ..compat import compat_str, compat_urllib_parse_unquote +from ..utils import ExtractorError, str_or_none, try_get class AudiusBaseIE(InfoExtractor): diff --git a/yt_dlp/extractor/awaan.py b/yt_dlp/extractor/awaan.py index d289f6be3..6fc938de9 100644 --- a/yt_dlp/extractor/awaan.py +++ b/yt_dlp/extractor/awaan.py @@ -41,7 +41,7 @@ class AWAANBaseIE(InfoExtractor): 'id': video_id, 'title': title, 'description': video_data.get('description_en') or video_data.get('description_ar'), - 'thumbnail': format_field(img, template='http://admin.mangomolo.com/analytics/%s'), + 'thumbnail': format_field(img, None, 'http://admin.mangomolo.com/analytics/%s'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 9cb019a49..5ddeef7b5 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1,16 +1,12 @@ -import xml.etree.ElementTree import functools import itertools import json import re +import urllib.error +import xml.etree.ElementTree from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urllib_error, - compat_urlparse, -) +from ..compat import compat_HTTPError, compat_str, compat_urlparse from ..utils import ( ExtractorError, OnDemandPagedList, @@ -391,7 +387,7 @@ class BBCCoUkIE(InfoExtractor): href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) except ExtractorError as e: - if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError) + if not (isinstance(e.exc_info[1], urllib.error.HTTPError) and e.exc_info[1].code in (403, 404)): raise fmts = [] diff --git a/yt_dlp/extractor/bellmedia.py b/yt_dlp/extractor/bellmedia.py index 8f9849d9b..5ae4b917a 100644 --- a/yt_dlp/extractor/bellmedia.py +++ b/yt_dlp/extractor/bellmedia.py @@ -24,7 +24,7 @@ class BellMediaIE(InfoExtractor): )/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})''' _TESTS = [{ 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', - 'md5': '36d3ef559cfe8af8efe15922cd3ce950', + 'md5': '3e5b8e38370741d5089da79161646635', 'info_dict': { 'id': '1403070', 'ext': 'flv', @@ -32,6 +32,14 @@ class BellMediaIE(InfoExtractor): 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', 'upload_date': '20180525', 'timestamp': 1527288600, + 'season_id': 73997, + 'season': '2018', + 'thumbnail': 'http://images2.9c9media.com/image_asset/2018_5_25_baf30cbd-b28d-4a18-9903-4bb8713b00f5_PNG_956x536.jpg', + 'tags': [], + 'categories': ['ETFs'], + 'season_number': 8, + 'duration': 272.038, + 'series': 'Market Call Tonight', }, }, { 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index ead0dd88b..d695d9b49 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -677,6 +677,11 @@ class BilibiliAudioIE(BilibiliAudioBaseIE): 'vcodec': 'none' }] + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + song = self._call_api('song/info', au_id) title = song['title'] statistic = song.get('statistic') or {} @@ -784,7 +789,8 @@ class BiliIntlBaseIE(InfoExtractor): def json2srt(self, json): data = '\n\n'.join( f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}' - for i, line in enumerate(json['body']) if line.get('content')) + for i, line in enumerate(traverse_obj(json, ( + 'body', lambda _, l: l['content'] and l['from'] and l['to'])))) return data def _get_subtitles(self, *, ep_id=None, aid=None): @@ -947,12 +953,11 @@ class BiliIntlIE(BiliIntlBaseIE): video_id = ep_id or aid webpage = self._download_webpage(url, video_id) # Bstation layout - initial_data = self._parse_json(self._search_regex( - r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), video_id, fatal=False) or {} - video_data = ( - traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) - or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {}) + initial_data = ( + self._search_json(r'window\.__INITIAL_(?:DATA|STATE)__\s*=', webpage, 'preload state', video_id, default={}) + or self._search_nuxt_data(webpage, video_id, '__initialState', fatal=False, traverse=None)) + video_data = traverse_obj( + initial_data, ('OgvVideo', 'epDetail'), ('UgcVideo', 'videoData'), ('ugc', 'archive'), expected_type=dict) if season_id and not video_data: # Non-Bstation layout, read through episode list @@ -960,7 +965,7 @@ class BiliIntlIE(BiliIntlBaseIE): video_data = traverse_obj(season_json, ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id), expected_type=dict, get_all=False) - return self._extract_video_info(video_data, ep_id=ep_id, aid=aid) + return self._extract_video_info(video_data or {}, ep_id=ep_id, aid=aid) class BiliIntlSeriesIE(BiliIntlBaseIE): diff --git a/yt_dlp/extractor/bloomberg.py b/yt_dlp/extractor/bloomberg.py index c0aaeae02..c842c342c 100644 --- a/yt_dlp/extractor/bloomberg.py +++ b/yt_dlp/extractor/bloomberg.py @@ -7,13 +7,11 @@ class BloombergIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)' _TESTS = [{ - 'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2', - # The md5 checksum changes + 'url': 'https://www.bloomberg.com/news/videos/2021-09-14/apple-unveils-the-new-iphone-13-stock-doesn-t-move-much-video', 'info_dict': { - 'id': 'qurhIVlJSB6hzkVi229d8g', + 'id': 'V8cFcYMxTHaMcEiiYVr39A', 'ext': 'flv', - 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', - 'description': 'md5:a8ba0302912d03d246979735c17d2761', + 'title': 'Apple Unveils the New IPhone 13, Stock Doesn\'t Move Much', }, 'params': { 'format': 'best[format_id^=hds]', @@ -57,7 +55,7 @@ class BloombergIE(InfoExtractor): title = re.sub(': Video$', '', self._og_search_title(webpage)) embed_info = self._download_json( - 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) + 'http://www.bloomberg.com/multimedia/api/embed?id=%s' % video_id, video_id) formats = [] for stream in embed_info['streams']: stream_url = stream.get('url') diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py index 936c34e15..a5412897d 100644 --- a/yt_dlp/extractor/brightcove.py +++ b/yt_dlp/extractor/brightcove.py @@ -600,9 +600,9 @@ class BrightcoveNewIE(AdobePassIE): account_id, player_id, embed, content_type, video_id = self._match_valid_url(url).groups() policy_key_id = '%s_%s' % (account_id, player_id) - policy_key = self._downloader.cache.load('brightcove', policy_key_id) + policy_key = self.cache.load('brightcove', policy_key_id) policy_key_extracted = False - store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x) + store_pk = lambda x: self.cache.store('brightcove', policy_key_id, x) def extract_policy_key(): base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index cac3f1e9d..999b7bc53 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -304,13 +304,13 @@ class CBCGemIE(InfoExtractor): def _get_claims_token(self, email, password): if not self.claims_token_valid(): self._claims_token = self._new_claims_token(email, password) - self._downloader.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token) + self.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token) return self._claims_token def _real_initialize(self): if self.claims_token_valid(): return - self._claims_token = self._downloader.cache.load(self._NETRC_MACHINE, 'claims_token') + self._claims_token = self.cache.load(self._NETRC_MACHINE, 'claims_token') def _find_secret_formats(self, formats, video_id): """ Find a valid video url and convert it to the secret variant """ diff --git a/yt_dlp/extractor/ccc.py b/yt_dlp/extractor/ccc.py index b11e1f74e..1bc0f07f2 100644 --- a/yt_dlp/extractor/ccc.py +++ b/yt_dlp/extractor/ccc.py @@ -75,6 +75,7 @@ class CCCIE(InfoExtractor): 'thumbnail': event_data.get('thumb_url'), 'timestamp': parse_iso8601(event_data.get('date')), 'duration': int_or_none(event_data.get('length')), + 'view_count': int_or_none(event_data.get('view_count')), 'tags': event_data.get('tags'), 'formats': formats, } diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 9b257bee9..6d01c60d5 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -1,13 +1,9 @@ import codecs -import re import json +import re from .common import InfoExtractor -from ..compat import ( - compat_chr, - compat_ord, - compat_urllib_parse_unquote, -) +from ..compat import compat_ord, compat_urllib_parse_unquote from ..utils import ( ExtractorError, float_or_none, @@ -16,8 +12,8 @@ from ..utils import ( multipart_encode, parse_duration, random_birthday, - urljoin, try_get, + urljoin, ) @@ -144,7 +140,7 @@ class CDAIE(InfoExtractor): b = [] for c in a: f = compat_ord(c) - b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f <= 126 else compat_chr(f)) + b.append(chr(33 + (f + 14) % 94) if 33 <= f <= 126 else chr(f)) a = ''.join(b) a = a.replace('.cda.mp4', '') for p in ('.2cda.pl', '.3cda.pl'): diff --git a/yt_dlp/extractor/chingari.py b/yt_dlp/extractor/chingari.py index 7e8c0bfc9..e54d92a86 100644 --- a/yt_dlp/extractor/chingari.py +++ b/yt_dlp/extractor/chingari.py @@ -1,11 +1,11 @@ import itertools import json +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus from ..utils import ( - clean_html, ExtractorError, + clean_html, int_or_none, str_to_int, url_or_none, @@ -47,8 +47,8 @@ class ChingariBaseIE(InfoExtractor): 'id': id, 'extractor_key': ChingariIE.ie_key(), 'extractor': 'Chingari', - 'title': compat_urllib_parse_unquote_plus(clean_html(post_data.get('caption'))), - 'description': compat_urllib_parse_unquote_plus(clean_html(post_data.get('caption'))), + 'title': urllib.parse.unquote_plus(clean_html(post_data.get('caption'))), + 'description': urllib.parse.unquote_plus(clean_html(post_data.get('caption'))), 'duration': media_data.get('duration'), 'thumbnail': url_or_none(thumbnail), 'like_count': post_data.get('likeCount'), diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index ebeca4395..4fbcfe203 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1,6 +1,10 @@ import base64 import collections +import getpass import hashlib +import http.client +import http.cookiejar +import http.cookies import itertools import json import math @@ -9,24 +13,12 @@ import os import random import sys import time +import urllib.parse +import urllib.request import xml.etree.ElementTree -from ..compat import ( - compat_cookiejar_Cookie, - compat_cookies_SimpleCookie, - compat_etree_fromstring, - compat_expanduser, - compat_getpass, - compat_http_client, - compat_os_name, - compat_str, - compat_urllib_error, - compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, - compat_urllib_request, - compat_urlparse, - re, -) +from ..compat import functools, re # isort: split +from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name from ..downloader import FileDownloader from ..downloader.f4m import get_base_url, remove_encrypted_media from ..utils import ( @@ -35,6 +27,7 @@ from ..utils import ( ExtractorError, GeoRestrictedError, GeoUtils, + LenientJSONDecoder, RegexNotFoundError, UnsupportedError, age_restricted, @@ -384,6 +377,11 @@ class InfoExtractor: release_year: Year (YYYY) when the album was released. composer: Composer of the piece + The following fields should only be set for clips that should be cut from the original video: + + section_start: Start time of the section in seconds + section_end: End time of the section in seconds + Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, None is equivalent to absence of information. @@ -610,8 +608,7 @@ class InfoExtractor: if ip_block: self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) - self._downloader.write_debug( - '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip) + self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For') return # Path 2: bypassing based on country code @@ -666,7 +663,7 @@ class InfoExtractor: if hasattr(e, 'countries'): kwargs['countries'] = e.countries raise type(e)(e.orig_msg, **kwargs) - except compat_http_client.IncompleteRead as e: + except http.client.IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) except (KeyError, StopIteration) as e: raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url)) @@ -690,6 +687,14 @@ class InfoExtractor: """Sets a YoutubeDL instance as the downloader for this IE.""" self._downloader = downloader + @property + def cache(self): + return self._downloader.cache + + @property + def cookiejar(self): + return self._downloader.cookiejar + def _initialize_pre_login(self): """ Intialization before login. Redefine in subclasses.""" pass @@ -717,7 +722,7 @@ class InfoExtractor: @staticmethod def __can_accept_status_code(err, expected_status): - assert isinstance(err, compat_urllib_error.HTTPError) + assert isinstance(err, urllib.error.HTTPError) if expected_status is None: return False elif callable(expected_status): @@ -725,7 +730,14 @@ class InfoExtractor: else: return err.code in variadic(expected_status) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): + def _create_request(self, url_or_request, data=None, headers=None, query=None): + if isinstance(url_or_request, urllib.request.Request): + return update_Request(url_or_request, data=data, headers=headers, query=query) + if query: + url_or_request = update_url_query(url_or_request, query) + return sanitized_Request(url_or_request, data, headers or {}) + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None): """ Return the response handle. @@ -753,21 +765,13 @@ class InfoExtractor: # geo unrestricted country. We will do so once we encounter any # geo restriction error. if self._x_forwarded_for_ip: - if 'X-Forwarded-For' not in headers: - headers['X-Forwarded-For'] = self._x_forwarded_for_ip + headers = (headers or {}).copy() + headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip) - if isinstance(url_or_request, compat_urllib_request.Request): - url_or_request = update_Request( - url_or_request, data=data, headers=headers, query=query) - else: - if query: - url_or_request = update_url_query(url_or_request, query) - if data is not None or headers: - url_or_request = sanitized_Request(url_or_request, data, headers) try: - return self._downloader.urlopen(url_or_request) + return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query)) except network_exceptions as err: - if isinstance(err, compat_urllib_error.HTTPError): + if isinstance(err, urllib.error.HTTPError): if self.__can_accept_status_code(err, expected_status): # Retain reference to error to prevent file object from # being closed before it can be read. Works around the @@ -788,14 +792,42 @@ class InfoExtractor: self.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, + encoding=None, data=None, headers={}, query={}, expected_status=None): """ Return a tuple (page content as string, URL handle). - See _download_webpage docstring for arguments specification. + Arguments: + url_or_request -- plain text URL as a string or + a urllib.request.Request object + video_id -- Video/playlist/item identifier (string) + + Keyword arguments: + note -- note printed before downloading (string) + errnote -- note printed in case of an error (string) + fatal -- flag denoting whether error should be considered fatal, + i.e. whether it should cause ExtractionError to be raised, + otherwise a warning will be reported and extraction continued + encoding -- encoding for a page content decoding, guessed automatically + when not explicitly specified + data -- POST data (bytes) + headers -- HTTP headers (dict) + query -- URL query (dict) + expected_status -- allows to accept failed HTTP requests (non 2xx + status code) by explicitly specifying a set of accepted status + codes. Can be any of the following entities: + - an integer type specifying an exact failed status code to + accept + - a list or a tuple of integer types specifying a list of + failed status codes to accept + - a callable accepting an actual failed status code and + returning True if it should be accepted + Note that this argument does not affect success status codes (2xx) + which are always accepted. """ + # Strip hashes from the URL (#1038) - if isinstance(url_or_request, (compat_str, str)): + if isinstance(url_or_request, str): url_or_request = url_or_request.partition('#')[0] urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) @@ -850,140 +882,48 @@ class InfoExtractor: 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', expected=True) + def _request_dump_filename(self, url, video_id): + basen = f'{video_id}_{url}' + trim_length = self.get_param('trim_file_name') or 240 + if len(basen) > trim_length: + h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() + basen = basen[:trim_length - len(h)] + h + filename = sanitize_filename(f'{basen}.dump', restricted=True) + # Working around MAX_PATH limitation on Windows (see + # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) + if compat_os_name == 'nt': + absfilepath = os.path.abspath(filename) + if len(absfilepath) > 259: + filename = fR'\\?\{absfilepath}' + return filename + + def __decode_webpage(self, webpage_bytes, encoding, headers): + if not encoding: + encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes) + try: + return webpage_bytes.decode(encoding, 'replace') + except LookupError: + return webpage_bytes.decode('utf-8', 'replace') + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): - content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() if prefix is not None: webpage_bytes = prefix + webpage_bytes - if not encoding: - encoding = self._guess_encoding_from_content(content_type, webpage_bytes) if self.get_param('dump_intermediate_pages', False): self.to_screen('Dumping request to ' + urlh.geturl()) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) - if self.get_param('write_pages', False): - basen = f'{video_id}_{urlh.geturl()}' - trim_length = self.get_param('trim_file_name') or 240 - if len(basen) > trim_length: - h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() - basen = basen[:trim_length - len(h)] + h - raw_filename = basen + '.dump' - filename = sanitize_filename(raw_filename, restricted=True) - self.to_screen('Saving request to ' + filename) - # Working around MAX_PATH limitation on Windows (see - # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) - if compat_os_name == 'nt': - absfilepath = os.path.abspath(filename) - if len(absfilepath) > 259: - filename = '\\\\?\\' + absfilepath + if self.get_param('write_pages'): + filename = self._request_dump_filename(urlh.geturl(), video_id) + self.to_screen(f'Saving request to {filename}') with open(filename, 'wb') as outf: outf.write(webpage_bytes) - try: - content = webpage_bytes.decode(encoding, 'replace') - except LookupError: - content = webpage_bytes.decode('utf-8', 'replace') - + content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers) self.__check_blocked(content) return content - def _download_webpage( - self, url_or_request, video_id, note=None, errnote=None, - fatal=True, tries=1, timeout=5, encoding=None, data=None, - headers={}, query={}, expected_status=None): - """ - Return the data of the page as a string. - - Arguments: - url_or_request -- plain text URL as a string or - a compat_urllib_request.Requestobject - video_id -- Video/playlist/item identifier (string) - - Keyword arguments: - note -- note printed before downloading (string) - errnote -- note printed in case of an error (string) - fatal -- flag denoting whether error should be considered fatal, - i.e. whether it should cause ExtractionError to be raised, - otherwise a warning will be reported and extraction continued - tries -- number of tries - timeout -- sleep interval between tries - encoding -- encoding for a page content decoding, guessed automatically - when not explicitly specified - data -- POST data (bytes) - headers -- HTTP headers (dict) - query -- URL query (dict) - expected_status -- allows to accept failed HTTP requests (non 2xx - status code) by explicitly specifying a set of accepted status - codes. Can be any of the following entities: - - an integer type specifying an exact failed status code to - accept - - a list or a tuple of integer types specifying a list of - failed status codes to accept - - a callable accepting an actual failed status code and - returning True if it should be accepted - Note that this argument does not affect success status codes (2xx) - which are always accepted. - """ - - success = False - try_count = 0 - while success is False: - try: - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - success = True - except compat_http_client.IncompleteRead as e: - try_count += 1 - if try_count >= tries: - raise e - self._sleep(timeout, video_id) - if res is False: - return res - else: - content, _ = res - return content - - def _download_xml_handle( - self, url_or_request, video_id, note='Downloading XML', - errnote='Unable to download XML', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle). - - See _download_webpage docstring for arguments specification. - """ - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - if res is False: - return res - xml_string, urlh = res - return self._parse_xml( - xml_string, video_id, transform_source=transform_source, - fatal=fatal), urlh - - def _download_xml( - self, url_or_request, video_id, - note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, - data=None, headers={}, query={}, expected_status=None): - """ - Return the xml as an xml.etree.ElementTree.Element. - - See _download_webpage docstring for arguments specification. - """ - res = self._download_xml_handle( - url_or_request, video_id, note=note, errnote=errnote, - transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, - expected_status=expected_status) - return res if res is False else res[0] - def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): if transform_source: xml_string = transform_source(xml_string) @@ -996,101 +936,126 @@ class InfoExtractor: else: self.report_warning(errmsg + str(ve)) - def _download_json_handle( - self, url_or_request, video_id, note='Downloading JSON metadata', - errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return a tuple (JSON object, URL handle). - - See _download_webpage docstring for arguments specification. - """ - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - if res is False: - return res - json_string, urlh = res - return self._parse_json( - json_string, video_id, transform_source=transform_source, - fatal=fatal), urlh - - def _download_json( - self, url_or_request, video_id, note='Downloading JSON metadata', - errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return the JSON object as a dict. - - See _download_webpage docstring for arguments specification. - """ - res = self._download_json_handle( - url_or_request, video_id, note=note, errnote=errnote, - transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, - expected_status=expected_status) - return res if res is False else res[0] - - def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): - if transform_source: - json_string = transform_source(json_string) + def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, **parser_kwargs): try: - return json.loads(json_string, strict=False) + return json.loads( + json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs) except ValueError as ve: - errmsg = '%s: Failed to parse JSON ' % video_id + errmsg = f'{video_id}: Failed to parse JSON' if fatal: raise ExtractorError(errmsg, cause=ve) else: - self.report_warning(errmsg + str(ve)) + self.report_warning(f'{errmsg}: {ve}') def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True): return self._parse_json( data[data.find('{'):data.rfind('}') + 1], video_id, transform_source, fatal) - def _download_socket_json_handle( - self, url_or_request, video_id, note='Polling socket', - errnote='Unable to poll socket', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return a tuple (JSON object, URL handle). + def __create_download_methods(name, parser, note, errnote, return_value): + + def parse(ie, content, *args, **kwargs): + if parser is None: + return content + # parser is fetched by name so subclasses can override it + return getattr(ie, parser)(content, *args, **kwargs) + + def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + res = self._download_webpage_handle( + url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding, + data=data, headers=headers, query=query, expected_status=expected_status) + if res is False: + return res + content, urlh = res + return parse(self, content, video_id, transform_source=transform_source, fatal=fatal), urlh + + def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + if self.get_param('load_pages'): + url_or_request = self._create_request(url_or_request, data, headers, query) + filename = self._request_dump_filename(url_or_request.full_url, video_id) + self.to_screen(f'Loading request from {filename}') + try: + with open(filename, 'rb') as dumpf: + webpage_bytes = dumpf.read() + except OSError as e: + self.report_warning(f'Unable to load request from disk: {e}') + else: + content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers) + return parse(self, content, video_id, transform_source, fatal) + kwargs = { + 'note': note, + 'errnote': errnote, + 'transform_source': transform_source, + 'fatal': fatal, + 'encoding': encoding, + 'data': data, + 'headers': headers, + 'query': query, + 'expected_status': expected_status, + } + if parser is None: + kwargs.pop('transform_source') + # The method is fetched by name so subclasses can override _download_..._handle + res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs) + return res if res is False else res[0] + + def impersonate(func, name, return_value): + func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}' + func.__doc__ = f''' + @param transform_source Apply this transformation before parsing + @returns {return_value} + + See _download_webpage_handle docstring for other arguments specification + ''' + + impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)') + impersonate(download_content, f'_download_{name}', f'{return_value}') + return download_handle, download_content + + _download_xml_handle, _download_xml = __create_download_methods( + 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element') + _download_json_handle, _download_json = __create_download_methods( + 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict') + _download_socket_json_handle, _download_socket_json = __create_download_methods( + 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict') + __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1] - See _download_webpage docstring for arguments specification. - """ - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - if res is False: - return res - webpage, urlh = res - return self._parse_socket_response_as_json( - webpage, video_id, transform_source=transform_source, - fatal=fatal), urlh - - def _download_socket_json( - self, url_or_request, video_id, note='Polling socket', - errnote='Unable to poll socket', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): + def _download_webpage( + self, url_or_request, video_id, note=None, errnote=None, + fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs): """ - Return the JSON object as a dict. + Return the data of the page as a string. - See _download_webpage docstring for arguments specification. + Keyword arguments: + tries -- number of tries + timeout -- sleep interval between tries + + See _download_webpage_handle docstring for other arguments specification. """ - res = self._download_socket_json_handle( - url_or_request, video_id, note=note, errnote=errnote, - transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, - expected_status=expected_status) - return res if res is False else res[0] + + R''' # NB: These are unused; should they be deprecated? + if tries != 1: + self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage') + if timeout is NO_DEFAULT: + timeout = 5 + else: + self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage') + ''' + + try_count = 0 + while True: + try: + return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs) + except http.client.IncompleteRead as e: + try_count += 1 + if try_count >= tries: + raise e + self._sleep(timeout, video_id) def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs): - idstr = format_field(video_id, template='%s: ') + idstr = format_field(video_id, None, '%s: ') msg = f'[{self.IE_NAME}] {idstr}{msg}' if only_once: if f'WARNING: {msg}' in self._printed_messages: @@ -1136,7 +1101,7 @@ class InfoExtractor: self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg) return - msg += format_field(self._login_hint(method), template='. %s') + msg += format_field(self._login_hint(method), None, '. %s') raise ExtractorError(msg, expected=True) def raise_geo_restricted( @@ -1228,6 +1193,33 @@ class InfoExtractor: self.report_warning('unable to extract %s' % _name + bug_reports_message()) return None + def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', + contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs): + """Searches string for the JSON object specified by start_pattern""" + # NB: end_pattern is only used to reduce the size of the initial match + if default is NO_DEFAULT: + default, has_default = {}, False + else: + fatal, has_default = False, True + + json_string = self._search_regex( + rf'{start_pattern}\s*(?P<json>{{\s*{contains_pattern}\s*}})\s*{end_pattern}', + string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT) + if not json_string: + return default + + _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS) + try: + return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs) + except ExtractorError as e: + if fatal: + raise ExtractorError( + f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id) + elif not has_default: + self.report_warning( + f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id) + return default + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. @@ -1292,7 +1284,7 @@ class InfoExtractor: if tfa is not None: return tfa - return compat_getpass('Type %s and press [Return]: ' % note) + return getpass.getpass('Type %s and press [Return]: ' % note) # Helper functions for extracting OpenGraph info @staticmethod @@ -1343,7 +1335,7 @@ class InfoExtractor: return self._og_search_property('url', html, **kargs) def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs): - return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs) + return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs) def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): name = variadic(name) @@ -1400,27 +1392,25 @@ class InfoExtractor: return self._html_search_meta('twitter:player', html, 'twitter card player') - def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): - json_ld_list = list(re.finditer(JSON_LD_RE, html)) - default = kwargs.get('default', NO_DEFAULT) - # JSON-LD may be malformed and thus `fatal` should be respected. - # At the same time `default` may be passed that assumes `fatal=False` - # for _search_regex. Let's simulate the same behavior here as well. - fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False - json_ld = [] - for mobj in json_ld_list: - json_ld_item = self._parse_json( - mobj.group('json_ld'), video_id, fatal=fatal) - if not json_ld_item: - continue - if isinstance(json_ld_item, dict): - json_ld.append(json_ld_item) - elif isinstance(json_ld_item, (list, tuple)): - json_ld.extend(json_ld_item) - if json_ld: - json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) - if json_ld: - return json_ld + def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT): + """Yield all json ld objects in the html""" + if default is not NO_DEFAULT: + fatal = False + for mobj in re.finditer(JSON_LD_RE, html): + json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal) + for json_ld in variadic(json_ld_item): + if isinstance(json_ld, dict): + yield json_ld + + def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT): + """Search for a video in any json ld in the html""" + if default is not NO_DEFAULT: + fatal = False + info = self._json_ld( + list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)), + video_id, fatal=fatal, expected_type=expected_type) + if info: + return info if default is not NO_DEFAULT: return default elif fatal: @@ -1430,7 +1420,7 @@ class InfoExtractor: return {} def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): - if isinstance(json_ld, compat_str): + if isinstance(json_ld, str): json_ld = self._parse_json(json_ld, video_id, fatal=fatal) if not json_ld: return {} @@ -1451,6 +1441,10 @@ class InfoExtractor: 'ViewAction': 'view', } + def is_type(e, *expected_types): + type = variadic(traverse_obj(e, '@type')) + return any(x in type for x in expected_types) + def extract_interaction_type(e): interaction_type = e.get('interactionType') if isinstance(interaction_type, dict): @@ -1464,9 +1458,7 @@ class InfoExtractor: if not isinstance(interaction_statistic, list): return for is_e in interaction_statistic: - if not isinstance(is_e, dict): - continue - if is_e.get('@type') != 'InteractionCounter': + if not is_type(is_e, 'InteractionCounter'): continue interaction_type = extract_interaction_type(is_e) if not interaction_type: @@ -1503,22 +1495,23 @@ class InfoExtractor: info['chapters'] = chapters def extract_video_object(e): - assert e['@type'] == 'VideoObject' + assert is_type(e, 'VideoObject') author = e.get('author') info.update({ 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnails': [{'url': url_or_none(url)} - for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))], + 'thumbnails': [{'url': url} + for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL')) + if url_or_none(url)], 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), # author can be an instance of 'Organization' or 'Person' types. # both types can have 'name' property(inherited from 'Thing' type). [1] # however some websites are using 'Text' type instead. # 1. https://schema.org/VideoObject - 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None, - 'filesize': float_or_none(e.get('contentSize')), + 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None, + 'filesize': int_or_none(float_or_none(e.get('contentSize'))), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), 'height': int_or_none(e.get('height')), @@ -1534,13 +1527,12 @@ class InfoExtractor: if at_top_level and set(e.keys()) == {'@context', '@graph'}: traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False) break - item_type = e.get('@type') - if expected_type is not None and expected_type != item_type: + if expected_type is not None and not is_type(e, expected_type): continue rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none) if rating is not None: info['average_rating'] = rating - if item_type in ('TVEpisode', 'Episode'): + if is_type(e, 'TVEpisode', 'Episode'): episode_name = unescapeHTML(e.get('name')) info.update({ 'episode': episode_name, @@ -1550,37 +1542,39 @@ class InfoExtractor: if not info.get('title') and episode_name: info['title'] = episode_name part_of_season = e.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): + if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'): info.update({ 'season': unescapeHTML(part_of_season.get('name')), 'season_number': int_or_none(part_of_season.get('seasonNumber')), }) part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): + if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type == 'Movie': + elif is_type(e, 'Movie'): info.update({ 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('dateCreated')), }) - elif item_type in ('Article', 'NewsArticle'): + elif is_type(e, 'Article', 'NewsArticle'): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), 'title': unescapeHTML(e.get('headline')), 'description': unescapeHTML(e.get('articleBody') or e.get('description')), }) - if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject': + if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'): extract_video_object(e['video'][0]) - elif item_type == 'VideoObject': + elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'): + extract_video_object(e['subjectOf'][0]) + elif is_type(e, 'VideoObject'): extract_video_object(e) if expected_type is None: continue else: break video = e.get('video') - if isinstance(video, dict) and video.get('@type') == 'VideoObject': + if is_type(video, 'VideoObject'): extract_video_object(video) if expected_type is None: continue @@ -1597,15 +1591,13 @@ class InfoExtractor: webpage, 'next.js data', fatal=fatal, **kw), video_id, transform_source=transform_source, fatal=fatal) - def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'): - ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. ''' - # not all website do this, but it can be changed - # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source + def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): + """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" rectx = re.escape(context_name) + FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)' js, arg_keys, arg_vals = self._search_regex( - (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx, - r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx), - webpage, context_name, group=['js', 'arg_keys', 'arg_vals']) + (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'), + webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal) args = dict(zip(arg_keys.split(','), arg_vals.split(','))) @@ -1613,7 +1605,8 @@ class InfoExtractor: if val in ('undefined', 'void 0'): args[key] = 'null' - return self._parse_json(js_to_json(js, args), video_id)['data'][0] + ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) + return traverse_obj(ret, traverse) or {} @staticmethod def _hidden_inputs(html): @@ -2166,7 +2159,7 @@ class InfoExtractor: ]), m3u8_doc) def format_url(url): - return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url) + return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url) if self.get_param('hls_split_discontinuity', False): def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None): @@ -2539,7 +2532,7 @@ class InfoExtractor: }) continue - src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) + src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src) src_url = src_url.strip() if proto == 'm3u8' or src_ext == 'm3u8': @@ -2562,7 +2555,7 @@ class InfoExtractor: 'plugin': 'flowplayer-3.2.0.1', } f4m_url += '&' if '?' in f4m_url else '?' - f4m_url += compat_urllib_parse_urlencode(f4m_params) + f4m_url += urllib.parse.urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) elif src_ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -2803,13 +2796,18 @@ class InfoExtractor: mime_type = representation_attrib['mimeType'] content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) - codecs = parse_codecs(representation_attrib.get('codecs', '')) + codec_str = representation_attrib.get('codecs', '') + # Some kind of binary subtitle found in some youtube livestreams + if mime_type == 'application/x-rawcc': + codecs = {'scodec': codec_str} + else: + codecs = parse_codecs(codec_str) if content_type not in ('video', 'audio', 'text'): if mime_type == 'image/jpeg': content_type = mime_type - elif codecs['vcodec'] != 'none': + elif codecs.get('vcodec', 'none') != 'none': content_type = 'video' - elif codecs['acodec'] != 'none': + elif codecs.get('acodec', 'none') != 'none': content_type = 'audio' elif codecs.get('scodec', 'none') != 'none': content_type = 'text' @@ -2827,7 +2825,7 @@ class InfoExtractor: if re.match(r'^https?://', base_url): break if mpd_base_url and base_url.startswith('/'): - base_url = compat_urlparse.urljoin(mpd_base_url, base_url) + base_url = urllib.parse.urljoin(mpd_base_url, base_url) elif mpd_base_url and not re.match(r'^https?://', base_url): if not mpd_base_url.endswith('/'): mpd_base_url += '/' @@ -3097,7 +3095,7 @@ class InfoExtractor: sampling_rate = int_or_none(track.get('SamplingRate')) track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) - track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern) + track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern) fragments = [] fragment_ctx = { @@ -3116,7 +3114,7 @@ class InfoExtractor: fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat for _ in range(fragment_repeat): fragments.append({ - 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), + 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern), 'duration': fragment_ctx['duration'] / stream_timescale, }) fragment_ctx['time'] += fragment_ctx['duration'] @@ -3184,7 +3182,8 @@ class InfoExtractor: return f return {} - def _media_formats(src, cur_media_type, type_info={}): + def _media_formats(src, cur_media_type, type_info=None): + type_info = type_info or {} full_url = absolute_url(src) ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': @@ -3202,6 +3201,7 @@ class InfoExtractor: formats = [{ 'url': full_url, 'vcodec': 'none' if cur_media_type == 'audio' else None, + 'ext': ext, }] return is_plain_url, formats @@ -3228,7 +3228,8 @@ class InfoExtractor: media_attributes = extract_attributes(media_tag) src = strip_or_none(media_attributes.get('src')) if src: - _, formats = _media_formats(src, media_type) + f = parse_content_type(media_attributes.get('type')) + _, formats = _media_formats(src, media_type, f) media_info['formats'].extend(formats) media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) if media_content: @@ -3357,7 +3358,7 @@ class InfoExtractor: return formats, subtitles def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): - query = compat_urlparse.urlparse(url).query + query = urllib.parse.urlparse(url).query url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) mobj = re.search( r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url) @@ -3463,7 +3464,7 @@ class InfoExtractor: if not isinstance(track, dict): continue track_kind = track.get('kind') - if not track_kind or not isinstance(track_kind, compat_str): + if not track_kind or not isinstance(track_kind, str): continue if track_kind.lower() not in ('captions', 'subtitles'): continue @@ -3536,7 +3537,7 @@ class InfoExtractor: # Often no height is provided but there is a label in # format like "1080p", "720p SD", or 1080. height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), + r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''), 'height', default=None)) a_format = { 'url': source_url, @@ -3588,17 +3589,15 @@ class InfoExtractor: def _set_cookie(self, domain, name, value, expire_time=None, port=None, path='/', secure=False, discard=False, rest={}, **kwargs): - cookie = compat_cookiejar_Cookie( + cookie = http.cookiejar.Cookie( 0, name, value, port, port is not None, domain, True, domain.startswith('.'), path, True, secure, expire_time, discard, None, None, rest) - self._downloader.cookiejar.set_cookie(cookie) + self.cookiejar.set_cookie(cookie) def _get_cookies(self, url): - """ Return a compat_cookies_SimpleCookie with the cookies for the url """ - req = sanitized_Request(url) - self._downloader.cookiejar.add_cookie_header(req) - return compat_cookies_SimpleCookie(req.get_header('Cookie')) + """ Return a http.cookies.SimpleCookie with the cookies for the url """ + return http.cookies.SimpleCookie(self._downloader._calc_cookies(url)) def _apply_first_set_cookie_header(self, url_handle, cookie): """ @@ -3742,7 +3741,7 @@ class InfoExtractor: def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') - @property + @functools.cached_property def _cookies_passed(self): """Whether cookies have been passed to YoutubeDL""" return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None @@ -3764,10 +3763,10 @@ class InfoExtractor: return headers def _generic_id(self, url): - return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) def _generic_title(self, url): - return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + return urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) @staticmethod def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): diff --git a/yt_dlp/extractor/commonprotocols.py b/yt_dlp/extractor/commonprotocols.py index e8f19b9e0..2f93e8ea5 100644 --- a/yt_dlp/extractor/commonprotocols.py +++ b/yt_dlp/extractor/commonprotocols.py @@ -1,5 +1,6 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import compat_urlparse class RtmpIE(InfoExtractor): @@ -23,7 +24,7 @@ class RtmpIE(InfoExtractor): 'formats': [{ 'url': url, 'ext': 'flv', - 'format_id': compat_urlparse.urlparse(url).scheme, + 'format_id': urllib.parse.urlparse(url).scheme, }], } diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index bb1dbbaad..6877e1a3f 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -1,19 +1,20 @@ import base64 -import re import json -import zlib - +import re +import urllib.request import xml.etree.ElementTree +import zlib from hashlib import sha1 -from math import pow, sqrt, floor +from math import floor, pow, sqrt + from .common import InfoExtractor from .vrv import VRVBaseIE +from ..aes import aes_cbc_decrypt from ..compat import ( compat_b64decode, compat_etree_fromstring, compat_str, compat_urllib_parse_urlencode, - compat_urllib_request, compat_urlparse, ) from ..utils import ( @@ -22,8 +23,8 @@ from ..utils import ( extract_attributes, float_or_none, format_field, - intlist_to_bytes, int_or_none, + intlist_to_bytes, join_nonempty, lowercase_escape, merge_dicts, @@ -34,9 +35,6 @@ from ..utils import ( try_get, xpath_text, ) -from ..aes import ( - aes_cbc_decrypt, -) class CrunchyrollBaseIE(InfoExtractor): @@ -259,7 +257,7 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE): } def _download_webpage(self, url_or_request, *args, **kwargs): - request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) + request = (url_or_request if isinstance(url_or_request, urllib.request.Request) else sanitized_Request(url_or_request)) # Accept-Language must be set explicitly to accept any language to avoid issues # similar to https://github.com/ytdl-org/youtube-dl/issues/6797. @@ -728,11 +726,12 @@ class CrunchyrollBetaBaseIE(CrunchyrollBaseIE): headers={ 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] }) - bucket = policy_response['cms']['bucket'] + cms = traverse_obj(policy_response, 'cms_beta', 'cms') + bucket = cms['bucket'] params = { - 'Policy': policy_response['cms']['policy'], - 'Signature': policy_response['cms']['signature'], - 'Key-Pair-Id': policy_response['cms']['key_pair_id'] + 'Policy': cms['policy'], + 'Signature': cms['signature'], + 'Key-Pair-Id': cms['key_pair_id'] } locale = traverse_obj(initial_state, ('localization', 'locale')) if locale: diff --git a/yt_dlp/extractor/curiositystream.py b/yt_dlp/extractor/curiositystream.py index 5b76b29ff..a105b6ce2 100644 --- a/yt_dlp/extractor/curiositystream.py +++ b/yt_dlp/extractor/curiositystream.py @@ -1,12 +1,8 @@ import re from .common import InfoExtractor -from ..utils import ( - int_or_none, - urlencode_postdata, - compat_str, - ExtractorError, -) +from ..compat import compat_str +from ..utils import ExtractorError, int_or_none, urlencode_postdata class CuriosityStreamBaseIE(InfoExtractor): @@ -23,6 +19,11 @@ class CuriosityStreamBaseIE(InfoExtractor): def _call_api(self, path, video_id, query=None): headers = {} + if not self._auth_token: + auth_cookie = self._get_cookies('https://curiositystream.com').get('auth_token') + if auth_cookie: + self.write_debug('Obtained auth_token cookie') + self._auth_token = auth_cookie.value if self._auth_token: headers['X-Auth-Token'] = self._auth_token result = self._download_json( @@ -45,7 +46,7 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream' _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)' _TESTS = [{ - 'url': 'https://app.curiositystream.com/video/2', + 'url': 'http://app.curiositystream.com/video/2', 'info_dict': { 'id': '2', 'ext': 'mp4', diff --git a/yt_dlp/extractor/cwtv.py b/yt_dlp/extractor/cwtv.py index 07239f39c..9b83264ee 100644 --- a/yt_dlp/extractor/cwtv.py +++ b/yt_dlp/extractor/cwtv.py @@ -91,4 +91,5 @@ class CWTVIE(InfoExtractor): 'timestamp': parse_iso8601(video_data.get('start_time')), 'age_limit': parse_age_limit(video_data.get('rating')), 'ie_key': 'ThePlatform', + 'thumbnail': video_data.get('large_thumbnail') } diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py index 3b090d5e0..46438891f 100644 --- a/yt_dlp/extractor/dailymotion.py +++ b/yt_dlp/extractor/dailymotion.py @@ -5,13 +5,15 @@ import re from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, + OnDemandPagedList, age_restricted, clean_html, - ExtractorError, int_or_none, - OnDemandPagedList, + traverse_obj, try_get, unescapeHTML, + unsmuggle_url, urlencode_postdata, ) @@ -220,6 +222,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): return urls def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url) video_id, playlist_id = self._match_valid_url(url).groups() if playlist_id: @@ -252,7 +255,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): metadata = self._download_json( 'https://www.dailymotion.com/player/metadata/video/' + xid, xid, 'Downloading metadata JSON', - query={'app': 'com.dailymotion.neon'}) + query=traverse_obj(smuggled_data, 'query') or {'app': 'com.dailymotion.neon'}) error = metadata.get('error') if error: diff --git a/yt_dlp/extractor/dailywire.py b/yt_dlp/extractor/dailywire.py new file mode 100644 index 000000000..1f27797ad --- /dev/null +++ b/yt_dlp/extractor/dailywire.py @@ -0,0 +1,114 @@ +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + join_nonempty, + traverse_obj, + url_or_none, +) + + +class DailyWireBaseIE(InfoExtractor): + _JSON_PATH = { + 'episode': ('props', 'pageProps', 'episodeData', 'episode'), + 'videos': ('props', 'pageProps', 'videoData', 'video'), + 'podcasts': ('props', 'pageProps', 'episode'), + } + + def _get_json(self, url): + sites_type, slug = self._match_valid_url(url).group('sites_type', 'id') + json_data = self._search_nextjs_data(self._download_webpage(url, slug), slug) + return slug, traverse_obj(json_data, self._JSON_PATH[sites_type]) + + +class DailyWireIE(DailyWireBaseIE): + _VALID_URL = r'https?://(?:www\.)dailywire(?:\.com)/(?P<sites_type>episode|videos)/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.dailywire.com/episode/1-fauci', + 'info_dict': { + 'id': 'ckzsl50xnqpy30850in3v4bu7', + 'ext': 'mp4', + 'display_id': '1-fauci', + 'title': '1. Fauci', + 'description': 'md5:9df630347ef85081b7e97dd30bc22853', + 'thumbnail': 'https://daily-wire-production.imgix.net/episodes/ckzsl50xnqpy30850in3v4bu7/ckzsl50xnqpy30850in3v4bu7-1648237399554.jpg', + 'creator': 'Caroline Roberts', + 'series_id': 'ckzplm0a097fn0826r2vc3j7h', + 'series': 'China: The Enemy Within', + } + }, { + 'url': 'https://www.dailywire.com/episode/ep-124-bill-maher', + 'info_dict': { + 'id': 'cl0ngbaalplc80894sfdo9edf', + 'ext': 'mp3', + 'display_id': 'ep-124-bill-maher', + 'title': 'Ep. 124 - Bill Maher', + 'thumbnail': 'https://daily-wire-production.imgix.net/episodes/cl0ngbaalplc80894sfdo9edf/cl0ngbaalplc80894sfdo9edf-1647065568518.jpg', + 'creator': 'Caroline Roberts', + 'description': 'md5:adb0de584bcfa9c41374999d9e324e98', + 'series_id': 'cjzvep7270hp00786l9hwccob', + 'series': 'The Sunday Special', + } + }, { + 'url': 'https://www.dailywire.com/videos/the-hyperions', + 'only_matching': True, + }] + + def _real_extract(self, url): + slug, episode_info = self._get_json(url) + urls = traverse_obj( + episode_info, (('segments', 'videoUrl'), ..., ('video', 'audio')), expected_type=url_or_none) + + formats, subtitles = [], {} + for url in urls: + if determine_ext(url) != 'm3u8': + formats.append({'url': url}) + continue + format_, subs_ = self._extract_m3u8_formats_and_subtitles(url, slug) + formats.extend(format_) + self._merge_subtitles(subs_, target=subtitles) + self._sort_formats(formats) + return { + 'id': episode_info['id'], + 'display_id': slug, + 'title': traverse_obj(episode_info, 'title', 'name'), + 'description': episode_info.get('description'), + 'creator': join_nonempty(('createdBy', 'firstName'), ('createdBy', 'lastName'), from_dict=episode_info, delim=' '), + 'duration': float_or_none(episode_info.get('duration')), + 'is_live': episode_info.get('isLive'), + 'thumbnail': traverse_obj(episode_info, 'thumbnail', 'image', expected_type=url_or_none), + 'formats': formats, + 'subtitles': subtitles, + 'series_id': traverse_obj(episode_info, ('show', 'id')), + 'series': traverse_obj(episode_info, ('show', 'name')), + } + + +class DailyWirePodcastIE(DailyWireBaseIE): + _VALID_URL = r'https?://(?:www\.)dailywire(?:\.com)/(?P<sites_type>podcasts)/(?P<podcaster>[\w-]+/(?P<id>[\w-]+))' + _TESTS = [{ + 'url': 'https://www.dailywire.com/podcasts/morning-wire/get-ready-for-recession-6-15-22', + 'info_dict': { + 'id': 'cl4f01d0w8pbe0a98ydd0cfn1', + 'ext': 'm4a', + 'display_id': 'get-ready-for-recession-6-15-22', + 'title': 'Get Ready for Recession | 6.15.22', + 'description': 'md5:c4afbadda4e1c38a4496f6d62be55634', + 'thumbnail': 'https://daily-wire-production.imgix.net/podcasts/ckx4otgd71jm508699tzb6hf4-1639506575562.jpg', + 'duration': 900.117667, + } + }] + + def _real_extract(self, url): + slug, episode_info = self._get_json(url) + audio_id = traverse_obj(episode_info, 'audioMuxPlaybackId', 'VUsAipTrBVSgzw73SpC2DAJD401TYYwEp') + + return { + 'id': episode_info['id'], + 'url': f'https://stream.media.dailywire.com/{audio_id}/audio.m4a', + 'display_id': slug, + 'title': episode_info.get('title'), + 'duration': float_or_none(episode_info.get('duration')), + 'thumbnail': episode_info.get('thumbnail'), + 'description': episode_info.get('description'), + } diff --git a/yt_dlp/extractor/digitalconcerthall.py b/yt_dlp/extractor/digitalconcerthall.py index c891ad0a6..3813a51fe 100644 --- a/yt_dlp/extractor/digitalconcerthall.py +++ b/yt_dlp/extractor/digitalconcerthall.py @@ -86,7 +86,7 @@ class DigitalConcertHallIE(InfoExtractor): }) m3u8_url = traverse_obj( - stream_info, ('channel', lambda x: x.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False) + stream_info, ('channel', lambda k, _: k.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False) formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', fatal=False) self._sort_formats(formats) diff --git a/yt_dlp/extractor/dropbox.py b/yt_dlp/extractor/dropbox.py index 6ac0c713a..0d12513b2 100644 --- a/yt_dlp/extractor/dropbox.py +++ b/yt_dlp/extractor/dropbox.py @@ -53,8 +53,8 @@ class DropboxIE(InfoExtractor): else: raise ExtractorError('Password protected video, use --video-password <password>', expected=True) - json_string = self._html_search_regex(r'InitReact\.mountComponent\(.*?,\s*(\{.+\})\s*?\)', webpage, 'Info JSON') - info_json = self._parse_json(json_string, video_id).get('props') + info_json = self._search_json(r'InitReact\.mountComponent\(.*?,', webpage, 'mountComponent', video_id, + contains_pattern=r'.+?"preview".+?', end_pattern=r'\)')['props'] transcode_url = traverse_obj(info_json, ((None, 'preview'), 'file', 'preview', 'content', 'transcode_url'), get_all=False) formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id) diff --git a/yt_dlp/extractor/dropout.py b/yt_dlp/extractor/dropout.py index 475825eb8..e280b1c9f 100644 --- a/yt_dlp/extractor/dropout.py +++ b/yt_dlp/extractor/dropout.py @@ -1,8 +1,8 @@ from .common import InfoExtractor from .vimeo import VHXEmbedIE from ..utils import ( - clean_html, ExtractorError, + clean_html, get_element_by_class, get_element_by_id, get_elements_by_class, @@ -96,11 +96,12 @@ class DropoutIE(InfoExtractor): def _login(self, display_id): username, password = self._get_login_info() - if not (username and password): - self.raise_login_required(method='password') + if not username: + return True response = self._download_webpage( - self._LOGIN_URL, display_id, note='Logging in', data=urlencode_postdata({ + self._LOGIN_URL, display_id, note='Logging in', fatal=False, + data=urlencode_postdata({ 'email': username, 'password': password, 'authenticity_token': self._get_authenticity_token(display_id), @@ -110,19 +111,25 @@ class DropoutIE(InfoExtractor): user_has_subscription = self._search_regex( r'user_has_subscription:\s*["\'](.+?)["\']', response, 'subscription status', default='none') if user_has_subscription.lower() == 'true': - return response + return elif user_has_subscription.lower() == 'false': - raise ExtractorError('Account is not subscribed') + return 'Account is not subscribed' else: - raise ExtractorError('Incorrect username/password') + return 'Incorrect username/password' def _real_extract(self, url): display_id = self._match_id(url) - try: - self._login(display_id) - webpage = self._download_webpage(url, display_id, note='Downloading video webpage') - finally: - self._download_webpage('https://www.dropout.tv/logout', display_id, note='Logging out', fatal=False) + + webpage = None + if self._get_cookies('https://www.dropout.tv').get('_session'): + webpage = self._download_webpage(url, display_id) + if not webpage or '<div id="watch-unauthorized"' in webpage: + login_err = self._login(display_id) + webpage = self._download_webpage(url, display_id) + if login_err and '<div id="watch-unauthorized"' in webpage: + if login_err is True: + self.raise_login_required(method='any') + raise ExtractorError(login_err, expected=True) embed_url = self._search_regex(r'embed_url:\s*["\'](.+?)["\']', webpage, 'embed url') thumbnail = self._og_search_thumbnail(webpage) @@ -137,7 +144,7 @@ class DropoutIE(InfoExtractor): return { '_type': 'url_transparent', 'ie_key': VHXEmbedIE.ie_key(), - 'url': embed_url, + 'url': VHXEmbedIE._smuggle_referrer(embed_url, 'https://www.dropout.tv'), 'id': self._search_regex(r'embed\.vhx\.tv/videos/(.+?)\?', embed_url, 'id'), 'display_id': display_id, 'title': title, diff --git a/yt_dlp/extractor/duboku.py b/yt_dlp/extractor/duboku.py index 24403842d..fb0546cae 100644 --- a/yt_dlp/extractor/duboku.py +++ b/yt_dlp/extractor/duboku.py @@ -51,31 +51,39 @@ def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, e class DubokuIE(InfoExtractor): IE_NAME = 'duboku' - IE_DESC = 'www.duboku.co' + IE_DESC = 'www.duboku.io' - _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*' + _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*' _TESTS = [{ - 'url': 'https://www.duboku.co/vodplay/1575-1-1.html', + 'url': 'https://w.duboku.io/vodplay/1575-1-1.html', 'info_dict': { 'id': '1575-1-1', - 'ext': 'ts', + 'ext': 'mp4', 'series': '白色月光', 'title': 'contains:白色月光', 'season_number': 1, 'episode_number': 1, + 'season': 'Season 1', + 'episode_id': '1', + 'season_id': '1', + 'episode': 'Episode 1', }, 'params': { 'skip_download': 'm3u8 download', }, }, { - 'url': 'https://www.duboku.co/vodplay/1588-1-1.html', + 'url': 'https://w.duboku.io/vodplay/1588-1-1.html', 'info_dict': { 'id': '1588-1-1', - 'ext': 'ts', + 'ext': 'mp4', 'series': '亲爱的自己', - 'title': 'contains:预告片', + 'title': 'contains:第1集', 'season_number': 1, 'episode_number': 1, + 'episode': 'Episode 1', + 'season': 'Season 1', + 'episode_id': '1', + 'season_id': '1', }, 'params': { 'skip_download': 'm3u8 download', @@ -91,7 +99,7 @@ class DubokuIE(InfoExtractor): season_id = temp[1] episode_id = temp[2] - webpage_url = 'https://www.duboku.co/vodplay/%s.html' % video_id + webpage_url = 'https://w.duboku.io/vodplay/%s.html' % video_id webpage_html = self._download_webpage(webpage_url, video_id) # extract video url @@ -124,12 +132,13 @@ class DubokuIE(InfoExtractor): data_from = player_data.get('from') # if it is an embedded iframe, maybe it's an external source + headers = {'Referer': webpage_url} if data_from == 'iframe': # use _type url_transparent to retain the meaningful details # of the video. return { '_type': 'url_transparent', - 'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}), + 'url': smuggle_url(data_url, {'http_headers': headers}), 'id': video_id, 'title': title, 'series': series_title, @@ -139,7 +148,7 @@ class DubokuIE(InfoExtractor): 'episode_id': episode_id, } - formats = self._extract_m3u8_formats(data_url, video_id, 'mp4') + formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers) return { 'id': video_id, @@ -150,36 +159,29 @@ class DubokuIE(InfoExtractor): 'episode_number': int_or_none(episode_id), 'episode_id': episode_id, 'formats': formats, - 'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'} + 'http_headers': headers } class DubokuPlaylistIE(InfoExtractor): IE_NAME = 'duboku:list' - IE_DESC = 'www.duboku.co entire series' + IE_DESC = 'www.duboku.io entire series' - _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*' + _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*' _TESTS = [{ - 'url': 'https://www.duboku.co/voddetail/1575.html', + 'url': 'https://w.duboku.io/voddetail/1575.html', 'info_dict': { 'id': 'startswith:1575', 'title': '白色月光', }, 'playlist_count': 12, }, { - 'url': 'https://www.duboku.co/voddetail/1554.html', + 'url': 'https://w.duboku.io/voddetail/1554.html', 'info_dict': { 'id': 'startswith:1554', 'title': '以家人之名', }, 'playlist_mincount': 30, - }, { - 'url': 'https://www.duboku.co/voddetail/1554.html#playlist2', - 'info_dict': { - 'id': '1554#playlist2', - 'title': '以家人之名', - }, - 'playlist_mincount': 27, }] def _real_extract(self, url): @@ -189,7 +191,7 @@ class DubokuPlaylistIE(InfoExtractor): series_id = mobj.group('id') fragment = compat_urlparse.urlparse(url).fragment - webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id + webpage_url = 'https://w.duboku.io/voddetail/%s.html' % series_id webpage_html = self._download_webpage(webpage_url, series_id) # extract title @@ -234,6 +236,6 @@ class DubokuPlaylistIE(InfoExtractor): # return url results return self.playlist_result([ self.url_result( - compat_urlparse.urljoin('https://www.duboku.co', x['href']), + compat_urlparse.urljoin('https://w.duboku.io', x['href']), ie=DubokuIE.ie_key(), video_title=x.get('title')) for x in playlist], series_id + '#' + playlist_id, title) diff --git a/yt_dlp/extractor/ertgr.py b/yt_dlp/extractor/ertgr.py index 507f0a5c1..276543653 100644 --- a/yt_dlp/extractor/ertgr.py +++ b/yt_dlp/extractor/ertgr.py @@ -119,7 +119,7 @@ class ERTFlixCodenameIE(ERTFlixBaseIE): class ERTFlixIE(ERTFlixBaseIE): IE_NAME = 'ertflix' IE_DESC = 'ERTFLIX videos' - _VALID_URL = r'https?://www\.ertflix\.gr/(?:series|vod)/(?P<id>[a-z]{3}\.\d+)' + _VALID_URL = r'https?://www\.ertflix\.gr/(?:[^/]+/)?(?:series|vod)/(?P<id>[a-z]{3}\.\d+)' _TESTS = [{ 'url': 'https://www.ertflix.gr/vod/vod.173258-aoratoi-ergates', 'md5': '6479d5e60fd7e520b07ba5411dcdd6e7', @@ -171,6 +171,9 @@ class ERTFlixIE(ERTFlixBaseIE): 'title': 'Το δίκτυο', }, 'playlist_mincount': 9, + }, { + 'url': 'https://www.ertflix.gr/en/vod/vod.127652-ta-kalytera-mas-chronia-ep1-mia-volta-sto-feggari', + 'only_matching': True, }] def _extract_episode(self, episode): diff --git a/yt_dlp/extractor/espn.py b/yt_dlp/extractor/espn.py index 8fad70e6b..451148636 100644 --- a/yt_dlp/extractor/espn.py +++ b/yt_dlp/extractor/espn.py @@ -1,8 +1,11 @@ +import base64 +import json import re +import urllib.parse +from .adobepass import AdobePassIE from .common import InfoExtractor from .once import OnceIE -from ..compat import compat_str from ..utils import ( determine_ext, dict_get, @@ -24,7 +27,6 @@ class ESPNIE(OnceIE): (?: (?: video/(?:clip|iframe/twitter)| - watch/player ) (?: .*?\?.*?\bid=| @@ -47,6 +49,8 @@ class ESPNIE(OnceIE): 'description': 'md5:39370c2e016cb4ecf498ffe75bef7f0f', 'timestamp': 1390936111, 'upload_date': '20140128', + 'duration': 1302, + 'thumbnail': r're:https://.+\.jpg', }, 'params': { 'skip_download': True, @@ -72,15 +76,6 @@ class ESPNIE(OnceIE): 'url': 'https://cdn.espn.go.com/video/clip/_/id/19771774', 'only_matching': True, }, { - 'url': 'http://www.espn.com/watch/player?id=19141491', - 'only_matching': True, - }, { - 'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875', - 'only_matching': True, - }, { - 'url': 'http://www.espn.com/watch/player/_/id/19141491', - 'only_matching': True, - }, { 'url': 'http://www.espn.com/video/clip?id=10365079', 'only_matching': True, }, { @@ -98,7 +93,13 @@ class ESPNIE(OnceIE): }, { 'url': 'http://www.espn.com/espnw/video/26066627/arkansas-gibson-completes-hr-cycle-four-innings', 'only_matching': True, - }] + }, { + 'url': 'http://www.espn.com/watch/player?id=19141491', + 'only_matching': True, + }, { + 'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875', + 'only_matching': True, + }, ] def _real_extract(self, url): video_id = self._match_id(url) @@ -116,7 +117,7 @@ class ESPNIE(OnceIE): for source_id, source in source.items(): if source_id == 'alert': continue - elif isinstance(source, compat_str): + elif isinstance(source, str): extract_source(source, base_source_id) elif isinstance(source, dict): traverse_source( @@ -196,7 +197,7 @@ class ESPNArticleIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if ESPNIE.suitable(url) else super(ESPNArticleIE, cls).suitable(url) + return False if (ESPNIE.suitable(url) or WatchESPNIE.suitable(url)) else super().suitable(url) def _real_extract(self, url): video_id = self._match_id(url) @@ -277,3 +278,119 @@ class ESPNCricInfoIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class WatchESPNIE(AdobePassIE): + _VALID_URL = r'https://www.espn.com/watch/player/_/id/(?P<id>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})' + _TESTS = [{ + 'url': 'https://www.espn.com/watch/player/_/id/ba7d17da-453b-4697-bf92-76a99f61642b', + 'info_dict': { + 'id': 'ba7d17da-453b-4697-bf92-76a99f61642b', + 'ext': 'mp4', + 'title': 'Serbia vs. Turkey', + 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/ba7d17da-453b-4697-bf92-76a99f61642b/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.espn.com/watch/player/_/id/4e9b5bd1-4ceb-4482-9d28-1dd5f30d2f34', + 'info_dict': { + 'id': '4e9b5bd1-4ceb-4482-9d28-1dd5f30d2f34', + 'ext': 'mp4', + 'title': 'Real Madrid vs. Real Betis (LaLiga)', + 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/bd1f3d12-0654-47d9-852e-71b85ea695c7/16x9.jpg?timestamp=202201112217&showBadge=true&cb=12&package=ESPN_PLUS', + }, + 'params': { + 'skip_download': True, + }, + }] + + _API_KEY = 'ZXNwbiZicm93c2VyJjEuMC4w.ptUt7QxsteaRruuPmGZFaJByOoqKvDP2a5YkInHrc7c' + + def _call_bamgrid_api(self, path, video_id, payload=None, headers={}): + if 'Authorization' not in headers: + headers['Authorization'] = f'Bearer {self._API_KEY}' + parse = urllib.parse.urlencode if path == 'token' else json.dumps + return self._download_json( + f'https://espn.api.edge.bamgrid.com/{path}', video_id, headers=headers, data=parse(payload).encode()) + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + f'https://watch-cdn.product.api.espn.com/api/product/v3/watchespn/web/playback/event?id={video_id}', + video_id)['playbackState'] + + # ESPN+ subscription required, through cookies + if 'DTC' in video_data.get('sourceId'): + cookie = self._get_cookies(url).get('ESPN-ONESITE.WEB-PROD.token') + if not cookie: + self.raise_login_required(method='cookies') + + assertion = self._call_bamgrid_api( + 'devices', video_id, + headers={'Content-Type': 'application/json; charset=UTF-8'}, + payload={ + 'deviceFamily': 'android', + 'applicationRuntime': 'android', + 'deviceProfile': 'tv', + 'attributes': {}, + })['assertion'] + token = self._call_bamgrid_api( + 'token', video_id, payload={ + 'subject_token': assertion, + 'subject_token_type': 'urn:bamtech:params:oauth:token-type:device', + 'platform': 'android', + 'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange' + })['access_token'] + + assertion = self._call_bamgrid_api( + 'accounts/grant', video_id, payload={'id_token': cookie.value.split('|')[1]}, + headers={ + 'Authorization': token, + 'Content-Type': 'application/json; charset=UTF-8' + })['assertion'] + token = self._call_bamgrid_api( + 'token', video_id, payload={ + 'subject_token': assertion, + 'subject_token_type': 'urn:bamtech:params:oauth:token-type:account', + 'platform': 'android', + 'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange' + })['access_token'] + + playback = self._download_json( + video_data['videoHref'].format(scenario='browser~ssai'), video_id, + headers={ + 'Accept': 'application/vnd.media-service+json; version=5', + 'Authorization': token + }) + m3u8_url, headers = playback['stream']['complete'][0]['url'], {'authorization': token} + + # No login required + elif video_data.get('sourceId') == 'ESPN_FREE': + asset = self._download_json( + f'https://watch.auth.api.espn.com/video/auth/media/{video_id}/asset?apikey=uiqlbgzdwuru14v627vdusswb', + video_id) + m3u8_url, headers = asset['stream'], {} + + # TV Provider required + else: + resource = self._get_mvpd_resource('ESPN', video_data['name'], video_id, None) + auth = self._extract_mvpd_auth(url, video_id, 'ESPN', resource).encode() + + asset = self._download_json( + f'https://watch.auth.api.espn.com/video/auth/media/{video_id}/asset?apikey=uiqlbgzdwuru14v627vdusswb', + video_id, data=f'adobeToken={urllib.parse.quote_plus(base64.b64encode(auth))}&drmSupport=HLS'.encode()) + m3u8_url, headers = asset['stream'], {} + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data.get('name'), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': video_data.get('posterHref'), + 'http_headers': headers, + } diff --git a/yt_dlp/extractor/expressen.py b/yt_dlp/extractor/expressen.py index a1b8e9bc9..5aba21ba7 100644 --- a/yt_dlp/extractor/expressen.py +++ b/yt_dlp/extractor/expressen.py @@ -19,9 +19,10 @@ class ExpressenIE(InfoExtractor): ''' _TESTS = [{ 'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/', - 'md5': '2fbbe3ca14392a6b1b36941858d33a45', + 'md5': 'deb2ca62e7b1dcd19fa18ba37523f66e', 'info_dict': { - 'id': '8690962', + 'id': 'ba90f5a9-78d1-4511-aa02-c177b9c99136', + 'display_id': 'ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden', 'ext': 'mp4', 'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden', 'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba', @@ -64,7 +65,7 @@ class ExpressenIE(InfoExtractor): display_id, transform_source=unescapeHTML) info = extract_data('video-tracking-info') - video_id = info['videoId'] + video_id = info['contentId'] data = extract_data('article-data') stream = data['stream'] diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 9c5a5f482..32818a024 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -1,2175 +1,23 @@ -# flake8: noqa: F401 +import contextlib +import os -from .abc import ( - ABCIE, - ABCIViewIE, - ABCIViewShowSeriesIE, -) -from .abcnews import ( - AbcNewsIE, - AbcNewsVideoIE, -) -from .abcotvs import ( - ABCOTVSIE, - ABCOTVSClipsIE, -) -from .abematv import ( - AbemaTVIE, - AbemaTVTitleIE, -) -from .academicearth import AcademicEarthCourseIE -from .acast import ( - ACastIE, - ACastChannelIE, -) -from .adn import ADNIE -from .adobeconnect import AdobeConnectIE -from .adobetv import ( - AdobeTVEmbedIE, - AdobeTVIE, - AdobeTVShowIE, - AdobeTVChannelIE, - AdobeTVVideoIE, -) -from .adultswim import AdultSwimIE -from .aenetworks import ( - AENetworksIE, - AENetworksCollectionIE, - AENetworksShowIE, - HistoryTopicIE, - HistoryPlayerIE, - BiographyIE, -) -from .afreecatv import ( - AfreecaTVIE, - AfreecaTVLiveIE, - AfreecaTVUserIE, -) -from .airmozilla import AirMozillaIE -from .aljazeera import AlJazeeraIE -from .alphaporno import AlphaPornoIE -from .amara import AmaraIE -from .alura import ( - AluraIE, - AluraCourseIE -) -from .amcnetworks import AMCNetworksIE -from .animelab import ( - AnimeLabIE, - AnimeLabShowsIE, -) -from .amazon import AmazonStoreIE -from .americastestkitchen import ( - AmericasTestKitchenIE, - AmericasTestKitchenSeasonIE, -) -from .animeondemand import AnimeOnDemandIE -from .anvato import AnvatoIE -from .aol import AolIE -from .allocine import AllocineIE -from .aliexpress import AliExpressLiveIE -from .alsace20tv import ( - Alsace20TVIE, - Alsace20TVEmbedIE, -) -from .apa import APAIE -from .aparat import AparatIE -from .appleconnect import AppleConnectIE -from .appletrailers import ( - AppleTrailersIE, - AppleTrailersSectionIE, -) -from .applepodcasts import ApplePodcastsIE -from .archiveorg import ( - ArchiveOrgIE, - YoutubeWebArchiveIE, -) -from .arcpublishing import ArcPublishingIE -from .arkena import ArkenaIE -from .ard import ( - ARDBetaMediathekIE, - ARDIE, - ARDMediathekIE, -) -from .arte import ( - ArteTVIE, - ArteTVEmbedIE, - ArteTVPlaylistIE, - ArteTVCategoryIE, -) -from .arnes import ArnesIE -from .asiancrush import ( - AsianCrushIE, - AsianCrushPlaylistIE, -) -from .atresplayer import AtresPlayerIE -from .atttechchannel import ATTTechChannelIE -from .atvat import ATVAtIE -from .audimedia import AudiMediaIE -from .audioboom import AudioBoomIE -from .audiomack import AudiomackIE, AudiomackAlbumIE -from .audius import ( - AudiusIE, - AudiusTrackIE, - AudiusPlaylistIE, - AudiusProfileIE, -) -from .awaan import ( - AWAANIE, - AWAANVideoIE, - AWAANLiveIE, - AWAANSeasonIE, -) -from .azmedien import AZMedienIE -from .baidu import BaiduVideoIE -from .banbye import ( - BanByeIE, - BanByeChannelIE, -) -from .bandaichannel import BandaiChannelIE -from .bandcamp import ( - BandcampIE, - BandcampAlbumIE, - BandcampWeeklyIE, - BandcampUserIE, -) -from .bannedvideo import BannedVideoIE -from .bbc import ( - BBCCoUkIE, - BBCCoUkArticleIE, - BBCCoUkIPlayerEpisodesIE, - BBCCoUkIPlayerGroupIE, - BBCCoUkPlaylistIE, - BBCIE, -) -from .beeg import BeegIE -from .behindkink import BehindKinkIE -from .bellmedia import BellMediaIE -from .beatport import BeatportIE -from .bet import BetIE -from .bfi import BFIPlayerIE -from .bfmtv import ( - BFMTVIE, - BFMTVLiveIE, - BFMTVArticleIE, -) -from .bibeltv import BibelTVIE -from .bigflix import BigflixIE -from .bigo import BigoIE -from .bild import BildIE -from .bilibili import ( - BiliBiliIE, - BiliBiliSearchIE, - BilibiliCategoryIE, - BiliBiliBangumiIE, - BilibiliAudioIE, - BilibiliAudioAlbumIE, - BiliBiliPlayerIE, - BilibiliChannelIE, - BiliIntlIE, - BiliIntlSeriesIE, - BiliLiveIE, -) -from .biobiochiletv import BioBioChileTVIE -from .bitchute import ( - BitChuteIE, - BitChuteChannelIE, -) -from .bitwave import ( - BitwaveReplayIE, - BitwaveStreamIE, -) -from .biqle import BIQLEIE -from .blackboardcollaborate import BlackboardCollaborateIE -from .bleacherreport import ( - BleacherReportIE, - BleacherReportCMSIE, -) -from .blogger import BloggerIE -from .bloomberg import BloombergIE -from .bokecc import BokeCCIE -from .bongacams import BongaCamsIE -from .bostonglobe import BostonGlobeIE -from .box import BoxIE -from .bpb import BpbIE -from .br import ( - BRIE, - BRMediathekIE, -) -from .bravotv import BravoTVIE -from .breakcom import BreakIE -from .breitbart import BreitBartIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .businessinsider import BusinessInsiderIE -from .buzzfeed import BuzzFeedIE -from .byutv import BYUtvIE -from .c56 import C56IE -from .cableav import CableAVIE -from .callin import CallinIE -from .caltrans import CaltransIE -from .cam4 import CAM4IE -from .camdemy import ( - CamdemyIE, - CamdemyFolderIE -) -from .cammodels import CamModelsIE -from .camwithher import CamWithHerIE -from .canalalpha import CanalAlphaIE -from .canalplus import CanalplusIE -from .canalc2 import Canalc2IE -from .canvas import ( - CanvasIE, - CanvasEenIE, - VrtNUIE, - DagelijkseKostIE, -) -from .carambatv import ( - CarambaTVIE, - CarambaTVPageIE, -) -from .cartoonnetwork import CartoonNetworkIE -from .cbc import ( - CBCIE, - CBCPlayerIE, - CBCGemIE, - CBCGemPlaylistIE, - CBCGemLiveIE, -) -from .cbs import CBSIE -from .cbslocal import ( - CBSLocalIE, - CBSLocalArticleIE, -) -from .cbsinteractive import CBSInteractiveIE -from .cbsnews import ( - CBSNewsEmbedIE, - CBSNewsIE, - CBSNewsLiveVideoIE, -) -from .cbssports import ( - CBSSportsEmbedIE, - CBSSportsIE, - TwentyFourSevenSportsIE, -) -from .ccc import ( - CCCIE, - CCCPlaylistIE, -) -from .ccma import CCMAIE -from .cctv import CCTVIE -from .cda import CDAIE -from .ceskatelevize import CeskaTelevizeIE -from .cgtn import CGTNIE -from .channel9 import Channel9IE -from .charlierose import CharlieRoseIE -from .chaturbate import ChaturbateIE -from .chilloutzone import ChilloutzoneIE -from .chingari import ( - ChingariIE, - ChingariUserIE, -) -from .chirbit import ( - ChirbitIE, - ChirbitProfileIE, -) -from .cinchcast import CinchcastIE -from .cinemax import CinemaxIE -from .ciscolive import ( - CiscoLiveSessionIE, - CiscoLiveSearchIE, -) -from .ciscowebex import CiscoWebexIE -from .cjsw import CJSWIE -from .cliphunter import CliphunterIE -from .clippit import ClippitIE -from .cliprs import ClipRsIE -from .clipsyndicate import ClipsyndicateIE -from .closertotruth import CloserToTruthIE -from .cloudflarestream import CloudflareStreamIE -from .cloudy import CloudyIE -from .clubic import ClubicIE -from .clyp import ClypIE -from .cmt import CMTIE -from .cnbc import ( - CNBCIE, - CNBCVideoIE, -) -from .cnn import ( - CNNIE, - CNNBlogsIE, - CNNArticleIE, -) -from .coub import CoubIE -from .comedycentral import ( - ComedyCentralIE, - ComedyCentralTVIE, -) -from .commonmistakes import CommonMistakesIE, UnicodeBOMIE -from .commonprotocols import ( - MmsIE, - RtmpIE, - ViewSourceIE, -) -from .condenast import CondeNastIE -from .contv import CONtvIE -from .corus import CorusIE -from .cpac import ( - CPACIE, - CPACPlaylistIE, -) -from .cozytv import CozyTVIE -from .cracked import CrackedIE -from .crackle import CrackleIE -from .craftsy import CraftsyIE -from .crooksandliars import CrooksAndLiarsIE -from .crowdbunker import ( - CrowdBunkerIE, - CrowdBunkerChannelIE, -) -from .crunchyroll import ( - CrunchyrollIE, - CrunchyrollShowPlaylistIE, - CrunchyrollBetaIE, - CrunchyrollBetaShowIE, -) -from .cspan import CSpanIE, CSpanCongressIE -from .ctsnews import CtsNewsIE -from .ctv import CTVIE -from .ctvnews import CTVNewsIE -from .cultureunplugged import CultureUnpluggedIE -from .curiositystream import ( - CuriosityStreamIE, - CuriosityStreamCollectionsIE, - CuriosityStreamSeriesIE, -) -from .cwtv import CWTVIE -from .cybrary import ( - CybraryIE, - CybraryCourseIE -) -from .daftsex import DaftsexIE -from .dailymail import DailyMailIE -from .dailymotion import ( - DailymotionIE, - DailymotionPlaylistIE, - DailymotionUserIE, -) -from .damtomo import ( - DamtomoRecordIE, - DamtomoVideoIE, -) -from .daum import ( - DaumIE, - DaumClipIE, - DaumPlaylistIE, - DaumUserIE, -) -from .daystar import DaystarClipIE -from .dbtv import DBTVIE -from .dctp import DctpTvIE -from .deezer import ( - DeezerPlaylistIE, - DeezerAlbumIE, -) -from .democracynow import DemocracynowIE -from .dfb import DFBIE -from .dhm import DHMIE -from .digg import DiggIE -from .dotsub import DotsubIE -from .douyutv import ( - DouyuShowIE, - DouyuTVIE, -) -from .dplay import ( - DPlayIE, - DiscoveryPlusIE, - HGTVDeIE, - GoDiscoveryIE, - TravelChannelIE, - CookingChannelIE, - HGTVUsaIE, - FoodNetworkIE, - InvestigationDiscoveryIE, - DestinationAmericaIE, - AmHistoryChannelIE, - ScienceChannelIE, - DIYNetworkIE, - DiscoveryLifeIE, - AnimalPlanetIE, - TLCIE, - DiscoveryPlusIndiaIE, - DiscoveryNetworksDeIE, - DiscoveryPlusItalyIE, - DiscoveryPlusItalyShowIE, - DiscoveryPlusIndiaShowIE, -) -from .dreisat import DreiSatIE -from .drbonanza import DRBonanzaIE -from .drtuber import DrTuberIE -from .drtv import ( - DRTVIE, - DRTVLiveIE, -) -from .dtube import DTubeIE -from .dvtv import DVTVIE -from .duboku import ( - DubokuIE, - DubokuPlaylistIE -) -from .dumpert import DumpertIE -from .defense import DefenseGouvFrIE -from .digitalconcerthall import DigitalConcertHallIE -from .discovery import DiscoveryIE -from .disney import DisneyIE -from .dispeak import DigitallySpeakingIE -from .doodstream import DoodStreamIE -from .dropbox import DropboxIE -from .dropout import ( - DropoutSeasonIE, - DropoutIE -) -from .dw import ( - DWIE, - DWArticleIE, -) -from .eagleplatform import EaglePlatformIE -from .ebaumsworld import EbaumsWorldIE -from .echomsk import EchoMskIE -from .egghead import ( - EggheadCourseIE, - EggheadLessonIE, -) -from .ehow import EHowIE -from .eighttracks import EightTracksIE -from .einthusan import EinthusanIE -from .eitb import EitbIE -from .ellentube import ( - EllenTubeIE, - EllenTubeVideoIE, - EllenTubePlaylistIE, -) -from .elonet import ElonetIE -from .elpais import ElPaisIE -from .embedly import EmbedlyIE -from .engadget import EngadgetIE -from .epicon import ( - EpiconIE, - EpiconSeriesIE, -) -from .eporner import EpornerIE -from .eroprofile import ( - EroProfileIE, - EroProfileAlbumIE, -) -from .ertgr import ( - ERTFlixCodenameIE, - ERTFlixIE, - ERTWebtvEmbedIE, -) -from .escapist import EscapistIE -from .espn import ( - ESPNIE, - ESPNArticleIE, - FiveThirtyEightIE, - ESPNCricInfoIE, -) -from .esri import EsriVideoIE -from .europa import EuropaIE -from .europeantour import EuropeanTourIE -from .euscreen import EUScreenIE -from .expotv import ExpoTVIE -from .expressen import ExpressenIE -from .extremetube import ExtremeTubeIE -from .eyedotv import EyedoTVIE -from .facebook import ( - FacebookIE, - FacebookPluginsVideoIE, - FacebookRedirectURLIE, -) -from .fancode import ( - FancodeVodIE, - FancodeLiveIE -) +from ..utils import load_plugins -from .faz import FazIE -from .fc2 import ( - FC2IE, - FC2EmbedIE, - FC2LiveIE, -) -from .fczenit import FczenitIE -from .fifa import FifaIE -from .filmmodu import FilmmoduIE -from .filmon import ( - FilmOnIE, - FilmOnChannelIE, -) -from .filmweb import FilmwebIE -from .firsttv import FirstTVIE -from .fivetv import FiveTVIE -from .flickr import FlickrIE -from .folketinget import FolketingetIE -from .footyroom import FootyRoomIE -from .formula1 import Formula1IE -from .fourtube import ( - FourTubeIE, - PornTubeIE, - PornerBrosIE, - FuxIE, -) -from .fox import FOXIE -from .fox9 import ( - FOX9IE, - FOX9NewsIE, -) -from .foxgay import FoxgayIE -from .foxnews import ( - FoxNewsIE, - FoxNewsArticleIE, -) -from .foxsports import FoxSportsIE -from .fptplay import FptplayIE -from .franceculture import FranceCultureIE -from .franceinter import FranceInterIE -from .francetv import ( - FranceTVIE, - FranceTVSiteIE, - FranceTVInfoIE, -) -from .freesound import FreesoundIE -from .freespeech import FreespeechIE -from .frontendmasters import ( - FrontendMastersIE, - FrontendMastersLessonIE, - FrontendMastersCourseIE -) -from .fujitv import FujiTVFODPlus7IE -from .funimation import ( - FunimationIE, - FunimationPageIE, - FunimationShowIE, -) -from .funk import FunkIE -from .fusion import FusionIE -from .gab import ( - GabTVIE, - GabIE, -) -from .gaia import GaiaIE -from .gameinformer import GameInformerIE -from .gamejolt import ( - GameJoltIE, - GameJoltUserIE, - GameJoltGameIE, - GameJoltGameSoundtrackIE, - GameJoltCommunityIE, - GameJoltSearchIE, -) -from .gamespot import GameSpotIE -from .gamestar import GameStarIE -from .gaskrank import GaskrankIE -from .gazeta import GazetaIE -from .gdcvault import GDCVaultIE -from .gedidigital import GediDigitalIE -from .generic import GenericIE -from .gettr import ( - GettrIE, - GettrStreamingIE, -) -from .gfycat import GfycatIE -from .giantbomb import GiantBombIE -from .giga import GigaIE -from .glide import GlideIE -from .globo import ( - GloboIE, - GloboArticleIE, -) -from .go import GoIE -from .godtube import GodTubeIE -from .gofile import GofileIE -from .golem import GolemIE -from .goodgame import GoodGameIE -from .googledrive import GoogleDriveIE -from .googlepodcasts import ( - GooglePodcastsIE, - GooglePodcastsFeedIE, -) -from .googlesearch import GoogleSearchIE -from .gopro import GoProIE -from .goshgay import GoshgayIE -from .gotostage import GoToStageIE -from .gputechconf import GPUTechConfIE -from .gronkh import ( - GronkhIE, - GronkhFeedIE, - GronkhVodsIE -) -from .groupon import GrouponIE -from .hbo import HBOIE -from .hearthisat import HearThisAtIE -from .heise import HeiseIE -from .hellporno import HellPornoIE -from .helsinki import HelsinkiIE -from .hentaistigma import HentaiStigmaIE -from .hgtv import HGTVComShowIE -from .hketv import HKETVIE -from .hidive import HiDiveIE -from .historicfilms import HistoricFilmsIE -from .hitbox import HitboxIE, HitboxLiveIE -from .hitrecord import HitRecordIE -from .hotnewhiphop import HotNewHipHopIE -from .hotstar import ( - HotStarIE, - HotStarPrefixIE, - HotStarPlaylistIE, - HotStarSeriesIE, -) -from .howcast import HowcastIE -from .howstuffworks import HowStuffWorksIE -from .hrfensehen import HRFernsehenIE -from .hrti import ( - HRTiIE, - HRTiPlaylistIE, -) -from .hse import ( - HSEShowIE, - HSEProductIE, -) -from .huajiao import HuajiaoIE -from .huya import HuyaLiveIE -from .huffpost import HuffPostIE -from .hungama import ( - HungamaIE, - HungamaSongIE, - HungamaAlbumPlaylistIE, -) -from .hypem import HypemIE -from .icareus import IcareusIE -from .ichinanalive import ( - IchinanaLiveIE, - IchinanaLiveClipIE, -) -from .ign import ( - IGNIE, - IGNVideoIE, - IGNArticleIE, -) -from .iheart import ( - IHeartRadioIE, - IHeartRadioPodcastIE, -) -from .imdb import ( - ImdbIE, - ImdbListIE -) -from .imgur import ( - ImgurIE, - ImgurAlbumIE, - ImgurGalleryIE, -) -from .ina import InaIE -from .inc import IncIE -from .indavideo import IndavideoEmbedIE -from .infoq import InfoQIE -from .instagram import ( - InstagramIE, - InstagramIOSIE, - InstagramUserIE, - InstagramTagIE, - InstagramStoryIE, -) -from .internazionale import InternazionaleIE -from .internetvideoarchive import InternetVideoArchiveIE -from .iprima import ( - IPrimaIE, - IPrimaCNNIE -) -from .iqiyi import ( - IqiyiIE, - IqIE, - IqAlbumIE -) +_LAZY_LOADER = False +if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): + with contextlib.suppress(ImportError): + from .lazy_extractors import * # noqa: F403 + from .lazy_extractors import _ALL_CLASSES + _LAZY_LOADER = True -from .itprotv import ( - ITProTVIE, - ITProTVCourseIE -) +if not _LAZY_LOADER: + from ._extractors import * # noqa: F403 + _ALL_CLASSES = [ # noqa: F811 + klass + for name, klass in globals().items() + if name.endswith('IE') and name != 'GenericIE' + ] + _ALL_CLASSES.append(GenericIE) # noqa: F405 -from .itv import ( - ITVIE, - ITVBTCCIE, -) -from .ivi import ( - IviIE, - IviCompilationIE -) -from .ivideon import IvideonIE -from .iwara import ( - IwaraIE, - IwaraPlaylistIE, - IwaraUserIE, -) -from .izlesene import IzleseneIE -from .jable import ( - JableIE, - JablePlaylistIE, -) -from .jamendo import ( - JamendoIE, - JamendoAlbumIE, -) -from .jeuxvideo import JeuxVideoIE -from .jove import JoveIE -from .joj import JojIE -from .jwplatform import JWPlatformIE -from .kakao import KakaoIE -from .kaltura import KalturaIE -from .karaoketv import KaraoketvIE -from .karrierevideos import KarriereVideosIE -from .keezmovies import KeezMoviesIE -from .kelbyone import KelbyOneIE -from .ketnet import KetnetIE -from .khanacademy import ( - KhanAcademyIE, - KhanAcademyUnitIE, -) -from .kickstarter import KickStarterIE -from .kinja import KinjaEmbedIE -from .kinopoisk import KinoPoiskIE -from .konserthusetplay import KonserthusetPlayIE -from .koo import KooIE -from .krasview import KrasViewIE -from .ku6 import Ku6IE -from .kusi import KUSIIE -from .kuwo import ( - KuwoIE, - KuwoAlbumIE, - KuwoChartIE, - KuwoSingerIE, - KuwoCategoryIE, - KuwoMvIE, -) -from .la7 import ( - LA7IE, - LA7PodcastEpisodeIE, - LA7PodcastIE, -) -from .laola1tv import ( - Laola1TvEmbedIE, - Laola1TvIE, - EHFTVIE, - ITTFIE, -) -from .lastfm import ( - LastFMIE, - LastFMPlaylistIE, - LastFMUserIE, -) -from .lbry import ( - LBRYIE, - LBRYChannelIE, -) -from .lci import LCIIE -from .lcp import ( - LcpPlayIE, - LcpIE, -) -from .lecture2go import Lecture2GoIE -from .lecturio import ( - LecturioIE, - LecturioCourseIE, - LecturioDeCourseIE, -) -from .leeco import ( - LeIE, - LePlaylistIE, - LetvCloudIE, -) -from .lego import LEGOIE -from .lemonde import LemondeIE -from .lenta import LentaIE -from .libraryofcongress import LibraryOfCongressIE -from .libsyn import LibsynIE -from .lifenews import ( - LifeNewsIE, - LifeEmbedIE, -) -from .likee import ( - LikeeIE, - LikeeUserIE -) -from .limelight import ( - LimelightMediaIE, - LimelightChannelIE, - LimelightChannelListIE, -) -from .line import ( - LineLiveIE, - LineLiveChannelIE, -) -from .linkedin import ( - LinkedInIE, - LinkedInLearningIE, - LinkedInLearningCourseIE, -) -from .linuxacademy import LinuxAcademyIE -from .litv import LiTVIE -from .livejournal import LiveJournalIE -from .livestream import ( - LivestreamIE, - LivestreamOriginalIE, - LivestreamShortenerIE, -) -from .lnkgo import ( - LnkGoIE, - LnkIE, -) -from .localnews8 import LocalNews8IE -from .lovehomeporn import LoveHomePornIE -from .lrt import ( - LRTVODIE, - LRTStreamIE -) -from .lynda import ( - LyndaIE, - LyndaCourseIE -) -from .m6 import M6IE -from .magentamusik360 import MagentaMusik360IE -from .mailru import ( - MailRuIE, - MailRuMusicIE, - MailRuMusicSearchIE, -) -from .mainstreaming import MainStreamingIE -from .malltv import MallTVIE -from .mangomolo import ( - MangomoloVideoIE, - MangomoloLiveIE, -) -from .manoto import ( - ManotoTVIE, - ManotoTVShowIE, - ManotoTVLiveIE, -) -from .manyvids import ManyVidsIE -from .maoritv import MaoriTVIE -from .markiza import ( - MarkizaIE, - MarkizaPageIE, -) -from .massengeschmacktv import MassengeschmackTVIE -from .masters import MastersIE -from .matchtv import MatchTVIE -from .mdr import MDRIE -from .medaltv import MedalTVIE -from .mediaite import MediaiteIE -from .mediaklikk import MediaKlikkIE -from .mediaset import ( - MediasetIE, - MediasetShowIE, -) -from .mediasite import ( - MediasiteIE, - MediasiteCatalogIE, - MediasiteNamedCatalogIE, -) -from .medici import MediciIE -from .megaphone import MegaphoneIE -from .meipai import MeipaiIE -from .melonvod import MelonVODIE -from .meta import METAIE -from .metacafe import MetacafeIE -from .metacritic import MetacriticIE -from .mgoon import MgoonIE -from .mgtv import MGTVIE -from .miaopai import MiaoPaiIE -from .microsoftstream import MicrosoftStreamIE -from .microsoftvirtualacademy import ( - MicrosoftVirtualAcademyIE, - MicrosoftVirtualAcademyCourseIE, -) -from .mildom import ( - MildomIE, - MildomVodIE, - MildomClipIE, - MildomUserVodIE, -) -from .minds import ( - MindsIE, - MindsChannelIE, - MindsGroupIE, -) -from .ministrygrid import MinistryGridIE -from .minoto import MinotoIE -from .miomio import MioMioIE -from .mirrativ import ( - MirrativIE, - MirrativUserIE, -) -from .mit import TechTVMITIE, OCWMITIE -from .mitele import MiTeleIE -from .mixch import ( - MixchIE, - MixchArchiveIE, -) -from .mixcloud import ( - MixcloudIE, - MixcloudUserIE, - MixcloudPlaylistIE, -) -from .mlb import ( - MLBIE, - MLBVideoIE, -) -from .mlssoccer import MLSSoccerIE -from .mnet import MnetIE -from .moevideo import MoeVideoIE -from .mofosex import ( - MofosexIE, - MofosexEmbedIE, -) -from .mojvideo import MojvideoIE -from .morningstar import MorningstarIE -from .motherless import ( - MotherlessIE, - MotherlessGroupIE -) -from .motorsport import MotorsportIE -from .movieclips import MovieClipsIE -from .moviepilot import MoviepilotIE -from .moviezine import MoviezineIE -from .movingimage import MovingImageIE -from .msn import MSNIE -from .mtv import ( - MTVIE, - MTVVideoIE, - MTVServicesEmbeddedIE, - MTVDEIE, - MTVJapanIE, - MTVItaliaIE, - MTVItaliaProgrammaIE, -) -from .muenchentv import MuenchenTVIE -from .murrtube import MurrtubeIE, MurrtubeUserIE -from .musescore import MuseScoreIE -from .musicdex import ( - MusicdexSongIE, - MusicdexAlbumIE, - MusicdexArtistIE, - MusicdexPlaylistIE, -) -from .mwave import MwaveIE, MwaveMeetGreetIE -from .mxplayer import ( - MxplayerIE, - MxplayerShowIE, -) -from .mychannels import MyChannelsIE -from .myspace import MySpaceIE, MySpaceAlbumIE -from .myspass import MySpassIE -from .myvi import ( - MyviIE, - MyviEmbedIE, -) -from .myvideoge import MyVideoGeIE -from .myvidster import MyVidsterIE -from .n1 import ( - N1InfoAssetIE, - N1InfoIIE, -) -from .nate import ( - NateIE, - NateProgramIE, -) -from .nationalgeographic import ( - NationalGeographicVideoIE, - NationalGeographicTVIE, -) -from .naver import ( - NaverIE, - NaverLiveIE, -) -from .nba import ( - NBAWatchEmbedIE, - NBAWatchIE, - NBAWatchCollectionIE, - NBAEmbedIE, - NBAIE, - NBAChannelIE, -) -from .nbc import ( - NBCIE, - NBCNewsIE, - NBCOlympicsIE, - NBCOlympicsStreamIE, - NBCSportsIE, - NBCSportsStreamIE, - NBCSportsVPlayerIE, -) -from .ndr import ( - NDRIE, - NJoyIE, - NDREmbedBaseIE, - NDREmbedIE, - NJoyEmbedIE, -) -from .ndtv import NDTVIE -from .nebula import ( - NebulaIE, - NebulaSubscriptionsIE, - NebulaChannelIE, -) -from .nerdcubed import NerdCubedFeedIE -from .netzkino import NetzkinoIE -from .neteasemusic import ( - NetEaseMusicIE, - NetEaseMusicAlbumIE, - NetEaseMusicSingerIE, - NetEaseMusicListIE, - NetEaseMusicMvIE, - NetEaseMusicProgramIE, - NetEaseMusicDjRadioIE, -) -from .newgrounds import ( - NewgroundsIE, - NewgroundsPlaylistIE, - NewgroundsUserIE, -) -from .newstube import NewstubeIE -from .newsy import NewsyIE -from .nextmedia import ( - NextMediaIE, - NextMediaActionNewsIE, - AppleDailyIE, - NextTVIE, -) -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .nfb import NFBIE -from .nfhsnetwork import NFHSNetworkIE -from .nfl import ( - NFLIE, - NFLArticleIE, -) -from .nhk import ( - NhkVodIE, - NhkVodProgramIE, - NhkForSchoolBangumiIE, - NhkForSchoolSubjectIE, - NhkForSchoolProgramListIE, -) -from .nhl import NHLIE -from .nick import ( - NickIE, - NickBrIE, - NickDeIE, - NickNightIE, - NickRuIE, -) -from .niconico import ( - NiconicoIE, - NiconicoPlaylistIE, - NiconicoUserIE, - NiconicoSeriesIE, - NiconicoHistoryIE, - NicovideoSearchDateIE, - NicovideoSearchIE, - NicovideoSearchURLIE, - NicovideoTagURLIE, -) -from .ninecninemedia import ( - NineCNineMediaIE, - CPTwentyFourIE, -) -from .ninegag import NineGagIE -from .ninenow import NineNowIE -from .nintendo import NintendoIE -from .nitter import NitterIE -from .njpwworld import NJPWWorldIE -from .nobelprize import NobelPrizeIE -from .nonktube import NonkTubeIE -from .noodlemagazine import NoodleMagazineIE -from .noovo import NoovoIE -from .normalboots import NormalbootsIE -from .nosvideo import NosVideoIE -from .nova import ( - NovaEmbedIE, - NovaIE, -) -from .novaplay import NovaPlayIE -from .nowness import ( - NownessIE, - NownessPlaylistIE, - NownessSeriesIE, -) -from .noz import NozIE -from .npo import ( - AndereTijdenIE, - NPOIE, - NPOLiveIE, - NPORadioIE, - NPORadioFragmentIE, - SchoolTVIE, - HetKlokhuisIE, - VPROIE, - WNLIE, -) -from .npr import NprIE -from .nrk import ( - NRKIE, - NRKPlaylistIE, - NRKSkoleIE, - NRKTVIE, - NRKTVDirekteIE, - NRKRadioPodkastIE, - NRKTVEpisodeIE, - NRKTVEpisodesIE, - NRKTVSeasonIE, - NRKTVSeriesIE, -) -from .nrl import NRLTVIE -from .ntvcojp import NTVCoJpCUIE -from .ntvde import NTVDeIE -from .ntvru import NTVRuIE -from .nytimes import ( - NYTimesIE, - NYTimesArticleIE, - NYTimesCookingIE, -) -from .nuvid import NuvidIE -from .nzherald import NZHeraldIE -from .nzz import NZZIE -from .odatv import OdaTVIE -from .odnoklassniki import OdnoklassnikiIE -from .oktoberfesttv import OktoberfestTVIE -from .olympics import OlympicsReplayIE -from .on24 import On24IE -from .ondemandkorea import OnDemandKoreaIE -from .onefootball import OneFootballIE -from .onet import ( - OnetIE, - OnetChannelIE, - OnetMVPIE, - OnetPlIE, -) -from .onionstudios import OnionStudiosIE -from .ooyala import ( - OoyalaIE, - OoyalaExternalIE, -) -from .opencast import ( - OpencastIE, - OpencastPlaylistIE, -) -from .openrec import ( - OpenRecIE, - OpenRecCaptureIE, - OpenRecMovieIE, -) -from .ora import OraTVIE -from .orf import ( - ORFTVthekIE, - ORFFM4IE, - ORFFM4StoryIE, - ORFOE1IE, - ORFOE3IE, - ORFNOEIE, - ORFWIEIE, - ORFBGLIE, - ORFOOEIE, - ORFSTMIE, - ORFKTNIE, - ORFSBGIE, - ORFTIRIE, - ORFVBGIE, - ORFIPTVIE, -) -from .outsidetv import OutsideTVIE -from .packtpub import ( - PacktPubIE, - PacktPubCourseIE, -) -from .palcomp3 import ( - PalcoMP3IE, - PalcoMP3ArtistIE, - PalcoMP3VideoIE, -) -from .pandoratv import PandoraTVIE -from .panopto import ( - PanoptoIE, - PanoptoListIE, - PanoptoPlaylistIE -) -from .paramountplus import ( - ParamountPlusIE, - ParamountPlusSeriesIE, -) -from .parliamentliveuk import ParliamentLiveUKIE -from .parlview import ParlviewIE -from .patreon import ( - PatreonIE, - PatreonUserIE -) -from .pbs import PBSIE -from .pearvideo import PearVideoIE -from .peekvids import PeekVidsIE, PlayVidsIE -from .peertube import ( - PeerTubeIE, - PeerTubePlaylistIE, -) -from .peertv import PeerTVIE -from .peloton import ( - PelotonIE, - PelotonLiveIE -) -from .people import PeopleIE -from .performgroup import PerformGroupIE -from .periscope import ( - PeriscopeIE, - PeriscopeUserIE, -) -from .philharmoniedeparis import PhilharmonieDeParisIE -from .phoenix import PhoenixIE -from .photobucket import PhotobucketIE -from .piapro import PiaproIE -from .picarto import ( - PicartoIE, - PicartoVodIE, -) -from .piksel import PikselIE -from .pinkbike import PinkbikeIE -from .pinterest import ( - PinterestIE, - PinterestCollectionIE, -) -from .pixivsketch import ( - PixivSketchIE, - PixivSketchUserIE, -) -from .pladform import PladformIE -from .planetmarathi import PlanetMarathiIE -from .platzi import ( - PlatziIE, - PlatziCourseIE, -) -from .playfm import PlayFMIE -from .playplustv import PlayPlusTVIE -from .plays import PlaysTVIE -from .playstuff import PlayStuffIE -from .playtvak import PlaytvakIE -from .playvid import PlayvidIE -from .playwire import PlaywireIE -from .plutotv import PlutoTVIE -from .pluralsight import ( - PluralsightIE, - PluralsightCourseIE, -) -from .podchaser import PodchaserIE -from .podomatic import PodomaticIE -from .pokemon import ( - PokemonIE, - PokemonWatchIE, - PokemonSoundLibraryIE, -) -from .pokergo import ( - PokerGoIE, - PokerGoCollectionIE, -) -from .polsatgo import PolsatGoIE -from .polskieradio import ( - PolskieRadioIE, - PolskieRadioCategoryIE, - PolskieRadioPlayerIE, - PolskieRadioPodcastIE, - PolskieRadioPodcastListIE, - PolskieRadioRadioKierowcowIE, -) -from .popcorntimes import PopcorntimesIE -from .popcorntv import PopcornTVIE -from .porn91 import Porn91IE -from .porncom import PornComIE -from .pornflip import PornFlipIE -from .pornhd import PornHdIE -from .pornhub import ( - PornHubIE, - PornHubUserIE, - PornHubPlaylistIE, - PornHubPagedVideoListIE, - PornHubUserVideosUploadIE, -) -from .pornotube import PornotubeIE -from .pornovoisines import PornoVoisinesIE -from .pornoxo import PornoXOIE -from .pornez import PornezIE -from .puhutv import ( - PuhuTVIE, - PuhuTVSerieIE, -) -from .presstv import PressTVIE -from .projectveritas import ProjectVeritasIE -from .prosiebensat1 import ProSiebenSat1IE -from .prx import ( - PRXStoryIE, - PRXSeriesIE, - PRXAccountIE, - PRXStoriesSearchIE, - PRXSeriesSearchIE -) -from .puls4 import Puls4IE -from .pyvideo import PyvideoIE -from .qqmusic import ( - QQMusicIE, - QQMusicSingerIE, - QQMusicAlbumIE, - QQMusicToplistIE, - QQMusicPlaylistIE, -) -from .r7 import ( - R7IE, - R7ArticleIE, -) -from .radiko import RadikoIE, RadikoRadioIE -from .radiocanada import ( - RadioCanadaIE, - RadioCanadaAudioVideoIE, -) -from .radiode import RadioDeIE -from .radiojavan import RadioJavanIE -from .radiobremen import RadioBremenIE -from .radiofrance import RadioFranceIE -from .radiozet import RadioZetPodcastIE -from .radiokapital import ( - RadioKapitalIE, - RadioKapitalShowIE, -) -from .radlive import ( - RadLiveIE, - RadLiveChannelIE, - RadLiveSeasonIE, -) -from .rai import ( - RaiPlayIE, - RaiPlayLiveIE, - RaiPlayPlaylistIE, - RaiPlaySoundIE, - RaiPlaySoundLiveIE, - RaiPlaySoundPlaylistIE, - RaiIE, -) -from .raywenderlich import ( - RayWenderlichIE, - RayWenderlichCourseIE, -) -from .rbmaradio import RBMARadioIE -from .rcs import ( - RCSIE, - RCSEmbedsIE, - RCSVariousIE, -) -from .rcti import ( - RCTIPlusIE, - RCTIPlusSeriesIE, - RCTIPlusTVIE, -) -from .rds import RDSIE -from .redbulltv import ( - RedBullTVIE, - RedBullEmbedIE, - RedBullTVRrnContentIE, - RedBullIE, -) -from .reddit import RedditIE -from .redgifs import ( - RedGifsIE, - RedGifsSearchIE, - RedGifsUserIE, -) -from .redtube import RedTubeIE -from .regiotv import RegioTVIE -from .rentv import ( - RENTVIE, - RENTVArticleIE, -) -from .restudy import RestudyIE -from .reuters import ReutersIE -from .reverbnation import ReverbNationIE -from .rice import RICEIE -from .rmcdecouverte import RMCDecouverteIE -from .rockstargames import RockstarGamesIE -from .rokfin import ( - RokfinIE, - RokfinStackIE, - RokfinChannelIE, - RokfinSearchIE, -) -from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE -from .rottentomatoes import RottenTomatoesIE -from .rozhlas import RozhlasIE -from .rtbf import RTBFIE -from .rte import RteIE, RteRadioIE -from .rtlnl import RtlNlIE -from .rtl2 import ( - RTL2IE, - RTL2YouIE, - RTL2YouSeriesIE, -) -from .rtnews import ( - RTNewsIE, - RTDocumentryIE, - RTDocumentryPlaylistIE, - RuptlyIE, -) -from .rtp import RTPIE -from .rtrfm import RTRFMIE -from .rts import RTSIE -from .rtve import ( - RTVEALaCartaIE, - RTVEAudioIE, - RTVELiveIE, - RTVEInfantilIE, - RTVETelevisionIE, -) -from .rtvnh import RTVNHIE -from .rtvs import RTVSIE -from .ruhd import RUHDIE -from .rule34video import Rule34VideoIE -from .rumble import ( - RumbleEmbedIE, - RumbleChannelIE, -) -from .rutube import ( - RutubeIE, - RutubeChannelIE, - RutubeEmbedIE, - RutubeMovieIE, - RutubePersonIE, - RutubePlaylistIE, - RutubeTagsIE, -) -from .glomex import ( - GlomexIE, - GlomexEmbedIE, -) -from .megatvcom import ( - MegaTVComIE, - MegaTVComEmbedIE, -) -from .ant1newsgr import ( - Ant1NewsGrWatchIE, - Ant1NewsGrArticleIE, - Ant1NewsGrEmbedIE, -) -from .rutv import RUTVIE -from .ruutu import RuutuIE -from .ruv import ( - RuvIE, - RuvSpilaIE -) -from .safari import ( - SafariIE, - SafariApiIE, - SafariCourseIE, -) -from .saitosan import SaitosanIE -from .samplefocus import SampleFocusIE -from .sapo import SapoIE -from .savefrom import SaveFromIE -from .sbs import SBSIE -from .screencast import ScreencastIE -from .screencastomatic import ScreencastOMaticIE -from .scrippsnetworks import ( - ScrippsNetworksWatchIE, - ScrippsNetworksIE, -) -from .scte import ( - SCTEIE, - SCTECourseIE, -) -from .seeker import SeekerIE -from .senategov import SenateISVPIE, SenateGovIE -from .sendtonews import SendtoNewsIE -from .servus import ServusIE -from .sevenplus import SevenPlusIE -from .sexu import SexuIE -from .seznamzpravy import ( - SeznamZpravyIE, - SeznamZpravyArticleIE, -) -from .shahid import ( - ShahidIE, - ShahidShowIE, -) -from .shared import ( - SharedIE, - VivoIE, -) -from .shemaroome import ShemarooMeIE -from .showroomlive import ShowRoomLiveIE -from .simplecast import ( - SimplecastIE, - SimplecastEpisodeIE, - SimplecastPodcastIE, -) -from .sina import SinaIE -from .sixplay import SixPlayIE -from .skeb import SkebIE -from .skyit import ( - SkyItPlayerIE, - SkyItVideoIE, - SkyItVideoLiveIE, - SkyItIE, - SkyItAcademyIE, - SkyItArteIE, - CieloTVItIE, - TV8ItIE, -) -from .skylinewebcams import SkylineWebcamsIE -from .skynewsarabia import ( - SkyNewsArabiaIE, - SkyNewsArabiaArticleIE, -) -from .skynewsau import SkyNewsAUIE -from .sky import ( - SkyNewsIE, - SkyNewsStoryIE, - SkySportsIE, - SkySportsNewsIE, -) -from .slideshare import SlideshareIE -from .slideslive import SlidesLiveIE -from .slutload import SlutloadIE -from .snotr import SnotrIE -from .sohu import SohuIE -from .sonyliv import ( - SonyLIVIE, - SonyLIVSeriesIE, -) -from .soundcloud import ( - SoundcloudEmbedIE, - SoundcloudIE, - SoundcloudSetIE, - SoundcloudRelatedIE, - SoundcloudUserIE, - SoundcloudTrackStationIE, - SoundcloudPlaylistIE, - SoundcloudSearchIE, -) -from .soundgasm import ( - SoundgasmIE, - SoundgasmProfileIE -) -from .southpark import ( - SouthParkIE, - SouthParkDeIE, - SouthParkDkIE, - SouthParkEsIE, - SouthParkNlIE -) -from .sovietscloset import ( - SovietsClosetIE, - SovietsClosetPlaylistIE -) -from .spankbang import ( - SpankBangIE, - SpankBangPlaylistIE, -) -from .spankwire import SpankwireIE -from .spiegel import SpiegelIE -from .spike import ( - BellatorIE, - ParamountNetworkIE, -) -from .stitcher import ( - StitcherIE, - StitcherShowIE, -) -from .sport5 import Sport5IE -from .sportbox import SportBoxIE -from .sportdeutschland import SportDeutschlandIE -from .spotify import ( - SpotifyIE, - SpotifyShowIE, -) -from .spreaker import ( - SpreakerIE, - SpreakerPageIE, - SpreakerShowIE, - SpreakerShowPageIE, -) -from .springboardplatform import SpringboardPlatformIE -from .sprout import SproutIE -from .srgssr import ( - SRGSSRIE, - SRGSSRPlayIE, -) -from .srmediathek import SRMediathekIE -from .stanfordoc import StanfordOpenClassroomIE -from .startv import StarTVIE -from .steam import SteamIE -from .storyfire import ( - StoryFireIE, - StoryFireUserIE, - StoryFireSeriesIE, -) -from .streamable import StreamableIE -from .streamanity import StreamanityIE -from .streamcloud import StreamcloudIE -from .streamcz import StreamCZIE -from .streamff import StreamFFIE -from .streetvoice import StreetVoiceIE -from .stretchinternet import StretchInternetIE -from .stripchat import StripchatIE -from .stv import STVPlayerIE -from .sunporno import SunPornoIE -from .sverigesradio import ( - SverigesRadioEpisodeIE, - SverigesRadioPublicationIE, -) -from .svt import ( - SVTIE, - SVTPageIE, - SVTPlayIE, - SVTSeriesIE, -) -from .swrmediathek import SWRMediathekIE -from .syfy import SyfyIE -from .sztvhu import SztvHuIE -from .tagesschau import TagesschauIE -from .tass import TassIE -from .tbs import TBSIE -from .tdslifeway import TDSLifewayIE -from .teachable import ( - TeachableIE, - TeachableCourseIE, -) -from .teachertube import ( - TeacherTubeIE, - TeacherTubeUserIE, -) -from .teachingchannel import TeachingChannelIE -from .teamcoco import TeamcocoIE -from .teamtreehouse import TeamTreeHouseIE -from .techtalks import TechTalksIE -from .ted import ( - TedEmbedIE, - TedPlaylistIE, - TedSeriesIE, - TedTalkIE, -) -from .tele5 import Tele5IE -from .tele13 import Tele13IE -from .telebruxelles import TeleBruxellesIE -from .telecinco import TelecincoIE -from .telegraaf import TelegraafIE -from .telegram import TelegramEmbedIE -from .telemb import TeleMBIE -from .telemundo import TelemundoIE -from .telequebec import ( - TeleQuebecIE, - TeleQuebecSquatIE, - TeleQuebecEmissionIE, - TeleQuebecLiveIE, - TeleQuebecVideoIE, -) -from .teletask import TeleTaskIE -from .telewebion import TelewebionIE -from .tennistv import TennisTVIE -from .tenplay import TenPlayIE -from .testurl import TestURLIE -from .tf1 import TF1IE -from .tfo import TFOIE -from .theintercept import TheInterceptIE -from .theplatform import ( - ThePlatformIE, - ThePlatformFeedIE, -) -from .thestar import TheStarIE -from .thesun import TheSunIE -from .theta import ( - ThetaVideoIE, - ThetaStreamIE, -) -from .theweatherchannel import TheWeatherChannelIE -from .thisamericanlife import ThisAmericanLifeIE -from .thisav import ThisAVIE -from .thisoldhouse import ThisOldHouseIE -from .threespeak import ( - ThreeSpeakIE, - ThreeSpeakUserIE, -) -from .threeqsdn import ThreeQSDNIE -from .tiktok import ( - TikTokIE, - TikTokUserIE, - TikTokSoundIE, - TikTokEffectIE, - TikTokTagIE, - TikTokVMIE, - DouyinIE, -) -from .tinypic import TinyPicIE -from .tmz import TMZIE -from .tnaflix import ( - TNAFlixNetworkEmbedIE, - TNAFlixIE, - EMPFlixIE, - MovieFapIE, -) -from .toggle import ( - ToggleIE, - MeWatchIE, -) -from .toggo import ( - ToggoIE, -) -from .tokentube import ( - TokentubeIE, - TokentubeChannelIE -) -from .tonline import TOnlineIE -from .toongoggles import ToonGogglesIE -from .toutv import TouTvIE -from .toypics import ToypicsUserIE, ToypicsIE -from .traileraddict import TrailerAddictIE -from .trilulilu import TriluliluIE -from .trovo import ( - TrovoIE, - TrovoVodIE, - TrovoChannelVodIE, - TrovoChannelClipIE, -) -from .trueid import TrueIDIE -from .trunews import TruNewsIE -from .trutv import TruTVIE -from .tube8 import Tube8IE -from .tubitv import ( - TubiTvIE, - TubiTvShowIE, -) -from .tumblr import TumblrIE -from .tunein import ( - TuneInClipIE, - TuneInStationIE, - TuneInProgramIE, - TuneInTopicIE, - TuneInShortenerIE, -) -from .tunepk import TunePkIE -from .turbo import TurboIE -from .tv2 import ( - TV2IE, - TV2ArticleIE, - KatsomoIE, - MTVUutisetArticleIE, -) -from .tv2dk import ( - TV2DKIE, - TV2DKBornholmPlayIE, -) -from .tv2hu import ( - TV2HuIE, - TV2HuSeriesIE, -) -from .tv4 import TV4IE -from .tv5mondeplus import TV5MondePlusIE -from .tv5unis import ( - TV5UnisVideoIE, - TV5UnisIE, -) -from .tva import ( - TVAIE, - QubIE, -) -from .tvanouvelles import ( - TVANouvellesIE, - TVANouvellesArticleIE, -) -from .tvc import ( - TVCIE, - TVCArticleIE, -) -from .tver import TVerIE -from .tvigle import TvigleIE -from .tvland import TVLandIE -from .tvn24 import TVN24IE -from .tvnet import TVNetIE -from .tvnoe import TVNoeIE -from .tvnow import ( - TVNowIE, - TVNowFilmIE, - TVNowNewIE, - TVNowSeasonIE, - TVNowAnnualIE, - TVNowShowIE, -) -from .tvopengr import ( - TVOpenGrWatchIE, - TVOpenGrEmbedIE, -) -from .tvp import ( - TVPEmbedIE, - TVPIE, - TVPStreamIE, - TVPWebsiteIE, -) -from .tvplay import ( - TVPlayIE, - ViafreeIE, - TVPlayHomeIE, -) -from .tvplayer import TVPlayerIE -from .tweakers import TweakersIE -from .twentyfourvideo import TwentyFourVideoIE -from .twentymin import TwentyMinutenIE -from .twentythreevideo import TwentyThreeVideoIE -from .twitcasting import ( - TwitCastingIE, - TwitCastingLiveIE, - TwitCastingUserIE, -) -from .twitch import ( - TwitchVodIE, - TwitchCollectionIE, - TwitchVideosIE, - TwitchVideosClipsIE, - TwitchVideosCollectionsIE, - TwitchStreamIE, - TwitchClipsIE, -) -from .twitter import ( - TwitterCardIE, - TwitterIE, - TwitterAmplifyIE, - TwitterBroadcastIE, - TwitterShortenerIE, -) -from .udemy import ( - UdemyIE, - UdemyCourseIE -) -from .udn import UDNEmbedIE -from .ufctv import ( - UFCTVIE, - UFCArabiaIE, -) -from .ukcolumn import UkColumnIE -from .uktvplay import UKTVPlayIE -from .digiteka import DigitekaIE -from .dlive import ( - DLiveVODIE, - DLiveStreamIE, -) -from .drooble import DroobleIE -from .umg import UMGDeIE -from .unistra import UnistraIE -from .unity import UnityIE -from .uol import UOLIE -from .uplynk import ( - UplynkIE, - UplynkPreplayIE, -) -from .urort import UrortIE -from .urplay import URPlayIE -from .usanetwork import USANetworkIE -from .usatoday import USATodayIE -from .ustream import UstreamIE, UstreamChannelIE -from .ustudio import ( - UstudioIE, - UstudioEmbedIE, -) -from .utreon import UtreonIE -from .varzesh3 import Varzesh3IE -from .vbox7 import Vbox7IE -from .veehd import VeeHDIE -from .veo import VeoIE -from .veoh import VeohIE -from .vesti import VestiIE -from .vevo import ( - VevoIE, - VevoPlaylistIE, -) -from .vgtv import ( - BTArticleIE, - BTVestlendingenIE, - VGTVIE, -) -from .vh1 import VH1IE -from .vice import ( - ViceIE, - ViceArticleIE, - ViceShowIE, -) -from .vidbit import VidbitIE -from .viddler import ViddlerIE -from .videa import VideaIE -from .videocampus_sachsen import VideocampusSachsenIE -from .videodetective import VideoDetectiveIE -from .videofyme import VideofyMeIE -from .videomore import ( - VideomoreIE, - VideomoreVideoIE, - VideomoreSeasonIE, -) -from .videopress import VideoPressIE -from .vidio import ( - VidioIE, - VidioPremierIE, - VidioLiveIE -) -from .vidlii import VidLiiIE -from .vier import VierIE, VierVideosIE -from .viewlift import ( - ViewLiftIE, - ViewLiftEmbedIE, -) -from .viidea import ViideaIE -from .vimeo import ( - VimeoIE, - VimeoAlbumIE, - VimeoChannelIE, - VimeoGroupsIE, - VimeoLikesIE, - VimeoOndemandIE, - VimeoReviewIE, - VimeoUserIE, - VimeoWatchLaterIE, - VHXEmbedIE, -) -from .vimm import ( - VimmIE, - VimmRecordingIE, -) -from .vimple import VimpleIE -from .vine import ( - VineIE, - VineUserIE, -) -from .viki import ( - VikiIE, - VikiChannelIE, -) -from .viqeo import ViqeoIE -from .viu import ( - ViuIE, - ViuPlaylistIE, - ViuOTTIE, -) -from .vk import ( - VKIE, - VKUserVideosIE, - VKWallPostIE, -) -from .vlive import ( - VLiveIE, - VLivePostIE, - VLiveChannelIE, -) -from .vodlocker import VodlockerIE -from .vodpl import VODPlIE -from .vodplatform import VODPlatformIE -from .voicerepublic import VoiceRepublicIE -from .voicy import ( - VoicyIE, - VoicyChannelIE, -) -from .voot import ( - VootIE, - VootSeriesIE, -) -from .voxmedia import ( - VoxMediaVolumeIE, - VoxMediaIE, -) -from .vrt import VRTIE -from .vrak import VrakIE -from .vrv import ( - VRVIE, - VRVSeriesIE, -) -from .vshare import VShareIE -from .vtm import VTMIE -from .medialaan import MedialaanIE -from .vuclip import VuClipIE -from .vupload import VuploadIE -from .vvvvid import ( - VVVVIDIE, - VVVVIDShowIE, -) -from .vyborymos import VyboryMosIE -from .vzaar import VzaarIE -from .wakanim import WakanimIE -from .walla import WallaIE -from .washingtonpost import ( - WashingtonPostIE, - WashingtonPostArticleIE, -) -from .wasdtv import ( - WASDTVStreamIE, - WASDTVRecordIE, - WASDTVClipIE, -) -from .wat import WatIE -from .watchbox import WatchBoxIE -from .watchindianporn import WatchIndianPornIE -from .wdr import ( - WDRIE, - WDRPageIE, - WDRElefantIE, - WDRMobileIE, -) -from .webcaster import ( - WebcasterIE, - WebcasterFeedIE, -) -from .webofstories import ( - WebOfStoriesIE, - WebOfStoriesPlaylistIE, -) -from .weibo import ( - WeiboIE, - WeiboMobileIE -) -from .weiqitv import WeiqiTVIE -from .willow import WillowIE -from .wimtv import WimTVIE -from .whowatch import WhoWatchIE -from .wistia import ( - WistiaIE, - WistiaPlaylistIE, -) -from .worldstarhiphop import WorldStarHipHopIE -from .wppilot import ( - WPPilotIE, - WPPilotChannelsIE, -) -from .wsj import ( - WSJIE, - WSJArticleIE, -) -from .wwe import WWEIE -from .xbef import XBefIE -from .xboxclips import XboxClipsIE -from .xfileshare import XFileShareIE -from .xhamster import ( - XHamsterIE, - XHamsterEmbedIE, - XHamsterUserIE, -) -from .xiami import ( - XiamiSongIE, - XiamiAlbumIE, - XiamiArtistIE, - XiamiCollectionIE -) -from .ximalaya import ( - XimalayaIE, - XimalayaAlbumIE -) -from .xinpianchang import XinpianchangIE -from .xminus import XMinusIE -from .xnxx import XNXXIE -from .xstream import XstreamIE -from .xtube import XTubeUserIE, XTubeIE -from .xuite import XuiteIE -from .xvideos import XVideosIE -from .xxxymovies import XXXYMoviesIE -from .yahoo import ( - YahooIE, - YahooSearchIE, - YahooGyaOPlayerIE, - YahooGyaOIE, - YahooJapanNewsIE, -) -from .yandexdisk import YandexDiskIE -from .yandexmusic import ( - YandexMusicTrackIE, - YandexMusicAlbumIE, - YandexMusicPlaylistIE, - YandexMusicArtistTracksIE, - YandexMusicArtistAlbumsIE, -) -from .yandexvideo import ( - YandexVideoIE, - YandexVideoPreviewIE, - ZenYandexIE, - ZenYandexChannelIE, -) -from .yapfiles import YapFilesIE -from .yesjapan import YesJapanIE -from .yinyuetai import YinYueTaiIE -from .ynet import YnetIE -from .youjizz import YouJizzIE -from .youku import ( - YoukuIE, - YoukuShowIE, -) -from .younow import ( - YouNowLiveIE, - YouNowChannelIE, - YouNowMomentIE, -) -from .youporn import YouPornIE -from .yourporn import YourPornIE -from .yourupload import YourUploadIE -from .youtube import ( - YoutubeIE, - YoutubeClipIE, - YoutubeFavouritesIE, - YoutubeNotificationsIE, - YoutubeHistoryIE, - YoutubeTabIE, - YoutubeLivestreamEmbedIE, - YoutubePlaylistIE, - YoutubeRecommendedIE, - YoutubeSearchDateIE, - YoutubeSearchIE, - YoutubeSearchURLIE, - YoutubeMusicSearchURLIE, - YoutubeSubscriptionsIE, - YoutubeStoriesIE, - YoutubeTruncatedIDIE, - YoutubeTruncatedURLIE, - YoutubeYtBeIE, - YoutubeYtUserIE, - YoutubeWatchLaterIE, -) -from .zapiks import ZapiksIE -from .zattoo import ( - BBVTVIE, - EinsUndEinsTVIE, - EWETVIE, - GlattvisionTVIE, - MNetTVIE, - NetPlusIE, - OsnatelTVIE, - QuantumTVIE, - SaltTVIE, - SAKTVIE, - VTXTVIE, - WalyTVIE, - ZattooIE, - ZattooLiveIE, - ZattooMoviesIE, - ZattooRecordingsIE, -) -from .zdf import ZDFIE, ZDFChannelIE -from .zee5 import ( - Zee5IE, - Zee5SeriesIE, -) -from .zhihu import ZhihuIE -from .zingmp3 import ( - ZingMp3IE, - ZingMp3AlbumIE, - ZingMp3ChartHomeIE, - ZingMp3WeekChartIE, - ZingMp3ChartMusicVideoIE, - ZingMp3UserIE, -) -from .zoom import ZoomIE -from .zype import ZypeIE +_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) +_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index de45f9298..5b34f3bff 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -1,18 +1,18 @@ import json import re +import urllib.parse from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, compat_str, compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, ) from ..utils import ( + ExtractorError, clean_html, determine_ext, error_to_compat_str, - ExtractorError, float_or_none, get_element_by_id, get_first, @@ -467,7 +467,7 @@ class FacebookIE(InfoExtractor): dash_manifest = video.get('dash_manifest') if dash_manifest: formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)))) def process_formats(formats): # Downloads with browser's User-Agent are rate limited. Working around diff --git a/yt_dlp/extractor/fc2.py b/yt_dlp/extractor/fc2.py index 225677b00..3501c4cf6 100644 --- a/yt_dlp/extractor/fc2.py +++ b/yt_dlp/extractor/fc2.py @@ -1,16 +1,13 @@ import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, -) +from ..compat import compat_parse_qs from ..dependencies import websockets from ..utils import ( ExtractorError, WebSocketsWrapper, js_to_json, sanitized_Request, - std_headers, traverse_obj, update_url_query, urlencode_postdata, @@ -81,7 +78,7 @@ class FC2IE(InfoExtractor): webpage = None if not url.startswith('fc2:'): webpage = self._download_webpage(url, video_id) - self._downloader.cookiejar.clear_session_cookies() # must clear + self.cookiejar.clear_session_cookies() # must clear self._login() title, thumbnail, description = None, None, None @@ -207,10 +204,10 @@ class FC2LiveIE(InfoExtractor): 'Cookie': str(self._get_cookies('https://live.fc2.com/'))[12:], 'Origin': 'https://live.fc2.com', 'Accept': '*/*', - 'User-Agent': std_headers['User-Agent'], + 'User-Agent': self.get_param('http_headers')['User-Agent'], }) - self.write_debug('[debug] Sending HLS server request') + self.write_debug('Sending HLS server request') while True: recv = ws.recv() @@ -232,13 +229,10 @@ class FC2LiveIE(InfoExtractor): if not data or not isinstance(data, dict): continue if data.get('name') == '_response_' and data.get('id') == 1: - self.write_debug('[debug] Goodbye.') + self.write_debug('Goodbye') playlist_data = data break - elif self._downloader.params.get('verbose', False): - if len(recv) > 100: - recv = recv[:100] + '...' - self.to_screen('[debug] Server said: %s' % recv) + self.write_debug('Server said: %s%s' % (recv[:100], '...' if len(recv) > 100 else '')) if not playlist_data: raise ExtractorError('Unable to fetch HLS playlist info via WebSocket') diff --git a/yt_dlp/extractor/flickr.py b/yt_dlp/extractor/flickr.py index 552ecd43a..9f60a6b1f 100644 --- a/yt_dlp/extractor/flickr.py +++ b/yt_dlp/extractor/flickr.py @@ -94,7 +94,7 @@ class FlickrIE(InfoExtractor): owner = video_info.get('owner', {}) uploader_id = owner.get('nsid') uploader_path = owner.get('path_alias') or uploader_id - uploader_url = format_field(uploader_path, template='https://www.flickr.com/photos/%s/') + uploader_url = format_field(uploader_path, None, 'https://www.flickr.com/photos/%s/') return { 'id': video_id, diff --git a/yt_dlp/extractor/fourzerostudio.py b/yt_dlp/extractor/fourzerostudio.py new file mode 100644 index 000000000..e1804e39e --- /dev/null +++ b/yt_dlp/extractor/fourzerostudio.py @@ -0,0 +1,107 @@ +from .common import InfoExtractor +from ..utils import traverse_obj, unified_timestamp + + +class FourZeroStudioArchiveIE(InfoExtractor): + _VALID_URL = r'https?://0000\.studio/(?P<uploader_id>[^/]+)/broadcasts/(?P<id>[^/]+)/archive' + IE_NAME = '0000studio:archive' + _TESTS = [{ + 'url': 'https://0000.studio/mumeijiten/broadcasts/1290f433-fce0-4909-a24a-5f7df09665dc/archive', + 'info_dict': { + 'id': '1290f433-fce0-4909-a24a-5f7df09665dc', + 'title': 'noteで『canape』様へのファンレターを執筆します。(数秘術その2)', + 'timestamp': 1653802534, + 'release_timestamp': 1653796604, + 'thumbnails': 'count:1', + 'comments': 'count:7', + 'uploader': '『中崎雄心』の執務室。', + 'uploader_id': 'mumeijiten', + } + }] + + def _real_extract(self, url): + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') + webpage = self._download_webpage(url, video_id) + nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None) + + pcb = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorBroadcast'), get_all=False) + uploader_internal_id = traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'id'), get_all=False) + + formats, subs = self._extract_m3u8_formats_and_subtitles(pcb['archiveUrl'], video_id, ext='mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': pcb.get('title'), + 'age_limit': 18 if pcb.get('isAdult') else None, + 'timestamp': unified_timestamp(pcb.get('finishTime')), + 'release_timestamp': unified_timestamp(pcb.get('createdAt')), + 'thumbnails': [{ + 'url': pcb['thumbnailUrl'], + 'ext': 'png', + }] if pcb.get('thumbnailUrl') else None, + 'formats': formats, + 'subtitles': subs, + 'comments': [{ + 'author': c.get('username'), + 'author_id': c.get('postedUserId'), + 'author_thumbnail': c.get('userThumbnailUrl'), + 'id': c.get('id'), + 'text': c.get('body'), + 'timestamp': unified_timestamp(c.get('createdAt')), + 'like_count': c.get('likeCount'), + 'is_favorited': c.get('isLikedByOwner'), + 'author_is_uploader': c.get('postedUserId') == uploader_internal_id, + } for c in traverse_obj(nuxt_data, ( + 'ssrRefs', ..., lambda _, v: v['__typename'] == 'PublicCreatorBroadcastComment')) or []], + 'uploader_id': uploader_id, + 'uploader': traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'username'), get_all=False), + } + + +class FourZeroStudioClipIE(InfoExtractor): + _VALID_URL = r'https?://0000\.studio/(?P<uploader_id>[^/]+)/archive-clip/(?P<id>[^/]+)' + IE_NAME = '0000studio:clip' + _TESTS = [{ + 'url': 'https://0000.studio/soeji/archive-clip/e46b0278-24cd-40a8-92e1-b8fc2b21f34f', + 'info_dict': { + 'id': 'e46b0278-24cd-40a8-92e1-b8fc2b21f34f', + 'title': 'わたベーさんからイラスト差し入れいただきました。ありがとうございました!', + 'timestamp': 1652109105, + 'like_count': 1, + 'uploader': 'ソエジマケイタ', + 'uploader_id': 'soeji', + } + }] + + def _real_extract(self, url): + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') + webpage = self._download_webpage(url, video_id) + nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None) + + clip_info = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorArchivedClip'), get_all=False) + + info = next(( + m for m in self._parse_html5_media_entries(url, webpage, video_id) + if 'mp4' in traverse_obj(m, ('formats', ..., 'ext')) + ), None) + if not info: + self.report_warning('Failed to find a desired media element. Falling back to using NUXT data.') + info = { + 'formats': [{ + 'ext': 'mp4', + 'url': url, + } for url in clip_info.get('mediaFiles') or [] if url], + } + return { + **info, + 'id': video_id, + 'title': clip_info.get('clipComment'), + 'timestamp': unified_timestamp(clip_info.get('createdAt')), + 'like_count': clip_info.get('likeCount'), + 'uploader_id': uploader_id, + 'uploader': traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'username'), get_all=False), + } diff --git a/yt_dlp/extractor/foxgay.py b/yt_dlp/extractor/foxgay.py index 4abc2cfd0..b285464ec 100644 --- a/yt_dlp/extractor/foxgay.py +++ b/yt_dlp/extractor/foxgay.py @@ -31,7 +31,7 @@ class FoxgayIE(InfoExtractor): description = get_element_by_id('inf_tit', webpage) # The default user-agent with foxgay cookies leads to pages without videos - self._downloader.cookiejar.clear('.foxgay.com') + self.cookiejar.clear('.foxgay.com') # Find the URL for the iFrame which contains the actual video. iframe_url = self._html_search_regex( r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1', webpage, diff --git a/yt_dlp/extractor/foxnews.py b/yt_dlp/extractor/foxnews.py index cee4d6b49..e8513f2c2 100644 --- a/yt_dlp/extractor/foxnews.py +++ b/yt_dlp/extractor/foxnews.py @@ -59,10 +59,13 @@ class FoxNewsIE(AMPIE): @staticmethod def _extract_urls(webpage): return [ - mobj.group('url') + f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}' for mobj in re.finditer( - r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1', - webpage)] + r'''(?x) + <(?:script|(?:amp-)?iframe)[^>]+\bsrc=["\'] + (?:https?:)?//video\.foxnews\.com/v/(?:video-embed\.html|embed\.js)\? + (?:[^>"\']+&)?(?:video_)?id=(?P<video_id>\d+) + ''', webpage)] def _real_extract(self, url): host, video_id = self._match_valid_url(url).groups() diff --git a/yt_dlp/extractor/franceculture.py b/yt_dlp/extractor/franceculture.py deleted file mode 100644 index 6bd9912f3..000000000 --- a/yt_dlp/extractor/franceculture.py +++ /dev/null @@ -1,125 +0,0 @@ -import re -from .common import InfoExtractor -from ..utils import ( - determine_ext, - extract_attributes, - int_or_none, - traverse_obj, - unified_strdate, -) - - -class FranceCultureIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - # playlist - 'url': 'https://www.franceculture.fr/emissions/serie/hasta-dente', - 'playlist_count': 12, - 'info_dict': { - 'id': 'hasta-dente', - 'title': 'Hasta Dente', - 'description': 'md5:57479af50648d14e9bb649e6b1f8f911', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20201024', - }, - 'playlist': [{ - 'info_dict': { - 'id': '3c1c2e55-41a0-11e5-9fe0-005056a87c89', - 'ext': 'mp3', - 'title': 'Jeudi, vous avez dit bizarre ?', - 'description': 'md5:47cf1e00cc21c86b0210279996a812c6', - 'duration': 604, - 'upload_date': '20201024', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1603576680 - }, - }, - ], - }, { - 'url': 'https://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', - 'info_dict': { - 'id': 'rendez-vous-au-pays-des-geeks', - 'display_id': 'rendez-vous-au-pays-des-geeks', - 'ext': 'mp3', - 'title': 'Rendez-vous au pays des geeks', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20140301', - 'vcodec': 'none', - 'duration': 3569, - }, - }, { - # no thumbnail - 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - info = { - 'id': display_id, - 'title': self._html_search_regex( - r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>', - webpage, 'title', default=self._og_search_title(webpage)), - 'description': self._html_search_regex( - r'(?s)<div[^>]+class="excerpt"[^>]*>(.*?)</div>', webpage, 'description', default=None), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': self._html_search_regex( - r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None), - 'upload_date': unified_strdate(self._html_search_regex( - r'(?s)class="teaser-text-date".*?(\d{2}/\d{2}/\d{4})', webpage, 'date', default=None)), - } - - playlist_data = self._search_regex( - r'''(?sx) - <section[^>]+data-xiti-place="[^"]*?liste_episodes[^"?]*?"[^>]*> - (.*?) - </section> - ''', - webpage, 'playlist data', fatal=False, default=None) - - if playlist_data: - entries = [] - for item, item_description in re.findall( - r'(?s)(<button[^<]*class="[^"]*replay-button[^>]*>).*?<p[^>]*class="[^"]*teaser-text-chapo[^>]*>(.*?)</p>', - playlist_data): - - item_attributes = extract_attributes(item) - entries.append({ - 'id': item_attributes.get('data-emission-uuid'), - 'url': item_attributes.get('data-url'), - 'title': item_attributes.get('data-diffusion-title'), - 'duration': int_or_none(traverse_obj(item_attributes, 'data-duration-seconds', 'data-duration-seconds')), - 'description': item_description, - 'timestamp': int_or_none(item_attributes.get('data-start-time')), - 'thumbnail': info['thumbnail'], - 'uploader': info['uploader'], - }) - - return { - '_type': 'playlist', - 'entries': entries, - **info - } - - video_data = extract_attributes(self._search_regex( - r'''(?sx) - (?: - </h1>| - <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> - ).*? - (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>) - ''', - webpage, 'video data')) - video_url = traverse_obj(video_data, 'data-url', 'data-asset-source') - ext = determine_ext(video_url.lower()) - - return { - 'display_id': display_id, - 'url': video_url, - 'ext': ext, - 'vcodec': 'none' if ext == 'mp3' else None, - 'duration': int_or_none(video_data.get('data-duration')), - **info - } diff --git a/yt_dlp/extractor/freetv.py b/yt_dlp/extractor/freetv.py new file mode 100644 index 000000000..f38bae90b --- /dev/null +++ b/yt_dlp/extractor/freetv.py @@ -0,0 +1,141 @@ +import itertools +import re + +from .common import InfoExtractor +from ..utils import int_or_none, traverse_obj, urlencode_postdata + + +class FreeTvBaseIE(InfoExtractor): + def _get_api_response(self, content_id, resource_type, postdata): + return self._download_json( + 'https://www.freetv.com/wordpress/wp-admin/admin-ajax.php', + content_id, data=urlencode_postdata(postdata), + note=f'Downloading {content_id} {resource_type} JSON')['data'] + + +class FreeTvMoviesIE(FreeTvBaseIE): + _VALID_URL = r'https?://(?:www\.)?freetv\.com/peliculas/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.freetv.com/peliculas/atrapame-si-puedes/', + 'md5': 'dc62d5abf0514726640077cd1591aa92', + 'info_dict': { + 'id': '428021', + 'title': 'Atrápame Si Puedes', + 'description': 'md5:ca63bc00898aeb2f64ec87c6d3a5b982', + 'ext': 'mp4', + } + }, { + 'url': 'https://www.freetv.com/peliculas/monstruoso/', + 'md5': '509c15c68de41cb708d1f92d071f20aa', + 'info_dict': { + 'id': '377652', + 'title': 'Monstruoso', + 'description': 'md5:333fc19ee327b457b980e54a911ea4a3', + 'ext': 'mp4', + } + }] + + def _extract_video(self, content_id, action='olyott_video_play'): + api_response = self._get_api_response(content_id, 'video', { + 'action': action, + 'contentID': content_id, + }) + + video_id, video_url = api_response['displayMeta']['contentID'], api_response['displayMeta']['streamURLVideo'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': traverse_obj(api_response, ('displayMeta', 'title')), + 'description': traverse_obj(api_response, ('displayMeta', 'desc')), + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + return self._extract_video( + self._search_regex(( + r'class=["\'][^>]+postid-(?P<video_id>\d+)', + r'<link[^>]+freetv.com/\?p=(?P<video_id>\d+)', + r'<div[^>]+data-params=["\'][^>]+post_id=(?P<video_id>\d+)', + ), webpage, 'video id', group='video_id')) + + +class FreeTvIE(FreeTvBaseIE): + IE_NAME = 'freetv:series' + _VALID_URL = r'https?://(?:www\.)?freetv\.com/series/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.freetv.com/series/el-detective-l/', + 'info_dict': { + 'id': 'el-detective-l', + 'title': 'El Detective L', + 'description': 'md5:f9f1143bc33e9856ecbfcbfb97a759be' + }, + 'playlist_count': 24, + }, { + 'url': 'https://www.freetv.com/series/esmeraldas/', + 'info_dict': { + 'id': 'esmeraldas', + 'title': 'Esmeraldas', + 'description': 'md5:43d7ec45bd931d8268a4f5afaf4c77bf' + }, + 'playlist_count': 62, + }, { + 'url': 'https://www.freetv.com/series/las-aventuras-de-leonardo/', + 'info_dict': { + 'id': 'las-aventuras-de-leonardo', + 'title': 'Las Aventuras de Leonardo', + 'description': 'md5:0c47130846c141120a382aca059288f6' + }, + 'playlist_count': 13, + }, + ] + + def _extract_series_season(self, season_id, series_title): + episodes = self._get_api_response(season_id, 'series', { + 'contentID': season_id, + 'action': 'olyott_get_dynamic_series_content', + 'type': 'list', + 'perPage': '1000', + })['1'] + + for episode in episodes: + video_id = str(episode['contentID']) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(episode['streamURL'], video_id, 'mp4') + self._sort_formats(formats) + + yield { + 'id': video_id, + 'title': episode.get('fullTitle'), + 'description': episode.get('description'), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': episode.get('thumbnail'), + 'series': series_title, + 'series_id': traverse_obj(episode, ('contentMeta', 'displayMeta', 'seriesID')), + 'season_id': traverse_obj(episode, ('contentMeta', 'displayMeta', 'seasonID')), + 'season_number': traverse_obj( + episode, ('contentMeta', 'displayMeta', 'seasonNum'), expected_type=int_or_none), + 'episode_number': traverse_obj( + episode, ('contentMeta', 'displayMeta', 'episodeNum'), expected_type=int_or_none), + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + title = self._html_search_regex( + r'<h1[^>]+class=["\']synopis[^>]>(?P<title>[^<]+)', webpage, 'title', group='title', fatal=False) + description = self._html_search_regex( + r'<div[^>]+class=["\']+synopis content[^>]><p>(?P<description>[^<]+)', + webpage, 'description', group='description', fatal=False) + + return self.playlist_result( + itertools.chain.from_iterable( + self._extract_series_season(season_id, title) + for season_id in re.findall(r'<option[^>]+value=["\'](\d+)["\']', webpage)), + display_id, title, description) diff --git a/yt_dlp/extractor/fuyintv.py b/yt_dlp/extractor/fuyintv.py new file mode 100644 index 000000000..197901d57 --- /dev/null +++ b/yt_dlp/extractor/fuyintv.py @@ -0,0 +1,30 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class FuyinTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fuyin\.tv/html/(?:\d+)/(?P<id>\d+)\.html' + _TESTS = [{ + 'url': 'https://www.fuyin.tv/html/2733/44129.html', + 'info_dict': { + 'id': '44129', + 'ext': 'mp4', + 'title': '第1集', + 'description': 'md5:21a3d238dc8d49608e1308e85044b9c3', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json( + 'https://www.fuyin.tv/api/api/tv.movie/url', + video_id, query={'urlid': f'{video_id}'}) + webpage = self._download_webpage(url, video_id, fatal=False) + + return { + 'id': video_id, + 'title': traverse_obj(json_data, ('data', 'title')), + 'url': json_data['data']['url'], + 'ext': 'mp4', + 'description': self._html_search_meta('description', webpage), + } diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index f594d02c2..c2f754453 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -1,5 +1,6 @@ import os import re +import urllib.parse import xml.etree.ElementTree from .ant1newsgr import Ant1NewsGrEmbedIE @@ -69,11 +70,13 @@ from .spankwire import SpankwireIE from .sportbox import SportBoxIE from .spotify import SpotifyBaseIE from .springboardplatform import SpringboardPlatformIE +from .substack import SubstackIE from .svt import SVTIE from .teachable import TeachableIE from .ted import TedEmbedIE from .theplatform import ThePlatformIE from .threeqsdn import ThreeQSDNIE +from .tiktok import TikTokIE from .tnaflix import TNAFlixNetworkEmbedIE from .tube8 import Tube8IE from .tunein import TuneInBaseIE @@ -104,12 +107,7 @@ from .yapfiles import YapFilesIE from .youporn import YouPornIE from .youtube import YoutubeIE from .zype import ZypeIE -from ..compat import ( - compat_etree_fromstring, - compat_str, - compat_urllib_parse_unquote, - compat_urlparse, -) +from ..compat import compat_etree_fromstring from ..utils import ( KNOWN_EXTENSIONS, ExtractorError, @@ -129,6 +127,7 @@ from ..utils import ( sanitized_Request, smuggle_url, str_or_none, + try_call, unescapeHTML, unified_timestamp, unsmuggle_url, @@ -2526,6 +2525,118 @@ class GenericIE(InfoExtractor): 'upload_date': '20220504', }, }, + { + # Webpage contains double BOM + 'url': 'https://www.filmarkivet.se/movies/paris-d-moll/', + 'md5': 'df02cadc719dcc63d43288366f037754', + 'info_dict': { + 'id': 'paris-d-moll', + 'ext': 'mp4', + 'upload_date': '20220518', + 'title': 'Paris d-moll', + 'description': 'md5:319e37ea5542293db37e1e13072fe330', + 'thumbnail': 'https://www.filmarkivet.se/wp-content/uploads/parisdmoll2.jpg', + 'timestamp': 1652833414, + 'age_limit': 0, + } + }, + { + 'url': 'https://www.mollymovieclub.com/p/interstellar?s=r#details', + 'md5': '198bde8bed23d0b23c70725c83c9b6d9', + 'info_dict': { + 'id': '53602801', + 'ext': 'mpga', + 'title': 'Interstellar', + 'description': 'Listen now | Episode One', + 'thumbnail': 'md5:c30d9c83f738e16d8551d7219d321538', + 'uploader': 'Molly Movie Club', + 'uploader_id': '839621', + }, + }, + { + 'url': 'https://www.blockedandreported.org/p/episode-117-lets-talk-about-depp?s=r', + 'md5': 'c0cc44ee7415daeed13c26e5b56d6aa0', + 'info_dict': { + 'id': '57962052', + 'ext': 'mpga', + 'title': 'md5:855b2756f0ee10f6723fa00b16266f8d', + 'description': 'md5:fe512a5e94136ad260c80bde00ea4eef', + 'thumbnail': 'md5:2218f27dfe517bb5ac16c47d0aebac59', + 'uploader': 'Blocked and Reported', + 'uploader_id': '500230', + }, + }, + { + 'url': 'https://www.skimag.com/video/ski-people-1980/', + 'info_dict': { + 'id': 'ski-people-1980', + 'title': 'Ski People (1980)', + }, + 'playlist_count': 1, + 'playlist': [{ + 'md5': '022a7e31c70620ebec18deeab376ee03', + 'info_dict': { + 'id': 'YTmgRiNU', + 'ext': 'mp4', + 'title': '1980 Ski People', + 'timestamp': 1610407738, + 'description': 'md5:cf9c3d101452c91e141f292b19fe4843', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/YTmgRiNU/poster.jpg?width=720', + 'duration': 5688.0, + 'upload_date': '20210111', + } + }] + }, + { + 'note': 'Rumble embed', + 'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html', + 'md5': '53af34098a7f92c4e51cf0bd1c33f009', + 'info_dict': { + 'id': 'vb0ofn', + 'ext': 'mp4', + 'timestamp': 1612662578, + 'uploader': 'LovingMontana', + 'channel': 'LovingMontana', + 'upload_date': '20210207', + 'title': 'Winter-loving dog helps girls dig a snow fort ', + 'channel_url': 'https://rumble.com/c/c-546523', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/f/x/x/5fxxb.OvCc.1-small-Moose-The-Dog-Helps-Girls-D.jpg', + 'duration': 103, + } + }, + { + 'note': 'Rumble JS embed', + 'url': 'https://therightscoop.com/what-does-9-plus-1-plus-1-equal-listen-to-this-audio-of-attempted-kavanaugh-assassins-call-and-youll-get-it', + 'md5': '4701209ac99095592e73dbba21889690', + 'info_dict': { + 'id': 'v15eqxl', + 'ext': 'mp4', + 'channel': 'Mr Producer Media', + 'duration': 92, + 'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh', + 'channel_url': 'https://rumble.com/c/RichSementa', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.OvCc-small-911-Audio-From-The-Man-Who-.jpg', + 'timestamp': 1654892716, + 'uploader': 'Mr Producer Media', + 'upload_date': '20220610', + } + }, + { + 'note': 'JSON LD with multiple @type', + 'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html', + 'md5': 'c7949f34f57273013fb7ccb1156393db', + 'info_dict': { + 'id': 'ipy2AcGL', + 'ext': 'mp4', + 'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d', + 'thumbnail': r're:https://media\.nu\.nl/m/.+\.jpg', + 'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen', + 'timestamp': 1586577474, + 'upload_date': '20200411', + 'age_limit': 0, + 'duration': 111.0, + } + }, ] def report_following_redirect(self, new_url): @@ -2536,66 +2647,44 @@ class GenericIE(InfoExtractor): self._downloader.write_debug(f'Identified a {name}') def _extract_rss(self, url, video_id, doc): - playlist_title = doc.find('./channel/title').text - playlist_desc_el = doc.find('./channel/description') - playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text - NS_MAP = { 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', } entries = [] for it in doc.findall('./channel/item'): - next_url = None - enclosure_nodes = it.findall('./enclosure') - for e in enclosure_nodes: - next_url = e.attrib.get('url') - if next_url: - break - - if not next_url: - next_url = xpath_text(it, 'link', fatal=False) - + next_url = next( + (e.attrib.get('url') for e in it.findall('./enclosure')), + xpath_text(it, 'link', fatal=False)) if not next_url: continue - if it.find('guid').text is not None: - next_url = smuggle_url(next_url, {'force_videoid': it.find('guid').text}) + guid = try_call(lambda: it.find('guid').text) + if guid: + next_url = smuggle_url(next_url, {'force_videoid': guid}) def itunes(key): - return xpath_text( - it, xpath_with_ns('./itunes:%s' % key, NS_MAP), - default=None) - - duration = itunes('duration') - explicit = (itunes('explicit') or '').lower() - if explicit in ('true', 'yes'): - age_limit = 18 - elif explicit in ('false', 'no'): - age_limit = 0 - else: - age_limit = None + return xpath_text(it, xpath_with_ns(f'./itunes:{key}', NS_MAP), default=None) entries.append({ '_type': 'url_transparent', 'url': next_url, - 'title': it.find('title').text, + 'title': try_call(lambda: it.find('title').text), 'description': xpath_text(it, 'description', default=None), - 'timestamp': unified_timestamp( - xpath_text(it, 'pubDate', default=None)), - 'duration': int_or_none(duration) or parse_duration(duration), + 'timestamp': unified_timestamp(xpath_text(it, 'pubDate', default=None)), + 'duration': parse_duration(itunes('duration')), 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')), 'episode': itunes('title'), 'episode_number': int_or_none(itunes('episode')), 'season_number': int_or_none(itunes('season')), - 'age_limit': age_limit, + 'age_limit': {'true': 18, 'yes': 18, 'false': 0, 'no': 0}.get((itunes('explicit') or '').lower()), }) return { '_type': 'playlist', 'id': url, - 'title': playlist_title, - 'description': playlist_desc, + 'title': try_call(lambda: doc.find('./channel/title').text), + 'description': try_call(lambda: doc.find('./channel/description').text), 'entries': entries, } @@ -2610,7 +2699,7 @@ class GenericIE(InfoExtractor): title = self._html_search_meta('DC.title', webpage, fatal=True) - camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg) + camtasia_url = urllib.parse.urljoin(url, camtasia_cfg) camtasia_cfg = self._download_xml( camtasia_url, video_id, note='Downloading camtasia configuration', @@ -2626,7 +2715,7 @@ class GenericIE(InfoExtractor): entries.append({ 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0], 'title': f'{title} - {n.tag}', - 'url': compat_urlparse.urljoin(url, url_n.text), + 'url': urllib.parse.urljoin(url, url_n.text), 'duration': float_or_none(n.find('./duration').text), }) @@ -2678,7 +2767,7 @@ class GenericIE(InfoExtractor): if url.startswith('//'): return self.url_result(self.http_scheme() + url) - parsed_url = compat_urlparse.urlparse(url) + parsed_url = urllib.parse.urlparse(url) if not parsed_url.scheme: default_search = self.get_param('default_search') if default_search is None: @@ -2754,7 +2843,7 @@ class GenericIE(InfoExtractor): m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: self.report_detected('direct video link') - format_id = compat_str(m.group('format_id')) + format_id = str(m.group('format_id')) subtitles = {} if format_id.endswith('mpegurl'): formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') @@ -2873,7 +2962,7 @@ class GenericIE(InfoExtractor): # Unescaping the whole page allows to handle those cases in a generic way # FIXME: unescaping the whole page may break URLs, commenting out for now. # There probably should be a second run of generic extractor on unescaped webpage. - # webpage = compat_urllib_parse_unquote(webpage) + # webpage = urllib.parse.unquote(webpage) # Unescape squarespace embeds to be detected by generic extractor, # see https://github.com/ytdl-org/youtube-dl/issues/21294 @@ -2975,7 +3064,7 @@ class GenericIE(InfoExtractor): if vimeo_urls: return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) - vhx_url = VHXEmbedIE._extract_url(webpage) + vhx_url = VHXEmbedIE._extract_url(url, webpage) if vhx_url: return self.url_result(vhx_url, VHXEmbedIE.ie_key()) @@ -3023,6 +3112,7 @@ class GenericIE(InfoExtractor): wistia_urls = WistiaIE._extract_urls(webpage) if wistia_urls: playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key()) + playlist['entries'] = list(playlist['entries']) for entry in playlist['entries']: entry.update({ '_type': 'url_transparent', @@ -3042,6 +3132,11 @@ class GenericIE(InfoExtractor): # Don't set the extractor because it can be a track url or an album return self.url_result(burl) + # Check for Substack custom domains + substack_url = SubstackIE._extract_url(webpage, url) + if substack_url: + return self.url_result(substack_url, SubstackIE) + # Look for embedded Vevo player mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) @@ -3140,7 +3235,7 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url')) mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage) if mobj is not None: - return self.url_result(compat_urllib_parse_unquote(mobj.group('url'))) + return self.url_result(urllib.parse.unquote(mobj.group('url'))) # Look for funnyordie embed matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) @@ -3393,7 +3488,7 @@ class GenericIE(InfoExtractor): r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) if mobj is not None: return self.url_result( - compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') + urllib.parse.urljoin(url, mobj.group('url')), 'UDNEmbed') # Look for Senate ISVP iframe senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) @@ -3626,7 +3721,7 @@ class GenericIE(InfoExtractor): if mediasite_urls: entries = [ self.url_result(smuggle_url( - compat_urlparse.urljoin(url, mediasite_url), + urllib.parse.urljoin(url, mediasite_url), {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) for mediasite_url in mediasite_urls] return self.playlist_result(entries, video_id, video_title) @@ -3762,6 +3857,11 @@ class GenericIE(InfoExtractor): if ruutu_urls: return self.playlist_from_matches(ruutu_urls, video_id, video_title) + # Look for Tiktok embeds + tiktok_urls = TikTokIE._extract_urls(webpage) + if tiktok_urls: + return self.playlist_from_matches(tiktok_urls, video_id, video_title) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: @@ -3816,11 +3916,11 @@ class GenericIE(InfoExtractor): subtitles = {} for source in sources: src = source.get('src') - if not src or not isinstance(src, compat_str): + if not src or not isinstance(src, str): continue - src = compat_urlparse.urljoin(url, src) + src = urllib.parse.urljoin(url, src) src_type = source.get('type') - if isinstance(src_type, compat_str): + if isinstance(src_type, str): src_type = src_type.lower() ext = determine_ext(src).lower() if src_type == 'video/youtube': @@ -3854,7 +3954,7 @@ class GenericIE(InfoExtractor): if not src: continue subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({ - 'url': compat_urlparse.urljoin(url, src), + 'url': urllib.parse.urljoin(url, src), 'name': sub.get('label'), 'http_headers': { 'Referer': full_response.geturl(), @@ -3871,22 +3971,17 @@ class GenericIE(InfoExtractor): json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url') not in (url, None): self.report_detected('JSON LD') - if determine_ext(json_ld['url']) == 'm3u8': - json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles( - json_ld['url'], video_id, 'mp4') - json_ld.pop('url') - self._sort_formats(json_ld['formats']) - else: - json_ld['_type'] = 'url_transparent' - json_ld['url'] = smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True}) - return merge_dicts(json_ld, info_dict) + return merge_dicts({ + '_type': 'url_transparent', + 'url': smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True}), + }, json_ld, info_dict) def check_video(vurl): if YoutubeIE.suitable(vurl): return True if RtmpIE.suitable(vurl): return True - vpath = compat_urlparse.urlparse(vurl).path + vpath = urllib.parse.urlparse(vurl).path vext = determine_ext(vpath, None) return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') @@ -4014,7 +4109,7 @@ class GenericIE(InfoExtractor): if refresh_header: found = re.search(REDIRECT_REGEX, refresh_header) if found: - new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) + new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1))) if new_url != url: self.report_following_redirect(new_url) return { @@ -4040,8 +4135,8 @@ class GenericIE(InfoExtractor): for video_url in orderedSet(found): video_url = unescapeHTML(video_url) video_url = video_url.replace('\\/', '/') - video_url = compat_urlparse.urljoin(url, video_url) - video_id = compat_urllib_parse_unquote(os.path.basename(video_url)) + video_url = urllib.parse.urljoin(url, video_url) + video_id = urllib.parse.unquote(os.path.basename(video_url)) # Sometimes, jwplayer extraction will result in a YouTube URL if YoutubeIE.suitable(video_url): diff --git a/yt_dlp/extractor/giga.py b/yt_dlp/extractor/giga.py index 9e835a6da..e728598f7 100644 --- a/yt_dlp/extractor/giga.py +++ b/yt_dlp/extractor/giga.py @@ -1,13 +1,8 @@ import itertools from .common import InfoExtractor -from ..utils import ( - qualities, - compat_str, - parse_duration, - parse_iso8601, - str_to_int, -) +from ..compat import compat_str +from ..utils import parse_duration, parse_iso8601, qualities, str_to_int class GigaIE(InfoExtractor): diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index c0905f86a..d7475b6da 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -264,7 +264,7 @@ class GoogleDriveIE(InfoExtractor): subtitles_id = ttsurl.encode('utf-8').decode( 'unicode_escape').split('=')[-1] - self._downloader.cookiejar.clear(domain='.google.com', path='/', name='NID') + self.cookiejar.clear(domain='.google.com', path='/', name='NID') return { 'id': video_id, @@ -276,3 +276,59 @@ class GoogleDriveIE(InfoExtractor): 'automatic_captions': self.extract_automatic_captions( video_id, subtitles_id, hl), } + + +class GoogleDriveFolderIE(InfoExtractor): + IE_NAME = 'GoogleDrive:Folder' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/drive/folders/(?P<id>[\w-]{28,})' + _TESTS = [{ + 'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', + 'info_dict': { + 'id': '1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', + 'title': 'Forrest' + }, + 'playlist_count': 3, + }] + _BOUNDARY = '=====vc17a3rwnndj=====' + _REQUEST = "/drive/v2beta/files?openDrive=true&reason=102&syncType=0&errorRecovery=false&q=trashed%20%3D%20false%20and%20'{folder_id}'%20in%20parents&fields=kind%2CnextPageToken%2Citems(kind%2CmodifiedDate%2CmodifiedByMeDate%2ClastViewedByMeDate%2CfileSize%2Cowners(kind%2CpermissionId%2Cid)%2ClastModifyingUser(kind%2CpermissionId%2Cid)%2ChasThumbnail%2CthumbnailVersion%2Ctitle%2Cid%2CresourceKey%2Cshared%2CsharedWithMeDate%2CuserPermission(role)%2CexplicitlyTrashed%2CmimeType%2CquotaBytesUsed%2Ccopyable%2CfileExtension%2CsharingUser(kind%2CpermissionId%2Cid)%2Cspaces%2Cversion%2CteamDriveId%2ChasAugmentedPermissions%2CcreatedDate%2CtrashingUser(kind%2CpermissionId%2Cid)%2CtrashedDate%2Cparents(id)%2CshortcutDetails(targetId%2CtargetMimeType%2CtargetLookupStatus)%2Ccapabilities(canCopy%2CcanDownload%2CcanEdit%2CcanAddChildren%2CcanDelete%2CcanRemoveChildren%2CcanShare%2CcanTrash%2CcanRename%2CcanReadTeamDrive%2CcanMoveTeamDriveItem)%2Clabels(starred%2Ctrashed%2Crestricted%2Cviewed))%2CincompleteSearch&appDataFilter=NO_APP_DATA&spaces=drive&pageToken={page_token}&maxResults=50&supportsTeamDrives=true&includeItemsFromAllDrives=true&corpora=default&orderBy=folder%2Ctitle_natural%20asc&retryCount=0&key={key} HTTP/1.1" + _DATA = f'''--{_BOUNDARY} +content-type: application/http +content-transfer-encoding: binary + +GET %s + +--{_BOUNDARY} +''' + + def _call_api(self, folder_id, key, data, **kwargs): + response = self._download_webpage( + 'https://clients6.google.com/batch/drive/v2beta', + folder_id, data=data.encode('utf-8'), + headers={ + 'Content-Type': 'text/plain;charset=UTF-8;', + 'Origin': 'https://drive.google.com', + }, query={ + '$ct': f'multipart/mixed; boundary="{self._BOUNDARY}"', + 'key': key + }, **kwargs) + return self._search_json('', response, 'api response', folder_id, **kwargs) or {} + + def _get_folder_items(self, folder_id, key): + page_token = '' + while page_token is not None: + request = self._REQUEST.format(folder_id=folder_id, page_token=page_token, key=key) + page = self._call_api(folder_id, key, self._DATA % request) + yield from page['items'] + page_token = page.get('nextPageToken') + + def _real_extract(self, url): + folder_id = self._match_id(url) + + webpage = self._download_webpage(url, folder_id) + key = self._search_regex(r'"(\w{39})"', webpage, 'key') + + folder_info = self._call_api(folder_id, key, self._DATA % f'/drive/v2beta/files/{folder_id} HTTP/1.1', fatal=False) + + return self.playlist_from_matches( + self._get_folder_items(folder_id, key), folder_id, folder_info.get('title'), + ie=GoogleDriveIE, getter=lambda item: f'https://drive.google.com/file/d/{item["id"]}') diff --git a/yt_dlp/extractor/hitbox.py b/yt_dlp/extractor/hitbox.py index a7e4424b6..6ecdd390c 100644 --- a/yt_dlp/extractor/hitbox.py +++ b/yt_dlp/extractor/hitbox.py @@ -1,13 +1,13 @@ import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( clean_html, - parse_iso8601, + determine_ext, float_or_none, int_or_none, - compat_str, - determine_ext, + parse_iso8601, ) diff --git a/yt_dlp/extractor/ina.py b/yt_dlp/extractor/ina.py index 56038f1ca..9e2c9cf47 100644 --- a/yt_dlp/extractor/ina.py +++ b/yt_dlp/extractor/ina.py @@ -1,23 +1,19 @@ from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - strip_or_none, - xpath_attr, - xpath_text, -) +from ..utils import unified_strdate class InaIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:[^/]+/)?(?:video|audio)/(?P<id>\w+)' _TESTS = [{ - 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', - 'md5': 'a667021bf2b41f8dc6049479d9bb38a3', + 'url': 'https://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', + 'md5': 'c5a09e5cb5604ed10709f06e7a377dda', 'info_dict': { 'id': 'I12055569', 'ext': 'mp4', 'title': 'François Hollande "Je crois que c\'est clair"', - 'description': 'md5:3f09eb072a06cb286b8f7e4f77109663', + 'description': 'md5:08201f1c86fb250611f0ba415d21255a', + 'upload_date': '20070712', + 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/3c4/I12055569.jpeg', } }, { 'url': 'https://www.ina.fr/video/S806544_001/don-d-organes-des-avancees-mais-d-importants-besoins-video.html', @@ -31,53 +27,37 @@ class InaIE(InfoExtractor): }, { 'url': 'http://m.ina.fr/video/I12055569', 'only_matching': True, + }, { + 'url': 'https://www.ina.fr/ina-eclaire-actu/video/cpb8205116303/les-jeux-electroniques', + 'md5': '4b8284a9a3a184fdc7e744225b8251e7', + 'info_dict': { + 'id': 'CPB8205116303', + 'ext': 'mp4', + 'title': 'Les jeux électroniques', + 'description': 'md5:e09f7683dad1cc60b74950490127d233', + 'upload_date': '19821204', + 'duration': 657, + 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/203/CPB8205116303.jpeg', + } }] def _real_extract(self, url): - video_id = self._match_id(url) - info_doc = self._download_xml( - 'http://player.ina.fr/notices/%s.mrss' % video_id, video_id) - item = info_doc.find('channel/item') - title = xpath_text(item, 'title', fatal=True) - media_ns_xpath = lambda x: self._xpath_ns(x, 'http://search.yahoo.com/mrss/') - content = item.find(media_ns_xpath('content')) + video_id = self._match_id(url).upper() + webpage = self._download_webpage(url, video_id) - get_furl = lambda x: xpath_attr(content, media_ns_xpath(x), 'url') - formats = [] - for q, w, h in (('bq', 400, 300), ('mq', 512, 384), ('hq', 768, 576)): - q_url = get_furl(q) - if not q_url: - continue - formats.append({ - 'format_id': q, - 'url': q_url, - 'width': w, - 'height': h, - }) - if not formats: - furl = get_furl('player') or content.attrib['url'] - ext = determine_ext(furl) - formats = [{ - 'url': furl, - 'vcodec': 'none' if ext == 'mp3' else None, - 'ext': ext, - }] + api_url = self._html_search_regex( + r'asset-details-url\s*=\s*["\'](?P<api_url>[^"\']+)', + webpage, 'api_url').replace(video_id, f'{video_id}.json') - thumbnails = [] - for thumbnail in content.findall(media_ns_xpath('thumbnail')): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'height': int_or_none(thumbnail.get('height')), - 'width': int_or_none(thumbnail.get('width')), - }) + api_response = self._download_json(api_url, video_id) return { 'id': video_id, - 'formats': formats, - 'title': title, - 'description': strip_or_none(xpath_text(item, 'description')), - 'thumbnails': thumbnails, + 'url': api_response['resourceUrl'], + 'ext': {'video': 'mp4', 'audio': 'mp3'}.get(api_response.get('type')), + 'title': api_response.get('title'), + 'description': api_response.get('description'), + 'upload_date': unified_strdate(api_response.get('dateOfBroadcast')), + 'duration': api_response.get('duration'), + 'thumbnail': api_response.get('resourceThumbnail'), } diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index 05000e2fb..5a824b500 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -410,7 +410,7 @@ class InstagramIE(InstagramBaseIE): if nodes: return self.playlist_result( self._extract_nodes(nodes, True), video_id, - format_field(username, template='Post by %s'), description) + format_field(username, None, 'Post by %s'), description) video_url = self._og_search_video_url(webpage, secure=False) diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py index a0298f1a1..5c316687c 100644 --- a/yt_dlp/extractor/iqiyi.py +++ b/yt_dlp/extractor/iqiyi.py @@ -351,7 +351,7 @@ class IqIE(InfoExtractor): ''' def _extract_vms_player_js(self, webpage, video_id): - player_js_cache = self._downloader.cache.load('iq', 'player_js') + player_js_cache = self.cache.load('iq', 'player_js') if player_js_cache: return player_js_cache webpack_js_url = self._proto_relative_url(self._search_regex( @@ -364,7 +364,7 @@ class IqIE(InfoExtractor): f'https://stc.iqiyipic.com/_next/static/chunks/{webpack_map1.get(module_index, module_index)}.{webpack_map2[module_index]}.js', video_id, note=f'Downloading #{module_index} module JS', errnote='Unable to download module JS', fatal=False) or '' if 'vms request' in module_js: - self._downloader.cache.store('iq', 'player_js', module_js) + self.cache.store('iq', 'player_js', module_js) return module_js raise ExtractorError('Unable to extract player JS') @@ -440,7 +440,7 @@ class IqIE(InfoExtractor): preview_time = traverse_obj( initial_format_data, ('boss_ts', (None, 'data'), ('previewTime', 'rtime')), expected_type=float_or_none, get_all=False) if traverse_obj(initial_format_data, ('boss_ts', 'data', 'prv'), expected_type=int_or_none): - self.report_warning('This preview video is limited%s' % format_field(preview_time, template=' to %s seconds')) + self.report_warning('This preview video is limited%s' % format_field(preview_time, None, ' to %s seconds')) # TODO: Extract audio-only formats for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none, default=[])): diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index 4b88da35f..f77c5d44d 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -1,15 +1,16 @@ +import itertools import re -import urllib +import urllib.parse from .common import InfoExtractor from ..utils import ( int_or_none, mimetype2ext, remove_end, + strip_or_none, + unified_strdate, url_or_none, urljoin, - unified_strdate, - strip_or_none, ) @@ -171,37 +172,70 @@ class IwaraUserIE(IwaraBaseIE): IE_NAME = 'iwara:user' _TESTS = [{ - 'url': 'https://ecchi.iwara.tv/users/CuteMMD', + 'note': 'number of all videos page is just 1 page. less than 40 videos', + 'url': 'https://ecchi.iwara.tv/users/infinityyukarip', 'info_dict': { - 'id': 'CuteMMD', + 'title': 'Uploaded videos from Infinity_YukariP', + 'id': 'infinityyukarip', + 'uploader': 'Infinity_YukariP', + 'uploader_id': 'infinityyukarip', }, - 'playlist_mincount': 198, + 'playlist_mincount': 39, }, { - # urlencoded - 'url': 'https://ecchi.iwara.tv/users/%E5%92%95%E5%98%BF%E5%98%BF', + 'note': 'no even all videos page. probably less than 10 videos', + 'url': 'https://ecchi.iwara.tv/users/mmd-quintet', 'info_dict': { - 'id': '咕嘿嘿', + 'title': 'Uploaded videos from mmd quintet', + 'id': 'mmd-quintet', + 'uploader': 'mmd quintet', + 'uploader_id': 'mmd-quintet', }, - 'playlist_mincount': 141, + 'playlist_mincount': 6, + }, { + 'note': 'has paging. more than 40 videos', + 'url': 'https://ecchi.iwara.tv/users/theblackbirdcalls', + 'info_dict': { + 'title': 'Uploaded videos from TheBlackbirdCalls', + 'id': 'theblackbirdcalls', + 'uploader': 'TheBlackbirdCalls', + 'uploader_id': 'theblackbirdcalls', + }, + 'playlist_mincount': 420, + }, { + 'note': 'foreign chars in URL. there must be foreign characters in URL', + 'url': 'https://ecchi.iwara.tv/users/ぶた丼', + 'info_dict': { + 'title': 'Uploaded videos from ぶた丼', + 'id': 'ぶた丼', + 'uploader': 'ぶた丼', + 'uploader_id': 'ぶた丼', + }, + 'playlist_mincount': 170, }] - def _entries(self, playlist_id, base_url, webpage): - yield from self._extract_playlist(base_url, webpage) - - page_urls = re.findall( - r'class="pager-item"[^>]*>\s*<a[^<]+href="([^"]+)', webpage) - - for n, path in enumerate(page_urls, 2): + def _entries(self, playlist_id, base_url): + webpage = self._download_webpage( + f'{base_url}/users/{playlist_id}', playlist_id) + videos_url = self._search_regex(r'<a href="(/users/[^/]+/videos)(?:\?[^"]+)?">', webpage, 'all videos url', default=None) + if not videos_url: + yield from self._extract_playlist(base_url, webpage) + return + + videos_url = urljoin(base_url, videos_url) + + for n in itertools.count(1): + page = self._download_webpage( + videos_url, playlist_id, note=f'Downloading playlist page {n}', + query={'page': str(n - 1)} if n > 1 else {}) yield from self._extract_playlist( - base_url, self._download_webpage( - urljoin(base_url, path), playlist_id, note=f'Downloading playlist page {n}')) + base_url, page) + + if f'page={n}' not in page: + break def _real_extract(self, url): playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url') playlist_id = urllib.parse.unquote(playlist_id) - webpage = self._download_webpage( - f'{base_url}/users/{playlist_id}/videos', playlist_id) - return self.playlist_result( - self._entries(playlist_id, base_url, webpage), playlist_id) + self._entries(playlist_id, base_url), playlist_id) diff --git a/yt_dlp/extractor/ixigua.py b/yt_dlp/extractor/ixigua.py new file mode 100644 index 000000000..163edf480 --- /dev/null +++ b/yt_dlp/extractor/ixigua.py @@ -0,0 +1,84 @@ +import base64 + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_id, + int_or_none, + js_to_json, + str_or_none, + traverse_obj, +) + + +class IxiguaIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)?ixigua\.com/(?:video/)?(?P<id>\d+).+' + _TESTS = [{ + 'url': 'https://www.ixigua.com/6996881461559165471', + 'info_dict': { + 'id': '6996881461559165471', + 'ext': 'mp4', + 'title': '盲目涉水风险大,亲身示范高水位行车注意事项', + 'description': 'md5:8c82f46186299add4a1c455430740229', + 'tags': ['video_car'], + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'uploader': '懂车帝原创', + 'uploader_id': '6480145787', + 'thumbnail': r're:^https?://.+\.(avif|webp)', + 'timestamp': 1629088414, + 'duration': 1030, + } + }] + + def _get_json_data(self, webpage, video_id): + js_data = get_element_by_id('SSR_HYDRATED_DATA', webpage) + if not js_data: + if self._cookies_passed: + raise ExtractorError('Failed to get SSR_HYDRATED_DATA') + raise ExtractorError('Cookies (not necessarily logged in) are needed', expected=True) + + return self._parse_json( + js_data.replace('window._SSR_HYDRATED_DATA=', ''), video_id, transform_source=js_to_json) + + def _media_selector(self, json_data): + for path, override in ( + (('video_list', ), {}), + (('dynamic_video', 'dynamic_video_list'), {'acodec': 'none'}), + (('dynamic_video', 'dynamic_audio_list'), {'vcodec': 'none', 'ext': 'm4a'}), + ): + for media in traverse_obj(json_data, (..., *path, lambda _, v: v['main_url'])): + yield { + 'url': base64.b64decode(media['main_url']).decode(), + 'width': int_or_none(media.get('vwidth')), + 'height': int_or_none(media.get('vheight')), + 'fps': int_or_none(media.get('fps')), + 'vcodec': media.get('codec_type'), + 'format_id': str_or_none(media.get('quality_type')), + 'filesize': int_or_none(media.get('size')), + 'ext': 'mp4', + **override, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_data = self._get_json_data(webpage, video_id)['anyVideo']['gidInformation']['packerData']['video'] + + formats = list(self._media_selector(json_data.get('videoResource'))) + self._sort_formats(formats) + return { + 'id': video_id, + 'title': json_data.get('title'), + 'description': json_data.get('video_abstract'), + 'formats': formats, + 'like_count': json_data.get('video_like_count'), + 'duration': int_or_none(json_data.get('duration')), + 'tags': [json_data.get('tag')], + 'uploader_id': traverse_obj(json_data, ('user_info', 'user_id')), + 'uploader': traverse_obj(json_data, ('user_info', 'name')), + 'view_count': json_data.get('video_watch_count'), + 'dislike_count': json_data.get('video_unlike_count'), + 'timestamp': int_or_none(json_data.get('video_publish_time')), + } diff --git a/yt_dlp/extractor/joj.py b/yt_dlp/extractor/joj.py index a01411be1..1c4676e95 100644 --- a/yt_dlp/extractor/joj.py +++ b/yt_dlp/extractor/joj.py @@ -70,7 +70,7 @@ class JojIE(InfoExtractor): r'(\d+)[pP]\.', format_url, 'height', default=None) formats.append({ 'url': format_url, - 'format_id': format_field(height, template='%sp'), + 'format_id': format_field(height, None, '%sp'), 'height': int(height), }) if not formats: diff --git a/yt_dlp/extractor/jwplatform.py b/yt_dlp/extractor/jwplatform.py index 8dbbb2926..2cb7ca3d7 100644 --- a/yt_dlp/extractor/jwplatform.py +++ b/yt_dlp/extractor/jwplatform.py @@ -5,7 +5,7 @@ from ..utils import unsmuggle_url class JWPlatformIE(InfoExtractor): - _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' + _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|manifest)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' _TESTS = [{ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', @@ -37,6 +37,9 @@ class JWPlatformIE(InfoExtractor): webpage) if ret: return ret + mobj = re.search(r'<div\b[^>]* data-video-jw-id="([a-zA-Z0-9]{8})"', webpage) + if mobj: + return [f'jwplatform:{mobj.group(1)}'] def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) diff --git a/yt_dlp/extractor/kaltura.py b/yt_dlp/extractor/kaltura.py index afad279bd..f4092aa71 100644 --- a/yt_dlp/extractor/kaltura.py +++ b/yt_dlp/extractor/kaltura.py @@ -382,5 +382,5 @@ class KalturaIE(InfoExtractor): 'duration': info.get('duration'), 'timestamp': info.get('createdAt'), 'uploader_id': format_field(info, 'userId', ignore=('None', None)), - 'view_count': info.get('plays'), + 'view_count': int_or_none(info.get('plays')), } diff --git a/yt_dlp/extractor/keezmovies.py b/yt_dlp/extractor/keezmovies.py index 79f9c7fa7..1c2d5c01c 100644 --- a/yt_dlp/extractor/keezmovies.py +++ b/yt_dlp/extractor/keezmovies.py @@ -68,7 +68,7 @@ class KeezMoviesIE(InfoExtractor): video_url, title, 32).decode('utf-8') formats.append({ 'url': format_url, - 'format_id': format_field(height, template='%dp'), + 'format_id': format_field(height, None, '%dp'), 'height': height, 'tbr': tbr, }) diff --git a/yt_dlp/extractor/kicker.py b/yt_dlp/extractor/kicker.py new file mode 100644 index 000000000..a2c7dd4e8 --- /dev/null +++ b/yt_dlp/extractor/kicker.py @@ -0,0 +1,55 @@ +from .common import InfoExtractor +from .dailymotion import DailymotionIE + + +class KickerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)kicker\.(?:de)/(?P<id>[\w-]+)/video' + _TESTS = [{ + 'url': 'https://www.kicker.de/pogba-dembel-co-die-top-11-der-abloesefreien-spieler-905049/video', + 'info_dict': { + 'id': 'km04mrK0DrRAVxy2GcA', + 'title': 'md5:b91d145bac5745ac58d5479d8347a875', + 'ext': 'mp4', + 'duration': 350, + 'description': 'md5:a5a3dd77dbb6550dbfb997be100b9998', + 'uploader_id': 'x2dfupo', + 'timestamp': 1654677626, + 'like_count': int, + 'uploader': 'kicker.de', + 'view_count': int, + 'age_limit': 0, + 'thumbnail': r're:https://s\d+\.dmcdn\.net/v/T-x741YeYAx8aSZ0Z/x1080', + 'tags': ['published', 'category.InternationalSoccer'], + 'upload_date': '20220608' + } + }, { + 'url': 'https://www.kicker.de/ex-unioner-in-der-bezirksliga-felix-kroos-vereinschallenge-in-pankow-902825/video', + 'info_dict': { + 'id': 'k2omNsJKdZ3TxwxYSFJ', + 'title': 'md5:72ec24d7f84b8436fe1e89d198152adf', + 'ext': 'mp4', + 'uploader_id': 'x2dfupo', + 'duration': 331, + 'timestamp': 1652966015, + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/TxU4Z1YYCmtisTbMq/x1080', + 'tags': ['FELIX KROOS', 'EINFACH MAL LUPPEN', 'KROOS', 'FSV FORTUNA PANKOW', 'published', 'category.Amateurs', 'marketingpreset.Spreekick'], + 'age_limit': 0, + 'view_count': int, + 'upload_date': '20220519', + 'uploader': 'kicker.de', + 'description': 'md5:0c2060c899a91c8bf40f578f78c5846f', + 'like_count': int, + } + }] + + def _real_extract(self, url): + video_slug = self._match_id(url) + + webpage = self._download_webpage(url, video_slug) + dailymotion_video_id = self._search_regex( + r'data-dmprivateid\s*=\s*[\'"](?P<video_id>\w+)', webpage, + 'video id', group='video_id') + + return self.url_result( + f'https://www.dailymotion.com/video/{dailymotion_video_id}', + ie=DailymotionIE, video_title=self._html_extract_title(webpage)) diff --git a/yt_dlp/extractor/kth.py b/yt_dlp/extractor/kth.py new file mode 100644 index 000000000..e17c6db91 --- /dev/null +++ b/yt_dlp/extractor/kth.py @@ -0,0 +1,28 @@ +from .common import InfoExtractor +from ..utils import smuggle_url + + +class KTHIE(InfoExtractor): + _VALID_URL = r'https?://play\.kth\.se/(?:[^/]+/)+(?P<id>[a-z0-9_]+)' + _TEST = { + 'url': 'https://play.kth.se/media/Lunch+breakA+De+nya+aff%C3%A4rerna+inom+Fordonsdalen/0_uoop6oz9', + 'md5': 'd83ada6d00ca98b73243a88efe19e8a6', + 'info_dict': { + 'id': '0_uoop6oz9', + 'ext': 'mp4', + 'title': 'md5:bd1d6931facb6828762a33e6ce865f37', + 'thumbnail': 're:https?://.+/thumbnail/.+', + 'duration': 3516, + 'timestamp': 1647345358, + 'upload_date': '20220315', + 'uploader_id': 'md5:0ec23e33a89e795a4512930c8102509f', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + result = self.url_result( + smuggle_url('kaltura:308:%s' % video_id, { + 'service_url': 'https://api.kaltura.nordu.net'}), + 'Kaltura') + return result diff --git a/yt_dlp/extractor/kusi.py b/yt_dlp/extractor/kusi.py index f1221ef1b..4fec2c2b2 100644 --- a/yt_dlp/extractor/kusi.py +++ b/yt_dlp/extractor/kusi.py @@ -1,10 +1,10 @@ import random +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus from ..utils import ( - int_or_none, float_or_none, + int_or_none, timeconvert, update_url_query, xpath_text, @@ -66,7 +66,7 @@ class KUSIIE(InfoExtractor): formats = [] for quality in quality_options: formats.append({ - 'url': compat_urllib_parse_unquote_plus(quality.attrib['url']), + 'url': urllib.parse.unquote_plus(quality.attrib['url']), 'height': int_or_none(quality.attrib.get('height')), 'width': int_or_none(quality.attrib.get('width')), 'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000), diff --git a/yt_dlp/extractor/lastfm.py b/yt_dlp/extractor/lastfm.py index 7ba666d06..f14198cfd 100644 --- a/yt_dlp/extractor/lastfm.py +++ b/yt_dlp/extractor/lastfm.py @@ -15,7 +15,7 @@ class LastFMPlaylistBaseIE(InfoExtractor): for page_number in range(start_page_number, (last_page_number or start_page_number) + 1): webpage = self._download_webpage( url, playlist_id, - note='Downloading page %d%s' % (page_number, format_field(last_page_number, template=' of %d')), + note='Downloading page %d%s' % (page_number, format_field(last_page_number, None, ' of %d')), query={'page': page_number}) page_entries = [ self.url_result(player_url, 'Youtube') diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index 953ce2e18..909720e8b 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -192,10 +192,11 @@ class LBRYIE(LBRYBaseIE): claim_id, is_live = result['signing_channel']['claim_id'], True headers = {'referer': 'https://player.odysee.live/'} live_data = self._download_json( - f'https://api.live.odysee.com/v1/odysee/live/{claim_id}', claim_id, + 'https://api.odysee.live/livestream/is_live', claim_id, + query={'channel_claim_id': claim_id}, note='Downloading livestream JSON metadata')['data'] - streaming_url = final_url = live_data.get('url') - if not final_url and not live_data.get('live'): + streaming_url = final_url = live_data.get('VideoURL') + if not final_url and not live_data.get('Live'): self.raise_no_formats('This stream is not live', True, claim_id) else: raise UnsupportedError(url) diff --git a/yt_dlp/extractor/line.py b/yt_dlp/extractor/line.py index 63b6c002a..09c512e50 100644 --- a/yt_dlp/extractor/line.py +++ b/yt_dlp/extractor/line.py @@ -34,7 +34,7 @@ class LineLiveBaseIE(InfoExtractor): 'timestamp': int_or_none(item.get('createdAt')), 'channel': channel.get('name'), 'channel_id': channel_id, - 'channel_url': format_field(channel_id, template='https://live.line.me/channels/%s'), + 'channel_url': format_field(channel_id, None, 'https://live.line.me/channels/%s'), 'duration': int_or_none(item.get('archiveDuration')), 'view_count': int_or_none(item.get('viewerCount')), 'comment_count': int_or_none(item.get('chatCount')), diff --git a/yt_dlp/extractor/lnkgo.py b/yt_dlp/extractor/lnkgo.py index 3bb52777f..9ea08ec5a 100644 --- a/yt_dlp/extractor/lnkgo.py +++ b/yt_dlp/extractor/lnkgo.py @@ -1,7 +1,7 @@ from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( clean_html, - compat_str, format_field, int_or_none, parse_iso8601, diff --git a/yt_dlp/extractor/medaltv.py b/yt_dlp/extractor/medaltv.py index 527b50cb0..5f0a9b42f 100644 --- a/yt_dlp/extractor/medaltv.py +++ b/yt_dlp/extractor/medaltv.py @@ -116,7 +116,7 @@ class MedalTVIE(InfoExtractor): author = try_get( hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {} author_id = str_or_none(author.get('id')) - author_url = format_field(author_id, template='https://medal.tv/users/%s') + author_url = format_field(author_id, None, 'https://medal.tv/users/%s') return { 'id': video_id, diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index 60c454dda..f396c1bd3 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -20,10 +20,10 @@ class MediasetIE(ThePlatformBaseIE): (?: mediaset:| https?:// - (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ + (?:\w+\.)+mediaset\.it/ (?: (?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_| - player/index\.html\?.*?\bprogramGuid= + player/(?:v\d+/)?index\.html\?.*?\bprogramGuid= ) )(?P<id>[0-9A-Z]{16,}) ''' @@ -159,6 +159,12 @@ class MediasetIE(ThePlatformBaseIE): }, { 'url': 'https://www.mediasetplay.mediaset.it/movie/herculeslaleggendahainizio/hercules-la-leggenda-ha-inizio_F305927501000102', 'only_matching': True, + }, { + 'url': 'https://mediasetinfinity.mediaset.it/video/braveandbeautiful/episodio-113_F310948005000402', + 'only_matching': True, + }, { + 'url': 'https://static3.mediasetplay.mediaset.it/player/v2/index.html?partnerId=wittytv&configId=&programGuid=FD00000000153323', + 'only_matching': True, }] @staticmethod @@ -286,7 +292,7 @@ class MediasetShowIE(MediasetIE): _VALID_URL = r'''(?x) (?: https?:// - (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ + (\w+\.)+mediaset\.it/ (?: (?:fiction|programmi-tv|serie-tv|kids)/(?:.+?/)? (?:[a-z-]+)_SE(?P<id>\d{12}) diff --git a/yt_dlp/extractor/metacafe.py b/yt_dlp/extractor/metacafe.py index 31fec86d2..048c74e68 100644 --- a/yt_dlp/extractor/metacafe.py +++ b/yt_dlp/extractor/metacafe.py @@ -1,17 +1,14 @@ import json import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse, - compat_urllib_parse_unquote, -) +from ..compat import compat_parse_qs, compat_urllib_parse_unquote from ..utils import ( - determine_ext, ExtractorError, - int_or_none, + determine_ext, get_element_by_attribute, + int_or_none, mimetype2ext, ) @@ -143,7 +140,7 @@ class MetacafeIE(InfoExtractor): headers = { # Disable family filter - 'Cookie': 'user=%s; ' % compat_urllib_parse.quote(json.dumps({'ffilter': False})) + 'Cookie': 'user=%s; ' % urllib.parse.quote(json.dumps({'ffilter': False})) } # AnyClip videos require the flashversion cookie so that we get the link diff --git a/yt_dlp/extractor/minds.py b/yt_dlp/extractor/minds.py index 393d20604..8079bbb39 100644 --- a/yt_dlp/extractor/minds.py +++ b/yt_dlp/extractor/minds.py @@ -118,7 +118,7 @@ class MindsIE(MindsBaseIE): 'timestamp': int_or_none(entity.get('time_created')), 'uploader': strip_or_none(owner.get('name')), 'uploader_id': uploader_id, - 'uploader_url': format_field(uploader_id, template='https://www.minds.com/%s'), + 'uploader_url': format_field(uploader_id, None, 'https://www.minds.com/%s'), 'view_count': int_or_none(entity.get('play:count')), 'like_count': int_or_none(entity.get('thumbs:up:count')), 'dislike_count': int_or_none(entity.get('thumbs:down:count')), diff --git a/yt_dlp/extractor/mirrorcouk.py b/yt_dlp/extractor/mirrorcouk.py new file mode 100644 index 000000000..7b4f95b4b --- /dev/null +++ b/yt_dlp/extractor/mirrorcouk.py @@ -0,0 +1,98 @@ +from .common import InfoExtractor +from ..utils import unescapeHTML + + +class MirrorCoUKIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mirror\.co\.uk/[/+[\w-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.mirror.co.uk/tv/tv-news/love-island-fans-baffled-after-27163139', + 'info_dict': { + 'id': 'voyyS7SV', + 'ext': 'mp4', + 'title': 'Love Island: Gemma Owen enters the villa', + 'description': 'Love Island: Michael Owen\'s daughter Gemma Owen enters the villa.', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/voyyS7SV/poster.jpg?width=720', + 'display_id': '27163139', + 'timestamp': 1654547895, + 'duration': 57.0, + 'upload_date': '20220606', + }, + }, { + 'url': 'https://www.mirror.co.uk/3am/celebrity-news/michael-jacksons-son-blankets-new-25344890', + 'info_dict': { + 'id': 'jyXpdvxp', + 'ext': 'mp4', + 'title': 'Michael Jackson’s son Bigi calls for action on climate change', + 'description': 'md5:d39ceaba2b7a615b4ca6557e7bc40222', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/jyXpdvxp/poster.jpg?width=720', + 'display_id': '25344890', + 'timestamp': 1635749907, + 'duration': 56.0, + 'upload_date': '20211101', + }, + }, { + 'url': 'https://www.mirror.co.uk/sport/football/news/antonio-conte-next-tottenham-manager-25346042', + 'info_dict': { + 'id': 'q6FkKa4p', + 'ext': 'mp4', + 'title': 'Nuno sacked by Tottenham after fifth Premier League defeat of the season', + 'description': 'Nuno Espirito Santo has been sacked as Tottenham boss after only four months in charge.', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/q6FkKa4p/poster.jpg?width=720', + 'display_id': '25346042', + 'timestamp': 1635763157, + 'duration': 40.0, + 'upload_date': '20211101', + }, + }, { + 'url': 'https://www.mirror.co.uk/3am/celebrity-news/johnny-depp-splashes-50k-curry-27160737', + 'info_dict': { + 'id': 'IT0oa1nH', + 'ext': 'mp4', + 'title': 'Johnny Depp Leaves The Grand Hotel in Birmingham', + 'description': 'Johnny Depp Leaves The Grand Hotel in Birmingham.', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/IT0oa1nH/poster.jpg?width=720', + 'display_id': '27160737', + 'timestamp': 1654524120, + 'duration': 65.0, + 'upload_date': '20220606', + }, + }, { + 'url': 'https://www.mirror.co.uk/tv/tv-news/love-islands-liam-could-first-27162602', + 'info_dict': { + 'id': 'EaPr5Z2j', + 'ext': 'mp4', + 'title': 'Love Island: Davide reveals plot twist after receiving text', + 'description': 'Love Island: Davide reveals plot twist after receiving text', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/EaPr5Z2j/poster.jpg?width=720', + 'display_id': '27162602', + 'timestamp': 1654552597, + 'duration': 23.0, + 'upload_date': '20220606', + }, + }, { + 'url': 'https://www.mirror.co.uk/news/uk-news/william-kate-sent-message-george-27160572', + 'info_dict': { + 'id': 'ygtceXIu', + 'ext': 'mp4', + 'title': 'Prince William and Kate arrive in Wales with George and Charlotte', + 'description': 'Prince William and Kate Middleton arrive in Wales with children Prince George and Princess Charlotte.', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/ygtceXIu/poster.jpg?width=720', + 'display_id': '27160572', + 'timestamp': 1654349678, + 'duration': 106.0, + 'upload_date': '20220604', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + data = self._search_json(r'div\s+class="json-placeholder"\s+data-json="', + webpage, 'data', display_id, transform_source=unescapeHTML)['videoData'] + + return { + '_type': 'url_transparent', + 'url': f'jwplatform:{data["videoId"]}', + 'ie_key': 'JWPlatform', + 'display_id': display_id, + } diff --git a/yt_dlp/extractor/mixcloud.py b/yt_dlp/extractor/mixcloud.py index 796f268f4..a77d7e682 100644 --- a/yt_dlp/extractor/mixcloud.py +++ b/yt_dlp/extractor/mixcloud.py @@ -3,7 +3,6 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_b64decode, - compat_chr, compat_ord, compat_str, compat_urllib_parse_unquote, @@ -72,7 +71,7 @@ class MixcloudIE(MixcloudBaseIE): def _decrypt_xor_cipher(key, ciphertext): """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR.""" return ''.join([ - compat_chr(compat_ord(ch) ^ compat_ord(k)) + chr(compat_ord(ch) ^ compat_ord(k)) for ch, k in zip(ciphertext, itertools.cycle(key))]) def _real_extract(self, url): diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py index a230d9cdd..c3b063ffe 100644 --- a/yt_dlp/extractor/naver.py +++ b/yt_dlp/extractor/naver.py @@ -1,13 +1,19 @@ +import itertools import re +from urllib.parse import urlparse, parse_qs from .common import InfoExtractor from ..utils import ( + ExtractorError, clean_html, dict_get, - ExtractorError, int_or_none, + merge_dicts, parse_duration, + traverse_obj, + try_call, try_get, + unified_timestamp, update_url_query, ) @@ -247,3 +253,134 @@ class NaverLiveIE(InfoExtractor): 'categories': [meta.get('categoryId')], 'is_live': True } + + +class NaverNowIE(NaverBaseIE): + IE_NAME = 'navernow' + _VALID_URL = r'https?://now\.naver\.com/show/(?P<id>[0-9]+)' + _PAGE_SIZE = 30 + _API_URL = 'https://apis.naver.com/now_web/nowcms-api-xhmac/cms/v1' + _TESTS = [{ + 'url': 'https://now.naver.com/show/4759?shareReplayId=5901#replay=', + 'md5': 'e05854162c21c221481de16b2944a0bc', + 'info_dict': { + 'id': '4759-5901', + 'title': '아이키X노제\r\n💖꽁냥꽁냥💖(1)', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1650369600, + 'upload_date': '20220419', + 'uploader_id': 'now', + 'view_count': int, + }, + 'params': { + 'noplaylist': True, + } + }, { + 'url': 'https://now.naver.com/show/4759?shareHightlight=1078#highlight=', + 'md5': '9f6118e398aa0f22b2152f554ea7851b', + 'info_dict': { + 'id': '4759-1078', + 'title': '아이키: 나 리정한테 흔들렸어,,, 질투 폭발하는 노제 여보😾 [아이키의 떰즈업]ㅣ네이버 NOW.', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg', + 'upload_date': '20220504', + 'timestamp': 1651648042, + 'uploader_id': 'now', + 'view_count': int, + }, + 'params': { + 'noplaylist': True, + }, + }, { + 'url': 'https://now.naver.com/show/4759', + 'info_dict': { + 'id': '4759', + 'title': '아이키의 떰즈업', + }, + 'playlist_mincount': 48 + }, { + 'url': 'https://now.naver.com/show/4759?shareReplayId=5901#replay', + 'info_dict': { + 'id': '4759', + 'title': '아이키의 떰즈업', + }, + 'playlist_mincount': 48, + }, { + 'url': 'https://now.naver.com/show/4759?shareHightlight=1078#highlight=', + 'info_dict': { + 'id': '4759', + 'title': '아이키의 떰즈업', + }, + 'playlist_mincount': 48, + }] + + def _extract_replay(self, show_id, replay_id): + vod_info = self._download_json(f'{self._API_URL}/shows/{show_id}/vod/{replay_id}', replay_id) + in_key = self._download_json(f'{self._API_URL}/shows/{show_id}/vod/{replay_id}/inkey', replay_id)['inKey'] + return merge_dicts({ + 'id': f'{show_id}-{replay_id}', + 'title': traverse_obj(vod_info, ('episode', 'title')), + 'timestamp': unified_timestamp(traverse_obj(vod_info, ('episode', 'start_time'))), + 'thumbnail': vod_info.get('thumbnail_image_url'), + }, self._extract_video_info(replay_id, vod_info['video_id'], in_key)) + + def _extract_show_replays(self, show_id): + page = 0 + while True: + show_vod_info = self._download_json( + f'{self._API_URL}/vod-shows/{show_id}', show_id, + query={'offset': page * self._PAGE_SIZE, 'limit': self._PAGE_SIZE}, + note=f'Downloading JSON vod list for show {show_id} - page {page}' + )['response']['result'] + for v in show_vod_info.get('vod_list') or []: + yield self._extract_replay(show_id, v['id']) + + if try_call(lambda: show_vod_info['count'] <= self._PAGE_SIZE * (page + 1)): + break + page += 1 + + def _extract_show_highlights(self, show_id, highlight_id=None): + page = 0 + while True: + highlights_videos = self._download_json( + f'{self._API_URL}/shows/{show_id}/highlights/videos/', show_id, + query={'offset': page * self._PAGE_SIZE, 'limit': self._PAGE_SIZE}, + note=f'Downloading JSON highlights for show {show_id} - page {page}') + + for highlight in highlights_videos.get('results') or []: + if highlight_id and highlight.get('id') != int(highlight_id): + continue + yield merge_dicts({ + 'id': f'{show_id}-{highlight["id"]}', + 'title': highlight.get('title'), + 'timestamp': unified_timestamp(highlight.get('regdate')), + 'thumbnail': highlight.get('thumbnail_url'), + }, self._extract_video_info(highlight['id'], highlight['video_id'], highlight['video_inkey'])) + + if try_call(lambda: highlights_videos['count'] <= self._PAGE_SIZE * (page + 1)): + break + page += 1 + + def _extract_highlight(self, show_id, highlight_id): + try: + return next(self._extract_show_highlights(show_id, highlight_id)) + except StopIteration: + raise ExtractorError(f'Unable to find highlight {highlight_id} for show {show_id}') + + def _real_extract(self, url): + show_id = self._match_id(url) + qs = parse_qs(urlparse(url).query) + + if not self._yes_playlist(show_id, qs.get('shareHightlight')): + return self._extract_highlight(show_id, qs['shareHightlight'][0]) + elif not self._yes_playlist(show_id, qs.get('shareReplayId')): + return self._extract_replay(show_id, qs['shareReplayId'][0]) + + show_info = self._download_json( + f'{self._API_URL}/shows/{show_id}', show_id, + note=f'Downloading JSON vod list for show {show_id}') + + return self.playlist_result( + itertools.chain(self._extract_show_replays(show_id), self._extract_show_highlights(show_id)), + show_id, show_info.get('title')) diff --git a/yt_dlp/extractor/ndr.py b/yt_dlp/extractor/ndr.py index de0142ccf..ad8dbd7a7 100644 --- a/yt_dlp/extractor/ndr.py +++ b/yt_dlp/extractor/ndr.py @@ -1,11 +1,15 @@ +import re + from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse from ..utils import ( determine_ext, + ExtractorError, int_or_none, - parse_duration, + merge_dicts, + parse_iso8601, qualities, try_get, - unified_strdate, urljoin, ) @@ -14,120 +18,139 @@ class NDRBaseIE(InfoExtractor): def _real_extract(self, url): mobj = self._match_valid_url(url) display_id = next(group for group in mobj.groups() if group) - id = mobj.group('id') webpage = self._download_webpage(url, display_id) - return self._extract_embed(webpage, display_id, id) + return self._extract_embed(webpage, display_id, url) class NDRIE(NDRBaseIE): IE_NAME = 'ndr' IE_DESC = 'NDR.de - Norddeutscher Rundfunk' - _VALID_URL = r'https?://(?:www\.)?(?:daserste\.)?ndr\.de/(?:[^/]+/)*(?P<display_id>[^/?#]+),(?P<id>[\da-z]+)\.html' + _VALID_URL = r'https?://(?:\w+\.)*ndr\.de/(?:[^/]+/)*(?P<id>[^/?#]+),[\da-z]+\.html' _TESTS = [{ + # httpVideo, same content id 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', + 'md5': '6515bc255dc5c5f8c85bbc38e035a659', 'info_dict': { 'id': 'hafengeburtstag988', + 'display_id': 'Party-Poette-und-Parade', 'ext': 'mp4', 'title': 'Party, Pötte und Parade', - 'thumbnail': 'https://www.ndr.de/fernsehen/hafengeburtstag990_v-contentxl.jpg', 'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c', - 'series': None, - 'channel': 'NDR Fernsehen', - 'upload_date': '20150508', + 'uploader': 'ndrtv', + 'timestamp': 1431255671, + 'upload_date': '20150510', 'duration': 3498, }, - }, { - 'url': 'https://www.ndr.de/sport/fussball/Rostocks-Matchwinner-Froede-Ein-Hansa-Debuet-wie-im-Maerchen,hansa10312.html', - 'only_matching': True - }, { - 'url': 'https://www.ndr.de/nachrichten/niedersachsen/kommunalwahl_niedersachsen_2021/Grosse-Parteien-zufrieden-mit-Ergebnissen-der-Kommunalwahl,kommunalwahl1296.html', - 'info_dict': { - 'id': 'kommunalwahl1296', - 'ext': 'mp4', - 'title': 'Die Spitzenrunde: Die Wahl aus Sicht der Landespolitik', - 'thumbnail': 'https://www.ndr.de/fernsehen/screenshot1194912_v-contentxl.jpg', - 'description': 'md5:5c6e2ad744cef499135735a1036d7aa7', - 'series': 'Hallo Niedersachsen', - 'channel': 'NDR Fernsehen', - 'upload_date': '20210913', - 'duration': 438, + 'params': { + 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { - 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', + # httpVideo, different content id + 'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html', + 'md5': '1043ff203eab307f0c51702ec49e9a71', 'info_dict': { - 'id': 'sendung1091858', + 'id': 'osna272', + 'display_id': '40-Osnabrueck-spielt-sich-in-einen-Rausch', 'ext': 'mp4', - 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', - 'thumbnail': 'https://www.ndr.de/fernsehen/screenshot983938_v-contentxl.jpg', - 'description': 'md5:700f6de264010585012a72f97b0ac0c9', - 'series': 'extra 3', - 'channel': 'NDR Fernsehen', - 'upload_date': '20201111', - 'duration': 1749, - } + 'title': 'Osnabrück - Wehen Wiesbaden: Die Highlights', + 'description': 'md5:32e9b800b3d2d4008103752682d5dc01', + 'uploader': 'ndrtv', + 'timestamp': 1442059200, + 'upload_date': '20150912', + 'duration': 510, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'No longer available', }, { + # httpAudio, same content id 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', + 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', 'info_dict': { 'id': 'audio51535', + 'display_id': 'La-Valette-entgeht-der-Hinrichtung', 'ext': 'mp3', 'title': 'La Valette entgeht der Hinrichtung', - 'thumbnail': 'https://www.ndr.de/mediathek/mediathekbild140_v-podcast.jpg', 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', - 'upload_date': '20140729', - 'duration': 884.0, + 'uploader': 'ndrinfo', + 'timestamp': 1631711863, + 'upload_date': '20210915', + 'duration': 884, }, - 'expected_warnings': ['unable to extract json url'], + 'params': { + 'skip_download': True, + }, + }, { + # with subtitles + 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', + 'info_dict': { + 'id': 'extra18674', + 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', + 'ext': 'mp4', + 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', + 'description': 'md5:700f6de264010585012a72f97b0ac0c9', + 'uploader': 'ndrtv', + 'upload_date': '20201207', + 'timestamp': 1614349457, + 'duration': 1749, + 'subtitles': { + 'de': [{ + 'ext': 'ttml', + 'url': r're:^https://www\.ndr\.de.+', + }], + }, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', + 'only_matching': True, }] - def _extract_embed(self, webpage, display_id, id): - formats = [] - base_url = 'https://www.ndr.de' - json_url = self._search_regex(r'<iframe[^>]+src=\"([^\"]+)_theme-ndrde[^\.]*\.html\"', webpage, - 'json url', fatal=False) - if json_url: - data_json = self._download_json(base_url + json_url.replace('ardplayer_image', 'ardjson_image') + '.json', - id, fatal=False) - info_json = data_json.get('_info', {}) - media_json = try_get(data_json, lambda x: x['_mediaArray'][0]['_mediaStreamArray']) - for media in media_json: - if media.get('_quality') == 'auto': - formats.extend(self._extract_m3u8_formats(media['_stream'], id)) - subtitles = {} - sub_url = data_json.get('_subtitleUrl') - if sub_url: - subtitles.setdefault('de', []).append({ - 'url': base_url + sub_url, - }) - self._sort_formats(formats) - return { - 'id': id, - 'title': info_json.get('clipTitle'), - 'thumbnail': base_url + data_json.get('_previewImage'), - 'description': info_json.get('clipDescription'), - 'series': info_json.get('seriesTitle') or None, - 'channel': info_json.get('channelTitle'), - 'upload_date': unified_strdate(info_json.get('clipDate')), - 'duration': data_json.get('_duration'), - 'formats': formats, - 'subtitles': subtitles, - } - else: - json_url = base_url + self._search_regex(r'apiUrl\s?=\s?\'([^\']+)\'', webpage, 'json url').replace( - '_belongsToPodcast-', '') - data_json = self._download_json(json_url, id, fatal=False) - return { - 'id': id, - 'title': data_json.get('title'), - 'thumbnail': base_url + data_json.get('poster'), - 'description': data_json.get('summary'), - 'upload_date': unified_strdate(data_json.get('publicationDate')), - 'duration': parse_duration(data_json.get('duration')), - 'formats': [{ - 'url': try_get(data_json, (lambda x: x['audio'][0]['url'], lambda x: x['files'][0]['url'])), - 'vcodec': 'none', - 'ext': 'mp3', - }], - } + def _extract_embed(self, webpage, display_id, url): + embed_url = ( + self._html_search_meta( + 'embedURL', webpage, 'embed URL', + default=None) + or self._search_regex( + r'\bembedUrl["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'embed URL', group='url', default=None) + or self._search_regex( + r'\bvar\s*sophoraID\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'embed URL', group='url', default='')) + # some more work needed if we only found sophoraID + if re.match(r'^[a-z]+\d+$', embed_url): + # get the initial part of the url path,. eg /panorama/archiv/2022/ + parsed_url = compat_urllib_parse_urlparse(url) + path = self._search_regex(r'(.+/)%s' % display_id, parsed_url.path or '', 'embed URL', default='') + # find tell-tale image with the actual ID + ndr_id = self._search_regex(r'%s([a-z]+\d+)(?!\.)\b' % (path, ), webpage, 'embed URL', default=None) + # or try to use special knowledge! + NDR_INFO_URL_TPL = 'https://www.ndr.de/info/%s-player.html' + embed_url = 'ndr:%s' % (ndr_id, ) if ndr_id else NDR_INFO_URL_TPL % (embed_url, ) + if not embed_url: + raise ExtractorError('Unable to extract embedUrl') + + description = self._search_regex( + r'<p[^>]+itemprop="description">([^<]+)</p>', + webpage, 'description', default=None) or self._og_search_description(webpage) + timestamp = parse_iso8601( + self._search_regex( + (r'<span[^>]+itemprop="(?:datePublished|uploadDate)"[^>]+content="(?P<cont>[^"]+)"', + r'\bvar\s*pdt\s*=\s*(?P<q>["\'])(?P<cont>(?:(?!(?P=q)).)+)(?P=q)', ), + webpage, 'upload date', group='cont', default=None)) + info = self._search_json_ld(webpage, display_id, default={}) + return merge_dicts({ + '_type': 'url_transparent', + 'url': embed_url, + 'display_id': display_id, + 'description': description, + 'timestamp': timestamp, + }, info) class NJoyIE(NDRBaseIE): @@ -151,19 +174,19 @@ class NJoyIE(NDRBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpVideo, different content id 'url': 'http://www.n-joy.de/musik/Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-,felixjaehn168.html', 'md5': '417660fffa90e6df2fda19f1b40a64d8', 'info_dict': { - 'id': 'dockville882', + 'id': 'livestream283', 'display_id': 'Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-', - 'ext': 'mp4', - 'title': '"Ich hab noch nie" mit Felix Jaehn', - 'description': 'md5:85dd312d53be1b99e1f998a16452a2f3', + 'ext': 'mp3', + 'title': 'Das frueheste DJ Set des Nordens live mit Felix Jaehn', + 'description': 'md5:681698f527b8601e511e7b79edde7d2c', 'uploader': 'njoy', - 'upload_date': '20150822', - 'duration': 211, + 'upload_date': '20210830', }, 'params': { 'skip_download': True, @@ -173,18 +196,25 @@ class NJoyIE(NDRBaseIE): 'only_matching': True, }] - def _extract_embed(self, webpage, display_id, id): + def _extract_embed(self, webpage, display_id, url=None): + # find tell-tale URL with the actual ID, or ... video_id = self._search_regex( - r'<iframe[^>]+id="pp_([\da-z]+)"', webpage, 'embed id') - description = self._search_regex( - r'<div[^>]+class="subline"[^>]*>[^<]+</div>\s*<p>([^<]+)</p>', - webpage, 'description', fatal=False) + (r'''\bsrc\s*=\s*["']?(?:/\w+)+/([a-z]+\d+)(?!\.)\b''', + r'<iframe[^>]+id="pp_([\da-z]+)"', ), + webpage, 'NDR id', default=None) + + description = ( + self._html_search_meta('description', webpage) + or self._search_regex( + r'<div[^>]+class="subline"[^>]*>[^<]+</div>\s*<p>([^<]+)</p>', + webpage, 'description', fatal=False)) return { '_type': 'url_transparent', 'ie_key': 'NDREmbedBase', 'url': 'ndr:%s' % video_id, 'display_id': display_id, 'description': description, + 'title': display_id.replace('-', ' ').strip(), } @@ -287,7 +317,7 @@ class NDREmbedBaseIE(InfoExtractor): class NDREmbedIE(NDREmbedBaseIE): IE_NAME = 'ndr:embed' - _VALID_URL = r'https?://(?:www\.)?(?:daserste\.)?ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html' + _VALID_URL = r'https?://(?:\w+\.)*ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:(?:ard)?player|externalPlayer)\.html' _TESTS = [{ 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', @@ -300,6 +330,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'upload_date': '20150907', 'duration': 132, }, + 'skip': 'No longer available', }, { 'url': 'http://www.ndr.de/ndr2/events/soundcheck/soundcheck3366-player.html', 'md5': '002085c44bae38802d94ae5802a36e78', @@ -315,6 +346,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { 'url': 'http://www.ndr.de/info/audio51535-player.html', 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', @@ -324,7 +356,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'title': 'La Valette entgeht der Hinrichtung', 'is_live': False, 'uploader': 'ndrinfo', - 'upload_date': '20140729', + 'upload_date': '20210915', 'duration': 884, }, 'params': { @@ -345,15 +377,17 @@ class NDREmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpVideoLive 'url': 'http://www.ndr.de/fernsehen/livestream/livestream217-externalPlayer.html', 'info_dict': { 'id': 'livestream217', - 'ext': 'flv', + 'ext': 'mp4', 'title': r're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, - 'upload_date': '20150910', + 'upload_date': '20210409', + 'uploader': 'ndrtv', }, 'params': { 'skip_download': True, @@ -391,9 +425,10 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'ext': 'mp4', 'title': 'Zehn Jahre Reeperbahn Festival - die Doku', 'is_live': False, - 'upload_date': '20150807', + 'upload_date': '20200826', 'duration': 1011, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { # httpAudio 'url': 'http://www.n-joy.de/news_wissen/stefanrichter100-player_image-d5e938b1-f21a-4b9a-86b8-aaba8bca3a13_theme-n-joy.html', @@ -410,6 +445,7 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpAudioLive, no explicit ext 'url': 'http://www.n-joy.de/news_wissen/webradioweltweit100-player_image-3fec0484-2244-4565-8fb8-ed25fd28b173_theme-n-joy.html', @@ -419,7 +455,7 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'title': r're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, 'uploader': 'njoy', - 'upload_date': '20150810', + 'upload_date': '20210830', }, 'params': { 'skip_download': True, diff --git a/yt_dlp/extractor/ndtv.py b/yt_dlp/extractor/ndtv.py index fbb033169..bfe52f77d 100644 --- a/yt_dlp/extractor/ndtv.py +++ b/yt_dlp/extractor/ndtv.py @@ -1,13 +1,7 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote_plus -) -from ..utils import ( - parse_duration, - remove_end, - unified_strdate, - urljoin -) +from ..utils import parse_duration, remove_end, unified_strdate, urljoin class NDTVIE(InfoExtractor): @@ -80,7 +74,7 @@ class NDTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # '__title' does not contain extra words such as sub-site name, "Video" etc. - title = compat_urllib_parse_unquote_plus( + title = urllib.parse.unquote_plus( self._search_regex(r"__title\s*=\s*'([^']+)'", webpage, 'title', default=None) or self._og_search_title(webpage)) diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index ff9a2adf0..7057b8b26 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -1,14 +1,11 @@ import itertools import json import time -import urllib +import urllib.error +import urllib.parse -from ..utils import ( - ExtractorError, - parse_iso8601, - try_get, -) from .common import InfoExtractor +from ..utils import ExtractorError, parse_iso8601, try_get class NebulaBaseIE(InfoExtractor): diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py index 4def7e76b..f9a67876a 100644 --- a/yt_dlp/extractor/neteasemusic.py +++ b/yt_dlp/extractor/neteasemusic.py @@ -1,18 +1,12 @@ -from hashlib import md5 +import itertools +import re from base64 import b64encode from datetime import datetime -import re +from hashlib import md5 from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_str, - compat_itertools_count, -) -from ..utils import ( - sanitized_Request, - float_or_none, -) +from ..compat import compat_str, compat_urllib_parse_urlencode +from ..utils import float_or_none, sanitized_Request class NetEaseMusicBaseIE(InfoExtractor): @@ -449,7 +443,7 @@ class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): name = None desc = None entries = [] - for offset in compat_itertools_count(start=0, step=self._PAGE_SIZE): + for offset in itertools.count(start=0, step=self._PAGE_SIZE): info = self.query_api( 'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d' % (self._PAGE_SIZE, dj_id, offset), diff --git a/yt_dlp/extractor/netverse.py b/yt_dlp/extractor/netverse.py new file mode 100644 index 000000000..f529682a3 --- /dev/null +++ b/yt_dlp/extractor/netverse.py @@ -0,0 +1,176 @@ +import functools + +from .common import InfoExtractor +from .dailymotion import DailymotionIE +from ..utils import ( + InAdvancePagedList, + smuggle_url, + traverse_obj, +) + + +class NetverseBaseIE(InfoExtractor): + _ENDPOINTS = { + 'watch': 'watchvideo', + 'video': 'watchvideo', + 'webseries': 'webseries', + } + + def _call_api(self, url, query={}): + display_id, sites_type = self._match_valid_url(url).group('display_id', 'type') + + json_data = self._download_json( + f'https://api.netverse.id/medias/api/v2/{self._ENDPOINTS[sites_type]}/{display_id}', + display_id, query=query) + + return display_id, json_data + + +class NetverseIE(NetverseBaseIE): + _VALID_URL = r'https?://(?:\w+\.)?netverse\.id/(?P<type>watch|video)/(?P<display_id>[^/?#&]+)' + _TESTS = [{ + # Watch video + 'url': 'https://www.netverse.id/watch/waktu-indonesia-bercanda-edisi-spesial-lebaran-2016', + 'info_dict': { + 'id': 'k4yhqUwINAGtmHx3NkL', + 'title': 'Waktu Indonesia Bercanda - Edisi Spesial Lebaran 2016', + 'ext': 'mp4', + 'season': 'Season 2016', + 'description': 'md5:fc27747c0aa85067b6967c816f01617c', + 'thumbnail': 'https://vplayed-uat.s3-ap-southeast-1.amazonaws.com/images/webseries/thumbnails/2021/11/619cfce45c827.jpeg', + 'episode_number': 22, + 'series': 'Waktu Indonesia Bercanda', + 'episode': 'Episode 22', + 'uploader_id': 'x2ir3vq', + 'age_limit': 0, + 'tags': [], + 'view_count': int, + 'display_id': 'waktu-indonesia-bercanda-edisi-spesial-lebaran-2016', + 'duration': 2990, + 'upload_date': '20210722', + 'timestamp': 1626919804, + 'like_count': int, + 'uploader': 'Net Prime', + } + }, { + # series + 'url': 'https://www.netverse.id/watch/jadoo-seorang-model', + 'info_dict': { + 'id': 'x88izwc', + 'title': 'Jadoo Seorang Model', + 'ext': 'mp4', + 'season': 'Season 2', + 'description': 'md5:c616e8e59d3edf2d3d506e3736120d99', + 'thumbnail': 'https://storage.googleapis.com/netprime-live/images/webseries/thumbnails/2021/11/619cf63f105d3.jpeg', + 'episode_number': 2, + 'series': 'Hello Jadoo', + 'episode': 'Episode 2', + 'view_count': int, + 'like_count': int, + 'display_id': 'jadoo-seorang-model', + 'uploader_id': 'x2ir3vq', + 'duration': 635, + 'timestamp': 1646372927, + 'tags': ['PG069497-hellojadooseason2eps2'], + 'upload_date': '20220304', + 'uploader': 'Net Prime', + 'age_limit': 0, + }, + 'skip': 'video get Geo-blocked for some country' + }, { + # non www host + 'url': 'https://netverse.id/watch/tetangga-baru', + 'info_dict': { + 'id': 'k4CNGz7V0HJ7vfwZbXy', + 'ext': 'mp4', + 'title': 'Tetangga Baru', + 'season': 'Season 1', + 'description': 'md5:ed6dd355bed84d139b1154c3d8d65957', + 'thumbnail': 'https://vplayed-uat.s3-ap-southeast-1.amazonaws.com/images/webseries/thumbnails/2021/11/619cfd9d32c5f.jpeg', + 'episode_number': 1, + 'series': 'Tetangga Masa Gitu', + 'episode': 'Episode 1', + 'timestamp': 1624538169, + 'view_count': int, + 'upload_date': '20210624', + 'age_limit': 0, + 'uploader_id': 'x2ir3vq', + 'like_count': int, + 'uploader': 'Net Prime', + 'tags': ['PG008534', 'tetangga', 'Baru'], + 'display_id': 'tetangga-baru', + 'duration': 1406, + }, + }, { + # /video url + 'url': 'https://www.netverse.id/video/pg067482-hellojadoo-season1', + 'title': 'Namaku Choi Jadoo', + 'info_dict': { + 'id': 'x887jzz', + 'ext': 'mp4', + 'thumbnail': 'https://storage.googleapis.com/netprime-live/images/webseries/thumbnails/2021/11/619cf63f105d3.jpeg', + 'season': 'Season 1', + 'episode_number': 1, + 'description': 'md5:c616e8e59d3edf2d3d506e3736120d99', + 'title': 'Namaku Choi Jadoo', + 'series': 'Hello Jadoo', + 'episode': 'Episode 1', + 'age_limit': 0, + 'like_count': int, + 'view_count': int, + 'tags': ['PG067482', 'PG067482-HelloJadoo-season1'], + 'duration': 780, + 'display_id': 'pg067482-hellojadoo-season1', + 'uploader_id': 'x2ir3vq', + 'uploader': 'Net Prime', + 'timestamp': 1645764984, + 'upload_date': '20220225', + }, + 'skip': 'This video get Geo-blocked for some country' + }] + + def _real_extract(self, url): + display_id, program_json = self._call_api(url) + videos = program_json['response']['videos'] + + return { + '_type': 'url_transparent', + 'ie_key': DailymotionIE.ie_key(), + 'url': smuggle_url(videos['dailymotion_url'], {'query': {'embedder': 'https://www.netverse.id'}}), + 'display_id': display_id, + 'title': videos.get('title'), + 'season': videos.get('season_name'), + 'thumbnail': traverse_obj(videos, ('program_detail', 'thumbnail_image')), + 'description': traverse_obj(videos, ('program_detail', 'description')), + 'episode_number': videos.get('episode_order'), + 'series': traverse_obj(videos, ('program_detail', 'title')), + } + + +class NetversePlaylistIE(NetverseBaseIE): + _VALID_URL = r'https?://(?:\w+\.)?netverse\.id/(?P<type>webseries)/(?P<display_id>[^/?#&]+)' + _TEST = { + 'url': 'https://netverse.id/webseries/tetangga-masa-gitu', + 'info_dict': { + 'id': 'tetangga-masa-gitu', + 'title': 'Tetangga Masa Gitu', + }, + 'playlist_count': 46, + } + + def parse_playlist(self, url, page_num): + _, playlist_json = self._call_api(url, query={'page': page_num + 1}) + for slug in traverse_obj(playlist_json, ('response', 'related', 'data', ..., 'slug')): + yield self.url_result(f'https://www.netverse.id/video/{slug}', NetverseIE) + + def _real_extract(self, url): + _, playlist_data = self._call_api(url) + webseries_related_info = playlist_data['response']['related'] + # TODO: get video from other season + # The season has id and the next season video is located at api_url/<season_id>?page=<page> + return self.playlist_result( + InAdvancePagedList(functools.partial(self.parse_playlist, url), + webseries_related_info['last_page'], + webseries_related_info['to'] - webseries_related_info['from'] + 1), + traverse_obj(playlist_data, ('response', 'webseries_info', 'slug')), + traverse_obj(playlist_data, ('response', 'webseries_info', 'title'))) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index cf2ec7b79..60d76d1b1 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -11,7 +11,7 @@ from ..utils import ( class NhkBaseIE(InfoExtractor): - _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json' + _API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json' _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand' _TYPE_REGEX = r'/(?P<type>video|audio)/' @@ -27,7 +27,7 @@ class NhkBaseIE(InfoExtractor): def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups() - if episode_id.isdigit(): + if len(episode_id) == 7: episode_id = episode_id[:4] + '-' + episode_id[4:] is_video = m_type == 'video' @@ -89,7 +89,8 @@ class NhkBaseIE(InfoExtractor): class NhkVodIE(NhkBaseIE): - _VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg + _VALID_URL = r'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ @@ -129,6 +130,19 @@ class NhkVodIE(NhkBaseIE): }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', 'only_matching': True, + }, { + # video, alphabetic character in ID #29670 + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/', + 'only_matching': True, + 'info_dict': { + 'id': 'qfjay6cg', + 'ext': 'mp4', + 'title': 'DESIGN TALKS plus - Fishermen’s Finery', + 'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448', + 'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$', + 'upload_date': '20210615', + 'timestamp': 1623722008, + } }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index a80b544f8..82fb27631 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -647,14 +647,14 @@ class NiconicoSeriesIE(InfoExtractor): 'id': '110226', 'title': 'ご立派ァ!のシリーズ', }, - 'playlist_mincount': 10, # as of 2021/03/17 + 'playlist_mincount': 10, }, { 'url': 'https://www.nicovideo.jp/series/12312/', 'info_dict': { 'id': '12312', 'title': 'バトルスピリッツ お勧めカード紹介(調整中)', }, - 'playlist_mincount': 97, # as of 2021/03/17 + 'playlist_mincount': 103, }, { 'url': 'https://nico.ms/series/203559', 'only_matching': True, @@ -672,7 +672,7 @@ class NiconicoSeriesIE(InfoExtractor): title = unescapeHTML(title) playlist = [ self.url_result(f'https://www.nicovideo.jp/watch/{v_id}', video_id=v_id) - for v_id in re.findall(r'href="/watch/([a-z0-9]+)" data-href="/watch/\1', webpage)] + for v_id in re.findall(r'data-href=[\'"](?:https://www\.nicovideo\.jp)?/watch/([a-z0-9]+)', webpage)] return self.playlist_result(playlist, list_id, title) diff --git a/yt_dlp/extractor/npr.py b/yt_dlp/extractor/npr.py index 6d93f154c..e677e862d 100644 --- a/yt_dlp/extractor/npr.py +++ b/yt_dlp/extractor/npr.py @@ -1,9 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - int_or_none, - qualities, - url_or_none, -) +from ..utils import int_or_none, qualities, traverse_obj, url_or_none class NprIE(InfoExtractor): @@ -51,6 +47,15 @@ class NprIE(InfoExtractor): # multimedia, no formats, stream 'url': 'https://www.npr.org/2020/02/14/805476846/laura-stevenson-tiny-desk-concert', 'only_matching': True, + }, { + 'url': 'https://www.npr.org/2022/03/15/1084896560/bonobo-tiny-desk-home-concert', + 'info_dict': { + 'id': '1086468851', + 'ext': 'mp4', + 'title': 'Bonobo: Tiny Desk (Home) Concert', + 'duration': 1061, + 'thumbnail': r're:^https?://media.npr.org/assets/img/.*\.jpg$', + }, }] def _real_extract(self, url): @@ -110,6 +115,12 @@ class NprIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( stream_url, stream_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + if not formats: + raw_json_ld = self._yield_json_ld(self._download_webpage(url, playlist_id), playlist_id, fatal=False) + m3u8_url = traverse_obj(list(raw_json_ld), (..., 'subjectOf', ..., 'embedUrl'), get_all=False) + formats = self._extract_m3u8_formats(m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False) + self._sort_formats(formats) entries.append({ diff --git a/yt_dlp/extractor/nrk.py b/yt_dlp/extractor/nrk.py index 553c55132..fcbafe418 100644 --- a/yt_dlp/extractor/nrk.py +++ b/yt_dlp/extractor/nrk.py @@ -3,18 +3,17 @@ import random import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError, compat_str from ..utils import ( - compat_HTTPError, - determine_ext, ExtractorError, + determine_ext, int_or_none, parse_duration, parse_iso8601, str_or_none, try_get, - urljoin, url_or_none, + urljoin, ) diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index 61e3a8b86..79dad09e3 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -9,7 +9,6 @@ from ..utils import ( ExtractorError, Popen, check_executable, - encodeArgument, get_exe_version, is_outdated_version, ) @@ -132,7 +131,7 @@ class PhantomJSwrapper: os.remove(self._TMP_FILES[name].name) def _save_cookies(self, url): - cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar) + cookies = cookie_jar_to_list(self.extractor.cookiejar) for cookie in cookies: if 'path' not in cookie: cookie['path'] = '/' @@ -213,16 +212,14 @@ class PhantomJSwrapper: else: self.extractor.to_screen(f'{video_id}: {note2}') - p = Popen( + stdout, stderr, returncode = Popen.run( [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = p.communicate_or_kill() - if p.returncode != 0: - raise ExtractorError( - 'Executing JS failed\n:' + encodeArgument(err)) + text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if returncode: + raise ExtractorError(f'Executing JS failed:\n{stderr}') with open(self._TMP_FILES['html'].name, 'rb') as f: html = f.read().decode('utf-8') self._load_cookies() - return (html, encodeArgument(out)) + return html, stdout diff --git a/yt_dlp/extractor/peloton.py b/yt_dlp/extractor/peloton.py index 8e50ffc7f..3fc05d1f2 100644 --- a/yt_dlp/extractor/peloton.py +++ b/yt_dlp/extractor/peloton.py @@ -1,11 +1,9 @@ import json import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urllib_parse, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, float_or_none, @@ -125,7 +123,7 @@ class PelotonIE(InfoExtractor): is_live = False if ride_data.get('content_format') == 'audio': - url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('vod_stream_url'), compat_urllib_parse.quote(token)) + url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('vod_stream_url'), urllib.parse.quote(token)) formats = [{ 'url': url, 'ext': 'm4a', @@ -138,9 +136,9 @@ class PelotonIE(InfoExtractor): url = 'https://members.onepeloton.com/.netlify/functions/m3u8-proxy?displayLanguage=en&acceptedSubtitles=%s&url=%s?hdnea=%s' % ( ','.join([re.sub('^([a-z]+)-([A-Z]+)$', r'\1', caption) for caption in ride_data['captions']]), ride_data['vod_stream_url'], - compat_urllib_parse.quote(compat_urllib_parse.quote(token))) + urllib.parse.quote(urllib.parse.quote(token))) elif ride_data.get('live_stream_url'): - url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('live_stream_url'), compat_urllib_parse.quote(token)) + url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('live_stream_url'), urllib.parse.quote(token)) is_live = True else: raise ExtractorError('Missing video URL') diff --git a/yt_dlp/extractor/playsuisse.py b/yt_dlp/extractor/playsuisse.py new file mode 100644 index 000000000..a635ac92f --- /dev/null +++ b/yt_dlp/extractor/playsuisse.py @@ -0,0 +1,147 @@ +import json + +from .common import InfoExtractor +from ..utils import int_or_none, traverse_obj + + +class PlaySuisseIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/watch/(?P<id>[0-9]+)' + _TESTS = [ + { + 'url': 'https://www.playsuisse.ch/watch/763211/0', + 'md5': '82df2a470b2dfa60c2d33772a8a60cf8', + 'info_dict': { + 'id': '763211', + 'ext': 'mp4', + 'title': 'Knochen', + 'description': 'md5:8ea7a8076ba000cd9e8bc132fd0afdd8', + 'duration': 3344, + 'series': 'Wilder', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Knochen', + 'episode_number': 1, + 'thumbnail': 'md5:9260abe0c0ec9b69914d0a10d54c5878' + } + }, + { + 'url': 'https://www.playsuisse.ch/watch/808675/0', + 'md5': '818b94c1d2d7c4beef953f12cb8f3e75', + 'info_dict': { + 'id': '808675', + 'ext': 'mp4', + 'title': 'Der Läufer', + 'description': 'md5:9f61265c7e6dcc3e046137a792b275fd', + 'duration': 5280, + 'episode': 'Der Läufer', + 'thumbnail': 'md5:44af7d65ee02bbba4576b131868bb783' + } + }, + { + 'url': 'https://www.playsuisse.ch/watch/817193/0', + 'md5': '1d6c066f92cd7fffd8b28a53526d6b59', + 'info_dict': { + 'id': '817193', + 'ext': 'mp4', + 'title': 'Die Einweihungsparty', + 'description': 'md5:91ebf04d3a42cb3ab70666acf750a930', + 'duration': 1380, + 'series': 'Nr. 47', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Die Einweihungsparty', + 'episode_number': 1, + 'thumbnail': 'md5:637585fb106e3a4bcd991958924c7e44' + } + } + ] + + _GRAPHQL_QUERY = ''' + query AssetWatch($assetId: ID!) { + assetV2(id: $assetId) { + ...Asset + episodes { + ...Asset + } + } + } + fragment Asset on AssetV2 { + id + name + description + duration + episodeNumber + seasonNumber + seriesName + medias { + type + url + } + thumbnail16x9 { + ...ImageDetails + } + thumbnail2x3 { + ...ImageDetails + } + thumbnail16x9WithTitle { + ...ImageDetails + } + thumbnail2x3WithTitle { + ...ImageDetails + } + } + fragment ImageDetails on AssetImage { + id + url + }''' + + def _get_media_data(self, media_id): + # NOTE In the web app, the "locale" header is used to switch between languages, + # However this doesn't seem to take effect when passing the header here. + response = self._download_json( + 'https://4bbepzm4ef.execute-api.eu-central-1.amazonaws.com/prod/graphql', + media_id, data=json.dumps({ + 'operationName': 'AssetWatch', + 'query': self._GRAPHQL_QUERY, + 'variables': {'assetId': media_id} + }).encode('utf-8'), + headers={'Content-Type': 'application/json', 'locale': 'de'}) + + return response['data']['assetV2'] + + def _real_extract(self, url): + media_id = self._match_id(url) + media_data = self._get_media_data(media_id) + info = self._extract_single(media_data) + if media_data.get('episodes'): + info.update({ + '_type': 'playlist', + 'entries': map(self._extract_single, media_data['episodes']), + }) + return info + + def _extract_single(self, media_data): + thumbnails = traverse_obj(media_data, lambda k, _: k.startswith('thumbnail')) + + formats, subtitles = [], {} + for media in traverse_obj(media_data, 'medias', default=[]): + if not media.get('url') or media.get('type') != 'HLS': + continue + f, subs = self._extract_m3u8_formats_and_subtitles( + media['url'], media_data['id'], 'mp4', m3u8_id='HLS', fatal=False) + formats.extend(f) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': media_data['id'], + 'title': media_data.get('name'), + 'description': media_data.get('description'), + 'thumbnails': thumbnails, + 'duration': int_or_none(media_data.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + 'series': media_data.get('seriesName'), + 'season_number': int_or_none(media_data.get('seasonNumber')), + 'episode': media_data.get('name'), + 'episode_number': int_or_none(media_data.get('episodeNumber')), + } diff --git a/yt_dlp/extractor/playvid.py b/yt_dlp/extractor/playvid.py index 5ffefc934..18aeda7de 100644 --- a/yt_dlp/extractor/playvid.py +++ b/yt_dlp/extractor/playvid.py @@ -1,14 +1,9 @@ import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, -) -from ..utils import ( - clean_html, - ExtractorError, -) +from ..compat import compat_urllib_parse_unquote +from ..utils import ExtractorError, clean_html class PlayvidIE(InfoExtractor): @@ -62,7 +57,7 @@ class PlayvidIE(InfoExtractor): val = videovars_match.group(2) if key == 'title': - video_title = compat_urllib_parse_unquote_plus(val) + video_title = urllib.parse.unquote_plus(val) if key == 'duration': try: duration = int(val) diff --git a/yt_dlp/extractor/pokemon.py b/yt_dlp/extractor/pokemon.py index eef0d02ca..0911893d4 100644 --- a/yt_dlp/extractor/pokemon.py +++ b/yt_dlp/extractor/pokemon.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -136,42 +134,3 @@ class PokemonWatchIE(InfoExtractor): 'episode': video_data.get('title'), 'episode_number': int_or_none(video_data.get('episode')), }) - - -class PokemonSoundLibraryIE(InfoExtractor): - _VALID_URL = r'https?://soundlibrary\.pokemon\.co\.jp' - - _TESTS = [{ - 'url': 'https://soundlibrary.pokemon.co.jp/', - 'info_dict': { - 'title': 'Pokémon Diamond and Pearl Sound Tracks', - }, - 'playlist_mincount': 149, - }] - - def _real_extract(self, url): - musicbox_webpage = self._download_webpage( - 'https://soundlibrary.pokemon.co.jp/musicbox', None, - 'Downloading list of songs') - song_titles = [x.group(1) for x in re.finditer(r'<span>([^>]+?)</span><br/>をてもち曲に加えます。', musicbox_webpage)] - song_titles = song_titles[4::2] - - # each songs don't have permalink; instead we return all songs at once - song_entries = [{ - 'id': f'pokemon-soundlibrary-{song_id}', - 'url': f'https://soundlibrary.pokemon.co.jp/api/assets/signing/sounds/wav/{song_id}.wav', - # note: the server always serves MP3 files, despite its extension of the URL above - 'ext': 'mp3', - 'acodec': 'mp3', - 'vcodec': 'none', - 'title': song_title, - 'track': song_title, - 'artist': 'Nintendo / Creatures Inc. / GAME FREAK inc.', - 'uploader': 'Pokémon', - 'release_year': 2006, - 'release_date': '20060928', - 'track_number': song_id, - 'album': 'Pokémon Diamond and Pearl', - } for song_id, song_title in enumerate(song_titles, 1)] - - return self.playlist_result(song_entries, playlist_title='Pokémon Diamond and Pearl Sound Tracks') diff --git a/yt_dlp/extractor/popcorntimes.py b/yt_dlp/extractor/popcorntimes.py index ed741a07b..ddc5ec8c8 100644 --- a/yt_dlp/extractor/popcorntimes.py +++ b/yt_dlp/extractor/popcorntimes.py @@ -1,8 +1,5 @@ from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_chr, -) +from ..compat import compat_b64decode from ..utils import int_or_none @@ -50,7 +47,7 @@ class PopcorntimesIE(InfoExtractor): c_ord += 13 if upper < c_ord: c_ord -= 26 - loc_b64 += compat_chr(c_ord) + loc_b64 += chr(c_ord) video_url = compat_b64decode(loc_b64).decode('utf-8') diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index d296ccacb..35468b4fc 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -3,28 +3,26 @@ import itertools import math import operator import re +import urllib.request from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urllib_request, -) from .openload import PhantomJSwrapper +from ..compat import compat_HTTPError, compat_str from ..utils import ( + NO_DEFAULT, + ExtractorError, clean_html, determine_ext, - ExtractorError, format_field, int_or_none, merge_dicts, - NO_DEFAULT, orderedSet, remove_quotes, + remove_start, str_to_int, update_url_query, - urlencode_postdata, url_or_none, + urlencode_postdata, ) @@ -49,7 +47,7 @@ class PornHubBaseIE(InfoExtractor): r'document\.location\.reload\(true\)')): url_or_request = args[0] url = (url_or_request.get_full_url() - if isinstance(url_or_request, compat_urllib_request.Request) + if isinstance(url_or_request, urllib.request.Request) else url_or_request) phantom = PhantomJSwrapper(self, required_version='2.0') phantom.get(url, html=webpage) @@ -199,6 +197,16 @@ class PornHubIE(PornHubBaseIE): }, 'skip': 'This video has been disabled', }, { + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph601dc30bae19a', + 'info_dict': { + 'id': 'ph601dc30bae19a', + 'uploader': 'Projekt Melody', + 'uploader_id': 'projekt-melody', + 'upload_date': '20210205', + 'title': '"Welcome to My Pussy Mansion" - CB Stream (02/03/21)', + 'thumbnail': r're:https?://.+', + }, + }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, }, { @@ -429,7 +437,7 @@ class PornHubIE(PornHubBaseIE): default=None)) formats.append({ 'url': format_url, - 'format_id': format_field(height, template='%dp'), + 'format_id': format_field(height, None, '%dp'), 'height': height, }) @@ -457,9 +465,11 @@ class PornHubIE(PornHubBaseIE): self._sort_formats( formats, field_preference=('height', 'width', 'fps', 'format_id')) + model_profile = self._search_json( + r'var\s+MODEL_PROFILE\s*=', webpage, 'model profile', video_id, fatal=False) video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', - webpage, 'uploader', default=None) + webpage, 'uploader', default=None) or model_profile.get('username') def extract_vote_count(kind, name): return self._extract_count( @@ -488,6 +498,7 @@ class PornHubIE(PornHubBaseIE): return merge_dicts({ 'id': video_id, 'uploader': video_uploader, + 'uploader_id': remove_start(model_profile.get('modelProfileLink'), '/model/'), 'upload_date': upload_date, 'title': title, 'thumbnail': thumbnail, diff --git a/yt_dlp/extractor/premiershiprugby.py b/yt_dlp/extractor/premiershiprugby.py new file mode 100644 index 000000000..67d41fdfd --- /dev/null +++ b/yt_dlp/extractor/premiershiprugby.py @@ -0,0 +1,39 @@ +from .common import InfoExtractor +from ..utils import int_or_none, traverse_obj + + +class PremiershipRugbyIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)premiershiprugby\.(?:com)/watch/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.premiershiprugby.com/watch/full-match-harlequins-v-newcastle-falcons', + 'info_dict': { + 'id': '0_mbkb7ldt', + 'title': 'Full Match: Harlequins v Newcastle Falcons', + 'ext': 'mp4', + 'thumbnail': 'https://open.http.mp.streamamg.com/p/3000914/sp/300091400/thumbnail/entry_id/0_mbkb7ldt//width/960/height/540/type/1/quality/75', + 'duration': 6093.0, + 'tags': ['video'], + 'categories': ['Full Match', 'Harlequins', 'Newcastle Falcons', 'gallaher premiership'], + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + json_data = self._download_json( + f'https://article-cms-api.incrowdsports.com/v2/articles/slug/{display_id}', + display_id, query={'clientId': 'PRL'})['data']['article'] + + formats, subs = self._extract_m3u8_formats_and_subtitles( + json_data['heroMedia']['content']['videoLink'], display_id) + + return { + 'id': json_data['heroMedia']['content']['sourceSystemId'], + 'display_id': display_id, + 'title': traverse_obj(json_data, ('heroMedia', 'title')), + 'formats': formats, + 'subtitles': subs, + 'thumbnail': traverse_obj(json_data, ('heroMedia', 'content', 'videoThumbnail')), + 'duration': int_or_none(traverse_obj(json_data, ('heroMedia', 'content', 'metadata', 'msDuration')), scale=1000), + 'tags': json_data.get('tags'), + 'categories': traverse_obj(json_data, ('categories', ..., 'text')), + } diff --git a/yt_dlp/extractor/puls4.py b/yt_dlp/extractor/puls4.py index 3c13d1f56..38c5d1109 100644 --- a/yt_dlp/extractor/puls4.py +++ b/yt_dlp/extractor/puls4.py @@ -1,9 +1,6 @@ from .prosiebensat1 import ProSiebenSat1BaseIE -from ..utils import ( - unified_strdate, - parse_duration, - compat_str, -) +from ..compat import compat_str +from ..utils import parse_duration, unified_strdate class Puls4IE(ProSiebenSat1BaseIE): diff --git a/yt_dlp/extractor/radiko.py b/yt_dlp/extractor/radiko.py index dbb748715..498cc6be9 100644 --- a/yt_dlp/extractor/radiko.py +++ b/yt_dlp/extractor/radiko.py @@ -43,7 +43,7 @@ class RadikoBaseIE(InfoExtractor): }).split(',')[0] auth_data = (auth_token, area_id) - self._downloader.cache.store('radiko', 'auth_data', auth_data) + self.cache.store('radiko', 'auth_data', auth_data) return auth_data def _extract_full_key(self): @@ -150,7 +150,7 @@ class RadikoIE(RadikoBaseIE): vid_int = unified_timestamp(video_id, False) prog, station_program, ft, radio_begin, radio_end = self._find_program(video_id, station, vid_int) - auth_cache = self._downloader.cache.load('radiko', 'auth_data') + auth_cache = self.cache.load('radiko', 'auth_data') for attempt in range(2): auth_token, area_id = (not attempt and auth_cache) or self._auth_client() formats = self._extract_formats( diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py index 8fef54dab..7b60b2617 100644 --- a/yt_dlp/extractor/radiofrance.py +++ b/yt_dlp/extractor/radiofrance.py @@ -1,6 +1,7 @@ import re from .common import InfoExtractor +from ..utils import parse_duration, unified_strdate class RadioFranceIE(InfoExtractor): @@ -54,3 +55,51 @@ class RadioFranceIE(InfoExtractor): 'description': description, 'uploader': uploader, } + + +class FranceCultureIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/(?:franceculture|fip|francemusique|mouv|franceinter)/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])' + _TESTS = [ + { + 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487', + 'info_dict': { + 'id': '8440487', + 'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau', + 'ext': 'mp3', + 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?', + 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?', + 'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg', + 'upload_date': '20220514', + 'duration': 2750, + }, + }, + { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).group('id', 'display_id') + webpage = self._download_webpage(url, display_id) + + # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846 + video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'\s*"@type"\s*:\s*"AudioObject"\s*.+') + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_data['contentUrl'], + 'ext': video_data.get('encodingFormat'), + 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None, + 'duration': parse_duration(video_data.get('duration')), + 'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>', + webpage, 'title', default=self._og_search_title(webpage)), + 'description': self._html_search_regex( + r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None), + 'thumbnail': self._og_search_thumbnail(webpage), + 'uploader': self._html_search_regex( + r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None), + 'upload_date': unified_strdate(self._search_regex( + r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)) + } diff --git a/yt_dlp/extractor/radlive.py b/yt_dlp/extractor/radlive.py index dc9897305..d89c9563b 100644 --- a/yt_dlp/extractor/radlive.py +++ b/yt_dlp/extractor/radlive.py @@ -80,7 +80,7 @@ class RadLiveIE(InfoExtractor): 'release_timestamp': release_date, 'channel': channel.get('name'), 'channel_id': channel_id, - 'channel_url': format_field(channel_id, template='https://rad.live/content/channel/%s'), + 'channel_url': format_field(channel_id, None, 'https://rad.live/content/channel/%s'), } if content_type == 'episode': diff --git a/yt_dlp/extractor/rokfin.py b/yt_dlp/extractor/rokfin.py index ad53d697e..119c5ea3c 100644 --- a/yt_dlp/extractor/rokfin.py +++ b/yt_dlp/extractor/rokfin.py @@ -146,7 +146,7 @@ class RokfinIE(InfoExtractor): for page_n in itertools.count(): raw_comments = self._download_json( f'{_API_BASE_URL}comment?postId={video_id[5:]}&page={page_n}&size=50', - video_id, note=f'Downloading viewer comments page {page_n + 1}{format_field(pages_total, template=" of %s")}', + video_id, note=f'Downloading viewer comments page {page_n + 1}{format_field(pages_total, None, " of %s")}', fatal=False) or {} for comment in raw_comments.get('content') or []: @@ -318,7 +318,7 @@ class RokfinChannelIE(RokfinPlaylistBaseIE): data_url = f'{_API_BASE_URL}post/search/{tab}?page={page_n}&size=50&creator={channel_id}' metadata = self._download_json( data_url, channel_name, - note=f'Downloading video metadata page {page_n + 1}{format_field(pages_total, template=" of %s")}') + note=f'Downloading video metadata page {page_n + 1}{format_field(pages_total, None, " of %s")}') yield from self._get_video_data(metadata) pages_total = int_or_none(metadata.get('totalPages')) or None @@ -360,7 +360,7 @@ class RokfinSearchIE(SearchInfoExtractor): _db_access_key = None def _real_initialize(self): - self._db_url, self._db_access_key = self._downloader.cache.load(self.ie_key(), 'auth', default=(None, None)) + self._db_url, self._db_access_key = self.cache.load(self.ie_key(), 'auth', default=(None, None)) if not self._db_url: self._get_db_access_credentials() @@ -369,7 +369,7 @@ class RokfinSearchIE(SearchInfoExtractor): for page_number in itertools.count(1): search_results = self._run_search_query( query, data={'query': query, 'page': {'size': 100, 'current': page_number}}, - note=f'Downloading page {page_number}{format_field(total_pages, template=" of ~%s")}') + note=f'Downloading page {page_number}{format_field(total_pages, None, " of ~%s")}') total_pages = traverse_obj(search_results, ('meta', 'page', 'total_pages'), expected_type=int_or_none) for result in search_results.get('results') or []: @@ -405,6 +405,6 @@ class RokfinSearchIE(SearchInfoExtractor): self._db_url = url_or_none(f'{auth_data["ENDPOINT_BASE"]}/api/as/v1/engines/rokfin-search/search.json') self._db_access_key = f'Bearer {auth_data["SEARCH_KEY"]}' - self._downloader.cache.store(self.ie_key(), 'auth', (self._db_url, self._db_access_key)) + self.cache.store(self.ie_key(), 'auth', (self._db_url, self._db_access_key)) return raise ExtractorError('Unable to extract access credentials') diff --git a/yt_dlp/extractor/rtve.py b/yt_dlp/extractor/rtve.py index 42a602968..798dde7fa 100644 --- a/yt_dlp/extractor/rtve.py +++ b/yt_dlp/extractor/rtve.py @@ -1,14 +1,12 @@ import base64 import io +import struct from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_struct_unpack, -) +from ..compat import compat_b64decode from ..utils import ( - determine_ext, ExtractorError, + determine_ext, float_or_none, qualities, remove_end, @@ -73,7 +71,7 @@ class RTVEALaCartaIE(InfoExtractor): def _decrypt_url(png): encrypted_data = io.BytesIO(compat_b64decode(png)[8:]) while True: - length = compat_struct_unpack('!I', encrypted_data.read(4))[0] + length = struct.unpack('!I', encrypted_data.read(4))[0] chunk_type = encrypted_data.read(4) if chunk_type == b'IEND': break diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 50c383d79..924f9829f 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -24,6 +24,11 @@ class RumbleEmbedIE(InfoExtractor): 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', 'timestamp': 1571611968, 'upload_date': '20191020', + 'channel_url': 'https://rumble.com/c/WMAR', + 'channel': 'WMAR', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.OvCc-small-WMAR-2-News-Latest-Headline.jpg', + 'duration': 234, + 'uploader': 'WMAR', } }, { 'url': 'https://rumble.com/embed/vslb7v', @@ -38,19 +43,21 @@ class RumbleEmbedIE(InfoExtractor): 'channel': 'CTNews', 'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg', 'duration': 901, + 'uploader': 'CTNews', } }, { 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>%s)' % RumbleEmbedIE._VALID_URL, - webpage)] + @classmethod + def _extract_urls(cls, webpage): + embeds = tuple(re.finditer( + fr'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>{cls._VALID_URL})', webpage)) + if embeds: + return [mobj.group('url') for mobj in embeds] + return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer( + r'<script>\s*Rumble\(\s*"play"\s*,\s*{\s*[\'"]video[\'"]\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] def _real_extract(self, url): video_id = self._match_id(url) @@ -77,17 +84,26 @@ class RumbleEmbedIE(InfoExtractor): formats.append(f) self._sort_formats(formats) + subtitles = { + lang: [{ + 'url': sub_info['path'], + 'name': sub_info.get('language') or '', + }] for lang, sub_info in (video.get('cc') or {}).items() if sub_info.get('path') + } + author = video.get('author') or {} return { 'id': video_id, 'title': title, 'formats': formats, + 'subtitles': subtitles, 'thumbnail': video.get('i'), 'timestamp': parse_iso8601(video.get('pubDate')), 'channel': author.get('name'), 'channel_url': author.get('url'), 'duration': int_or_none(video.get('duration')), + 'uploader': author.get('name'), } diff --git a/yt_dlp/extractor/screencast.py b/yt_dlp/extractor/screencast.py index e3dbaab69..df5e79bef 100644 --- a/yt_dlp/extractor/screencast.py +++ b/yt_dlp/extractor/screencast.py @@ -1,11 +1,8 @@ +import urllib.request + from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_request, -) -from ..utils import ( - ExtractorError, -) +from ..compat import compat_parse_qs +from ..utils import ExtractorError class ScreencastIE(InfoExtractor): @@ -75,7 +72,7 @@ class ScreencastIE(InfoExtractor): flash_vars_s = flash_vars_s.replace(',', '&') if flash_vars_s: flash_vars = compat_parse_qs(flash_vars_s) - video_url_raw = compat_urllib_request.quote( + video_url_raw = urllib.request.quote( flash_vars['content'][0]) video_url = video_url_raw.replace('http%3A', 'http:') diff --git a/yt_dlp/extractor/shared.py b/yt_dlp/extractor/shared.py index 5bc097b0d..9a237b320 100644 --- a/yt_dlp/extractor/shared.py +++ b/yt_dlp/extractor/shared.py @@ -1,14 +1,13 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_urllib_parse_unquote_plus, -) +from ..compat import compat_b64decode from ..utils import ( - determine_ext, + KNOWN_EXTENSIONS, ExtractorError, + determine_ext, int_or_none, js_to_json, - KNOWN_EXTENSIONS, parse_filesize, rot47, url_or_none, @@ -130,7 +129,7 @@ class VivoIE(SharedBaseIE): return stream_url def decode_url(encoded_url): - return rot47(compat_urllib_parse_unquote_plus(encoded_url)) + return rot47(urllib.parse.unquote_plus(encoded_url)) return decode_url(self._parse_json( self._search_regex( diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 6dfa50c60..9e4c8cf25 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -67,7 +67,7 @@ class SoundcloudBaseIE(InfoExtractor): _HEADERS = {} def _store_client_id(self, client_id): - self._downloader.cache.store('soundcloud', 'client_id', client_id) + self.cache.store('soundcloud', 'client_id', client_id) def _update_client_id(self): webpage = self._download_webpage('https://soundcloud.com/', None) @@ -104,7 +104,7 @@ class SoundcloudBaseIE(InfoExtractor): raise def _initialize_pre_login(self): - self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf' + self._CLIENT_ID = self.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf' def _perform_login(self, username, password): if username != 'oauth': diff --git a/yt_dlp/extractor/southpark.py b/yt_dlp/extractor/southpark.py index 855f1d6d3..7381ac362 100644 --- a/yt_dlp/extractor/southpark.py +++ b/yt_dlp/extractor/southpark.py @@ -109,6 +109,49 @@ class SouthParkDeIE(SouthParkIE): return +class SouthParkLatIE(SouthParkIE): + IE_NAME = 'southpark.lat' + _VALID_URL = r'https?://(?:www\.)?southpark\.lat/(?:en/)?(?:video-?clips?|collections|episod(?:e|io)s)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.southpark.lat/en/video-clips/ct46op/south-park-tooth-fairy-cartman', + 'only_matching': True, + }, { + 'url': 'https://www.southpark.lat/episodios/9h0qbg/south-park-orgia-gatuna-temporada-3-ep-7', + 'only_matching': True, + }, { + 'url': 'https://www.southpark.lat/en/collections/29ve08/south-park-heating-up/lydbrc', + 'only_matching': True, + }, { + # clip + 'url': 'https://www.southpark.lat/en/video-clips/ct46op/south-park-tooth-fairy-cartman', + 'info_dict': { + 'id': 'e99d45ea-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'Tooth Fairy Cartman', + 'description': 'md5:db02e23818b4dc9cb5f0c5a7e8833a68', + }, + }, { + # episode + 'url': 'https://www.southpark.lat/episodios/9h0qbg/south-park-orgia-gatuna-temporada-3-ep-7', + 'info_dict': { + 'id': 'f5fbd823-04bc-11eb-9b1b-0e40cf2fc285', + 'ext': 'mp4', + 'title': 'South Park', + 'description': 'md5:ae0d875eff169dcbed16b21531857ac1', + }, + }] + + def _get_feed_url(self, uri, url=None): + video_id = self._id_from_uri(uri) + config = self._download_json( + f'http://media.mtvnservices.com/pmt/e1/access/index.html?uri={uri}&configtype=edge&ref={url}', + video_id) + return self._remove_template_parameter(config['feedWithQueryParams']) + + def _get_feed_query(self, uri): + return + + class SouthParkNlIE(SouthParkIE): IE_NAME = 'southpark.nl' _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' diff --git a/yt_dlp/extractor/spotify.py b/yt_dlp/extractor/spotify.py index a2068a1b6..fef8d8dd2 100644 --- a/yt_dlp/extractor/spotify.py +++ b/yt_dlp/extractor/spotify.py @@ -1,12 +1,15 @@ +import functools import json import re from .common import InfoExtractor from ..utils import ( + OnDemandPagedList, clean_podcast_url, float_or_none, int_or_none, strip_or_none, + traverse_obj, try_get, unified_strdate, ) @@ -25,7 +28,7 @@ class SpotifyBaseIE(InfoExtractor): self._ACCESS_TOKEN = self._download_json( 'https://open.spotify.com/get_access_token', None)['accessToken'] - def _call_api(self, operation, video_id, variables): + def _call_api(self, operation, video_id, variables, **kwargs): return self._download_json( 'https://api-partner.spotify.com/pathfinder/v1/query', video_id, query={ 'operationName': 'query' + operation, @@ -35,7 +38,8 @@ class SpotifyBaseIE(InfoExtractor): 'sha256Hash': self._OPERATION_HASHES[operation], }, }) - }, headers={'authorization': 'Bearer ' + self._ACCESS_TOKEN})['data'] + }, headers={'authorization': 'Bearer ' + self._ACCESS_TOKEN}, + **kwargs)['data'] def _extract_episode(self, episode, series): episode_id = episode['id'] @@ -143,22 +147,25 @@ class SpotifyShowIE(SpotifyBaseIE): }, 'playlist_mincount': 36, } + _PER_PAGE = 100 + + def _fetch_page(self, show_id, page=0): + return self._call_api('ShowEpisodes', show_id, { + 'limit': 100, + 'offset': page * self._PER_PAGE, + 'uri': f'spotify:show:{show_id}', + }, note=f'Downloading page {page + 1} JSON metadata')['podcast'] def _real_extract(self, url): show_id = self._match_id(url) - podcast = self._call_api('ShowEpisodes', show_id, { - 'limit': 1000000000, - 'offset': 0, - 'uri': 'spotify:show:' + show_id, - })['podcast'] - podcast_name = podcast.get('name') - - entries = [] - for item in (try_get(podcast, lambda x: x['episodes']['items']) or []): - episode = item.get('episode') - if not episode: - continue - entries.append(self._extract_episode(episode, podcast_name)) + first_page = self._fetch_page(show_id) + + def _entries(page): + podcast = self._fetch_page(show_id, page) if page else first_page + yield from map( + functools.partial(self._extract_episode, series=podcast.get('name')), + traverse_obj(podcast, ('episodes', 'items', ..., 'episode'))) return self.playlist_result( - entries, show_id, podcast_name, podcast.get('description')) + OnDemandPagedList(_entries, self._PER_PAGE), + show_id, first_page.get('name'), first_page.get('description')) diff --git a/yt_dlp/extractor/storyfire.py b/yt_dlp/extractor/storyfire.py index 716190220..035747c31 100644 --- a/yt_dlp/extractor/storyfire.py +++ b/yt_dlp/extractor/storyfire.py @@ -44,7 +44,7 @@ class StoryFireBaseIE(InfoExtractor): 'timestamp': int_or_none(video.get('publishDate')), 'uploader': video.get('username'), 'uploader_id': uploader_id, - 'uploader_url': format_field(uploader_id, template='https://storyfire.com/user/%s/video'), + 'uploader_url': format_field(uploader_id, None, 'https://storyfire.com/user/%s/video'), 'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')), } diff --git a/yt_dlp/extractor/streamcz.py b/yt_dlp/extractor/streamcz.py index 85fc3a3c3..849a9882d 100644 --- a/yt_dlp/extractor/streamcz.py +++ b/yt_dlp/extractor/streamcz.py @@ -52,8 +52,8 @@ class StreamCZIE(InfoExtractor): def _extract_formats(self, spl_url, video): for ext, pref, streams in ( - ('ts', -1, traverse_obj(video, ('http_stream', 'qualities'))), - ('mp4', 1, video.get('mp4'))): + ('ts', -1, traverse_obj(video, ('http_stream', 'qualities')) or {}), + ('mp4', 1, video.get('mp4') or {})): for format_id, stream in streams.items(): if not stream.get('url'): continue diff --git a/yt_dlp/extractor/stv.py b/yt_dlp/extractor/stv.py index 618dc4329..c879fb52e 100644 --- a/yt_dlp/extractor/stv.py +++ b/yt_dlp/extractor/stv.py @@ -1,6 +1,6 @@ from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - compat_str, float_or_none, int_or_none, smuggle_url, diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py new file mode 100644 index 000000000..70cf10515 --- /dev/null +++ b/yt_dlp/extractor/substack.py @@ -0,0 +1,100 @@ +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import str_or_none, traverse_obj + + +class SubstackIE(InfoExtractor): + _VALID_URL = r'https?://(?P<username>[\w-]+)\.substack\.com/p/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://haleynahman.substack.com/p/i-made-a-vlog?s=r', + 'md5': 'f27e4fc6252001d48d479f45e65cdfd5', + 'info_dict': { + 'id': '47660949', + 'ext': 'mp4', + 'title': 'I MADE A VLOG', + 'description': 'md5:10c01ff93439a62e70ce963b2aa0b7f6', + 'thumbnail': 'md5:bec758a34d8ee9142d43bcebdf33af18', + 'uploader': 'Maybe Baby', + 'uploader_id': '33628', + } + }, { + 'url': 'https://haleynahman.substack.com/p/-dear-danny-i-found-my-boyfriends?s=r', + 'md5': '0a63eacec877a1171a62cfa69710fcea', + 'info_dict': { + 'id': '51045592', + 'ext': 'mpga', + 'title': "🎧 Dear Danny: I found my boyfriend's secret Twitter account", + 'description': 'md5:a57f2439319e56e0af92dd0c95d75797', + 'thumbnail': 'md5:daa40b6b79249417c14ff8103db29639', + 'uploader': 'Maybe Baby', + 'uploader_id': '33628', + } + }, { + 'url': 'https://andrewzimmern.substack.com/p/mussels-with-black-bean-sauce-recipe', + 'md5': 'fd3c07077b02444ff0130715b5f632bb', + 'info_dict': { + 'id': '47368578', + 'ext': 'mp4', + 'title': 'Mussels with Black Bean Sauce: Recipe of the Week #7', + 'description': 'md5:b96234a2906c7d854d5229818d889515', + 'thumbnail': 'md5:e30bfaa9da40e82aa62354263a9dd232', + 'uploader': "Andrew Zimmern's Spilled Milk ", + 'uploader_id': '577659', + } + }] + + @classmethod + def _extract_url(cls, webpage, url): + if not re.search(r'<script[^>]+src=["\']https://substackcdn.com/[^"\']+\.js', webpage): + return + + mobj = re.search(r'{[^}]*["\']subdomain["\']\s*:\s*["\'](?P<subdomain>[^"]+)', webpage) + if mobj: + parsed = urllib.parse.urlparse(url) + return parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl() + + def _extract_video_formats(self, video_id, username): + formats, subtitles = [], {} + for video_format in ('hls', 'mp4'): + video_url = f'https://{username}.substack.com/api/v1/video/upload/{video_id}/src?type={video_format}' + + if video_format == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': video_url, + 'ext': video_format, + }) + + return formats, subtitles + + def _real_extract(self, url): + display_id, username = self._match_valid_url(url).group('id', 'username') + webpage = self._download_webpage(url, display_id) + + webpage_info = self._search_json(r'<script[^>]*>\s*window\._preloads\s*=', webpage, 'preloads', display_id) + + post_type = webpage_info['post']['type'] + formats, subtitles = [], {} + if post_type == 'podcast': + formats, subtitles = [{'url': webpage_info['post']['podcast_url']}], {} + elif post_type == 'video': + formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], username) + else: + self.raise_no_formats(f'Page type "{post_type}" is not supported') + + self._sort_formats(formats) + return { + 'id': str(webpage_info['post']['id']), + 'formats': formats, + 'subtitles': subtitles, + 'title': traverse_obj(webpage_info, ('post', 'title')), + 'description': traverse_obj(webpage_info, ('post', 'description')), + 'thumbnail': traverse_obj(webpage_info, ('post', 'cover_image')), + 'uploader': traverse_obj(webpage_info, ('pub', 'name')), + 'uploader_id': str_or_none(traverse_obj(webpage_info, ('post', 'publication_id'))), + } diff --git a/yt_dlp/extractor/tennistv.py b/yt_dlp/extractor/tennistv.py index 80acaf190..3bd7ce3c4 100644 --- a/yt_dlp/extractor/tennistv.py +++ b/yt_dlp/extractor/tennistv.py @@ -1,16 +1,17 @@ -import json +import urllib.parse from .common import InfoExtractor - from ..utils import ( ExtractorError, + random_uuidv4, unified_timestamp, + urlencode_postdata, ) class TennisTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tennistv\.com/videos/(?P<id>[-a-z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.tennistv.com/videos/indian-wells-2018-verdasco-fritz', 'info_dict': { 'id': 'indian-wells-2018-verdasco-fritz', @@ -25,86 +26,132 @@ class TennisTVIE(InfoExtractor): 'skip_download': True, }, 'skip': 'Requires email and password of a subscribed account', - } + }, { + 'url': 'https://www.tennistv.com/videos/2650480/best-matches-of-2022-part-5', + 'info_dict': { + 'id': '2650480', + 'ext': 'mp4', + 'title': 'Best Matches of 2022 - Part 5', + 'description': 'md5:36dec3bfae7ed74bd79e48045b17264c', + 'thumbnail': 'https://open.http.mp.streamamg.com/p/3001482/sp/300148200/thumbnail/entry_id/0_myef18pd/version/100001/height/1920', + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'Requires email and password of a subscribed account', + }] _NETRC_MACHINE = 'tennistv' - _session_token = None - - def _perform_login(self, username, password): - login_form = { - 'Email': username, - 'Password': password, - } - login_json = json.dumps(login_form).encode('utf-8') - headers = { - 'content-type': 'application/json', - 'Referer': 'https://www.tennistv.com/login', - 'Origin': 'https://www.tennistv.com', - } - - login_result = self._download_json( - 'https://www.tennistv.com/api/users/v1/login', None, - note='Logging in', - errnote='Login failed (wrong password?)', - headers=headers, - data=login_json) + access_token, refresh_token = None, None + _PARTNER_ID = 3001482 + _FORMAT_URL = 'https://open.http.mp.streamamg.com/p/{partner}/sp/{partner}00/playManifest/entryId/{entry}/format/applehttp/protocol/https/a.m3u8?ks={session}' + _AUTH_BASE_URL = 'https://sso.tennistv.com/auth/realms/TennisTV/protocol/openid-connect' + _HEADERS = { + 'origin': 'https://www.tennistv.com', + 'referer': 'https://www.tennistv.com/', + 'content-Type': 'application/x-www-form-urlencoded' + } - if login_result['error']['errorCode']: - raise ExtractorError('Login failed, %s said: %r' % (self.IE_NAME, login_result['error']['errorMessage'])) + def _perform_login(self, username, password): + login_page = self._download_webpage( + f'{self._AUTH_BASE_URL}/auth', None, 'Downloading login page', + query={ + 'client_id': 'tennis-tv-web', + 'redirect_uri': 'https://tennistv.com', + 'response_mode': 'fragment', + 'response_type': 'code', + 'scope': 'openid' + }) + + post_url = self._html_search_regex(r'action=["\']([^"\']+?)["\']\s+method=["\']post["\']', login_page, 'login POST url') + temp_page = self._download_webpage( + post_url, None, 'Sending login data', 'Unable to send login data', + headers=self._HEADERS, data=urlencode_postdata({ + 'username': username, + 'password': password, + 'submitAction': 'Log In' + })) + if 'Your username or password was incorrect' in temp_page: + raise ExtractorError('Your username or password was incorrect', expected=True) + + handle = self._request_webpage( + f'{self._AUTH_BASE_URL}/auth', None, 'Logging in', headers=self._HEADERS, + query={ + 'client_id': 'tennis-tv-web', + 'redirect_uri': 'https://www.tennistv.com/resources/v1.1.10/html/silent-check-sso.html', + 'state': random_uuidv4(), + 'response_mode': 'fragment', + 'response_type': 'code', + 'scope': 'openid', + 'nonce': random_uuidv4(), + 'prompt': 'none' + }) + + self.get_token(None, { + 'code': urllib.parse.parse_qs(handle.geturl())['code'][-1], + 'grant_type': 'authorization_code', + 'client_id': 'tennis-tv-web', + 'redirect_uri': 'https://www.tennistv.com/resources/v1.1.10/html/silent-check-sso.html' + }) + + def get_token(self, video_id, payload): + res = self._download_json( + f'{self._AUTH_BASE_URL}/token', video_id, 'Fetching tokens', + 'Unable to fetch tokens', headers=self._HEADERS, data=urlencode_postdata(payload)) + + self.access_token = res.get('access_token') or self.access_token + self.refresh_token = res.get('refresh_token') or self.refresh_token - if login_result['entitlement'] != 'SUBSCRIBED': - self.report_warning('%s may not be subscribed to %s.' % (username, self.IE_NAME)) + def _real_initialize(self): + if self.access_token and self.refresh_token: + return - self._session_token = login_result['sessionToken'] + cookies = self._get_cookies('https://www.tennistv.com/') + if not cookies.get('access_token') or not cookies.get('refresh_token'): + self.raise_login_required() + self.access_token, self.refresh_token = cookies['access_token'].value, cookies['refresh_token'].value - def _real_initialize(self): - if not self._session_token: - raise self.raise_login_required('Login info is needed for this website', method='password') + def _download_session_json(self, video_id, entryid,): + return self._download_json( + f'https://atppayments.streamamg.com/api/v1/session/ksession/?lang=en&apijwttoken={self.access_token}&entryId={entryid}', + video_id, 'Downloading ksession token', 'Failed to download ksession token', headers=self._HEADERS) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - internal_id = self._search_regex(r'video=([\w-]+)', webpage, 'internal video id') + entryid = self._search_regex(r'data-entry-id=["\']([^"\']+)', webpage, 'entryID') + session_json = self._download_session_json(video_id, entryid) - headers = { - 'Origin': 'https://www.tennistv.com', - 'authorization': 'ATP %s' % self._session_token, - 'content-type': 'application/json', - 'Referer': url, - } - check_data = { - 'videoID': internal_id, - 'VideoUrlType': 'HLS', - } - check_json = json.dumps(check_data).encode('utf-8') - check_result = self._download_json( - 'https://www.tennistv.com/api/users/v1/entitlementchecknondiva', - video_id, note='Checking video authorization', headers=headers, data=check_json) - formats = self._extract_m3u8_formats(check_result['contentUrl'], video_id, ext='mp4') - self._sort_formats(formats) + k_session = session_json.get('KSession') + if k_session is None: + self.get_token(video_id, { + 'grant_type': 'refresh_token', + 'refresh_token': self.refresh_token, + 'client_id': 'tennis-tv-web' + }) + k_session = self._download_session_json(video_id, entryid).get('KSession') + if k_session is None: + raise ExtractorError('Failed to get KSession, possibly a premium video', expected=True) - vdata = self._download_json( - 'https://www.tennistv.com/api/en/v2/none/common/video/%s' % video_id, - video_id, headers=headers) + if session_json.get('ErrorMessage'): + self.report_warning(session_json['ErrorMessage']) - timestamp = unified_timestamp(vdata['timestamp']) - thumbnail = vdata['video']['thumbnailUrl'] - description = vdata['displayText']['description'] - title = vdata['video']['title'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + self._FORMAT_URL.format(partner=self._PARTNER_ID, entry=entryid, session=k_session), video_id) - series = vdata['tour'] - venue = vdata['displayText']['venue'] - round_str = vdata['seo']['round'] + self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'description': description, + 'title': self._html_extract_title(webpage) or self._og_search_title(webpage), + 'description': self._html_search_regex( + (r'<span itemprop="description" content=["\']([^"\']+)["\']>', *self._og_regexes('description')), + webpage, 'description', fatal=False), + 'thumbnail': f'https://open.http.mp.streamamg.com/p/{self._PARTNER_ID}/sp/{self._PARTNER_ID}00/thumbnail/entry_id/{entryid}/version/100001/height/1920', + 'timestamp': unified_timestamp(self._html_search_regex( + r'<span itemprop="description" content=["\']([^"\']+)["\']>', webpage, 'upload time')), + 'series': self._html_search_regex(r'data-series\s*?=\s*?"(.*?)"', webpage, 'series', fatal=False) or None, + 'season': self._html_search_regex(r'data-tournament-city\s*?=\s*?"(.*?)"', webpage, 'season', fatal=False) or None, + 'episode': self._html_search_regex(r'data-round\s*?=\s*?"(.*?)"', webpage, 'round', fatal=False) or None, 'formats': formats, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'series': series, - 'season': venue, - 'episode': round_str, + 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/testurl.py b/yt_dlp/extractor/testurl.py index 32cae429e..d205fe053 100644 --- a/yt_dlp/extractor/testurl.py +++ b/yt_dlp/extractor/testurl.py @@ -11,7 +11,7 @@ class TestURLIE(InfoExtractor): _VALID_URL = r'test(?:url)?:(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?$' def _real_extract(self, url): - from ..extractor import gen_extractor_classes + from . import gen_extractor_classes extractor_id, num = self._match_valid_url(url).group('extractor', 'num') diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 4ba993582..680358d5e 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -1,28 +1,27 @@ import itertools +import json import random +import re import string import time -import json from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse -) +from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse from ..utils import ( ExtractorError, HEADRequest, + LazyList, UnsupportedError, + get_element_by_id, get_first, int_or_none, join_nonempty, - LazyList, + qualities, srt_subtitles_timecode, str_or_none, traverse_obj, try_get, url_or_none, - qualities, ) @@ -35,6 +34,21 @@ class TikTokBaseIE(InfoExtractor): _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s' _WEBPAGE_HOST = 'https://www.tiktok.com/' QUALITIES = ('360p', '540p', '720p', '1080p') + _session_initialized = False + + @staticmethod + def _create_url(user_id, video_id): + return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}' + + def _get_sigi_state(self, webpage, display_id): + return self._parse_json(get_element_by_id( + 'SIGI_STATE|sigi-persisted-data', webpage, escape_value=False), display_id) + + def _real_initialize(self): + if self._session_initialized: + return + self._request_webpage(HEADRequest('https://www.tiktok.com'), None, note='Setting up session', fatal=False) + TikTokBaseIE._session_initialized = True def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): @@ -261,6 +275,9 @@ class TikTokBaseIE(InfoExtractor): return { 'id': aweme_id, + 'extractor_key': TikTokIE.ie_key(), + 'extractor': TikTokIE.IE_NAME, + 'webpage_url': self._create_url(author_info.get('uid'), aweme_id), 'title': aweme_detail.get('desc'), 'description': aweme_detail.get('desc'), 'view_count': int_or_none(stats_info.get('play_count')), @@ -361,7 +378,7 @@ class TikTokBaseIE(InfoExtractor): class TikTokIE(TikTokBaseIE): - _VALID_URL = r'https?://www\.tiktok\.com/@[\w\.-]+/video/(?P<id>\d+)' + _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)/video)/(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610', @@ -459,14 +476,14 @@ class TikTokIE(TikTokBaseIE): 'repost_count': int, 'comment_count': int, }, - 'expected_warnings': ['Video not available'] + 'expected_warnings': ['trying with webpage', 'Unable to find video in feed'] }, { # Video without title and description 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694', 'info_dict': { 'id': '7059698374567611694', 'ext': 'mp4', - 'title': 'tiktok video #7059698374567611694', + 'title': 'TikTok video #7059698374567611694', 'description': '', 'uploader': 'pokemonlife22', 'creator': 'Pokemon', @@ -483,13 +500,40 @@ class TikTokIE(TikTokBaseIE): 'repost_count': int, 'comment_count': int, }, - 'expected_warnings': ['Video not available', 'Creating a generic title'] + }, { + # hydration JSON is sent in a <script> element + 'url': 'https://www.tiktok.com/@denidil6/video/7065799023130643713', + 'info_dict': { + 'id': '7065799023130643713', + 'ext': 'mp4', + 'title': '#denidil#денидил', + 'description': '#denidil#денидил', + 'uploader': 'denidil6', + 'uploader_id': '7046664115636405250', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAsvMSzFdQ4ikl3uR2TEJwMBbB2yZh2Zxwhx-WCo3rbDpAharE3GQCrFuJArI3C8QJ', + 'artist': 'Holocron Music', + 'album': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night', + 'track': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night', + 'timestamp': 1645134536, + 'duration': 26, + 'upload_date': '20220217', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + }, + 'expected_warnings': ['trying feed workaround', 'Unable to find video in feed'] }, { # Auto-captions available 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', 'only_matching': True }] + @classmethod + def _extract_urls(cls, webpage): + return [mobj.group('url') for mobj in re.finditer( + rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{cls._VALID_URL})', webpage)] + def _extract_aweme_app(self, aweme_id): try: aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id, @@ -497,7 +541,7 @@ class TikTokIE(TikTokBaseIE): if not aweme_detail: raise ExtractorError('Video not available', video_id=aweme_id) except ExtractorError as e: - self.report_warning(f'{e}; Retrying with feed workaround') + self.report_warning(f'{e.orig_msg}; trying feed workaround') feed_list = self._call_api('feed', {'aweme_id': aweme_id}, aweme_id, note='Downloading video feed', errnote='Unable to download video feed').get('aweme_list') or [] aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None) @@ -506,26 +550,20 @@ class TikTokIE(TikTokBaseIE): return self._parse_aweme_video_app(aweme_detail) def _real_extract(self, url): - video_id = self._match_id(url) - + video_id, user_id = self._match_valid_url(url).group('id', 'user_id') try: return self._extract_aweme_app(video_id) except ExtractorError as e: - self.report_warning(f'{e}; Retrying with webpage') + self.report_warning(f'{e}; trying with webpage') - # If we only call once, we get a 403 when downlaoding the video. - self._download_webpage(url, video_id) - webpage = self._download_webpage(url, video_id, note='Downloading video webpage') + url = self._create_url(user_id, video_id) + webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'User-Agent:Mozilla/5.0'}) next_data = self._search_nextjs_data(webpage, video_id, default='{}') - if next_data: status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0 video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct'), expected_type=dict) else: - sigi_json = self._search_regex( - r'>\s*window\[[\'"]SIGI_STATE[\'"]\]\s*=\s*(?P<sigi_state>{.+});', - webpage, 'sigi data', group='sigi_state') - sigi_data = self._parse_json(sigi_json, video_id) + sigi_data = self._get_sigi_state(webpage, video_id) status = traverse_obj(sigi_data, ('VideoPage', 'statusCode'), expected_type=int) or 0 video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict) @@ -841,7 +879,7 @@ class DouyinIE(TikTokIE): try: return self._extract_aweme_app(video_id) except ExtractorError as e: - self.report_warning(f'{e}; Retrying with webpage') + self.report_warning(f'{e}; trying with webpage') webpage = self._download_webpage(url, video_id) render_data_json = self._search_regex( diff --git a/yt_dlp/extractor/trovo.py b/yt_dlp/extractor/trovo.py index c049025a3..d43411928 100644 --- a/yt_dlp/extractor/trovo.py +++ b/yt_dlp/extractor/trovo.py @@ -38,7 +38,7 @@ class TrovoBaseIE(InfoExtractor): return { 'uploader': streamer_info.get('nickName'), 'uploader_id': str_or_none(streamer_info.get('uid')), - 'uploader_url': format_field(username, template='https://trovo.live/%s'), + 'uploader_url': format_field(username, None, 'https://trovo.live/%s'), } diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index b04575bd5..cebd027c8 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -54,9 +54,24 @@ class TVerIE(InfoExtractor): video_id = self._match_id(self._search_regex( (r'canonical"\s*href="(https?://tver\.jp/[^"]+)"', r'&link=(https?://tver\.jp/[^?&]+)[?&]'), webpage, 'url regex')) + + episode_info = self._download_json( + f'https://platform-api.tver.jp/service/api/v1/callEpisode/{video_id}?require_data=mylist,later[epefy106ur],good[epefy106ur],resume[epefy106ur]', + video_id, fatal=False, + query={ + 'platform_uid': self._PLATFORM_UID, + 'platform_token': self._PLATFORM_TOKEN, + }, headers={ + 'x-tver-platform-type': 'web' + }) + episode_content = traverse_obj( + episode_info, ('result', 'episode', 'content')) or {} + video_info = self._download_json( f'https://statics.tver.jp/content/episode/{video_id}.json', video_id, - query={'v': '5'}, headers={ + query={ + 'v': str_or_none(episode_content.get('version')) or '5', + }, headers={ 'Origin': 'https://tver.jp', 'Referer': 'https://tver.jp/', }) @@ -67,25 +82,13 @@ class TVerIE(InfoExtractor): if not r_id.isdigit(): r_id = f'ref:{r_id}' - additional_info = self._download_json( - f'https://platform-api.tver.jp/service/api/v1/callEpisode/{video_id}?require_data=mylist,later[epefy106ur],good[epefy106ur],resume[epefy106ur]', - video_id, fatal=False, - query={ - 'platform_uid': self._PLATFORM_UID, - 'platform_token': self._PLATFORM_TOKEN, - }, headers={ - 'x-tver-platform-type': 'web' - }) - - additional_content_info = traverse_obj( - additional_info, ('result', 'episode', 'content'), get_all=False) or {} - episode = strip_or_none(additional_content_info.get('title')) - series = str_or_none(additional_content_info.get('seriesTitle')) + episode = strip_or_none(episode_content.get('title')) + series = str_or_none(episode_content.get('seriesTitle')) title = ( join_nonempty(series, episode, delim=' ') or str_or_none(video_info.get('title'))) - provider = str_or_none(additional_content_info.get('productionProviderName')) - onair_label = str_or_none(additional_content_info.get('broadcastDateLabel')) + provider = str_or_none(episode_content.get('productionProviderName')) + onair_label = str_or_none(episode_content.get('broadcastDateLabel')) return { '_type': 'url_transparent', diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index af6750333..d516aafa2 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -470,7 +470,7 @@ class TwitterIE(TwitterBaseIE): 'uploader': uploader, 'timestamp': unified_timestamp(status.get('created_at')), 'uploader_id': uploader_id, - 'uploader_url': format_field(uploader_id, template='https://twitter.com/%s'), + 'uploader_url': format_field(uploader_id, None, 'https://twitter.com/%s'), 'like_count': int_or_none(status.get('favorite_count')), 'repost_count': int_or_none(status.get('retweet_count')), 'comment_count': int_or_none(status.get('reply_count')), diff --git a/yt_dlp/extractor/udemy.py b/yt_dlp/extractor/udemy.py index d35cd0d43..1dc2dbdc4 100644 --- a/yt_dlp/extractor/udemy.py +++ b/yt_dlp/extractor/udemy.py @@ -1,16 +1,12 @@ import re +import urllib.request from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urllib_request, - compat_urlparse, -) +from ..compat import compat_HTTPError, compat_str, compat_urlparse from ..utils import ( + ExtractorError, determine_ext, extract_attributes, - ExtractorError, float_or_none, int_or_none, js_to_json, @@ -148,14 +144,14 @@ class UdemyIE(InfoExtractor): 'X-Udemy-Snail-Case': 'true', 'X-Requested-With': 'XMLHttpRequest', } - for cookie in self._downloader.cookiejar: + for cookie in self.cookiejar: if cookie.name == 'client_id': headers['X-Udemy-Client-Id'] = cookie.value elif cookie.name == 'access_token': headers['X-Udemy-Bearer-Token'] = cookie.value headers['X-Udemy-Authorization'] = 'Bearer %s' % cookie.value - if isinstance(url_or_request, compat_urllib_request.Request): + if isinstance(url_or_request, urllib.request.Request): for header, value in headers.items(): url_or_request.add_header(header, value) else: diff --git a/yt_dlp/extractor/urort.py b/yt_dlp/extractor/urort.py index 296799d38..3f687f737 100644 --- a/yt_dlp/extractor/urort.py +++ b/yt_dlp/extractor/urort.py @@ -1,10 +1,7 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, -) -from ..utils import ( - unified_strdate, -) +from ..utils import unified_strdate class UrortIE(InfoExtractor): @@ -31,7 +28,7 @@ class UrortIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id) + fstr = urllib.parse.quote("InternalBandUrl eq '%s'" % playlist_id) json_url = 'http://urort.p3.no/breeze/urort/TrackDTOViews?$filter=%s&$orderby=Released%%20desc&$expand=Tags%%2CFiles' % fstr songs = self._download_json(json_url, playlist_id) entries = [] diff --git a/yt_dlp/extractor/vevo.py b/yt_dlp/extractor/vevo.py index bc0187511..825089f47 100644 --- a/yt_dlp/extractor/vevo.py +++ b/yt_dlp/extractor/vevo.py @@ -33,10 +33,124 @@ class VevoIE(VevoBaseIE): https?://cache\.vevo\.com/m/html/embed\.html\?video=| https?://videoplayer\.vevo\.com/embed/embedded\?videoId=| https?://embed\.vevo\.com/.*?[?&]isrc=| + https?://tv\.vevo\.com/watch/artist/(?:[^/]+)/| vevo:) (?P<id>[^&?#]+)''' - _TESTS = [] + _TESTS = [{ + 'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', + 'md5': '95ee28ee45e70130e3ab02b0f579ae23', + 'info_dict': { + 'id': 'GB1101300280', + 'ext': 'mp4', + 'title': 'Hurts - Somebody to Die For', + 'timestamp': 1372057200, + 'upload_date': '20130624', + 'uploader': 'Hurts', + 'track': 'Somebody to Die For', + 'artist': 'Hurts', + 'genre': 'Pop', + }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], + }, { + 'note': 'v3 SMIL format', + 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923', + 'md5': 'f6ab09b034f8c22969020b042e5ac7fc', + 'info_dict': { + 'id': 'USUV71302923', + 'ext': 'mp4', + 'title': 'Cassadee Pope - I Wish I Could Break Your Heart', + 'timestamp': 1392796919, + 'upload_date': '20140219', + 'uploader': 'Cassadee Pope', + 'track': 'I Wish I Could Break Your Heart', + 'artist': 'Cassadee Pope', + 'genre': 'Country', + }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], + }, { + 'note': 'Age-limited video', + 'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282', + 'info_dict': { + 'id': 'USRV81300282', + 'ext': 'mp4', + 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', + 'age_limit': 18, + 'timestamp': 1372888800, + 'upload_date': '20130703', + 'uploader': 'Justin Timberlake', + 'track': 'Tunnel Vision (Explicit)', + 'artist': 'Justin Timberlake', + 'genre': 'Pop', + }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], + }, { + 'note': 'No video_info', + 'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000', + 'md5': '8b83cc492d72fc9cf74a02acee7dc1b0', + 'info_dict': { + 'id': 'USUV71503000', + 'ext': 'mp4', + 'title': 'K Camp ft. T.I. - Till I Die', + 'age_limit': 18, + 'timestamp': 1449468000, + 'upload_date': '20151207', + 'uploader': 'K Camp', + 'track': 'Till I Die', + 'artist': 'K Camp', + 'genre': 'Hip-Hop', + }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], + }, { + 'note': 'Featured test', + 'url': 'https://www.vevo.com/watch/lemaitre/Wait/USUV71402190', + 'md5': 'd28675e5e8805035d949dc5cf161071d', + 'info_dict': { + 'id': 'USUV71402190', + 'ext': 'mp4', + 'title': 'Lemaitre ft. LoLo - Wait', + 'age_limit': 0, + 'timestamp': 1413432000, + 'upload_date': '20141016', + 'uploader': 'Lemaitre', + 'track': 'Wait', + 'artist': 'Lemaitre', + 'genre': 'Electronic', + }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], + }, { + 'note': 'Only available via webpage', + 'url': 'http://www.vevo.com/watch/GBUV71600656', + 'md5': '67e79210613865b66a47c33baa5e37fe', + 'info_dict': { + 'id': 'GBUV71600656', + 'ext': 'mp4', + 'title': 'ABC - Viva Love', + 'age_limit': 0, + 'timestamp': 1461830400, + 'upload_date': '20160428', + 'uploader': 'ABC', + 'track': 'Viva Love', + 'artist': 'ABC', + 'genre': 'Pop', + }, + 'expected_warnings': ['Failed to download video versions info'], + }, { + # no genres available + 'url': 'http://www.vevo.com/watch/INS171400764', + 'only_matching': True, + }, { + # Another case available only via the webpage; using streams/streamsV3 formats + # Geo-restricted to Netherlands/Germany + 'url': 'http://www.vevo.com/watch/boostee/pop-corn-clip-officiel/FR1A91600909', + 'only_matching': True, + }, { + 'url': 'https://embed.vevo.com/?isrc=USH5V1923499&partnerId=4d61b777-8023-4191-9ede-497ed6c24647&partnerAdCode=', + 'only_matching': True, + }, { + 'url': 'https://tv.vevo.com/watch/artist/janet-jackson/US0450100550', + 'only_matching': True, + }] _VERSIONS = { 0: 'youtube', # only in AuthenticateVideo videoVersions 1: 'level3', @@ -138,6 +252,7 @@ class VevoIE(VevoBaseIE): fatal=False)) else: m = re.search(r'''(?xi) + _(?P<quality>[a-z0-9]+) _(?P<width>[0-9]+)x(?P<height>[0-9]+) _(?P<vcodec>[a-z0-9]+) _(?P<vbr>[0-9]+) @@ -149,7 +264,7 @@ class VevoIE(VevoBaseIE): formats.append({ 'url': version_url, - 'format_id': 'http-%s-%s' % (version, video_version['quality']), + 'format_id': f'http-{version}-{video_version.get("quality") or m.group("quality")}', 'vcodec': m.group('vcodec'), 'acodec': m.group('acodec'), 'vbr': int(m.group('vbr')), diff --git a/yt_dlp/extractor/videa.py b/yt_dlp/extractor/videa.py index 251eb78fe..9b05c86a5 100644 --- a/yt_dlp/extractor/videa.py +++ b/yt_dlp/extractor/videa.py @@ -1,8 +1,10 @@ import random import re import string +import struct from .common import InfoExtractor +from ..compat import compat_b64decode, compat_ord from ..utils import ( ExtractorError, int_or_none, @@ -14,11 +16,6 @@ from ..utils import ( xpath_element, xpath_text, ) -from ..compat import ( - compat_b64decode, - compat_ord, - compat_struct_pack, -) class VideaIE(InfoExtractor): @@ -102,7 +99,7 @@ class VideaIE(InfoExtractor): j = (j + S[i]) % 256 S[i], S[j] = S[j], S[i] k = S[(S[i] + S[j]) % 256] - res += compat_struct_pack('B', k ^ compat_ord(cipher_text[m])) + res += struct.pack('B', k ^ compat_ord(cipher_text[m])) return res.decode() diff --git a/yt_dlp/extractor/videocampus_sachsen.py b/yt_dlp/extractor/videocampus_sachsen.py index 906412f08..679574bd7 100644 --- a/yt_dlp/extractor/videocampus_sachsen.py +++ b/yt_dlp/extractor/videocampus_sachsen.py @@ -6,14 +6,18 @@ from ..utils import ExtractorError class VideocampusSachsenIE(InfoExtractor): - IE_NAME = 'Vimp' + IE_NAME = 'ViMP' _INSTANCES = ( + 'bergauf.tv', 'campus.demo.vimp.com', 'corporate.demo.vimp.com', 'dancehalldatabase.com', + 'drehzahl.tv', 'educhannel.hs-gesundheit.de', 'emedia.ls.haw-hamburg.de', 'globale-evolution.net', + 'hohu.tv', + 'htvideos.hightechhigh.org', 'k210039.vimp.mivitec.net', 'media.cmslegal.com', 'media.hs-furtwangen.de', @@ -25,6 +29,7 @@ class VideocampusSachsenIE(InfoExtractor): 'mportal.europa-uni.de', 'pacific.demo.vimp.com', 'slctv.com', + 'streaming.prairiesouth.ca', 'tube.isbonline.cn', 'univideo.uni-kassel.de', 'ursula2.genetics.emory.edu', @@ -52,11 +57,15 @@ class VideocampusSachsenIE(InfoExtractor): 'vimp.weka-fachmedien.de', 'webtv.univ-montp3.fr', 'www.b-tu.de/media', + 'www.bergauf.tv', 'www.bigcitytv.de', 'www.cad-videos.de', + 'www.drehzahl.tv', 'www.fh-bielefeld.de/medienportal', + 'www.hohu.tv', 'www.orvovideo.com', 'www.rwe.tv', + 'www.salzi.tv', 'www.wenglor-media.com', 'www2.univ-sba.dz', ) @@ -73,6 +82,7 @@ class VideocampusSachsenIE(InfoExtractor): 'id': 'e6b9349905c1628631f175712250f2a1', 'title': 'Konstruktiver Entwicklungsprozess Vorlesung 7', 'description': 'Konstruktiver Entwicklungsprozess Vorlesung 7', + 'thumbnail': 'https://videocampus.sachsen.de/cache/1a985379ad3aecba8097a6902c7daa4e.jpg', 'ext': 'mp4', }, }, @@ -82,6 +92,7 @@ class VideocampusSachsenIE(InfoExtractor): 'id': 'fc99c527e4205b121cb7c74433469262', 'title': 'Was ist selbstgesteuertes Lernen?', 'description': 'md5:196aa3b0509a526db62f84679522a2f5', + 'thumbnail': 'https://videocampus.sachsen.de/cache/6f4a85096ba24cb398e6ce54446b57ae.jpg', 'display_id': 'Was-ist-selbstgesteuertes-Lernen', 'ext': 'mp4', }, @@ -92,6 +103,7 @@ class VideocampusSachsenIE(InfoExtractor): 'id': '09d4ed029002eb1bdda610f1103dd54c', 'title': 'Tutorial zur Nutzung von Adobe Connect aus Veranstalter-Sicht', 'description': 'md5:3d379ca3cc17b9da6784d7f58cca4d58', + 'thumbnail': 'https://videocampus.sachsen.de/cache/2452498fe8c2d5a7dc79a05d30f407b6.jpg', 'display_id': 'Tutorial-zur-Nutzung-von-Adobe-Connect-aus-Veranstalter-Sicht', 'ext': 'mp4', }, @@ -103,6 +115,7 @@ class VideocampusSachsenIE(InfoExtractor): 'id': '0183356e41af7bfb83d7667b20d9b6a3', 'title': 'Présentation de la Faculté de droit et des sciences politiques - Journée portes ouvertes 2021/22', 'description': 'md5:508958bd93e0ca002ac731d94182a54f', + 'thumbnail': 'https://www2.univ-sba.dz/cache/4d5d4a0b4189271a8cc6cb5328e14769.jpg', 'display_id': 'Presentation-de-la-Faculte-de-droit-et-des-sciences-politiques-Journee-portes-ouvertes-202122', 'ext': 'mp4', } @@ -113,6 +126,7 @@ class VideocampusSachsenIE(InfoExtractor): 'id': 'c8816f1cc942c12b6cce57c835cffd7c', 'title': 'Preisverleihung »Produkte des Jahres 2022«', 'description': 'md5:60c347568ca89aa25b772c4ea564ebd3', + 'thumbnail': 'https://vimp.weka-fachmedien.de/cache/da9f3090e9227b25beacf67ccf94de14.png', 'display_id': 'Preisverleihung-Produkte-des-Jahres-2022', 'ext': 'mp4', }, @@ -124,7 +138,7 @@ class VideocampusSachsenIE(InfoExtractor): 'title': 'Was ist selbstgesteuertes Lernen?', 'ext': 'mp4', }, - } + }, ] def _real_extract(self, url): @@ -139,12 +153,14 @@ class VideocampusSachsenIE(InfoExtractor): if not (display_id or tmp_id): # Title, description from embedded page's meta wouldn't be correct - title = self._html_search_regex(r'<img[^>]* title="([^"<]+)"', webpage, 'title', fatal=False) + title = self._html_search_regex(r'<video-js[^>]* data-piwik-title="([^"<]+)"', webpage, 'title', fatal=False) description = None + thumbnail = None else: title = self._html_search_meta(('og:title', 'twitter:title', 'title'), webpage, fatal=False) description = self._html_search_meta( - ('og:description', 'twitter:description', 'description'), webpage, default=None) + ('og:description', 'twitter:description', 'description'), webpage, fatal=False) + thumbnail = self._html_search_meta(('og:image', 'twitter:image'), webpage, fatal=False) formats, subtitles = [], {} try: @@ -162,7 +178,8 @@ class VideocampusSachsenIE(InfoExtractor): 'id': video_id, 'title': title, 'description': description, + 'thumbnail': thumbnail, 'display_id': display_id, 'formats': formats, - 'subtitles': subtitles + 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/vidio.py b/yt_dlp/extractor/vidio.py index 599996bf9..8092d340e 100644 --- a/yt_dlp/extractor/vidio.py +++ b/yt_dlp/extractor/vidio.py @@ -152,7 +152,7 @@ class VidioIE(VidioBaseIE): 'uploader': user.get('name'), 'timestamp': parse_iso8601(video.get('created_at')), 'uploader_id': username, - 'uploader_url': format_field(username, template='https://www.vidio.com/@%s'), + 'uploader_url': format_field(username, None, 'https://www.vidio.com/@%s'), 'channel': channel.get('name'), 'channel_id': str_or_none(channel.get('id')), 'view_count': get_count('view_count'), @@ -283,5 +283,5 @@ class VidioLiveIE(VidioBaseIE): 'uploader': user.get('name'), 'timestamp': parse_iso8601(stream_meta.get('start_time')), 'uploader_id': username, - 'uploader_url': format_field(username, template='https://www.vidio.com/@%s'), + 'uploader_url': format_field(username, None, 'https://www.vidio.com/@%s'), } diff --git a/yt_dlp/extractor/vidlii.py b/yt_dlp/extractor/vidlii.py index b9845affd..69a75304e 100644 --- a/yt_dlp/extractor/vidlii.py +++ b/yt_dlp/extractor/vidlii.py @@ -100,7 +100,7 @@ class VidLiiIE(InfoExtractor): uploader = self._search_regex( r'<div[^>]+class=["\']wt_person[^>]+>\s*<a[^>]+\bhref=["\']/user/[^>]+>([^<]+)', webpage, 'uploader', fatal=False) - uploader_url = format_field(uploader, template='https://www.vidlii.com/user/%s') + uploader_url = format_field(uploader, None, 'https://www.vidlii.com/user/%s') upload_date = unified_strdate(self._html_search_meta( 'datePublished', webpage, default=None) or self._search_regex( diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 59c5353ab..961734345 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -40,6 +40,18 @@ class VimeoBaseInfoExtractor(InfoExtractor): _LOGIN_REQUIRED = False _LOGIN_URL = 'https://vimeo.com/log_in' + @staticmethod + def _smuggle_referrer(url, referrer_url): + return smuggle_url(url, {'http_headers': {'Referer': referrer_url}}) + + def _unsmuggle_headers(self, url): + """@returns (url, smuggled_data, headers)""" + url, data = unsmuggle_url(url, {}) + headers = self.get_param('http_headers').copy() + if 'http_headers' in data: + headers.update(data['http_headers']) + return url, data, headers + def _perform_login(self, username, password): webpage = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') @@ -718,10 +730,6 @@ class VimeoIE(VimeoBaseInfoExtractor): ] @staticmethod - def _smuggle_referrer(url, referrer_url): - return smuggle_url(url, {'http_headers': {'Referer': referrer_url}}) - - @staticmethod def _extract_urls(url, webpage): urls = [] # Look for embedded (iframe) Vimeo player @@ -754,8 +762,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'Content-Type': 'application/x-www-form-urlencoded', }) checked = self._download_json( - url + '/check-password', video_id, - 'Verifying the password', data=data, headers=headers) + f'{compat_urlparse.urlsplit(url)._replace(query=None).geturl()}/check-password', + video_id, 'Verifying the password', data=data, headers=headers) if checked is False: raise ExtractorError('Wrong video password', expected=True) return checked @@ -830,10 +838,7 @@ class VimeoIE(VimeoBaseInfoExtractor): raise def _real_extract(self, url): - url, data = unsmuggle_url(url, {}) - headers = self.get_param('http_headers').copy() - if 'http_headers' in data: - headers.update(data['http_headers']) + url, data, headers = self._unsmuggle_headers(url) if 'Referer' not in headers: headers['Referer'] = url @@ -1383,14 +1388,15 @@ class VHXEmbedIE(VimeoBaseInfoExtractor): _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P<id>\d+)' @staticmethod - def _extract_url(webpage): + def _extract_url(url, webpage): mobj = re.search( r'<iframe[^>]+src="(https?://embed\.vhx\.tv/videos/\d+[^"]*)"', webpage) - return unescapeHTML(mobj.group(1)) if mobj else None + return VimeoIE._smuggle_referrer(unescapeHTML(mobj.group(1)), url) if mobj else None def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + url, _, headers = self._unsmuggle_headers(url) + webpage = self._download_webpage(url, video_id, headers=headers) config_url = self._parse_json(self._search_regex( r'window\.OTTData\s*=\s*({.+})', webpage, 'ott data'), video_id, js_to_json)['config_url'] diff --git a/yt_dlp/extractor/vine.py b/yt_dlp/extractor/vine.py index bbf43a83f..947f5cdb6 100644 --- a/yt_dlp/extractor/vine.py +++ b/yt_dlp/extractor/vine.py @@ -89,7 +89,7 @@ class VineIE(InfoExtractor): username = data.get('username') - alt_title = format_field(username, template='Vine by %s') + alt_title = format_field(username, None, 'Vine by %s') return { 'id': video_id, diff --git a/yt_dlp/extractor/voicy.py b/yt_dlp/extractor/voicy.py index e4570a03a..feab79138 100644 --- a/yt_dlp/extractor/voicy.py +++ b/yt_dlp/extractor/voicy.py @@ -1,3 +1,5 @@ +import itertools + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -9,8 +11,6 @@ from ..utils import ( unsmuggle_url, ) -import itertools - class VoicyBaseIE(InfoExtractor): def _extract_from_playlist_data(self, value): @@ -105,7 +105,7 @@ class VoicyChannelIE(VoicyBaseIE): @classmethod def suitable(cls, url): - return not VoicyIE.suitable(url) and super(VoicyChannelIE, cls).suitable(url) + return not VoicyIE.suitable(url) and super().suitable(url) def _entries(self, channel_id): pager = '' diff --git a/yt_dlp/extractor/vrv.py b/yt_dlp/extractor/vrv.py index 35662753e..0b9bf2903 100644 --- a/yt_dlp/extractor/vrv.py +++ b/yt_dlp/extractor/vrv.py @@ -1,17 +1,14 @@ import base64 -import json import hashlib import hmac +import json import random import string import time +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urllib_parse_urlencode, - compat_urllib_parse, -) +from ..compat import compat_HTTPError, compat_urllib_parse_urlencode from ..utils import ( ExtractorError, float_or_none, @@ -46,12 +43,12 @@ class VRVBaseIE(InfoExtractor): headers['Content-Type'] = 'application/json' base_string = '&'.join([ 'POST' if data else 'GET', - compat_urllib_parse.quote(base_url, ''), - compat_urllib_parse.quote(encoded_query, '')]) + urllib.parse.quote(base_url, ''), + urllib.parse.quote(encoded_query, '')]) oauth_signature = base64.b64encode(hmac.new( (self._API_PARAMS['oAuthSecret'] + '&' + self._TOKEN_SECRET).encode('ascii'), base_string.encode(), hashlib.sha1).digest()).decode() - encoded_query += '&oauth_signature=' + compat_urllib_parse.quote(oauth_signature, '') + encoded_query += '&oauth_signature=' + urllib.parse.quote(oauth_signature, '') try: return self._download_json( '?'.join([base_url, encoded_query]), video_id, diff --git a/yt_dlp/extractor/vshare.py b/yt_dlp/extractor/vshare.py index 8ef75d30e..fd5226bbc 100644 --- a/yt_dlp/extractor/vshare.py +++ b/yt_dlp/extractor/vshare.py @@ -1,11 +1,7 @@ import re from .common import InfoExtractor -from ..compat import compat_chr -from ..utils import ( - decode_packed_codes, - ExtractorError, -) +from ..utils import ExtractorError, decode_packed_codes class VShareIE(InfoExtractor): @@ -37,7 +33,7 @@ class VShareIE(InfoExtractor): digits = [int(digit) for digit in digits.split(',')] key_digit = self._search_regex( r'fromCharCode\(.+?(\d+)\)}', unpacked, 'key digit') - chars = [compat_chr(d - int(key_digit)) for d in digits] + chars = [chr(d - int(key_digit)) for d in digits] return ''.join(chars) def _real_extract(self, url): diff --git a/yt_dlp/extractor/wppilot.py b/yt_dlp/extractor/wppilot.py index 6349e5326..e1062b9b5 100644 --- a/yt_dlp/extractor/wppilot.py +++ b/yt_dlp/extractor/wppilot.py @@ -20,7 +20,7 @@ class WPPilotBaseIE(InfoExtractor): def _get_channel_list(self, cache=True): if cache is True: - cache_res = self._downloader.cache.load('wppilot', 'channel-list') + cache_res = self.cache.load('wppilot', 'channel-list') if cache_res: return cache_res, True webpage = self._download_webpage('https://pilot.wp.pl/tv/', None, 'Downloading webpage') @@ -35,7 +35,7 @@ class WPPilotBaseIE(InfoExtractor): channel_list = try_get(qhash_content, lambda x: x['data']['allChannels']['nodes']) if channel_list is None: continue - self._downloader.cache.store('wppilot', 'channel-list', channel_list) + self.cache.store('wppilot', 'channel-list', channel_list) return channel_list, False raise ExtractorError('Unable to find the channel list') @@ -101,7 +101,7 @@ class WPPilotIE(WPPilotBaseIE): channel = self._get_channel(video_id) video_id = str(channel['id']) - is_authorized = next((c for c in self._downloader.cookiejar if c.name == 'netviapisessid'), None) + is_authorized = next((c for c in self.cookiejar if c.name == 'netviapisessid'), None) # cookies starting with "g:" are assigned to guests is_authorized = True if is_authorized is not None and not is_authorized.value.startswith('g:') else False diff --git a/yt_dlp/extractor/xfileshare.py b/yt_dlp/extractor/xfileshare.py index 28b6ecb6e..63abe4a1f 100644 --- a/yt_dlp/extractor/xfileshare.py +++ b/yt_dlp/extractor/xfileshare.py @@ -1,11 +1,10 @@ import re from .common import InfoExtractor -from ..compat import compat_chr from ..utils import ( + ExtractorError, decode_packed_codes, determine_ext, - ExtractorError, int_or_none, js_to_json, urlencode_postdata, @@ -32,11 +31,11 @@ def aa_decode(aa_code): aa_char = aa_char.replace('+ ', '') m = re.match(r'^\d+', aa_char) if m: - ret += compat_chr(int(m.group(0), 8)) + ret += chr(int(m.group(0), 8)) else: m = re.match(r'^u([\da-f]+)', aa_char) if m: - ret += compat_chr(int(m.group(1), 16)) + ret += chr(int(m.group(1), 16)) return ret diff --git a/yt_dlp/extractor/xhamster.py b/yt_dlp/extractor/xhamster.py index ff15d3707..e42eed7d8 100644 --- a/yt_dlp/extractor/xhamster.py +++ b/yt_dlp/extractor/xhamster.py @@ -21,7 +21,7 @@ from ..utils import ( class XHamsterIE(InfoExtractor): - _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com)' + _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com)' _VALID_URL = r'''(?x) https?:// (?:.+?\.)?%s/ @@ -32,7 +32,7 @@ class XHamsterIE(InfoExtractor): ''' % _DOMAINS _TESTS = [{ 'url': 'https://xhamster.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', - 'md5': '98b4687efb1ffd331c4197854dc09e8f', + 'md5': '34e1ab926db5dc2750fed9e1f34304bb', 'info_dict': { 'id': '1509445', 'display_id': 'femaleagent-shy-beauty-takes-the-bait', @@ -41,6 +41,7 @@ class XHamsterIE(InfoExtractor): 'timestamp': 1350194821, 'upload_date': '20121014', 'uploader': 'Ruseful2011', + 'uploader_id': 'ruseful2011', 'duration': 893, 'age_limit': 18, }, @@ -70,6 +71,7 @@ class XHamsterIE(InfoExtractor): 'timestamp': 1454948101, 'upload_date': '20160208', 'uploader': 'parejafree', + 'uploader_id': 'parejafree', 'duration': 72, 'age_limit': 18, }, @@ -115,6 +117,9 @@ class XHamsterIE(InfoExtractor): }, { 'url': 'http://de.xhamster.com/videos/skinny-girl-fucks-herself-hard-in-the-forest-xhnBJZx', 'only_matching': True, + }, { + 'url': 'https://xhday.com/videos/strapless-threesome-xhh7yVf', + 'only_matching': True, }] def _real_extract(self, url): @@ -244,7 +249,6 @@ class XHamsterIE(InfoExtractor): categories = None uploader_url = url_or_none(try_get(video, lambda x: x['author']['pageURL'])) - return { 'id': video_id, 'display_id': display_id, @@ -263,7 +267,7 @@ class XHamsterIE(InfoExtractor): 'dislike_count': int_or_none(try_get( video, lambda x: x['rating']['dislikes'], int)), 'comment_count': int_or_none(video.get('views')), - 'age_limit': age_limit, + 'age_limit': age_limit if age_limit is not None else 18, 'categories': categories, 'formats': formats, } @@ -423,6 +427,9 @@ class XHamsterUserIE(InfoExtractor): 'id': 'firatkaan', }, 'playlist_mincount': 1, + }, { + 'url': 'https://xhday.com/users/mobhunter', + 'only_matching': True, }] def _entries(self, user_id): diff --git a/yt_dlp/extractor/yahoo.py b/yt_dlp/extractor/yahoo.py index 3fe6192bf..8811df6d8 100644 --- a/yt_dlp/extractor/yahoo.py +++ b/yt_dlp/extractor/yahoo.py @@ -1,15 +1,15 @@ import hashlib import itertools import re +import urllib.parse +from .brightcove import BrightcoveNewIE from .common import InfoExtractor, SearchInfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse, -) +from .youtube import YoutubeIE +from ..compat import compat_str from ..utils import ( - clean_html, ExtractorError, + clean_html, int_or_none, mimetype2ext, parse_iso8601, @@ -18,9 +18,6 @@ from ..utils import ( url_or_none, ) -from .brightcove import BrightcoveNewIE -from .youtube import YoutubeIE - class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' @@ -333,7 +330,7 @@ class YahooSearchIE(SearchInfoExtractor): def _search_results(self, query): for pagenum in itertools.count(0): - result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30) + result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (urllib.parse.quote_plus(query), pagenum * 30) info = self._download_json(result_url, query, note='Downloading results page ' + str(pagenum + 1)) yield from (self.url_result(result['rurl']) for result in info['results']) @@ -434,7 +431,7 @@ class YahooGyaOIE(InfoExtractor): page = 1 while True: playlist = self._download_json( - f'https://gyao.yahoo.co.jp/api/programs/{program_id}/videos?page={page}', program_id, + f'https://gyao.yahoo.co.jp/api/programs/{program_id}/videos?page={page}&serviceId=gy', program_id, note=f'Downloading JSON metadata page {page}') if not playlist: break diff --git a/yt_dlp/extractor/ynet.py b/yt_dlp/extractor/ynet.py index 444785947..27eda9721 100644 --- a/yt_dlp/extractor/ynet.py +++ b/yt_dlp/extractor/ynet.py @@ -1,8 +1,8 @@ -import re import json +import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus class YnetIE(InfoExtractor): @@ -31,7 +31,7 @@ class YnetIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - content = compat_urllib_parse_unquote_plus(self._og_search_video_url(webpage)) + content = urllib.parse.unquote_plus(self._og_search_video_url(webpage)) config = json.loads(self._search_regex(r'config=({.+?})$', content, 'video config')) f4m_url = config['clip']['url'] title = self._og_search_title(webpage) diff --git a/yt_dlp/extractor/younow.py b/yt_dlp/extractor/younow.py index 76d89f3ce..18112ba35 100644 --- a/yt_dlp/extractor/younow.py +++ b/yt_dlp/extractor/younow.py @@ -91,7 +91,7 @@ def _extract_moment(item, fatal=True): uploader = try_get(item, lambda x: x['owner']['name'], compat_str) uploader_id = try_get(item, lambda x: x['owner']['userId']) - uploader_url = format_field(uploader, template='https://www.younow.com/%s') + uploader_url = format_field(uploader, None, 'https://www.younow.com/%s') entry = { 'extractor_key': 'YouNowMoment', diff --git a/yt_dlp/extractor/youporn.py b/yt_dlp/extractor/youporn.py index 5aea82295..b484e08ec 100644 --- a/yt_dlp/extractor/youporn.py +++ b/yt_dlp/extractor/youporn.py @@ -135,9 +135,10 @@ class YouPornIE(InfoExtractor): r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( - [r'UPLOADED:\s*<span>([^<]+)', + (r'UPLOADED:\s*<span>([^<]+)', r'Date\s+[Aa]dded:\s*<span>([^<]+)', - r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'], + r'''(?s)<div[^>]+class=["']videoInfo(?:Date|Time)\b[^>]*>(.+?)</div>''', + r'(?s)<label\b[^>]*>Uploaded[^<]*</label>\s*<span\b[^>]*>(.+?)</span>'), webpage, 'upload date', fatal=False)) age_limit = self._rta_search(webpage) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 5546aa9a3..ebc3381a2 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2,7 +2,6 @@ import base64 import calendar import copy import datetime -import functools import hashlib import itertools import json @@ -14,18 +13,11 @@ import sys import threading import time import traceback +import urllib.error +import urllib.parse from .common import InfoExtractor, SearchInfoExtractor -from ..compat import ( - compat_chr, - compat_HTTPError, - compat_parse_qs, - compat_str, - compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urlparse, -) +from ..compat import functools from ..jsinterp import JSInterpreter from ..utils import ( NO_DEFAULT, @@ -382,11 +374,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): pref = {} if pref_cookie: try: - pref = dict(compat_urlparse.parse_qsl(pref_cookie.value)) + pref = dict(urllib.parse.parse_qsl(pref_cookie.value)) except ValueError: self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) pref.update({'hl': 'en', 'tz': 'UTC'}) - self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref)) + self._set_cookie('.youtube.com', name='PREF', value=urllib.parse.urlencode(pref)) def _real_initialize(self): self._initialize_pref() @@ -397,9 +389,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if self._LOGIN_REQUIRED and not self._cookies_passed: self.raise_login_required('Login details are needed to download this content', method='cookies') - _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' - _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' - _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)' + _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=' + _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=' def _get_default_ytcfg(self, client='web'): return copy.deepcopy(INNERTUBE_CLIENTS[client]) @@ -415,15 +406,19 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _extract_client_name(self, ytcfg, default_client='web'): return self._ytcfg_get_safe( ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'], - lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), compat_str, default_client) + lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), str, default_client) def _extract_client_version(self, ytcfg, default_client='web'): return self._ytcfg_get_safe( ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'], - lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion']), compat_str, default_client) + lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion']), str, default_client) + + def _select_api_hostname(self, req_api_hostname, default_client=None): + return (self._configuration_arg('innertube_host', [''], ie_key=YoutubeIE.ie_key())[0] + or req_api_hostname or self._get_innertube_host(default_client or 'web')) def _extract_api_key(self, ytcfg=None, default_client='web'): - return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client) + return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], str, default_client) def _extract_context(self, ytcfg=None, default_client='web'): context = get_first( @@ -470,18 +465,16 @@ class YoutubeBaseInfoExtractor(InfoExtractor): real_headers.update({'content-type': 'application/json'}) if headers: real_headers.update(headers) + api_key = (self._configuration_arg('innertube_key', [''], ie_key=YoutubeIE.ie_key(), casesense=True)[0] + or api_key or self._extract_api_key(default_client=default_client)) return self._download_json( - f'https://{api_hostname or self._get_innertube_host(default_client)}/youtubei/v1/{ep}', + f'https://{self._select_api_hostname(api_hostname, default_client)}/youtubei/v1/{ep}', video_id=video_id, fatal=fatal, note=note, errnote=errnote, data=json.dumps(data).encode('utf8'), headers=real_headers, - query={'key': api_key or self._extract_api_key(), 'prettyPrint': 'false'}) + query={'key': api_key, 'prettyPrint': 'false'}) def extract_yt_initial_data(self, item_id, webpage, fatal=True): - data = self._search_regex( - (fr'{self._YT_INITIAL_DATA_RE}\s*{self._YT_INITIAL_BOUNDARY_RE}', - self._YT_INITIAL_DATA_RE), webpage, 'yt initial data', fatal=fatal) - if data: - return self._parse_json(data, item_id, fatal=fatal) + return self._search_json(self._YT_INITIAL_DATA_RE, webpage, 'yt initial data', item_id, fatal=fatal) @staticmethod def _extract_session_index(*data): @@ -497,7 +490,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # Deprecated? def _extract_identity_token(self, ytcfg=None, webpage=None): if ytcfg: - token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) + token = try_get(ytcfg, lambda x: x['ID_TOKEN'], str) if token: return token if webpage: @@ -513,12 +506,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """ for data in args: # ytcfg includes channel_syncid if on secondary channel - delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], compat_str) + delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], str) if delegated_sid: return delegated_sid sync_ids = (try_get( data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], - lambda x: x['DATASYNC_ID']), compat_str) or '').split('||') + lambda x: x['DATASYNC_ID']), str) or '').split('||') if len(sync_ids) >= 2 and sync_ids[1]: # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel # and just "user_syncid||" for primary channel. We only want the channel_syncid @@ -534,7 +527,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], expected_type=str) - @property + @functools.cached_property def is_authenticated(self): return bool(self._generate_sapisidhash_header()) @@ -550,9 +543,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): self, *, ytcfg=None, account_syncid=None, session_index=None, visitor_data=None, identity_token=None, api_hostname=None, default_client='web'): - origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client)) + origin = 'https://' + (self._select_api_hostname(api_hostname, default_client)) headers = { - 'X-YouTube-Client-Name': compat_str( + 'X-YouTube-Client-Name': str( self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)), 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client), 'Origin': origin, @@ -612,7 +605,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _extract_continuation_ep_data(cls, continuation_ep: dict): if isinstance(continuation_ep, dict): continuation = try_get( - continuation_ep, lambda x: x['continuationCommand']['token'], compat_str) + continuation_ep, lambda x: x['continuationCommand']['token'], str) if not continuation: return ctp = continuation_ep.get('clickTrackingParams') @@ -672,7 +665,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _extract_badges(self, renderer: dict): badges = set() for badge in try_get(renderer, lambda x: x['badges'], list) or []: - label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], compat_str) + label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], str) if label: badges.add(label.lower()) return badges @@ -687,7 +680,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if not any(key is ... or isinstance(key, (list, tuple)) for key in variadic(path)): obj = [obj] for item in obj: - text = try_get(item, lambda x: x['simpleText'], compat_str) + text = try_get(item, lambda x: x['simpleText'], str) if text: return text runs = try_get(item, lambda x: x['runs'], list) or [] @@ -789,20 +782,20 @@ class YoutubeBaseInfoExtractor(InfoExtractor): note='%s%s' % (note, ' (retry #%d)' % count if count else '')) except ExtractorError as e: if isinstance(e.cause, network_exceptions): - if isinstance(e.cause, compat_HTTPError): + if isinstance(e.cause, urllib.error.HTTPError): first_bytes = e.cause.read(512) if not is_html(first_bytes): yt_error = try_get( self._parse_json( self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), - lambda x: x['error']['message'], compat_str) + lambda x: x['error']['message'], str) if yt_error: self._report_alerts([('ERROR', yt_error)], fatal=False) # Downloading page may result in intermittent 5xx HTTP error # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 # We also want to catch all other network exceptions since errors in later pages can be troublesome # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210 - if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429): + if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code not in (403, 429): last_error = error_to_compat_str(e.cause or e.msg) if count < retries: continue @@ -2212,28 +2205,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, { # Story. Requires specific player params to work. # Note: stories get removed after some period of time - 'url': 'https://www.youtube.com/watch?v=yN3x1t3sieA', + 'url': 'https://www.youtube.com/watch?v=vv8qTUWmulI', 'info_dict': { - 'id': 'yN3x1t3sieA', + 'id': 'vv8qTUWmulI', 'ext': 'mp4', - 'uploader': 'Linus Tech Tips', - 'duration': 13, - 'channel': 'Linus Tech Tips', + 'availability': 'unlisted', + 'view_count': int, + 'channel_id': 'UCzIZ8HrzDgc-pNQDUG6avBA', + 'upload_date': '20220526', + 'categories': ['Education'], + 'title': 'Story', + 'channel': 'IT\'S HISTORY', + 'description': '', + 'uploader_id': 'BlastfromthePast', + 'duration': 12, + 'uploader': 'IT\'S HISTORY', 'playable_in_embed': True, - 'tags': [], 'age_limit': 0, - 'uploader_url': 'http://www.youtube.com/user/LinusTechTips', - 'upload_date': '20220402', - 'thumbnail': 'https://i.ytimg.com/vi_webp/yN3x1t3sieA/maxresdefault.webp', - 'title': 'Story', 'live_status': 'not_live', - 'uploader_id': 'LinusTechTips', + 'tags': [], + 'thumbnail': 'https://i.ytimg.com/vi_webp/vv8qTUWmulI/maxresdefault.webp', + 'uploader_url': 'http://www.youtube.com/user/BlastfromthePast', + 'channel_url': 'https://www.youtube.com/channel/UCzIZ8HrzDgc-pNQDUG6avBA', + } + }, { + 'url': 'https://www.youtube.com/watch?v=tjjjtzRLHvA', + 'info_dict': { + 'id': 'tjjjtzRLHvA', + 'ext': 'mp4', + 'title': 'ハッシュタグ無し };if window.ytcsi', + 'upload_date': '20220323', + 'like_count': int, + 'availability': 'unlisted', + 'channel': 'nao20010128nao', + 'thumbnail': 'https://i.ytimg.com/vi_webp/tjjjtzRLHvA/maxresdefault.webp', + 'age_limit': 0, + 'uploader': 'nao20010128nao', + 'uploader_id': 'nao20010128nao', + 'categories': ['Music'], 'view_count': int, 'description': '', - 'channel_id': 'UCXuqSBlHAE6Xw-yeJA0Tunw', - 'categories': ['Science & Technology'], - 'channel_url': 'https://www.youtube.com/channel/UCXuqSBlHAE6Xw-yeJA0Tunw', - 'availability': 'unlisted', + 'channel_url': 'https://www.youtube.com/channel/UCdqltm_7iv1Vs6kp6Syke5A', + 'channel_id': 'UCdqltm_7iv1Vs6kp6Syke5A', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel_follower_count': int, + 'duration': 6, + 'tags': [], + 'uploader_url': 'http://www.youtube.com/user/nao20010128nao', } } ] @@ -2319,7 +2338,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Obtain from MPD's maximum seq value old_mpd_url = mpd_url last_error = ctx.pop('last_error', None) - expire_fast = immediate or last_error and isinstance(last_error, compat_HTTPError) and last_error.code == 403 + expire_fast = immediate or last_error and isinstance(last_error, urllib.error.HTTPError) and last_error.code == 403 mpd_url, stream_number, is_live = (mpd_feed(format_id, 5 if expire_fast else 18000) or (mpd_url, stream_number, False)) if not refresh_sequence: @@ -2386,6 +2405,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): last_segment_url = urljoin(fragment_base_url, 'sq/%d' % idx) yield { 'url': last_segment_url, + 'fragment_count': last_seq, } if known_idx == last_seq: no_fragment_score += 5 @@ -2400,7 +2420,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_player_url(self, *ytcfgs, webpage=None): player_url = traverse_obj( ytcfgs, (..., 'PLAYER_JS_URL'), (..., 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'jsUrl'), - get_all=False, expected_type=compat_str) + get_all=False, expected_type=str) if not player_url: return return urljoin('https://www.youtube.com', player_url) @@ -2417,7 +2437,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ - return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) + return '.'.join(str(len(part)) for part in example_sig.split('.')) @classmethod def _extract_player_info(cls, player_url): @@ -2447,7 +2467,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): func_id = f'js_{player_id}_{self._signature_cache_id(example_sig)}' assert os.path.basename(func_id) == func_id - cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) + cache_spec = self.cache.load('youtube-sigfuncs', func_id) if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) @@ -2455,11 +2475,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if code: res = self._parse_sig_js(code) - test_string = ''.join(map(compat_chr, range(len(example_sig)))) + test_string = ''.join(map(chr, range(len(example_sig)))) cache_res = res(test_string) cache_spec = [ord(c) for c in cache_res] - self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) + self.cache.store('youtube-sigfuncs', func_id, cache_spec) return res def _print_sig_code(self, func, example_sig): @@ -2494,12 +2514,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: yield _genslice(start, i, step) - test_string = ''.join(map(compat_chr, range(len(example_sig)))) + test_string = ''.join(map(chr, range(len(example_sig)))) cache_res = func(test_string) cache_spec = [ord(c) for c in cache_res] expr_code = ' + '.join(gen_sig_code(cache_spec)) signature_id_tuple = '(%s)' % ( - ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) + ', '.join(str(len(p)) for p in example_sig.split('.'))) code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' ' return %s\n') % (signature_id_tuple, expr_code) self.to_screen('Extracted signature function:\n' + code) @@ -2530,22 +2550,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _decrypt_signature(self, s, video_id, player_url): """Turn the encrypted s field into a working signature""" - - if player_url is None: - raise ExtractorError('Cannot decrypt signature without player_url') - try: player_id = (player_url, self._signature_cache_id(s)) if player_id not in self._player_cache: - func = self._extract_signature_function( - video_id, player_url, s - ) + func = self._extract_signature_function(video_id, player_url, s) self._player_cache[player_id] = func func = self._player_cache[player_id] self._print_sig_code(func, s) return func(s) except Exception as e: - raise ExtractorError('Signature extraction failed: ' + traceback.format_exc(), cause=e) + raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id) def _decrypt_nsig(self, s, video_id, player_url): """Turn the encrypted n field into a working signature""" @@ -2580,7 +2594,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_n_function(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self._downloader.cache.load('youtube-nsig', player_id) + func_code = self.cache.load('youtube-nsig', player_id) if func_code: jsi = JSInterpreter(func_code) @@ -2589,7 +2603,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._extract_n_function_name(jscode) jsi = JSInterpreter(jscode) func_code = jsi.extract_function_code(funcname) - self._downloader.cache.store('youtube-nsig', player_id, func_code) + self.cache.store('youtube-nsig', player_id, func_code) if self.get_param('youtube_print_sig_code'): self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n') @@ -2621,30 +2635,45 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return sts def _mark_watched(self, video_id, player_responses): - playback_url = get_first( - player_responses, ('playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'), - expected_type=url_or_none) - if not playback_url: - self.report_warning('Unable to mark watched') - return - parsed_playback_url = compat_urlparse.urlparse(playback_url) - qs = compat_urlparse.parse_qs(parsed_playback_url.query) + for is_full, key in enumerate(('videostatsPlaybackUrl', 'videostatsWatchtimeUrl')): + label = 'fully ' if is_full else '' + url = get_first(player_responses, ('playbackTracking', key, 'baseUrl'), + expected_type=url_or_none) + if not url: + self.report_warning(f'Unable to mark {label}watched') + return + parsed_url = urllib.parse.urlparse(url) + qs = urllib.parse.parse_qs(parsed_url.query) + + # cpn generation algorithm is reverse engineered from base.js. + # In fact it works even with dummy cpn. + CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' + cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)) + + # # more consistent results setting it to right before the end + video_length = [str(float((qs.get('len') or ['1.5'])[0]) - 1)] + + qs.update({ + 'ver': ['2'], + 'cpn': [cpn], + 'cmt': video_length, + 'el': 'detailpage', # otherwise defaults to "shorts" + }) - # cpn generation algorithm is reverse engineered from base.js. - # In fact it works even with dummy cpn. - CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' - cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)) + if is_full: + # these seem to mark watchtime "history" in the real world + # they're required, so send in a single value + qs.update({ + 'st': video_length, + 'et': video_length, + }) - qs.update({ - 'ver': ['2'], - 'cpn': [cpn], - }) - playback_url = compat_urlparse.urlunparse( - parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + url = urllib.parse.urlunparse( + parsed_url._replace(query=urllib.parse.urlencode(qs, True))) - self._download_webpage( - playback_url, video_id, 'Marking watched', - 'Unable to mark watched', fatal=False) + self._download_webpage( + url, video_id, f'Marking {label}watched', + 'Unable to mark watched', fatal=False) @staticmethod def _extract_urls(webpage): @@ -2713,39 +2742,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor): chapter_time = lambda chapter: parse_duration(self._get_text(chapter, 'timeDescription')) chapter_title = lambda chapter: self._get_text(chapter, 'title') - return next(( - filter(None, ( - self._extract_chapters( - traverse_obj(contents, (..., 'macroMarkersListItemRenderer')), - chapter_time, chapter_title, duration) - for contents in content_list - ))), []) - - def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration): - chapters = [] - last_chapter = {'start_time': 0} - for idx, chapter in enumerate(chapter_list or []): - title = chapter_title(chapter) - start_time = chapter_time(chapter) - if start_time is None: - continue - last_chapter['end_time'] = start_time - if start_time < last_chapter['start_time']: - if idx == 1: - chapters.pop() - self.report_warning('Invalid start time for chapter "%s"' % last_chapter['title']) - else: - self.report_warning(f'Invalid start time for chapter "{title}"') - continue - last_chapter = {'start_time': start_time, 'title': title} - chapters.append(last_chapter) - last_chapter['end_time'] = duration - return chapters + return next(filter(None, ( + self._extract_chapters(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')), + chapter_time, chapter_title, duration) + for contents in content_list)), []) - def _extract_yt_initial_variable(self, webpage, regex, video_id, name): - return self._parse_json(self._search_regex( - (fr'{regex}\s*{self._YT_INITIAL_BOUNDARY_RE}', - regex), webpage, name, default='{}'), video_id, fatal=False) + def _extract_chapters_from_description(self, description, duration): + return self._extract_chapters( + re.findall(r'(?m)^((?:\d+:)?\d{1,2}:\d{2})\b\W*\s(.+?)\s*$', description or ''), + chapter_time=lambda x: parse_duration(x[0]), chapter_title=lambda x: x[1], + duration=duration, strict=False) + + def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration, strict=True): + if not duration: + return + chapter_list = [{ + 'start_time': chapter_time(chapter), + 'title': chapter_title(chapter), + } for chapter in chapter_list or []] + if not strict: + chapter_list.sort(key=lambda c: c['start_time'] or 0) + + chapters = [{'start_time': 0, 'title': '<Untitled>'}] + for idx, chapter in enumerate(chapter_list): + if chapter['start_time'] is None or not chapter['title']: + self.report_warning(f'Incomplete chapter {idx}') + elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: + chapters[-1]['end_time'] = chapter['start_time'] + chapters.append(chapter) + else: + self.report_warning(f'Invalid start time for chapter "{chapter["title"]}"') + chapters[-1]['end_time'] = duration + return chapters if len(chapters) > 1 and chapters[1]['start_time'] else chapters[1:] def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') @@ -2758,12 +2786,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): timestamp, time_text = self._extract_time_text(comment_renderer, 'publishedTimeText') author = self._get_text(comment_renderer, 'authorText') author_id = try_get(comment_renderer, - lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str) + lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], str) votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'], - lambda x: x['likeCount']), compat_str)) or 0 + lambda x: x['likeCount']), str)) or 0 author_thumbnail = try_get(comment_renderer, - lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str) + lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], str) author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool) is_favorited = 'creatorHeart' in (try_get( @@ -3028,9 +3056,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg): initial_pr = None if webpage: - initial_pr = self._extract_yt_initial_variable( - webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, - video_id, 'initial player response') + initial_pr = self._search_json( + self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False) all_clients = set(clients) clients = clients[::-1] @@ -3144,16 +3171,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): fmt_url = fmt.get('url') if not fmt_url: - sc = compat_parse_qs(fmt.get('signatureCipher')) + sc = urllib.parse.parse_qs(fmt.get('signatureCipher')) fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) encrypted_sig = try_get(sc, lambda x: x['s'][0]) - if not (sc and fmt_url and encrypted_sig): + if not all((sc, fmt_url, player_url, encrypted_sig)): continue - if not player_url: + try: + fmt_url += '&%s=%s' % ( + traverse_obj(sc, ('sp', -1)) or 'signature', + self._decrypt_signature(encrypted_sig, video_id, player_url) + ) + except ExtractorError as e: + self.report_warning('Signature extraction failed: Some formats may be missing', only_once=True) + self.write_debug(e, only_once=True) continue - signature = self._decrypt_signature(sc['s'][0], video_id, player_url) - sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' - fmt_url += '&' + sp + '=' + signature query = parse_qs(fmt_url) throttled = False @@ -3164,7 +3195,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): except ExtractorError as e: self.report_warning( 'nsig extraction failed: You may experience throttling for some formats\n' - f'n = {query["n"][0]} ; player = {player_url}\n{e}', only_once=True) + f'n = {query["n"][0]} ; player = {player_url}', only_once=True) + self.write_debug(e, only_once=True) throttled = True if itag: @@ -3380,12 +3412,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Unquote should take place before split on comma (,) since textual # fields may contain comma as well (see # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs( - compat_urllib_parse_unquote_plus(feed)) + feed_data = urllib.parse.parse_qs( + urllib.parse.unquote_plus(feed)) def feed_entry(name): return try_get( - feed_data, lambda x: x[name][0], compat_str) + feed_data, lambda x: x[name][0], str) feed_id = feed_entry('id') if not feed_id: @@ -3414,6 +3446,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or get_first(microformats, 'lengthSeconds') or parse_duration(search_meta('duration'))) or None + if get_first(video_details, 'isPostLiveDvr'): + self.write_debug('Video is in Post-Live Manifestless mode') + if duration or 0 > 4 * 3600: + self.report_warning( + 'The livestream has not finished processing. Only 4 hours of the video can be currently downloaded. ' + 'This is a known issue and patches are welcome') + live_broadcast_details, is_live, streaming_data, formats = self._list_formats( video_id, microformats, video_details, player_responses, player_url, duration) @@ -3523,7 +3562,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, 'uploader_url': owner_profile_url, 'channel_id': channel_id, - 'channel_url': format_field(channel_id, template='https://www.youtube.com/channel/%s'), + 'channel_url': format_field(channel_id, None, 'https://www.youtube.com/channel/%s'), 'duration': duration, 'view_count': int_or_none( get_first((video_details, microformats), (..., 'viewCount')) @@ -3593,7 +3632,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'translated_subs' in self._configuration_arg('skip'): continue trans_code += f'-{lang_code}' - trans_name += format_field(lang_name, template=' from %s') + trans_name += format_field(lang_name, None, ' from %s') # Add an "-orig" label to the original language so that it can be distinguished. # The subs are returned without "-orig" as well for compatibility if lang_code == f'a-{orig_trans_code}': @@ -3605,9 +3644,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles - parsed_url = compat_urllib_parse_urlparse(url) + parsed_url = urllib.parse.urlparse(url) for component in [parsed_url.fragment, parsed_url.query]: - query = compat_parse_qs(component) + query = urllib.parse.parse_qs(component) for k, v in query.items(): for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]: d_k += '_time' @@ -3616,7 +3655,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Youtube Music Auto-generated description if video_description: - mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) + mobj = re.search( + r'''(?xs) + (?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+ + (?P<album>[^\n]+) + (?:.+?℗\s*(?P<release_year>\d{4})(?!\d))? + (?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))? + (.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))? + .+\nAuto-generated\ by\ YouTube\.\s*$ + ''', video_description) if mobj: release_year = mobj.group('release_year') release_date = mobj.group('release_date') @@ -3634,9 +3681,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): initial_data = None if webpage: - initial_data = self._extract_yt_initial_variable( - webpage, self._YT_INITIAL_DATA_RE, video_id, - 'yt initial data') + initial_data = self.extract_yt_initial_data(video_id, webpage, fatal=False) if not initial_data: query = {'videoId': video_id} query.update(self._get_checkok_params()) @@ -3646,13 +3691,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): headers=self.generate_api_headers(ytcfg=master_ytcfg), note='Downloading initial data API JSON') + info['comment_count'] = traverse_obj(initial_data, ( + 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'itemSectionRenderer', + 'contents', ..., 'commentsEntryPointHeaderRenderer', 'commentCount', 'simpleText' + ), ( + 'engagementPanels', lambda _, v: v['engagementPanelSectionListRenderer']['panelIdentifier'] == 'comment-item-section', + 'engagementPanelSectionListRenderer', 'header', 'engagementPanelTitleHeaderRenderer', 'contextualInfo', 'runs', ..., 'text' + ), expected_type=int_or_none, get_all=False) + try: # This will error if there is no livechat initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation'] except (KeyError, IndexError, TypeError): pass else: info.setdefault('subtitles', {})['live_chat'] = [{ - 'url': f'https://www.youtube.com/watch?v={video_id}', # url is needed to set cookies + # url is needed to set cookies + 'url': f'https://www.youtube.com/watch?v={video_id}&bpctr=9999999999&has_verified=1', 'video_id': video_id, 'ext': 'json', 'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay', @@ -3662,6 +3716,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info['chapters'] = ( self._extract_chapters_from_json(initial_data, duration) or self._extract_chapters_from_engagement_panel(initial_data, duration) + or self._extract_chapters_from_description(video_description, duration) or None) contents = traverse_obj( @@ -3884,7 +3939,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): # generic endpoint URL support ep_url = urljoin('https://www.youtube.com/', try_get( renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str)) + str)) if ep_url: for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE): if ie.suitable(ep_url): @@ -3928,7 +3983,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _shelf_entries(self, shelf_renderer, skip_channels=False): ep = try_get( shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str) + str) shelf_url = urljoin('https://www.youtube.com', ep) if shelf_url: # Skipping links to another channels, note that checking for @@ -3988,7 +4043,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): yield entry # playlist attachment playlist_id = try_get( - post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], compat_str) + post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], str) if playlist_id: yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, @@ -3999,7 +4054,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): if not isinstance(run, dict): continue ep_url = try_get( - run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str) + run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], str) if not ep_url: continue if not YoutubeIE.suitable(ep_url): @@ -4015,9 +4070,12 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): return for content in contents: renderer = content.get('backstagePostThreadRenderer') - if not isinstance(renderer, dict): + if isinstance(renderer, dict): + yield from self._post_thread_entries(renderer) continue - yield from self._post_thread_entries(renderer) + renderer = content.get('videoRenderer') + if isinstance(renderer, dict): + yield self._video_entry(renderer) r''' # unused def _rich_grid_entries(self, contents): @@ -4173,10 +4231,10 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): uploader['uploader'] = self._search_regex( r'^by (.+) and \d+ others?$', owner_text, 'uploader', default=owner_text) uploader['uploader_id'] = try_get( - owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str) + owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], str) uploader['uploader_url'] = urljoin( 'https://www.youtube.com/', - try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) + try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], str)) return {k: v for k, v in uploader.items() if v is not None} def _extract_from_tabs(self, item_id, ytcfg, data, tabs): @@ -4304,13 +4362,13 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _extract_from_playlist(self, item_id, url, data, playlist, ytcfg): title = playlist.get('title') or try_get( - data, lambda x: x['titleText']['simpleText'], compat_str) + data, lambda x: x['titleText']['simpleText'], str) playlist_id = playlist.get('playlistId') or item_id # Delegating everything except mix playlists to regular tab-based playlist URL playlist_url = urljoin(url, try_get( playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str)) + str)) # Some playlists are unviewable but YouTube still provides a link to the (broken) playlist page [1] # [1] MLCT, RLTDwFCb4jeqaKWnciAYM-ZVHg @@ -4381,7 +4439,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): continue nav_item_renderer = menu_item.get('menuNavigationItemRenderer') text = try_get( - nav_item_renderer, lambda x: x['text']['simpleText'], compat_str) + nav_item_renderer, lambda x: x['text']['simpleText'], str) if not text or text.lower() != 'show unavailable videos': continue browse_endpoint = try_get( @@ -4402,7 +4460,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): check_get_keys='contents', fatal=False, ytcfg=ytcfg, note='Downloading API JSON with unavailable videos') - @property + @functools.cached_property def skip_webpage(self): return 'webpage' in self._configuration_arg('skip', ie_key=YoutubeTabIE.ie_key()) @@ -4423,7 +4481,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): data = self.extract_yt_initial_data(item_id, webpage or '', fatal=fatal) or {} except ExtractorError as e: if isinstance(e.cause, network_exceptions): - if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429): + if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code not in (403, 429): last_error = error_to_compat_str(e.cause or e.msg) if count < retries: continue @@ -5236,8 +5294,8 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): @YoutubeTabBaseInfoExtractor.passthrough_smuggled_data def _real_extract(self, url, smuggled_data): item_id = self._match_id(url) - url = compat_urlparse.urlunparse( - compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) + url = urllib.parse.urlunparse( + urllib.parse.urlparse(url)._replace(netloc='www.youtube.com')) compat_opts = self.get_param('compat_opts', []) def get_mobj(url): @@ -5257,7 +5315,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): mdata = self._extract_tab_endpoint( f'https://music.youtube.com/channel/{item_id}', item_id, default_client='web_music') murl = traverse_obj(mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'), - get_all=False, expected_type=compat_str) + get_all=False, expected_type=str) if not murl: raise ExtractorError('Failed to resolve album to playlist') return self.url_result(murl, ie=YoutubeTabIE.ie_key()) @@ -5622,11 +5680,13 @@ class YoutubeNotificationsIE(YoutubeTabBaseInfoExtractor): channel = traverse_obj( notification, ('contextualMenu', 'menuRenderer', 'items', 1, 'menuServiceItemRenderer', 'text', 'runs', 1, 'text'), expected_type=str) + notification_title = self._get_text(notification, 'shortMessage') + if notification_title: + notification_title = notification_title.replace('\xad', '') # remove soft hyphens + # TODO: handle recommended videos title = self._search_regex( - rf'{re.escape(channel)} [^:]+: (.+)', self._get_text(notification, 'shortMessage'), + rf'{re.escape(channel or "")}[^:]+: (.+)', notification_title, 'video title', default=None) - if title: - title = title.replace('\xad', '') # remove soft hyphens upload_date = (strftime_or_none(self._extract_time_text(notification, 'sentTimeText')[0], '%Y%m%d') if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE.ie_key()) else None) @@ -5778,7 +5838,7 @@ class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor): if params: section = next((k for k, v in self._SECTIONS.items() if v == params), params) else: - section = compat_urllib_parse_unquote_plus((url.split('#') + [''])[1]).lower() + section = urllib.parse.unquote_plus((url.split('#') + [''])[1]).lower() params = self._SECTIONS.get(section) if not params: section = None @@ -5925,14 +5985,43 @@ class YoutubeTruncatedURLIE(InfoExtractor): expected=True) -class YoutubeClipIE(InfoExtractor): +class YoutubeClipIE(YoutubeTabBaseInfoExtractor): IE_NAME = 'youtube:clip' - IE_DESC = False # Do not list - _VALID_URL = r'https?://(?:www\.)?youtube\.com/clip/' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/clip/(?P<id>[^/?#]+)' + _TESTS = [{ + # FIXME: Other metadata should be extracted from the clip, not from the base video + 'url': 'https://www.youtube.com/clip/UgytZKpehg-hEMBSn3F4AaABCQ', + 'info_dict': { + 'id': 'UgytZKpehg-hEMBSn3F4AaABCQ', + 'ext': 'mp4', + 'section_start': 29.0, + 'section_end': 39.7, + 'duration': 10.7, + } + }] def _real_extract(self, url): - self.report_warning('YouTube clips are not currently supported. The entire video will be downloaded instead') - return self.url_result(url, 'Generic') + clip_id = self._match_id(url) + _, data = self._extract_webpage(url, clip_id) + + video_id = traverse_obj(data, ('currentVideoEndpoint', 'watchEndpoint', 'videoId')) + if not video_id: + raise ExtractorError('Unable to find video ID') + + clip_data = traverse_obj(data, ( + 'engagementPanels', ..., 'engagementPanelSectionListRenderer', 'content', 'clipSectionRenderer', + 'contents', ..., 'clipAttributionRenderer', 'onScrubExit', 'commandExecutorCommand', 'commands', ..., + 'openPopupAction', 'popup', 'notificationActionRenderer', 'actionButton', 'buttonRenderer', 'command', + 'commandExecutorCommand', 'commands', ..., 'loopCommand'), get_all=False) + + return { + '_type': 'url_transparent', + 'url': f'https://www.youtube.com/watch?v={video_id}', + 'ie_key': YoutubeIE.ie_key(), + 'id': clip_id, + 'section_start': int(clip_data['startTimeMs']) / 1000, + 'section_end': int(clip_data['endTimeMs']) / 1000, + } class YoutubeTruncatedIDIE(InfoExtractor): diff --git a/yt_dlp/extractor/zattoo.py b/yt_dlp/extractor/zattoo.py index 16f827a7e..2a7e85472 100644 --- a/yt_dlp/extractor/zattoo.py +++ b/yt_dlp/extractor/zattoo.py @@ -220,7 +220,7 @@ class ZattooPlatformBaseIE(InfoExtractor): 'id': channel_name, 'title': channel_name, 'is_live': True, - 'format': formats, + 'formats': formats, 'subtitles': subtitles } diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py index a388ff562..3a7f01f7a 100644 --- a/yt_dlp/extractor/zdf.py +++ b/yt_dlp/extractor/zdf.py @@ -69,6 +69,7 @@ class ZDFBaseIE(InfoExtractor): f.update({ 'url': format_url, 'format_id': join_nonempty('http', meta.get('type'), meta.get('quality')), + 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None)) }) new_formats = [f] formats.extend(merge_dicts(f, { @@ -108,7 +109,7 @@ class ZDFBaseIE(InfoExtractor): 'class': track.get('class'), 'language': track.get('language'), }) - self._sort_formats(formats, ('hasaud', 'res', 'quality', 'language_preference')) + self._sort_formats(formats, ('tbr', 'res', 'quality', 'language_preference')) duration = float_or_none(try_get( ptmd, lambda x: x['attributes']['duration']['value']), scale=1000) @@ -187,7 +188,7 @@ class ZDFIE(ZDFBaseIE): }, }, { 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html', - 'md5': '3d6f1049e9682178a11c54b91f3dd065', + 'md5': '57af4423db0455a3975d2dc4578536bc', 'info_dict': { 'ext': 'mp4', 'id': 'video_funk_1770473', @@ -230,6 +231,19 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1641355200, 'upload_date': '20220105', }, + 'skip': 'No longer available "Diese Seite wurde leider nicht gefunden"' + }, { + 'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html', + 'info_dict': { + 'id': '191205_1800_sendung_sok8', + 'ext': 'mp4', + 'title': 'Das Geld anderer Leute', + 'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d', + 'duration': 2581.0, + 'timestamp': 1654790700, + 'upload_date': '20220609', + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=2400x1350', + }, }] def _extract_entry(self, url, player, content, video_id): diff --git a/yt_dlp/extractor/zhihu.py b/yt_dlp/extractor/zhihu.py index 70eb3ccd1..d8d259dd6 100644 --- a/yt_dlp/extractor/zhihu.py +++ b/yt_dlp/extractor/zhihu.py @@ -58,7 +58,7 @@ class ZhihuIE(InfoExtractor): 'uploader': author.get('name'), 'timestamp': int_or_none(zvideo.get('published_at')), 'uploader_id': author.get('id'), - 'uploader_url': format_field(url_token, template='https://www.zhihu.com/people/%s'), + 'uploader_url': format_field(url_token, None, 'https://www.zhihu.com/people/%s'), 'duration': float_or_none(video.get('duration')), 'view_count': int_or_none(zvideo.get('play_count')), 'like_count': int_or_none(zvideo.get('liked_count')), diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 70857b798..c95a0ff57 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -6,24 +6,22 @@ import re from .utils import ExtractorError, remove_quotes -_OPERATORS = [ - ('|', operator.or_), - ('^', operator.xor), - ('&', operator.and_), - ('>>', operator.rshift), - ('<<', operator.lshift), - ('-', operator.sub), - ('+', operator.add), - ('%', operator.mod), - ('/', operator.truediv), - ('*', operator.mul), -] -_ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS] -_ASSIGN_OPERATORS.append(('=', (lambda cur, right: right))) - -_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' +_NAME_RE = r'[a-zA-Z_$][\w$]*' +_OPERATORS = { + '|': operator.or_, + '^': operator.xor, + '&': operator.and_, + '>>': operator.rshift, + '<<': operator.lshift, + '-': operator.sub, + '+': operator.add, + '%': operator.mod, + '/': operator.truediv, + '*': operator.mul, +} _MATCHING_PARENS = dict(zip('({[', ')}]')) +_QUOTES = '\'"' class JS_Break(ExtractorError): @@ -49,13 +47,11 @@ class LocalNameSpace(collections.ChainMap): class JSInterpreter: + __named_object_counter = 0 + def __init__(self, code, objects=None): - if objects is None: - objects = {} - self.code = code - self._functions = {} - self._objects = objects - self.__named_object_counter = 0 + self.code, self._functions = code, {} + self._objects = {} if objects is None else objects def _named_object(self, namespace, obj): self.__named_object_counter += 1 @@ -69,12 +65,17 @@ class JSInterpreter: return counters = {k: 0 for k in _MATCHING_PARENS.values()} start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 + in_quote, escaping = None, False for idx, char in enumerate(expr): if char in _MATCHING_PARENS: counters[_MATCHING_PARENS[char]] += 1 elif char in counters: counters[char] -= 1 - if char != delim[pos] or any(counters.values()): + elif not escaping and char in _QUOTES and in_quote in (char, None): + in_quote = None if in_quote else char + escaping = not escaping and in_quote and char == '\\' + + if char != delim[pos] or any(counters.values()) or in_quote: pos = 0 continue elif pos != delim_len: @@ -87,9 +88,9 @@ class JSInterpreter: break yield expr[start:] - @staticmethod - def _separate_at_paren(expr, delim): - separated = list(JSInterpreter._separate(expr, delim, 1)) + @classmethod + def _separate_at_paren(cls, expr, delim): + separated = list(cls._separate(expr, delim, 1)) if len(separated) < 2: raise ExtractorError(f'No terminating paren {delim} in {expr}') return separated[0][1:].strip(), separated[1].strip() @@ -98,33 +99,29 @@ class JSInterpreter: if allow_recursion < 0: raise ExtractorError('Recursion limit reached') - sub_statements = list(self._separate(stmt, ';')) - stmt = (sub_statements or ['']).pop() + should_abort = False + sub_statements = list(self._separate(stmt, ';')) or [''] + stmt = sub_statements.pop().lstrip() + for sub_stmt in sub_statements: ret, should_abort = self.interpret_statement(sub_stmt, local_vars, allow_recursion - 1) if should_abort: - return ret + return ret, should_abort - should_abort = False - stmt = stmt.lstrip() - stmt_m = re.match(r'var\s', stmt) - if stmt_m: - expr = stmt[len(stmt_m.group(0)):] + m = re.match(r'(?P<var>var\s)|return(?:\s+|$)', stmt) + if not m: # Try interpreting it as an expression + expr = stmt + elif m.group('var'): + expr = stmt[len(m.group(0)):] else: - return_m = re.match(r'return(?:\s+|$)', stmt) - if return_m: - expr = stmt[len(return_m.group(0)):] - should_abort = True - else: - # Try interpreting it as an expression - expr = stmt + expr = stmt[len(m.group(0)):] + should_abort = True - v = self.interpret_expression(expr, local_vars, allow_recursion) - return v, should_abort + return self.interpret_expression(expr, local_vars, allow_recursion), should_abort def interpret_expression(self, expr, local_vars, allow_recursion): expr = expr.strip() - if expr == '': # Empty expression + if not expr: return None if expr.startswith('{'): @@ -150,8 +147,8 @@ class JSInterpreter: for item in self._separate(inner)]) expr = name + outer - m = re.match(r'try\s*', expr) - if m: + m = re.match(r'(?P<try>try)\s*|(?:(?P<catch>catch)|(?P<for>for)|(?P<switch>switch))\s*\(', expr) + if m and m.group('try'): if expr[m.end()] == '{': try_expr, expr = self._separate_at_paren(expr[m.end():], '}') else: @@ -161,21 +158,19 @@ class JSInterpreter: return ret return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] - m = re.match(r'catch\s*\(', expr) - if m: + elif m and m.group('catch'): # We ignore the catch block _, expr = self._separate_at_paren(expr, '}') return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] - m = re.match(r'for\s*\(', expr) - if m: + elif m and m.group('for'): constructor, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') if remaining.startswith('{'): body, expr = self._separate_at_paren(remaining, '}') else: - m = re.match(r'switch\s*\(', remaining) # FIXME - if m: - switch_val, remaining = self._separate_at_paren(remaining[m.end() - 1:], ')') + switch_m = re.match(r'switch\s*\(', remaining) # FIXME + if switch_m: + switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:], ')') body, expr = self._separate_at_paren(remaining, '}') body = 'switch(%s){%s}' % (switch_val, body) else: @@ -200,8 +195,7 @@ class JSInterpreter: f'Premature return in the initialization of a for loop in {constructor!r}') return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] - m = re.match(r'switch\s*\(', expr) - if m: + elif m and m.group('switch'): switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion) body, expr = self._separate_at_paren(remaining, '}') @@ -244,55 +238,63 @@ class JSInterpreter: ret = local_vars[var] expr = expr[:start] + json.dumps(ret) + expr[end:] - for op, opfunc in _ASSIGN_OPERATORS: - m = re.match(rf'''(?x) - (?P<out>{_NAME_RE})(?:\[(?P<index>[^\]]+?)\])? - \s*{re.escape(op)} - (?P<expr>.*)$''', expr) - if not m: - continue - right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion) + if not expr: + return None - if m.groupdict().get('index'): - lvar = local_vars[m.group('out')] - idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) - if not isinstance(idx, int): - raise ExtractorError(f'List indices must be integers: {idx}') - cur = lvar[idx] - val = opfunc(cur, right_val) - lvar[idx] = val - return val + m = re.match(fr'''(?x) + (?P<assign> + (?P<out>{_NAME_RE})(?:\[(?P<index>[^\]]+?)\])?\s* + (?P<op>{"|".join(map(re.escape, _OPERATORS))})? + =(?P<expr>.*)$ + )|(?P<return> + (?!if|return|true|false|null)(?P<name>{_NAME_RE})$ + )|(?P<indexing> + (?P<in>{_NAME_RE})\[(?P<idx>.+)\]$ + )|(?P<attribute> + (?P<var>{_NAME_RE})(?:\.(?P<member>[^(]+)|\[(?P<member2>[^\]]+)\])\s* + )|(?P<function> + (?P<fname>{_NAME_RE})\((?P<args>[\w$,]*)\)$ + )''', expr) + if m and m.group('assign'): + if not m.group('op'): + opfunc = lambda curr, right: right else: - cur = local_vars.get(m.group('out')) - val = opfunc(cur, right_val) - local_vars[m.group('out')] = val - return val + opfunc = _OPERATORS[m.group('op')] + right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion) + left_val = local_vars.get(m.group('out')) + + if not m.group('index'): + local_vars[m.group('out')] = opfunc(left_val, right_val) + return local_vars[m.group('out')] + elif left_val is None: + raise ExtractorError(f'Cannot index undefined variable: {m.group("out")}') + + idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) + if not isinstance(idx, int): + raise ExtractorError(f'List indices must be integers: {idx}') + left_val[idx] = opfunc(left_val[idx], right_val) + return left_val[idx] - if expr.isdigit(): + elif expr.isdigit(): return int(expr) - if expr == 'break': + elif expr == 'break': raise JS_Break() elif expr == 'continue': raise JS_Continue() - var_m = re.match( - r'(?!if|return|true|false|null)(?P<name>%s)$' % _NAME_RE, - expr) - if var_m: - return local_vars[var_m.group('name')] + elif m and m.group('return'): + return local_vars[m.group('name')] with contextlib.suppress(ValueError): return json.loads(expr) - m = re.match( - r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) - if m: + if m and m.group('indexing'): val = local_vars[m.group('in')] idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion) return val[idx] - for op, opfunc in _OPERATORS: + for op, opfunc in _OPERATORS.items(): separated = list(self._separate(expr, op)) if len(separated) < 2: continue @@ -308,10 +310,7 @@ class JSInterpreter: raise ExtractorError(f'Premature right-side return of {op} in {expr!r}') return opfunc(left_val or 0, right_val) - m = re.match( - r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*' % _NAME_RE, - expr) - if m: + if m and m.group('attribute'): variable = m.group('var') member = remove_quotes(m.group('member') or m.group('member2')) arg_str = expr[m.end():] @@ -326,7 +325,6 @@ class JSInterpreter: raise ExtractorError(f'{member} {msg}: {expr}') def eval_method(): - nonlocal member if variable == 'String': obj = str elif variable in local_vars: @@ -336,8 +334,8 @@ class JSInterpreter: self._objects[variable] = self.extract_object(variable) obj = self._objects[variable] + # Member access if arg_str is None: - # Member access if member == 'length': return len(obj) return obj[member] @@ -412,9 +410,7 @@ class JSInterpreter: except ValueError: return -1 - if isinstance(obj, list): - member = int(member) - return obj[member](argvals) + return obj[int(member) if isinstance(obj, list) else member](argvals) if remaining: return self.interpret_expression( @@ -423,9 +419,8 @@ class JSInterpreter: else: return eval_method() - m = re.match(r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) - if m: - fname = m.group('func') + elif m and m.group('function'): + fname = m.group('fname') argvals = tuple( int(v) if v.isdigit() else local_vars[v] for v in self._separate(m.group('args'))) @@ -435,8 +430,7 @@ class JSInterpreter: self._functions[fname] = self.extract_function(fname) return self._functions[fname](argvals) - if expr: - raise ExtractorError('Unsupported JS expression %r' % expr) + raise ExtractorError(f'Unsupported JS expression {expr!r}') def extract_object(self, objname): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' @@ -465,14 +459,17 @@ class JSInterpreter: """ @returns argnames, code """ func_m = re.search( r'''(?x) - (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s* + (?: + function\s+%(name)s| + [{;,]\s*%(name)s\s*=\s*function| + var\s+%(name)s\s*=\s*function + )\s* \((?P<args>[^)]*)\)\s* - (?P<code>\{(?:(?!};)[^"]|"([^"]|\\")*")+\})''' % ( - re.escape(funcname), re.escape(funcname), re.escape(funcname)), + (?P<code>{(?:(?!};)[^"]|"([^"]|\\")*")+})''' % {'name': re.escape(funcname)}, self.code) code, _ = self._separate_at_paren(func_m.group('code'), '}') # refine the match if func_m is None: - raise ExtractorError('Could not find JS function %r' % funcname) + raise ExtractorError(f'Could not find JS function "{funcname}"') return func_m.group('args').split(','), code def extract_function(self, funcname): @@ -486,11 +483,9 @@ class JSInterpreter: break start, body_start = mobj.span() body, remaining = self._separate_at_paren(code[body_start - 1:], '}') - name = self._named_object( - local_vars, - self.extract_function_from_code( - [str.strip(x) for x in mobj.group('args').split(',')], - body, local_vars, *global_stack)) + name = self._named_object(local_vars, self.extract_function_from_code( + [x.strip() for x in mobj.group('args').split(',')], + body, local_vars, *global_stack)) code = code[:start] + name + remaining return self.build_function(argnames, code, local_vars, *global_stack) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 91095f7f1..dddd5b15b 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1,29 +1,15 @@ -from __future__ import unicode_literals - -import os.path +import collections +import contextlib import optparse +import os.path import re +import shlex +import shutil +import string import sys -from .compat import ( - compat_expanduser, - compat_get_terminal_size, - compat_getenv, - compat_kwargs, - compat_shlex_split, -) -from .utils import ( - Config, - expand_path, - get_executable_path, - OUTTMPL_TYPES, - POSTPROCESS_WHEN, - remove_end, - write_string, -) +from .compat import compat_expanduser from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS -from .version import __version__ - from .downloader.external import list_external_downloaders from .postprocessor import ( FFmpegExtractAudioPP, @@ -33,22 +19,27 @@ from .postprocessor import ( SponsorBlockPP, ) from .postprocessor.modify_chapters import DEFAULT_SPONSORBLOCK_CHAPTER_TITLE +from .utils import ( + OUTTMPL_TYPES, + POSTPROCESS_WHEN, + Config, + expand_path, + get_executable_path, + join_nonempty, + remove_end, + write_string, +) +from .version import __version__ def parseOpts(overrideArguments=None, ignore_config_files='if_override'): - parser = create_parser() - root = Config(parser) - + root = Config(create_parser()) if ignore_config_files == 'if_override': ignore_config_files = overrideArguments is not None - if overrideArguments: - root.append_config(overrideArguments, label='Override') - else: - root.append_config(sys.argv[1:], label='Command-line') def _readUserConf(package_name, default=[]): # .config - xdg_config_home = compat_getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config') + xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config') userConfFile = os.path.join(xdg_config_home, package_name, 'config') if not os.path.isfile(userConfFile): userConfFile = os.path.join(xdg_config_home, '%s.conf' % package_name) @@ -57,7 +48,7 @@ def parseOpts(overrideArguments=None, ignore_config_files='if_override'): return userConf, userConfFile # appdata - appdata_dir = compat_getenv('appdata') + appdata_dir = os.getenv('appdata') if appdata_dir: userConfFile = os.path.join(appdata_dir, package_name, 'config') userConf = Config.read_file(userConfFile, default=None) @@ -80,7 +71,7 @@ def parseOpts(overrideArguments=None, ignore_config_files='if_override'): def add_config(label, path, user=False): """ Adds config and returns whether to continue """ - if root.parse_args()[0].ignoreconfig: + if root.parse_known_args()[0].ignoreconfig: return False # Multiple package names can be given here # Eg: ('yt-dlp', 'youtube-dlc', 'youtube-dl') will look for @@ -99,55 +90,131 @@ def parseOpts(overrideArguments=None, ignore_config_files='if_override'): def load_configs(): yield not ignore_config_files yield add_config('Portable', get_executable_path()) - yield add_config('Home', expand_path(root.parse_args()[0].paths.get('home', '')).strip()) + yield add_config('Home', expand_path(root.parse_known_args()[0].paths.get('home', '')).strip()) yield add_config('User', None, user=True) yield add_config('System', '/etc') - if all(load_configs()): - # If ignoreconfig is found inside the system configuration file, - # the user configuration is removed - if root.parse_args()[0].ignoreconfig: - user_conf = next((i for i, conf in enumerate(root.configs) if conf.label == 'User'), None) - if user_conf is not None: - root.configs.pop(user_conf) + opts = optparse.Values({'verbose': True, 'print_help': False}) + try: + try: + if overrideArguments: + root.append_config(overrideArguments, label='Override') + else: + root.append_config(sys.argv[1:], label='Command-line') + loaded_all_configs = all(load_configs()) + except ValueError as err: + raise root.parser.error(err) + + if loaded_all_configs: + # If ignoreconfig is found inside the system configuration file, + # the user configuration is removed + if root.parse_known_args()[0].ignoreconfig: + user_conf = next((i for i, conf in enumerate(root.configs) if conf.label == 'User'), None) + if user_conf is not None: + root.configs.pop(user_conf) + + opts, args = root.parse_args() + except optparse.OptParseError: + with contextlib.suppress(optparse.OptParseError): + opts, _ = root.parse_known_args(strict=False) + raise + except (SystemExit, KeyboardInterrupt): + opts.verbose = False + raise + finally: + verbose = opts.verbose and f'\n{root}'.replace('\n| ', '\n[debug] ')[1:] + if verbose: + write_string(f'{verbose}\n') + if opts.print_help: + if verbose: + write_string('\n') + root.parser.print_help() + if opts.print_help: + sys.exit() + return root.parser, opts, args - opts, args = root.parse_args() - if opts.verbose: - write_string(f'\n{root}'.replace('\n| ', '\n[debug] ')[1:] + '\n') - return parser, opts, args + +class _YoutubeDLHelpFormatter(optparse.IndentedHelpFormatter): + def __init__(self): + # No need to wrap help messages if we're on a wide console + max_width = shutil.get_terminal_size().columns or 80 + # The % is chosen to get a pretty output in README.md + super().__init__(width=max_width, max_help_position=int(0.45 * max_width)) + + @staticmethod + def format_option_strings(option): + """ ('-o', '--option') -> -o, --format METAVAR """ + opts = join_nonempty( + option._short_opts and option._short_opts[0], + option._long_opts and option._long_opts[0], + delim=', ') + if option.takes_value(): + opts += f' {option.metavar}' + return opts class _YoutubeDLOptionParser(optparse.OptionParser): # optparse is deprecated since python 3.2. So assume a stable interface even for private methods + ALIAS_TRIGGER_LIMIT = 100 + + def __init__(self): + super().__init__( + prog='yt-dlp' if detect_variant() == 'source' else None, + version=__version__, + usage='%prog [OPTIONS] URL [URL...]', + epilog='See full documentation at https://github.com/yt-dlp/yt-dlp#readme', + formatter=_YoutubeDLHelpFormatter(), + conflict_handler='resolve', + ) + + _UNKNOWN_OPTION = (optparse.BadOptionError, optparse.AmbiguousOptionError) + _BAD_OPTION = optparse.OptionValueError + + def parse_known_args(self, args=None, values=None, strict=True): + """Same as parse_args, but ignore unknown switches. Similar to argparse.parse_known_args""" + self.rargs, self.largs = self._get_args(args), [] + self.values = values or self.get_default_values() + while self.rargs: + arg = self.rargs[0] + try: + if arg == '--': + del self.rargs[0] + break + elif arg.startswith('--'): + self._process_long_opt(self.rargs, self.values) + elif arg.startswith('-') and arg != '-': + self._process_short_opts(self.rargs, self.values) + elif self.allow_interspersed_args: + self.largs.append(self.rargs.pop(0)) + else: + break + except optparse.OptParseError as err: + if isinstance(err, self._UNKNOWN_OPTION): + self.largs.append(err.opt_str) + elif strict: + if isinstance(err, self._BAD_OPTION): + self.error(str(err)) + raise + return self.check_values(self.values, self.largs) + + def error(self, msg): + msg = f'{self.get_prog_name()}: error: {str(msg).strip()}\n' + raise optparse.OptParseError(f'{self.get_usage()}\n{msg}' if self.usage else msg) + + def _get_args(self, args): + return sys.argv[1:] if args is None else list(args) def _match_long_opt(self, opt): """Improve ambigious argument resolution by comparing option objects instead of argument strings""" try: return super()._match_long_opt(opt) except optparse.AmbiguousOptionError as e: - if len(set(self._long_opt[p] for p in e.possibilities)) == 1: + if len({self._long_opt[p] for p in e.possibilities}) == 1: return e.possibilities[0] raise def create_parser(): - def _format_option_string(option): - ''' ('-o', '--option') -> -o, --format METAVAR''' - - opts = [] - - if option._short_opts: - opts.append(option._short_opts[0]) - if option._long_opts: - opts.append(option._long_opts[0]) - if len(opts) > 1: - opts.insert(1, ', ') - - if option.takes_value(): - opts.append(' %s' % option.metavar) - - return ''.join(opts) - def _list_from_options_callback(option, opt_str, value, parser, append=True, delim=',', process=str.strip): # append can be True, False or -1 (prepend) current = list(getattr(parser.values, option.dest)) if append else [] @@ -190,9 +257,9 @@ def create_parser(): out_dict = dict(getattr(parser.values, option.dest)) multiple_args = not isinstance(value, str) if multiple_keys: - allowed_keys = r'(%s)(,(%s))*' % (allowed_keys, allowed_keys) + allowed_keys = fr'({allowed_keys})(,({allowed_keys}))*' mobj = re.match( - r'(?i)(?P<keys>%s)%s(?P<val>.*)$' % (allowed_keys, delimiter), + fr'(?i)(?P<keys>{allowed_keys}){delimiter}(?P<val>.*)$', value[0] if multiple_args else value) if mobj is not None: keys, val = mobj.group('keys').split(','), mobj.group('val') @@ -202,7 +269,7 @@ def create_parser(): keys, val = [default_key], value else: raise optparse.OptionValueError( - 'wrong %s formatting; it should be %s, not "%s"' % (opt_str, option.metavar, value)) + f'wrong {opt_str} formatting; it should be {option.metavar}, not "{value}"') try: keys = map(process_key, keys) if process_key else keys val = process(val) if process else val @@ -212,30 +279,45 @@ def create_parser(): out_dict[key] = out_dict.get(key, []) + [val] if append else val setattr(parser.values, option.dest, out_dict) - # No need to wrap help messages if we're on a wide console - columns = compat_get_terminal_size().columns - max_width = columns if columns else 80 - # 47% is chosen because that is how README.md is currently formatted - # and moving help text even further to the right is undesirable. - # This can be reduced in the future to get a prettier output - max_help_position = int(0.47 * max_width) + parser = _YoutubeDLOptionParser() + alias_group = optparse.OptionGroup(parser, 'Aliases') + Formatter = string.Formatter() - fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position) - fmt.format_option_strings = _format_option_string + def _create_alias(option, opt_str, value, parser): + aliases, opts = value + try: + nargs = len({i if f == '' else f + for i, (_, f, _, _) in enumerate(Formatter.parse(opts)) if f is not None}) + opts.format(*map(str, range(nargs))) # validate + except Exception as err: + raise optparse.OptionValueError(f'wrong {opt_str} OPTIONS formatting; {err}') + if alias_group not in parser.option_groups: + parser.add_option_group(alias_group) - kw = { - 'version': __version__, - 'formatter': fmt, - 'usage': '%prog [OPTIONS] URL [URL...]', - 'conflict_handler': 'resolve', - } + aliases = (x if x.startswith('-') else f'--{x}' for x in map(str.strip, aliases.split(','))) + try: + alias_group.add_option( + *aliases, help=opts, nargs=nargs, type='str' if nargs else None, + dest='_triggered_aliases', default=collections.defaultdict(int), + metavar=' '.join(f'ARG{i}' for i in range(nargs)), action='callback', + callback=_alias_callback, callback_kwargs={'opts': opts, 'nargs': nargs}) + except Exception as err: + raise optparse.OptionValueError(f'wrong {opt_str} formatting; {err}') - parser = _YoutubeDLOptionParser(**compat_kwargs(kw)) + def _alias_callback(option, opt_str, value, parser, opts, nargs): + counter = getattr(parser.values, option.dest) + counter[opt_str] += 1 + if counter[opt_str] > parser.ALIAS_TRIGGER_LIMIT: + raise optparse.OptionValueError(f'Alias {opt_str} exceeded invocation limit') + if nargs == 1: + value = [value] + assert (nargs == 0 and value is None) or len(value) == nargs + parser.rargs[:0] = shlex.split( + opts if value is None else opts.format(*map(shlex.quote, value))) general = optparse.OptionGroup(parser, 'General Options') general.add_option( - '-h', '--help', - action='help', + '-h', '--help', dest='print_help', action='store_true', help='Print this help text and exit') general.add_option( '--version', @@ -272,7 +354,12 @@ def create_parser(): general.add_option( '--default-search', dest='default_search', metavar='PREFIX', - help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for the search term "large apple". Use the value "auto" to let yt-dlp guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching') + help=( + 'Use this prefix for unqualified URLs. ' + 'Eg: "gvsearch2:python" downloads two videos from google videos for the search term "python". ' + 'Use the value "auto" to let yt-dlp guess ("auto_warning" to emit a warning when guessing). ' + '"error" just throws an error. The default value "fixup_error" repairs broken URLs, ' + 'but emits an error if this is not possible instead of searching')) general.add_option( '--ignore-config', '--no-config', action='store_true', dest='ignoreconfig', @@ -290,8 +377,8 @@ def create_parser(): '--config-locations', dest='config_locations', metavar='PATH', action='append', help=( - 'Location of the main configuration file; either the path to the config or its containing directory. ' - 'Can be used multiple times and inside other configuration files')) + 'Location of the main configuration file; either the path to the config or its containing directory ' + '("-" for stdin). Can be used multiple times and inside other configuration files')) general.add_option( '--flat-playlist', action='store_const', dest='extract_flat', const='in_playlist', default=False, @@ -348,16 +435,26 @@ def create_parser(): 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' 'configurations by reverting some of the changes made in yt-dlp. ' 'See "Differences in default behavior" for details')) + general.add_option( + '--alias', metavar='ALIASES OPTIONS', dest='_', type='str', nargs=2, + action='callback', callback=_create_alias, + help=( + 'Create aliases for an option string. Unless an alias starts with a dash "-", it is prefixed with "--". ' + 'Arguments are parsed according to the Python string formatting mini-language. ' + 'Eg: --alias get-audio,-X "-S=aext:{0},abr -x --audio-format {0}" creates options ' + '"--get-audio" and "-X" that takes an argument (ARG0) and expands to ' + '"-S=aext:ARG0,abr -x --audio-format ARG0". All defined aliases are listed in the --help output. ' + 'Alias options can trigger more aliases; so be carefull to avoid defining recursive options. ' + f'As a safety measure, each alias may be triggered a maximum of {_YoutubeDLOptionParser.ALIAS_TRIGGER_LIMIT} times. ' + 'This option can be used multiple times')) network = optparse.OptionGroup(parser, 'Network Options') network.add_option( '--proxy', dest='proxy', default=None, metavar='URL', help=( - 'Use the specified HTTP/HTTPS/SOCKS proxy. To enable ' - 'SOCKS proxy, specify a proper scheme. For example ' - 'socks5://user:pass@127.0.0.1:1080/. Pass in an empty string (--proxy "") ' - 'for direct connection')) + 'Use the specified HTTP/HTTPS/SOCKS proxy. To enable SOCKS proxy, specify a proper scheme. ' + 'Eg: socks5://user:pass@127.0.0.1:1080/. Pass in an empty string (--proxy "") for direct connection')) network.add_option( '--socket-timeout', dest='socket_timeout', type=float, default=None, metavar='SECONDS', @@ -410,15 +507,19 @@ def create_parser(): selection.add_option( '--playlist-start', dest='playliststart', metavar='NUMBER', default=1, type=int, - help='Playlist video to start at (default is %default)') + help=optparse.SUPPRESS_HELP) selection.add_option( '--playlist-end', dest='playlistend', metavar='NUMBER', default=None, type=int, - help='Playlist video to end at (default is last)') + help=optparse.SUPPRESS_HELP) selection.add_option( - '--playlist-items', + '-I', '--playlist-items', dest='playlist_items', metavar='ITEM_SPEC', default=None, - help='Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13') + help=( + 'Comma seperated playlist_index of the videos to download. ' + 'You can specify a range using "[START]:[STOP][:STEP]". For backward compatibility, START-STOP is also supported. ' + 'Use negative indices to count from the right and negative STEP to download in reverse order. ' + 'Eg: "-I 1:3,7,-5::2" used on a playlist of size 15 will download the videos at index 1,2,3,7,11,13,15')) selection.add_option( '--match-title', dest='matchtitle', metavar='REGEX', @@ -439,9 +540,8 @@ def create_parser(): '--date', metavar='DATE', dest='date', default=None, help=( - 'Download only videos uploaded on this date. ' - 'The date can be "YYYYMMDD" or in the format ' - '"(now|today)[+-][0-9](day|week|month|year)(s)?"')) + 'Download only videos uploaded on this date. The date can be "YYYYMMDD" or in the format ' + '[now|today|yesterday][-N[day|week|month|year]]. Eg: --date today-2weeks')) selection.add_option( '--datebefore', metavar='DATE', dest='datebefore', default=None, @@ -466,7 +566,7 @@ def create_parser(): '--match-filters', metavar='FILTER', dest='match_filter', action='append', help=( - 'Generic video filter. Any field (see "OUTPUT TEMPLATE") can be compared with a ' + 'Generic video filter. Any "OUTPUT TEMPLATE" field can be compared with a ' 'number or a string using the operators defined in "Filtering formats". ' 'You can also simply specify a field to match if the field is present, ' 'use "!field" to check if the field is not present, and "&" to check multiple conditions. ' @@ -475,7 +575,8 @@ def create_parser(): '!is_live --match-filter "like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" ' 'matches only videos that are not live OR those that have a like count more than 100 ' '(or the like field is not available) and also has a description ' - 'that contains the phrase "cats & dogs" (ignoring case)')) + 'that contains the phrase "cats & dogs" (caseless). ' + 'Use "--match-filter -" to interactively ask whether to download each video')) selection.add_option( '--no-match-filter', metavar='FILTER', dest='match_filter', action='store_const', const=None, @@ -515,11 +616,11 @@ def create_parser(): selection.add_option( '--break-per-input', action='store_true', dest='break_per_url', default=False, - help='Make --break-on-existing and --break-on-reject act only on the current input URL') + help='Make --break-on-existing, --break-on-reject and --max-downloads act only on the current input URL') selection.add_option( '--no-break-per-input', action='store_false', dest='break_per_url', - help='--break-on-existing and --break-on-reject terminates the entire download queue') + help='--break-on-existing and similar options terminates the entire download queue') selection.add_option( '--skip-playlist-after-errors', metavar='N', dest='skip_playlist_after_errors', default=None, type=int, @@ -574,6 +675,19 @@ def create_parser(): '--ap-list-mso', action='store_true', dest='ap_list_mso', default=False, help='List all supported multiple-system operators') + authentication.add_option( + '--client-certificate', + dest='client_certificate', metavar='CERTFILE', + help='Path to client certificate file in PEM format. May include the private key') + authentication.add_option( + '--client-certificate-key', + dest='client_certificate_key', metavar='KEYFILE', + help='Path to private key file for client certificate') + authentication.add_option( + '--client-certificate-password', + dest='client_certificate_password', metavar='PASSWORD', + help='Password for client certificate private key, if encrypted. ' + 'If not provided, and the key is encrypted, yt-dlp will ask interactively') video_format = optparse.OptionGroup(parser, 'Video Format Options') video_format.add_option( @@ -590,13 +704,11 @@ def create_parser(): action='store_true', dest='format_sort_force', metavar='FORMAT', default=False, help=( 'Force user specified sort order to have precedence over all fields, ' - 'see "Sorting Formats" for more details')) + 'see "Sorting Formats" for more details (Alias: --S-force)')) video_format.add_option( '--no-format-sort-force', action='store_false', dest='format_sort_force', metavar='FORMAT', default=False, - help=( - 'Some fields have precedence over the user specified sort order (default), ' - 'see "Sorting Formats" for more details')) + help='Some fields have precedence over the user specified sort order (default)') video_format.add_option( '--video-multistreams', action='store_true', dest='allow_multiple_video_streams', default=None, @@ -695,14 +807,14 @@ def create_parser(): subtitles.add_option( '--sub-format', action='store', dest='subtitlesformat', metavar='FORMAT', default='best', - help='Subtitle format, accepts formats preference, for example: "srt" or "ass/srt/best"') + help='Subtitle format; accepts formats preference, Eg: "srt" or "ass/srt/best"') subtitles.add_option( '--sub-langs', '--srt-langs', action='callback', dest='subtitleslangs', metavar='LANGS', type='str', default=[], callback=_list_from_options_callback, help=( 'Languages of the subtitles to download (can be regex) or "all" separated by commas. (Eg: --sub-langs "en.*,ja") ' - 'You can prefix the language code with a "-" to exempt it from the requested languages. (Eg: --sub-langs all,-live_chat) ' + 'You can prefix the language code with a "-" to exclude it from the requested languages. (Eg: --sub-langs all,-live_chat) ' 'Use --list-subs for a list of available language tags')) downloader = optparse.OptionGroup(parser, 'Download Options') @@ -731,13 +843,26 @@ def create_parser(): dest='fragment_retries', metavar='RETRIES', default=10, help='Number of retries for a fragment (default is %default), or "infinite" (DASH, hlsnative and ISM)') downloader.add_option( + '--retry-sleep', + dest='retry_sleep', metavar='[TYPE:]EXPR', default={}, type='str', + action='callback', callback=_dict_from_options_callback, + callback_kwargs={ + 'allowed_keys': 'http|fragment|file_access', + 'default_key': 'http', + }, help=( + 'An expression for the time to sleep between retries in seconds (optionally) prefixed ' + 'by the type of retry (file_access, fragment, http (default)) to apply the sleep to. ' + 'EXPR can be a number, linear=START[:END[:STEP=1]] or exp=START[:END[:BASE=2]]. ' + 'This option can be used multiple times to set the sleep for the different retry types. ' + 'Eg: --retry-sleep linear=1::2 --retry-sleep fragment:exp=1:20')) + downloader.add_option( '--skip-unavailable-fragments', '--no-abort-on-unavailable-fragment', action='store_true', dest='skip_unavailable_fragments', default=True, - help='Skip unavailable fragments for DASH, hlsnative and ISM (default) (Alias: --no-abort-on-unavailable-fragment)') + help='Skip unavailable fragments for DASH, hlsnative and ISM downloads (default) (Alias: --no-abort-on-unavailable-fragment)') downloader.add_option( '--abort-on-unavailable-fragment', '--no-skip-unavailable-fragments', action='store_false', dest='skip_unavailable_fragments', - help='Abort downloading if a fragment is unavailable (Alias: --no-skip-unavailable-fragments)') + help='Abort download if a fragment is unavailable (Alias: --no-skip-unavailable-fragments)') downloader.add_option( '--keep-fragments', action='store_true', dest='keep_fragments', default=False, @@ -770,17 +895,25 @@ def create_parser(): help=optparse.SUPPRESS_HELP) downloader.add_option( '--playlist-reverse', - action='store_true', - help='Download playlist videos in reverse order') + action='store_true', dest='playlist_reverse', + help=optparse.SUPPRESS_HELP) downloader.add_option( '--no-playlist-reverse', action='store_false', dest='playlist_reverse', - help='Download playlist videos in default order (default)') + help=optparse.SUPPRESS_HELP) downloader.add_option( '--playlist-random', - action='store_true', + action='store_true', dest='playlist_random', help='Download playlist videos in random order') downloader.add_option( + '--lazy-playlist', + action='store_true', dest='lazy_playlist', + help='Process entries in the playlist as they are received. This disables n_entries, --playlist-random and --playlist-reverse') + downloader.add_option( + '--no-lazy-playlist', + action='store_false', dest='lazy_playlist', + help='Process videos in the playlist only after the entire playlist is parsed (default)') + downloader.add_option( '--xattr-set-filesize', dest='xattr_set_filesize', action='store_true', help='Set file xattribute ytdl.filesize with expected file size') @@ -807,6 +940,14 @@ def create_parser(): 'Do not use the mpegts container for HLS videos. ' 'This is default when not downloading live streams')) downloader.add_option( + '--download-sections', + metavar='REGEX', dest='download_ranges', action='append', + help=( + 'Download only chapters whose title matches the given regular expression. ' + 'Time ranges prefixed by a "*" can also be used in place of chapters to download the specified range. ' + 'Eg: --download-sections "*10:15-15:00" --download-sections "intro". ' + 'Needs ffmpeg. This option can be used multiple times to download multiple sections')) + downloader.add_option( '--downloader', '--external-downloader', dest='external_downloader', metavar='[PROTO:]NAME', default={}, type='str', action='callback', callback=_dict_from_options_callback, @@ -817,11 +958,11 @@ def create_parser(): }, help=( 'Name or path of the external downloader to use (optionally) prefixed by ' 'the protocols (http, ftp, m3u8, dash, rstp, rtmp, mms) to use it for. ' - 'Currently supports native, %s (Recommended: aria2c). ' + f'Currently supports native, {", ".join(list_external_downloaders())}. ' 'You can use this option multiple times to set different downloaders for different protocols. ' 'For example, --downloader aria2c --downloader "dash,m3u8:native" will use ' 'aria2c for http/ftp downloads, and the native downloader for dash/m3u8 downloads ' - '(Alias: --external-downloader)' % ', '.join(list_external_downloaders()))) + '(Alias: --external-downloader)')) downloader.add_option( '--downloader-args', '--external-downloader-args', metavar='NAME:ARGS', dest='external_downloader_args', default={}, type='str', @@ -829,7 +970,7 @@ def create_parser(): callback_kwargs={ 'allowed_keys': r'ffmpeg_[io]\d*|%s' % '|'.join(map(re.escape, list_external_downloaders())), 'default_key': 'default', - 'process': compat_shlex_split + 'process': shlex.split }, help=( 'Give these arguments to the external downloader. ' 'Specify the downloader name and the arguments separated by a colon ":". ' @@ -936,7 +1077,8 @@ def create_parser(): }, help=( 'Field name or output template to print to screen, optionally prefixed with when to print it, separated by a ":". ' 'Supported values of "WHEN" are the same as that of --use-postprocessor, and "video" (default). ' - 'Implies --quiet and --simulate (unless --no-simulate is used). This option can be used multiple times')) + 'Implies --quiet. Implies --simulate unless --no-simulate or later stages of WHEN are used. ' + 'This option can be used multiple times')) verbosity.add_option( '--print-to-file', metavar='[WHEN:]TEMPLATE FILE', dest='print_to_file', default={}, type='str', nargs=2, @@ -1044,6 +1186,10 @@ def create_parser(): action='store_true', dest='write_pages', default=False, help='Write downloaded intermediary pages to files in the current directory to debug problems') verbosity.add_option( + '--load-pages', + action='store_true', dest='load_pages', default=False, + help=optparse.SUPPRESS_HELP) + verbosity.add_option( '--youtube-print-sig-code', action='store_true', dest='youtube_print_sig_code', default=False, help=optparse.SUPPRESS_HELP) @@ -1054,7 +1200,7 @@ def create_parser(): verbosity.add_option( '-C', '--call-home', dest='call_home', action='store_true', default=False, - # help='[Broken] Contact the yt-dlp server for debugging') + # help='Contact the yt-dlp server for debugging') help=optparse.SUPPRESS_HELP) verbosity.add_option( '--no-call-home', @@ -1102,7 +1248,7 @@ def create_parser(): filesystem.add_option( '--output-na-placeholder', dest='outtmpl_na_placeholder', metavar='TEXT', default='NA', - help=('Placeholder value for unavailable meta fields in output filename template (default: "%default")')) + help=('Placeholder for unavailable fields in "OUTPUT TEMPLATE" (default: "%default")')) filesystem.add_option( '--autonumber-size', dest='autonumber_size', metavar='NUMBER', type=int, @@ -1308,26 +1454,27 @@ def create_parser(): postproc.add_option( '--audio-format', metavar='FORMAT', dest='audioformat', default='best', help=( - 'Specify audio format to convert the audio to when -x is used. Currently supported formats are: ' - 'best (default) or one of %s' % ', '.join(FFmpegExtractAudioPP.SUPPORTED_EXTS))) + 'Format to convert the audio to when -x is used. ' + f'(currently supported: best (default), {", ".join(FFmpegExtractAudioPP.SUPPORTED_EXTS)}). ' + 'You can specify multiple rules using similar syntax as --remux-video')) postproc.add_option( '--audio-quality', metavar='QUALITY', dest='audioquality', default='5', - help='Specify ffmpeg audio quality to use when converting the audio with -x. Insert a value between 0 (best) and 10 (worst) for VBR or a specific bitrate like 128K (default %default)') + help=( + 'Specify ffmpeg audio quality to use when converting the audio with -x. ' + 'Insert a value between 0 (best) and 10 (worst) for VBR or a specific bitrate like 128K (default %default)')) postproc.add_option( '--remux-video', metavar='FORMAT', dest='remuxvideo', default=None, help=( - 'Remux the video into another container if necessary (currently supported: %s). ' - 'If target container does not support the video/audio codec, remuxing will fail. ' - 'You can specify multiple rules; Eg. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 ' - 'and anything else to mkv.' % ', '.join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS))) + 'Remux the video into another container if necessary ' + f'(currently supported: {", ".join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS)}). ' + 'If target container does not support the video/audio codec, remuxing will fail. You can specify multiple rules; ' + 'Eg. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 and anything else to mkv')) postproc.add_option( '--recode-video', metavar='FORMAT', dest='recodevideo', default=None, - help=( - 'Re-encode the video into another format if re-encoding is necessary. ' - 'The syntax and supported formats are the same as --remux-video')) + help='Re-encode the video into another format if necessary. The syntax and supported formats are the same as --remux-video') postproc.add_option( '--postprocessor-args', '--ppa', metavar='NAME:ARGS', dest='postprocessor_args', default={}, type='str', @@ -1335,7 +1482,7 @@ def create_parser(): callback_kwargs={ 'allowed_keys': r'\w+(?:\+\w+)?', 'default_key': 'default-compat', - 'process': compat_shlex_split, + 'process': shlex.split, 'multiple_keys': False }, help=( 'Give these arguments to the postprocessors. ' @@ -1424,7 +1571,7 @@ def create_parser(): dest='parse_metadata', metavar='FIELDS REGEX REPLACE', action='append', nargs=3, help='Replace text in a metadata field using the given regex. This option can be used multiple times') postproc.add_option( - '--xattrs', + '--xattrs', '--xattr', action='store_true', dest='xattrs', default=False, help='Write metadata to the video file\'s xattrs (using dublin core and xdg standards)') postproc.add_option( @@ -1497,7 +1644,8 @@ def create_parser(): metavar='FORMAT', dest='convertthumbnails', default=None, help=( 'Convert the thumbnails to another format ' - '(currently supported: %s) ' % ', '.join(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS))) + f'(currently supported: {", ".join(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS)}). ' + 'You can specify multiple rules using similar syntax as --remux-video')) postproc.add_option( '--split-chapters', '--split-tracks', dest='split_chapters', action='store_true', default=False, @@ -1514,9 +1662,7 @@ def create_parser(): metavar='REGEX', dest='remove_chapters', action='append', help=( 'Remove chapters whose title matches the given regular expression. ' - 'Time ranges prefixed by a "*" can also be used in place of chapters to remove the specified range. ' - 'Eg: --remove-chapters "*10:15-15:00" --remove-chapters "intro". ' - 'This option can be used multiple times')) + 'The syntax is the same as --download-sections. This option can be used multiple times')) postproc.add_option( '--no-remove-chapters', dest='remove_chapters', action='store_const', const=None, help='Do not remove any chapters from the file (default)') @@ -1524,9 +1670,8 @@ def create_parser(): '--force-keyframes-at-cuts', action='store_true', dest='force_keyframes_at_cuts', default=False, help=( - 'Force keyframes around the chapters before removing/splitting them. ' - 'Requires a re-encode and thus is very slow, but the resulting video ' - 'may have fewer artifacts around the cuts')) + 'Force keyframes at cuts when downloading/splitting/removing sections. ' + 'This is slow due to needing a re-encode, but the resulting video may have fewer artifacts around the cuts')) postproc.add_option( '--no-force-keyframes-at-cuts', action='store_false', dest='force_keyframes_at_cuts', @@ -1564,8 +1709,8 @@ def create_parser(): 'aliases': {'default': ['all']} }, help=( 'SponsorBlock categories to create chapters for, separated by commas. ' - f'Available categories are all, default(=all), {", ".join(SponsorBlockPP.CATEGORIES.keys())}. ' - 'You can prefix the category with a "-" to exempt it. See [1] for description of the categories. ' + f'Available categories are {", ".join(SponsorBlockPP.CATEGORIES.keys())}, all and default (=all). ' + 'You can prefix the category with a "-" to exclude it. See [1] for description of the categories. ' 'Eg: --sponsorblock-mark all,-preview [1] https://wiki.sponsor.ajay.app/w/Segment_Categories')) sponsorblock.add_option( '--sponsorblock-remove', metavar='CATS', @@ -1586,9 +1731,9 @@ def create_parser(): '--sponsorblock-chapter-title', metavar='TEMPLATE', default=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, dest='sponsorblock_chapter_title', help=( - 'The title template for SponsorBlock chapters created by --sponsorblock-mark. ' - 'The same syntax as the output template is used, but the only available fields are ' - 'start_time, end_time, category, categories, name, category_names. Defaults to "%default"')) + 'An output template for the title of the SponsorBlock chapters created by --sponsorblock-mark. ' + 'The only available fields are start_time, end_time, category, categories, name, category_names. ' + 'Defaults to "%default"')) sponsorblock.add_option( '--no-sponsorblock', default=False, action='store_true', dest='no_sponsorblock', diff --git a/yt_dlp/postprocessor/common.py b/yt_dlp/postprocessor/common.py index addc46e5b..7c63fe8a4 100644 --- a/yt_dlp/postprocessor/common.py +++ b/yt_dlp/postprocessor/common.py @@ -45,9 +45,6 @@ class PostProcessor(metaclass=PostProcessorMetaClass): an initial argument and then with the returned value of the previous PostProcessor. - The chain will be stopped if one of them ever returns None or the end - of the chain is reached. - PostProcessor objects follow a "mutual registration" process similar to InfoExtractor objects. @@ -176,6 +173,8 @@ class PostProcessor(metaclass=PostProcessorMetaClass): def report_progress(self, s): s['_default_template'] = '%(postprocessor)s %(status)s' % s + if not self._downloader: + return progress_dict = s.copy() progress_dict.pop('info_dict') @@ -184,7 +183,8 @@ class PostProcessor(metaclass=PostProcessorMetaClass): progress_template = self.get_param('progress_template', {}) tmpl = progress_template.get('postprocess') if tmpl: - self._downloader.to_stdout(self._downloader.evaluate_outtmpl(tmpl, progress_dict)) + self._downloader.to_screen( + self._downloader.evaluate_outtmpl(tmpl, progress_dict), skip_eol=True, quiet=False) self._downloader.to_console_title(self._downloader.evaluate_outtmpl( progress_template.get('postprocess-title') or 'yt-dlp %(progress._default_template)s', @@ -213,5 +213,5 @@ class PostProcessor(metaclass=PostProcessorMetaClass): raise PostProcessingError(f'Unable to communicate with {self.PP_NAME} API: {e}') -class AudioConversionError(PostProcessingError): +class AudioConversionError(PostProcessingError): # Deprecated pass diff --git a/yt_dlp/postprocessor/embedthumbnail.py b/yt_dlp/postprocessor/embedthumbnail.py index d36e0008e..606d90d3d 100644 --- a/yt_dlp/postprocessor/embedthumbnail.py +++ b/yt_dlp/postprocessor/embedthumbnail.py @@ -1,11 +1,11 @@ import base64 -import imghdr import os import re import subprocess from .common import PostProcessor from .ffmpeg import FFmpegPostProcessor, FFmpegThumbnailsConvertorPP +from ..compat import imghdr from ..dependencies import mutagen from ..utils import ( Popen, @@ -157,14 +157,12 @@ class EmbedThumbnailPP(FFmpegPostProcessor): self._report_run('atomicparsley', filename) self.write_debug('AtomicParsley command line: %s' % shell_quote(cmd)) - p = Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate_or_kill() - if p.returncode != 0: - msg = stderr.decode('utf-8', 'replace').strip() - self.report_warning(f'Unable to embed thumbnails using AtomicParsley; {msg}') + stdout, stderr, returncode = Popen.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if returncode: + self.report_warning(f'Unable to embed thumbnails using AtomicParsley; {stderr.strip()}') # for formats that don't support thumbnails (like 3gp) AtomicParsley # won't create to the temporary file - if b'No changes' in stdout: + if 'No changes' in stdout: self.report_warning('The file format doesn\'t support embedding a thumbnail') success = False diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index d1d8e1687..d0a917379 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -6,8 +6,8 @@ import re import subprocess import time -from .common import AudioConversionError, PostProcessor -from ..compat import compat_str +from .common import PostProcessor +from ..compat import functools, imghdr from ..utils import ( ISO639Utils, Popen, @@ -18,6 +18,7 @@ from ..utils import ( dfxp2srt, encodeArgument, encodeFilename, + filter_dict, float_or_none, is_outdated_version, orderedSet, @@ -27,6 +28,7 @@ from ..utils import ( traverse_obj, variadic, write_json_file, + write_string, ) EXT_TO_OUT_FORMATS = { @@ -43,17 +45,37 @@ EXT_TO_OUT_FORMATS = { 'vtt': 'webvtt', } ACODECS = { - 'mp3': 'libmp3lame', - 'aac': 'aac', - 'flac': 'flac', - 'm4a': 'aac', - 'opus': 'libopus', - 'vorbis': 'libvorbis', - 'wav': None, - 'alac': None, + # name: (ext, encoder, opts) + 'mp3': ('mp3', 'libmp3lame', ()), + 'aac': ('m4a', 'aac', ('-f', 'adts')), + 'm4a': ('m4a', 'aac', ('-bsf:a', 'aac_adtstoasc')), + 'opus': ('opus', 'libopus', ()), + 'vorbis': ('ogg', 'libvorbis', ()), + 'flac': ('flac', 'flac', ()), + 'alac': ('m4a', None, ('-acodec', 'alac')), + 'wav': ('wav', None, ('-f', 'wav')), } +def create_mapping_re(supported): + return re.compile(r'{0}(?:/{0})*$'.format(r'(?:\s*\w+\s*>)?\s*(?:%s)\s*' % '|'.join(supported))) + + +def resolve_mapping(source, mapping): + """ + Get corresponding item from a mapping string like 'A>B/C>D/E' + @returns (target, error_message) + """ + for pair in mapping.lower().split('/'): + kv = pair.split('>', 1) + if len(kv) == 1 or kv[0].strip() == source: + target = kv[-1].strip() + if target == source: + return target, f'already is in target format {source}' + return target, None + return None, f'could not find a mapping for {source}' + + class FFmpegPostProcessorError(PostProcessingError): pass @@ -61,16 +83,8 @@ class FFmpegPostProcessorError(PostProcessingError): class FFmpegPostProcessor(PostProcessor): def __init__(self, downloader=None): PostProcessor.__init__(self, downloader) - self._determine_executables() - - def check_version(self): - if not self.available: - raise FFmpegPostProcessorError('ffmpeg not found. Please install or provide the path using --ffmpeg-location') - - required_version = '10-0' if self.basename == 'avconv' else '1.0' - if is_outdated_version(self._versions[self.basename], required_version): - self.report_warning(f'Your copy of {self.basename} is outdated, update {self.basename} ' - f'to version {required_version} or newer if you encounter any errors') + self._prefer_ffmpeg = self.get_param('prefer_ffmpeg', True) + self._paths = self._determine_executables() @staticmethod def get_versions_and_features(downloader=None): @@ -81,88 +95,99 @@ class FFmpegPostProcessor(PostProcessor): def get_versions(downloader=None): return FFmpegPostProcessor.get_versions_and_features(downloader)[0] - _version_cache, _features_cache = {}, {} + _ffmpeg_to_avconv = {'ffmpeg': 'avconv', 'ffprobe': 'avprobe'} def _determine_executables(self): - programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] - - def get_ffmpeg_version(path, prog): - if path in self._version_cache: - self._versions[prog], self._features = self._version_cache[path], self._features_cache.get(path, {}) - return - out = _get_exe_version_output(path, ['-bsfs'], to_screen=self.write_debug) - ver = detect_exe_version(out) if out else False - if ver: - regexs = [ - r'(?:\d+:)?([0-9.]+)-[0-9]+ubuntu[0-9.]+$', # Ubuntu, see [1] - r'n([0-9.]+)$', # Arch Linux - # 1. http://www.ducea.com/2006/06/17/ubuntu-package-version-naming-explanation/ - ] - for regex in regexs: - mobj = re.match(regex, ver) - if mobj: - ver = mobj.group(1) - self._versions[prog] = self._version_cache[path] = ver - if prog != 'ffmpeg' or not out: - return + programs = [*self._ffmpeg_to_avconv.keys(), *self._ffmpeg_to_avconv.values()] - mobj = re.search(r'(?m)^\s+libavformat\s+(?:[0-9. ]+)\s+/\s+(?P<runtime>[0-9. ]+)', out) - lavf_runtime_version = mobj.group('runtime').replace(' ', '') if mobj else None - self._features = self._features_cache[path] = { - 'fdk': '--enable-libfdk-aac' in out, - 'setts': 'setts' in out.splitlines(), - 'needs_adtstoasc': is_outdated_version(lavf_runtime_version, '57.56.100', False), - } - - self.basename = None - self.probe_basename = None - self._paths = None - self._versions = None - self._features = {} - - prefer_ffmpeg = self.get_param('prefer_ffmpeg', True) location = self.get_param('ffmpeg_location') if location is None: - self._paths = {p: p for p in programs} + return {p: p for p in programs} + + if not os.path.exists(location): + self.report_warning(f'ffmpeg-location {location} does not exist! Continuing without ffmpeg') + return {} + elif os.path.isdir(location): + dirname, basename = location, None else: - if not os.path.exists(location): - self.report_warning( - 'ffmpeg-location %s does not exist! ' - 'Continuing without ffmpeg.' % (location)) - self._versions = {} - return - elif os.path.isdir(location): - dirname, basename = location, None - else: - basename = os.path.splitext(os.path.basename(location))[0] - basename = next((p for p in programs if basename.startswith(p)), 'ffmpeg') - dirname = os.path.dirname(os.path.abspath(location)) - if basename in ('ffmpeg', 'ffprobe'): - prefer_ffmpeg = True - - self._paths = { - p: os.path.join(dirname, p) for p in programs} - if basename: - self._paths[basename] = location - - self._versions = {} - # NB: probe must be first for _features to be poulated correctly - executables = {'probe_basename': ('ffprobe', 'avprobe'), 'basename': ('ffmpeg', 'avconv')} - if prefer_ffmpeg is False: - executables = {k: v[::-1] for k, v in executables.items()} - for var, prefs in executables.items(): - for p in prefs: - get_ffmpeg_version(self._paths[p], p) - if self._versions[p]: - setattr(self, var, p) - break - - if self.basename == 'avconv': - self.deprecation_warning( - 'Support for avconv is deprecated and may be removed in a future version. Use ffmpeg instead') - if self.probe_basename == 'avprobe': + basename = os.path.splitext(os.path.basename(location))[0] + basename = next((p for p in programs if basename.startswith(p)), 'ffmpeg') + dirname = os.path.dirname(os.path.abspath(location)) + if basename in self._ffmpeg_to_avconv.keys(): + self._prefer_ffmpeg = True + + paths = {p: os.path.join(dirname, p) for p in programs} + if basename: + paths[basename] = location + return paths + + _version_cache, _features_cache = {None: None}, {} + + def _get_ffmpeg_version(self, prog): + path = self._paths.get(prog) + if path in self._version_cache: + return self._version_cache[path], self._features_cache.get(path, {}) + out = _get_exe_version_output(path, ['-bsfs'], to_screen=self.write_debug) + ver = detect_exe_version(out) if out else False + if ver: + regexs = [ + r'(?:\d+:)?([0-9.]+)-[0-9]+ubuntu[0-9.]+$', # Ubuntu, see [1] + r'n([0-9.]+)$', # Arch Linux + # 1. http://www.ducea.com/2006/06/17/ubuntu-package-version-naming-explanation/ + ] + for regex in regexs: + mobj = re.match(regex, ver) + if mobj: + ver = mobj.group(1) + self._version_cache[path] = ver + if prog != 'ffmpeg' or not out: + return ver, {} + + mobj = re.search(r'(?m)^\s+libavformat\s+(?:[0-9. ]+)\s+/\s+(?P<runtime>[0-9. ]+)', out) + lavf_runtime_version = mobj.group('runtime').replace(' ', '') if mobj else None + self._features_cache[path] = features = { + 'fdk': '--enable-libfdk-aac' in out, + 'setts': 'setts' in out.splitlines(), + 'needs_adtstoasc': is_outdated_version(lavf_runtime_version, '57.56.100', False), + } + return ver, features + + @property + def _versions(self): + return filter_dict({self.basename: self._version, self.probe_basename: self._probe_version}) + + @functools.cached_property + def basename(self): + self._version # run property + return self.basename + + @functools.cached_property + def probe_basename(self): + self._probe_version # run property + return self.probe_basename + + def _get_version(self, kind): + executables = (kind, self._ffmpeg_to_avconv[kind]) + if not self._prefer_ffmpeg: + executables = reversed(executables) + basename, version, features = next(filter( + lambda x: x[1], ((p, *self._get_ffmpeg_version(p)) for p in executables)), (None, None, {})) + if kind == 'ffmpeg': + self.basename, self._features = basename, features + else: + self.probe_basename = basename + if basename == self._ffmpeg_to_avconv[kind]: self.deprecation_warning( - 'Support for avprobe is deprecated and may be removed in a future version. Use ffprobe instead') + f'Support for {self._ffmpeg_to_avconv[kind]} is deprecated and may be removed in a future version. Use {kind} instead') + return version + + @functools.cached_property + def _version(self): + return self._get_version('ffmpeg') + + @functools.cached_property + def _probe_version(self): + return self._get_version('ffprobe') @property def available(self): @@ -170,7 +195,7 @@ class FFmpegPostProcessor(PostProcessor): @property def executable(self): - return self._paths[self.basename] + return self._paths.get(self.basename) @property def probe_available(self): @@ -178,7 +203,7 @@ class FFmpegPostProcessor(PostProcessor): @property def probe_executable(self): - return self._paths[self.probe_basename] + return self._paths.get(self.probe_basename) @staticmethod def stream_copy_opts(copy=True, *, ext=None): @@ -191,6 +216,15 @@ class FFmpegPostProcessor(PostProcessor): if ext in ('mp4', 'mov', 'm4a'): yield from ('-c:s', 'mov_text') + def check_version(self): + if not self.available: + raise FFmpegPostProcessorError('ffmpeg not found. Please install or provide the path using --ffmpeg-location') + + required_version = '10-0' if self.basename == 'avconv' else '1.0' + if is_outdated_version(self._version, required_version): + self.report_warning(f'Your copy of {self.basename} is outdated, update {self.basename} ' + f'to version {required_version} or newer if you encounter any errors') + def get_audio_codec(self, path): if not self.probe_available and not self.available: raise PostProcessingError('ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location') @@ -205,14 +239,13 @@ class FFmpegPostProcessor(PostProcessor): encodeArgument('-i')] cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True)) self.write_debug(f'{self.basename} command line: {shell_quote(cmd)}') - handle = Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout_data, stderr_data = handle.communicate_or_kill() - expected_ret = 0 if self.probe_available else 1 - if handle.wait() != expected_ret: + stdout, stderr, returncode = Popen.run( + cmd, text=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if returncode != (0 if self.probe_available else 1): return None except OSError: return None - output = (stdout_data if self.probe_available else stderr_data).decode('ascii', 'ignore') + output = stdout if self.probe_available else stderr if self.probe_available: audio_codec = None for line in output.split('\n'): @@ -246,11 +279,10 @@ class FFmpegPostProcessor(PostProcessor): ] cmd += opts - cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True)) - self.write_debug('ffprobe command line: %s' % shell_quote(cmd)) - p = Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - stdout, stderr = p.communicate() - return json.loads(stdout.decode('utf-8', 'replace')) + cmd.append(self._ffmpeg_filename_argument(path)) + self.write_debug(f'ffprobe command line: {shell_quote(cmd)}') + stdout, _, _ = Popen.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + return json.loads(stdout) def get_stream_number(self, path, keys, value): streams = self.get_metadata_object(path)['streams'] @@ -270,12 +302,12 @@ class FFmpegPostProcessor(PostProcessor): if fatal: raise PostProcessingError(f'Unable to determine video duration: {e.msg}') - def _duration_mismatch(self, d1, d2): + def _duration_mismatch(self, d1, d2, tolerance=2): if not d1 or not d2: return None # The duration is often only known to nearest second. So there can be <1sec disparity natually. # Further excuse an additional <1sec difference. - return abs(d1 - d2) > 2 + return abs(d1 - d2) > tolerance def run_ffmpeg_multiple_files(self, input_paths, out_path, opts, **kwargs): return self.real_run_ffmpeg( @@ -312,16 +344,15 @@ class FFmpegPostProcessor(PostProcessor): for i, (path, opts) in enumerate(path_opts) if path) self.write_debug('ffmpeg command line: %s' % shell_quote(cmd)) - p = Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - stdout, stderr = p.communicate_or_kill() - if p.returncode not in variadic(expected_retcodes): - stderr = stderr.decode('utf-8', 'replace').strip() + _, stderr, returncode = Popen.run( + cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + if returncode not in variadic(expected_retcodes): self.write_debug(stderr) - raise FFmpegPostProcessorError(stderr.split('\n')[-1]) + raise FFmpegPostProcessorError(stderr.strip().splitlines()[-1]) for out_path, _ in output_path_opts: if out_path: self.try_utime(out_path, oldest_mtime, oldest_mtime) - return stderr.decode('utf-8', 'replace') + return stderr def run_ffmpeg(self, path, out_path, opts, **kwargs): return self.run_ffmpeg_multiple_files([path], out_path, opts, **kwargs) @@ -391,11 +422,12 @@ class FFmpegPostProcessor(PostProcessor): class FFmpegExtractAudioPP(FFmpegPostProcessor): COMMON_AUDIO_EXTS = ('wav', 'flac', 'm4a', 'aiff', 'mp3', 'ogg', 'mka', 'opus', 'wma') - SUPPORTED_EXTS = ('aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav', 'alac') + SUPPORTED_EXTS = tuple(ACODECS.keys()) + FORMAT_RE = create_mapping_re(('best', *SUPPORTED_EXTS)) def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, nopostoverwrites=False): FFmpegPostProcessor.__init__(self, downloader) - self._preferredcodec = preferredcodec or 'best' + self.mapping = preferredcodec or 'best' self._preferredquality = float_or_none(preferredquality) self._nopostoverwrites = nopostoverwrites @@ -430,71 +462,47 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): try: FFmpegPostProcessor.run_ffmpeg(self, path, out_path, opts) except FFmpegPostProcessorError as err: - raise AudioConversionError(err.msg) + raise PostProcessingError(f'audio conversion failed: {err.msg}') @PostProcessor._restrict_to(images=False) def run(self, information): orig_path = path = information['filepath'] - orig_ext = information['ext'] - - if self._preferredcodec == 'best' and orig_ext in self.COMMON_AUDIO_EXTS: - self.to_screen('Skipping audio extraction since the file is already in a common audio format') + target_format, _skip_msg = resolve_mapping(information['ext'], self.mapping) + if target_format == 'best' and information['ext'] in self.COMMON_AUDIO_EXTS: + target_format, _skip_msg = None, 'the file is already in a common audio format' + if not target_format: + self.to_screen(f'Not converting audio {orig_path}; {_skip_msg}') return [], information filecodec = self.get_audio_codec(path) if filecodec is None: raise PostProcessingError('WARNING: unable to obtain file audio codec with ffprobe') - more_opts = [] - if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'): - if filecodec == 'aac' and self._preferredcodec in ['m4a', 'best']: - # Lossless, but in another container - acodec = 'copy' - extension = 'm4a' - more_opts = ['-bsf:a', 'aac_adtstoasc'] - elif filecodec in ['aac', 'flac', 'mp3', 'vorbis', 'opus']: - # Lossless if possible - acodec = 'copy' - extension = filecodec - if filecodec == 'aac': - more_opts = ['-f', 'adts'] - if filecodec == 'vorbis': - extension = 'ogg' - elif filecodec == 'alac': - acodec = None - extension = 'm4a' - more_opts += ['-acodec', 'alac'] - else: - # MP3 otherwise. - acodec = 'libmp3lame' - extension = 'mp3' - more_opts = self._quality_args(acodec) + if filecodec == 'aac' and target_format in ('m4a', 'best'): + # Lossless, but in another container + extension, _, more_opts, acodec = *ACODECS['m4a'], 'copy' + elif target_format == 'best' or target_format == filecodec: + # Lossless if possible + try: + extension, _, more_opts, acodec = *ACODECS[filecodec], 'copy' + except KeyError: + extension, acodec, more_opts = ACODECS['mp3'] else: # We convert the audio (lossy if codec is lossy) - acodec = ACODECS[self._preferredcodec] + extension, acodec, more_opts = ACODECS[target_format] if acodec == 'aac' and self._features.get('fdk'): - acodec = 'libfdk_aac' - extension = self._preferredcodec + acodec, more_opts = 'libfdk_aac', [] + + more_opts = list(more_opts) + if acodec != 'copy': more_opts = self._quality_args(acodec) - if self._preferredcodec == 'aac': - more_opts += ['-f', 'adts'] - elif self._preferredcodec == 'm4a': - more_opts += ['-bsf:a', 'aac_adtstoasc'] - elif self._preferredcodec == 'vorbis': - extension = 'ogg' - elif self._preferredcodec == 'wav': - extension = 'wav' - more_opts += ['-f', 'wav'] - elif self._preferredcodec == 'alac': - extension = 'm4a' - more_opts += ['-acodec', 'alac'] - - prefix, sep, ext = path.rpartition('.') # not os.path.splitext, since the latter does not work on unicode in all setups - temp_path = new_path = prefix + sep + extension + + # not os.path.splitext, since the latter does not work on unicode in all setups + temp_path = new_path = f'{path.rpartition(".")[0]}.{extension}' if new_path == path: if acodec == 'copy': - self.to_screen(f'File is already in target format {self._preferredcodec}, skipping') + self.to_screen(f'Not converting audio {orig_path}; file is already in target format {target_format}') return [], information orig_path = prepend_extension(path, 'orig') temp_path = prepend_extension(path, 'temp') @@ -503,14 +511,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): self.to_screen('Post-process file %s exists, skipping' % new_path) return [], information - try: - self.to_screen(f'Destination: {new_path}') - self.run_ffmpeg(path, temp_path, acodec, more_opts) - except AudioConversionError as e: - raise PostProcessingError( - 'audio conversion failed: ' + e.msg) - except Exception: - raise PostProcessingError('error running ' + self.basename) + self.to_screen(f'Destination: {new_path}') + self.run_ffmpeg(path, temp_path, acodec, more_opts) os.replace(path, orig_path) os.replace(temp_path, new_path) @@ -520,26 +522,19 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): # Try to update the date time for extracted audio file. if information.get('filetime') is not None: self.try_utime( - new_path, time.time(), information['filetime'], - errnote='Cannot update utime of audio file') + new_path, time.time(), information['filetime'], errnote='Cannot update utime of audio file') return [orig_path], information class FFmpegVideoConvertorPP(FFmpegPostProcessor): SUPPORTED_EXTS = ('mp4', 'mkv', 'flv', 'webm', 'mov', 'avi', 'mka', 'ogg', *FFmpegExtractAudioPP.SUPPORTED_EXTS) - FORMAT_RE = re.compile(r'{0}(?:/{0})*$'.format(r'(?:\w+>)?(?:%s)' % '|'.join(SUPPORTED_EXTS))) + FORMAT_RE = create_mapping_re(SUPPORTED_EXTS) _ACTION = 'converting' def __init__(self, downloader=None, preferedformat=None): super().__init__(downloader) - self._preferedformats = preferedformat.lower().split('/') - - def _target_ext(self, source_ext): - for pair in self._preferedformats: - kv = pair.split('>') - if len(kv) == 1 or kv[0].strip() == source_ext: - return kv[-1].strip() + self.mapping = preferedformat @staticmethod def _options(target_ext): @@ -550,11 +545,7 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor): @PostProcessor._restrict_to(images=False) def run(self, info): filename, source_ext = info['filepath'], info['ext'].lower() - target_ext = self._target_ext(source_ext) - _skip_msg = ( - f'could not find a mapping for {source_ext}' if not target_ext - else f'already is in target format {source_ext}' if source_ext == target_ext - else None) + target_ext, _skip_msg = resolve_mapping(source_ext, self.mapping) if _skip_msg: self.to_screen(f'Not {self._ACTION} media file "{filename}"; {_skip_msg}') return [], info @@ -762,7 +753,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor): for key, value in info.items(): mobj = re.fullmatch(meta_regex, key) if value is not None and mobj: - metadata[mobj.group('i') or 'common'][mobj.group('key')] = value + metadata[mobj.group('i') or 'common'][mobj.group('key')] = value.replace('\0', '') # Write id3v1 metadata also since Windows Explorer can't handle id3v2 tags yield ('-write_id3v1', '1') @@ -1030,8 +1021,8 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor): self.to_screen('Chapter %03d; Destination: %s' % (number, destination)) return ( destination, - ['-ss', compat_str(chapter['start_time']), - '-t', compat_str(chapter['end_time'] - chapter['start_time'])]) + ['-ss', str(chapter['start_time']), + '-t', str(chapter['end_time'] - chapter['start_time'])]) @PostProcessor._restrict_to(images=False) def run(self, info): @@ -1054,23 +1045,22 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor): class FFmpegThumbnailsConvertorPP(FFmpegPostProcessor): SUPPORTED_EXTS = ('jpg', 'png', 'webp') + FORMAT_RE = create_mapping_re(SUPPORTED_EXTS) def __init__(self, downloader=None, format=None): super().__init__(downloader) - self.format = format + self.mapping = format - @staticmethod - def is_webp(path): - with open(encodeFilename(path), 'rb') as f: - b = f.read(12) - return b[0:4] == b'RIFF' and b[8:] == b'WEBP' + @classmethod + def is_webp(cls, path): + write_string(f'DeprecationWarning: {cls.__module__}.{cls.__name__}.is_webp is deprecated') + return imghdr.what(path) == 'webp' def fixup_webp(self, info, idx=-1): thumbnail_filename = info['thumbnails'][idx]['filepath'] _, thumbnail_ext = os.path.splitext(thumbnail_filename) if thumbnail_ext: - thumbnail_ext = thumbnail_ext[1:].lower() - if thumbnail_ext != 'webp' and self.is_webp(thumbnail_filename): + if thumbnail_ext.lower() != '.webp' and imghdr.what(thumbnail_filename) == 'webp': self.to_screen('Correcting thumbnail "%s" extension to webp' % thumbnail_filename) webp_filename = replace_extension(thumbnail_filename, 'webp') os.replace(thumbnail_filename, webp_filename) @@ -1103,18 +1093,17 @@ class FFmpegThumbnailsConvertorPP(FFmpegPostProcessor): continue has_thumbnail = True self.fixup_webp(info, idx) - _, thumbnail_ext = os.path.splitext(original_thumbnail) - if thumbnail_ext: - thumbnail_ext = thumbnail_ext[1:].lower() + thumbnail_ext = os.path.splitext(original_thumbnail)[1][1:].lower() if thumbnail_ext == 'jpeg': thumbnail_ext = 'jpg' - if thumbnail_ext == self.format: - self.to_screen('Thumbnail "%s" is already in the requested format' % original_thumbnail) + target_ext, _skip_msg = resolve_mapping(thumbnail_ext, self.mapping) + if _skip_msg: + self.to_screen(f'Not converting thumbnail "{original_thumbnail}"; {_skip_msg}') continue - thumbnail_dict['filepath'] = self.convert_thumbnail(original_thumbnail, self.format) + thumbnail_dict['filepath'] = self.convert_thumbnail(original_thumbnail, target_ext) files_to_delete.append(original_thumbnail) info['__files_to_move'][thumbnail_dict['filepath']] = replace_extension( - info['__files_to_move'][original_thumbnail], self.format) + info['__files_to_move'][original_thumbnail], target_ext) if not has_thumbnail: self.to_screen('There aren\'t any thumbnails to convert') diff --git a/yt_dlp/postprocessor/modify_chapters.py b/yt_dlp/postprocessor/modify_chapters.py index 8a2ef9065..de3505e11 100644 --- a/yt_dlp/postprocessor/modify_chapters.py +++ b/yt_dlp/postprocessor/modify_chapters.py @@ -32,13 +32,13 @@ class ModifyChaptersPP(FFmpegPostProcessor): real_duration = self._get_real_video_duration(info['filepath']) if not chapters: - chapters = [{'start_time': 0, 'end_time': real_duration, 'title': info['title']}] + chapters = [{'start_time': 0, 'end_time': info.get('duration') or real_duration, 'title': info['title']}] info['chapters'], cuts = self._remove_marked_arrange_sponsors(chapters + sponsor_chapters) if not cuts: return [], info - if self._duration_mismatch(real_duration, info.get('duration')): + if self._duration_mismatch(real_duration, info.get('duration'), 1): if not self._duration_mismatch(real_duration, info['chapters'][-1]['end_time']): self.to_screen(f'Skipping {self.pp_key()} since the video appears to be already cut') return [], info diff --git a/yt_dlp/postprocessor/sponskrub.py b/yt_dlp/postprocessor/sponskrub.py index 1a9f5dc66..ff50d5b4f 100644 --- a/yt_dlp/postprocessor/sponskrub.py +++ b/yt_dlp/postprocessor/sponskrub.py @@ -84,17 +84,15 @@ class SponSkrubPP(PostProcessor): cmd = [encodeArgument(i) for i in cmd] self.write_debug('sponskrub command line: %s' % shell_quote(cmd)) - pipe = None if self.get_param('verbose') else subprocess.PIPE - p = Popen(cmd, stdout=pipe) - stdout = p.communicate_or_kill()[0] + stdout, _, returncode = Popen.run(cmd, text=True, stdout=None if self.get_param('verbose') else subprocess.PIPE) - if p.returncode == 0: + if not returncode: os.replace(temp_filename, filename) self.to_screen('Sponsor sections have been %s' % ('removed' if self.cutout else 'marked')) - elif p.returncode == 3: + elif returncode == 3: self.to_screen('No segments in the SponsorBlock database') else: - msg = stdout.decode('utf-8', 'replace').strip() if stdout else '' - msg = msg.split('\n')[0 if msg.lower().startswith('unrecognised') else -1] - raise PostProcessingError(msg if msg else 'sponskrub failed with error code %s' % p.returncode) + raise PostProcessingError( + stdout.strip().splitlines()[0 if stdout.strip().lower().startswith('unrecognised') else -1] + or f'sponskrub failed with error code {returncode}') return [], information diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py index 7f75561db..d79ed7ae7 100644 --- a/yt_dlp/postprocessor/sponsorblock.py +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -1,9 +1,9 @@ import hashlib import json import re +import urllib.parse from .ffmpeg import FFmpegPostProcessor -from ..compat import compat_urllib_parse_urlencode class SponsorBlockPP(FFmpegPostProcessor): @@ -86,7 +86,7 @@ class SponsorBlockPP(FFmpegPostProcessor): def _get_sponsor_segments(self, video_id, service): hash = hashlib.sha256(video_id.encode('ascii')).hexdigest() # SponsorBlock API recommends using first 4 hash characters. - url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + compat_urllib_parse_urlencode({ + url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + urllib.parse.urlencode({ 'service': service, 'categories': json.dumps(self._categories), 'actionTypes': json.dumps(['skip', 'poi']) diff --git a/yt_dlp/socks.py b/yt_dlp/socks.py index 34ba1394a..f93328f63 100644 --- a/yt_dlp/socks.py +++ b/yt_dlp/socks.py @@ -8,8 +8,9 @@ import collections import socket +import struct -from .compat import compat_ord, compat_struct_pack, compat_struct_unpack +from .compat import compat_ord __author__ = 'Timo Schmid <coding@timoschmid.de>' @@ -19,7 +20,7 @@ SOCKS4_REPLY_VERSION = 0x00 # if the client cannot resolve the destination host's domain name to find its # IP address, it should set the first three bytes of DSTIP to NULL and the last # byte to a non-zero value. -SOCKS4_DEFAULT_DSTIP = compat_struct_pack('!BBBB', 0, 0, 0, 0xFF) +SOCKS4_DEFAULT_DSTIP = struct.pack('!BBBB', 0, 0, 0, 0xFF) SOCKS5_VERSION = 5 SOCKS5_USER_AUTH_VERSION = 0x01 @@ -122,11 +123,11 @@ class sockssocket(socket.socket): def _recv_bytes(self, cnt): data = self.recvall(cnt) - return compat_struct_unpack(f'!{cnt}B', data) + return struct.unpack(f'!{cnt}B', data) @staticmethod def _len_and_data(data): - return compat_struct_pack('!B', len(data)) + data + return struct.pack('!B', len(data)) + data def _check_response_version(self, expected_version, got_version): if got_version != expected_version: @@ -147,7 +148,7 @@ class sockssocket(socket.socket): ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a) - packet = compat_struct_pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr + packet = struct.pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr username = (self._proxy.username or '').encode() packet += username + b'\x00' @@ -157,7 +158,7 @@ class sockssocket(socket.socket): self.sendall(packet) - version, resp_code, dstport, dsthost = compat_struct_unpack('!BBHI', self.recvall(8)) + version, resp_code, dstport, dsthost = struct.unpack('!BBHI', self.recvall(8)) self._check_response_version(SOCKS4_REPLY_VERSION, version) @@ -171,14 +172,14 @@ class sockssocket(socket.socket): self._setup_socks4(address, is_4a=True) def _socks5_auth(self): - packet = compat_struct_pack('!B', SOCKS5_VERSION) + packet = struct.pack('!B', SOCKS5_VERSION) auth_methods = [Socks5Auth.AUTH_NONE] if self._proxy.username and self._proxy.password: auth_methods.append(Socks5Auth.AUTH_USER_PASS) - packet += compat_struct_pack('!B', len(auth_methods)) - packet += compat_struct_pack(f'!{len(auth_methods)}B', *auth_methods) + packet += struct.pack('!B', len(auth_methods)) + packet += struct.pack(f'!{len(auth_methods)}B', *auth_methods) self.sendall(packet) @@ -194,7 +195,7 @@ class sockssocket(socket.socket): if method == Socks5Auth.AUTH_USER_PASS: username = self._proxy.username.encode() password = self._proxy.password.encode() - packet = compat_struct_pack('!B', SOCKS5_USER_AUTH_VERSION) + packet = struct.pack('!B', SOCKS5_USER_AUTH_VERSION) packet += self._len_and_data(username) + self._len_and_data(password) self.sendall(packet) @@ -214,14 +215,14 @@ class sockssocket(socket.socket): self._socks5_auth() reserved = 0 - packet = compat_struct_pack('!BBB', SOCKS5_VERSION, Socks5Command.CMD_CONNECT, reserved) + packet = struct.pack('!BBB', SOCKS5_VERSION, Socks5Command.CMD_CONNECT, reserved) if ipaddr is None: destaddr = destaddr.encode() - packet += compat_struct_pack('!B', Socks5AddressType.ATYP_DOMAINNAME) + packet += struct.pack('!B', Socks5AddressType.ATYP_DOMAINNAME) packet += self._len_and_data(destaddr) else: - packet += compat_struct_pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr - packet += compat_struct_pack('!H', port) + packet += struct.pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr + packet += struct.pack('!H', port) self.sendall(packet) @@ -240,7 +241,7 @@ class sockssocket(socket.socket): destaddr = self.recvall(alen) elif atype == Socks5AddressType.ATYP_IPV6: destaddr = self.recvall(16) - destport = compat_struct_unpack('!H', self.recvall(2))[0] + destport = struct.unpack('!H', self.recvall(2))[0] return (destaddr, destport) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 8c8ea384b..fba64be5a 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 import atexit import base64 import binascii @@ -11,10 +10,13 @@ import datetime import email.header import email.utils import errno -import functools import gzip import hashlib import hmac +import html.entities +import html.parser +import http.client +import http.cookiejar import importlib.util import io import itertools @@ -30,41 +32,28 @@ import re import shlex import socket import ssl +import struct import subprocess import sys import tempfile import time import traceback +import types +import urllib.error import urllib.parse +import urllib.request import xml.etree.ElementTree import zlib +from .compat import asyncio, functools # isort: split from .compat import ( - asyncio, - compat_chr, - compat_cookiejar, compat_etree_fromstring, compat_expanduser, - compat_html_entities, - compat_html_entities_html5, compat_HTMLParseError, - compat_HTMLParser, - compat_http_client, - compat_HTTPError, compat_os_name, - compat_parse_qs, compat_shlex_quote, - compat_str, - compat_struct_pack, - compat_struct_unpack, - compat_urllib_error, - compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urllib_request, - compat_urlparse, ) -from .dependencies import brotli, certifi, websockets +from .dependencies import brotli, certifi, websockets, xattr from .socks import ProxyType, sockssocket @@ -73,8 +62,8 @@ def register_socks_protocols(): # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 # URLs with protocols not in urlparse.uses_netloc are not handled correctly for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): - if scheme not in compat_urlparse.uses_netloc: - compat_urlparse.uses_netloc.append(scheme) + if scheme not in urllib.parse.uses_netloc: + urllib.parse.uses_netloc.append(scheme) # This is not clearly defined otherwise @@ -146,6 +135,7 @@ USER_AGENTS = { NO_DEFAULT = object() +IDENTITY = lambda x: x ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', @@ -248,6 +238,7 @@ JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<jso NUMBER_RE = r'\d+(?:\.\d+)?' +@functools.cache def preferredencoding(): """Get preferred encoding. @@ -315,7 +306,7 @@ def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): def _find_xpath(xpath): return node.find(xpath) - if isinstance(xpath, (str, compat_str)): + if isinstance(xpath, str): n = _find_xpath(xpath) else: for xp in xpath: @@ -362,14 +353,14 @@ def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): return n.attrib[key] -def get_element_by_id(id, html): +def get_element_by_id(id, html, **kwargs): """Return the content of the tag with the specified ID in the passed HTML document""" - return get_element_by_attribute('id', id, html) + return get_element_by_attribute('id', id, html, **kwargs) -def get_element_html_by_id(id, html): +def get_element_html_by_id(id, html, **kwargs): """Return the html of the tag with the specified ID in the passed HTML document""" - return get_element_html_by_attribute('id', id, html) + return get_element_html_by_attribute('id', id, html, **kwargs) def get_element_by_class(class_name, html): @@ -384,27 +375,27 @@ def get_element_html_by_class(class_name, html): return retval[0] if retval else None -def get_element_by_attribute(attribute, value, html, escape_value=True): - retval = get_elements_by_attribute(attribute, value, html, escape_value) +def get_element_by_attribute(attribute, value, html, **kwargs): + retval = get_elements_by_attribute(attribute, value, html, **kwargs) return retval[0] if retval else None -def get_element_html_by_attribute(attribute, value, html, escape_value=True): - retval = get_elements_html_by_attribute(attribute, value, html, escape_value) +def get_element_html_by_attribute(attribute, value, html, **kargs): + retval = get_elements_html_by_attribute(attribute, value, html, **kargs) return retval[0] if retval else None -def get_elements_by_class(class_name, html): +def get_elements_by_class(class_name, html, **kargs): """Return the content of all tags with the specified class in the passed HTML document as a list""" return get_elements_by_attribute( - 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name), html, escape_value=False) def get_elements_html_by_class(class_name, html): """Return the html of all tags with the specified class in the passed HTML document as a list""" return get_elements_html_by_attribute( - 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name), html, escape_value=False) @@ -443,7 +434,7 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value ) -class HTMLBreakOnClosingTagParser(compat_HTMLParser): +class HTMLBreakOnClosingTagParser(html.parser.HTMLParser): """ HTML parser which raises HTMLBreakOnClosingTagException upon reaching the closing tag for the first opening tag it has encountered, and can be used @@ -455,7 +446,7 @@ class HTMLBreakOnClosingTagParser(compat_HTMLParser): def __init__(self): self.tagstack = collections.deque() - compat_HTMLParser.__init__(self) + html.parser.HTMLParser.__init__(self) def __enter__(self): return self @@ -520,22 +511,22 @@ def get_element_text_and_html_by_tag(tag, html): raise compat_HTMLParseError('unexpected end of html') -class HTMLAttributeParser(compat_HTMLParser): +class HTMLAttributeParser(html.parser.HTMLParser): """Trivial HTML parser to gather the attributes for a single element""" def __init__(self): self.attrs = {} - compat_HTMLParser.__init__(self) + html.parser.HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): self.attrs = dict(attrs) -class HTMLListAttrsParser(compat_HTMLParser): +class HTMLListAttrsParser(html.parser.HTMLParser): """HTML parser to gather the attributes for the elements of a list""" def __init__(self): - compat_HTMLParser.__init__(self) + html.parser.HTMLParser.__init__(self) self.items = [] self._level = 0 @@ -594,6 +585,19 @@ def clean_html(html): return html.strip() +class LenientJSONDecoder(json.JSONDecoder): + def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs): + self.transform_source, self.ignore_extra = transform_source, ignore_extra + super().__init__(*args, **kwargs) + + def decode(self, s): + if self.transform_source: + s = self.transform_source(s) + if self.ignore_extra: + return self.raw_decode(s.lstrip())[0] + return super().decode(s) + + def sanitize_open(filename, open_mode): """Try to open the given filename, and slightly tweak it if this fails. @@ -619,9 +623,9 @@ def sanitize_open(filename, open_mode): # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124 raise LockingUnsupportedError() stream = locked_file(filename, open_mode, block=False).__enter__() - except LockingUnsupportedError: + except OSError: stream = open(filename, open_mode) - return (stream, filename) + return stream, filename except OSError as err: if attempt or err.errno in (errno.EACCES,): raise @@ -714,7 +718,9 @@ def sanitize_path(s, force=False): def sanitize_url(url): # Prepend protocol-less URLs with `http:` scheme in order to mitigate # the number of unwanted failures due to missing protocol - if url.startswith('//'): + if url is None: + return + elif url.startswith('//'): return 'http:%s' % url # Fix some common typos seen so far COMMON_TYPOS = ( @@ -730,10 +736,10 @@ def sanitize_url(url): def extract_basic_auth(url): - parts = compat_urlparse.urlsplit(url) + parts = urllib.parse.urlsplit(url) if parts.username is None: return url, None - url = compat_urlparse.urlunsplit(parts._replace(netloc=( + url = urllib.parse.urlunsplit(parts._replace(netloc=( parts.hostname if parts.port is None else '%s:%d' % (parts.hostname, parts.port)))) auth_payload = base64.b64encode( @@ -746,7 +752,7 @@ def sanitized_Request(url, *args, **kwargs): if auth_header is not None: headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {}) headers['Authorization'] = auth_header - return compat_urllib_request.Request(url, *args, **kwargs) + return urllib.request.Request(url, *args, **kwargs) def expand_path(s): @@ -754,13 +760,16 @@ def expand_path(s): return os.path.expandvars(compat_expanduser(s)) -def orderedSet(iterable): - """ Remove all duplicates from the input iterable """ - res = [] - for el in iterable: - if el not in res: - res.append(el) - return res +def orderedSet(iterable, *, lazy=False): + """Remove all duplicates from the input iterable""" + def _iter(): + seen = [] # Do not use set since the items can be unhashable + for x in iterable: + if x not in seen: + seen.append(x) + yield x + + return _iter() if lazy else list(_iter()) def _htmlentity_transform(entity_with_semicolon): @@ -768,13 +777,13 @@ def _htmlentity_transform(entity_with_semicolon): entity = entity_with_semicolon[:-1] # Known non-numeric HTML entity - if entity in compat_html_entities.name2codepoint: - return compat_chr(compat_html_entities.name2codepoint[entity]) + if entity in html.entities.name2codepoint: + return chr(html.entities.name2codepoint[entity]) # TODO: HTML5 allows entities without a semicolon. For example, # 'Éric' should be decoded as 'Éric'. - if entity_with_semicolon in compat_html_entities_html5: - return compat_html_entities_html5[entity_with_semicolon] + if entity_with_semicolon in html.entities.html5: + return html.entities.html5[entity_with_semicolon] mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) if mobj is not None: @@ -786,7 +795,7 @@ def _htmlentity_transform(entity_with_semicolon): base = 10 # See https://github.com/ytdl-org/youtube-dl/issues/7518 with contextlib.suppress(ValueError): - return compat_chr(int(numstr, base)) + return chr(int(numstr, base)) # Unknown entity in name, return its literal representation return '&%s;' % entity @@ -813,12 +822,9 @@ def escapeHTML(text): def process_communicate_or_kill(p, *args, **kwargs): - try: - return p.communicate(*args, **kwargs) - except BaseException: # Including KeyboardInterrupt - p.kill() - p.wait() - raise + write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated ' + 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead') + return Popen.communicate_or_kill(p, *args, **kwargs) class Popen(subprocess.Popen): @@ -828,11 +834,30 @@ class Popen(subprocess.Popen): else: _startupinfo = None - def __init__(self, *args, **kwargs): + def __init__(self, *args, text=False, **kwargs): + if text is True: + kwargs['universal_newlines'] = True # For 3.6 compatibility + kwargs.setdefault('encoding', 'utf-8') + kwargs.setdefault('errors', 'replace') super().__init__(*args, **kwargs, startupinfo=self._startupinfo) def communicate_or_kill(self, *args, **kwargs): - return process_communicate_or_kill(self, *args, **kwargs) + try: + return self.communicate(*args, **kwargs) + except BaseException: # Including KeyboardInterrupt + self.kill(timeout=None) + raise + + def kill(self, *, timeout=0): + super().kill() + if timeout != 0: + self.wait(timeout=timeout) + + @classmethod + def run(cls, *args, **kwargs): + with cls(*args, **kwargs) as proc: + stdout, stderr = proc.communicate_or_kill() + return stdout or '', stderr or '', proc.returncode def get_subprocess_encoding(): @@ -859,7 +884,7 @@ def decodeFilename(b, for_subprocess=False): def encodeArgument(s): # Legacy code that uses byte strings # Uncomment the following line after fixing all post processors - # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s)) + # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s)) return s if isinstance(s, str) else s.decode('ascii') @@ -873,7 +898,7 @@ def decodeOption(optval): if isinstance(optval, bytes): optval = optval.decode(preferredencoding()) - assert isinstance(optval, compat_str) + assert isinstance(optval, str) return optval @@ -919,22 +944,23 @@ def make_HTTPS_handler(params, **kwargs): context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998 context.set_ciphers('DEFAULT') + context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE if opts_check_certificate: if has_certifi and 'no-certifi' not in params.get('compat_opts', []): context.load_verify_locations(cafile=certifi.where()) - else: - try: - context.load_default_certs() - # Work around the issue in load_default_certs when there are bad certificates. See: - # https://github.com/yt-dlp/yt-dlp/issues/1060, - # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312 - except ssl.SSLError: - # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151 - if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'): - for storename in ('CA', 'ROOT'): - _ssl_load_windows_store_certs(context, storename) - context.set_default_verify_paths() + try: + context.load_default_certs() + # Work around the issue in load_default_certs when there are bad certificates. See: + # https://github.com/yt-dlp/yt-dlp/issues/1060, + # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312 + except ssl.SSLError: + # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151 + if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'): + for storename in ('CA', 'ROOT'): + _ssl_load_windows_store_certs(context, storename) + context.set_default_verify_paths() + client_certfile = params.get('client_certificate') if client_certfile: try: @@ -943,6 +969,13 @@ def make_HTTPS_handler(params, **kwargs): password=params.get('client_certificate_password')) except ssl.SSLError: raise YoutubeDLError('Unable to load client certificate') + + # Some servers may reject requests if ALPN extension is not sent. See: + # https://github.com/python/cpython/issues/85140 + # https://github.com/yt-dlp/yt-dlp/issues/3878 + with contextlib.suppress(NotImplementedError): + context.set_alpn_protocols(['http/1.1']) + return YoutubeDLHTTPSHandler(params, context=context, **kwargs) @@ -970,7 +1003,7 @@ class YoutubeDLError(Exception): super().__init__(self.msg) -network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error] +network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error] if hasattr(ssl, 'CertificateError'): network_exceptions.append(ssl.CertificateError) network_exceptions = tuple(network_exceptions) @@ -993,12 +1026,14 @@ class ExtractorError(YoutubeDLError): self.video_id = video_id self.ie = ie self.exc_info = sys.exc_info() # preserve original exception + if isinstance(self.exc_info[1], ExtractorError): + self.exc_info = self.exc_info[1].exc_info super().__init__(''.join(( - format_field(ie, template='[%s] '), - format_field(video_id, template='%s: '), + format_field(ie, None, '[%s] '), + format_field(video_id, None, '%s: '), msg, - format_field(cause, template=' (caused by %r)'), + format_field(cause, None, ' (caused by %r)'), '' if expected else bug_reports_message()))) def format_traceback(self): @@ -1220,7 +1255,7 @@ def handle_youtubedl_headers(headers): return filtered_headers -class YoutubeDLHandler(compat_urllib_request.HTTPHandler): +class YoutubeDLHandler(urllib.request.HTTPHandler): """Handler for HTTP requests and responses. This class, when installed with an OpenerDirector, automatically adds @@ -1239,11 +1274,11 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """ def __init__(self, params, *args, **kwargs): - compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs) + urllib.request.HTTPHandler.__init__(self, *args, **kwargs) self._params = params def http_open(self, req): - conn_class = compat_http_client.HTTPConnection + conn_class = http.client.HTTPConnection socks_proxy = req.headers.get('Ytdl-socks-proxy') if socks_proxy: @@ -1296,7 +1331,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): req.headers = handle_youtubedl_headers(req.headers) - return req + return super().do_request_(req) def http_response(self, req, resp): old_resp = resp @@ -1318,18 +1353,18 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): break else: raise original_ioerror - resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) + resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg del resp.headers['Content-encoding'] # deflate if resp.headers.get('Content-encoding', '') == 'deflate': gz = io.BytesIO(self.deflate(resp.read())) - resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) + resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg del resp.headers['Content-encoding'] # brotli if resp.headers.get('Content-encoding', '') == 'br': - resp = compat_urllib_request.addinfourl( + resp = urllib.request.addinfourl( io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg del resp.headers['Content-encoding'] @@ -1352,9 +1387,9 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): def make_socks_conn_class(base_class, socks_proxy): assert issubclass(base_class, ( - compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection)) + http.client.HTTPConnection, http.client.HTTPSConnection)) - url_components = compat_urlparse.urlparse(socks_proxy) + url_components = urllib.parse.urlparse(socks_proxy) if url_components.scheme.lower() == 'socks5': socks_type = ProxyType.SOCKS5 elif url_components.scheme.lower() in ('socks', 'socks4'): @@ -1365,7 +1400,7 @@ def make_socks_conn_class(base_class, socks_proxy): def unquote_if_non_empty(s): if not s: return s - return compat_urllib_parse_unquote_plus(s) + return urllib.parse.unquote_plus(s) proxy_args = ( socks_type, @@ -1383,7 +1418,7 @@ def make_socks_conn_class(base_class, socks_proxy): self.sock.settimeout(self.timeout) self.sock.connect((self.host, self.port)) - if isinstance(self, compat_http_client.HTTPSConnection): + if isinstance(self, http.client.HTTPSConnection): if hasattr(self, '_context'): # Python > 2.6 self.sock = self._context.wrap_socket( self.sock, server_hostname=self.host) @@ -1393,10 +1428,10 @@ def make_socks_conn_class(base_class, socks_proxy): return SocksConnection -class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): +class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler): def __init__(self, params, https_conn_class=None, *args, **kwargs): - compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs) - self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection + urllib.request.HTTPSHandler.__init__(self, *args, **kwargs) + self._https_conn_class = https_conn_class or http.client.HTTPSConnection self._params = params def https_open(self, req): @@ -1423,7 +1458,7 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): raise -class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): +class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): """ See [1] for cookie file format. @@ -1494,7 +1529,7 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): if self.filename is not None: filename = self.filename else: - raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT) + raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) # Store session cookies with `expires` set to 0 instead of an empty string for cookie in self: @@ -1511,7 +1546,7 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): if self.filename is not None: filename = self.filename else: - raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT) + raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) def prepare_line(line): if line.startswith(self._HTTPONLY_PREFIX): @@ -1521,10 +1556,10 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): return line cookie_list = line.split('\t') if len(cookie_list) != self._ENTRY_LEN: - raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list)) + raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list)) cookie = self._CookieFileEntry(*cookie_list) if cookie.expires_at and not cookie.expires_at.isdigit(): - raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at) + raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at) return line cf = io.StringIO() @@ -1532,9 +1567,9 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): for line in f: try: cf.write(prepare_line(line)) - except compat_cookiejar.LoadError as e: + except http.cookiejar.LoadError as e: if f'{line.strip()} '[0] in '[{"': - raise compat_cookiejar.LoadError( + raise http.cookiejar.LoadError( 'Cookies file must be Netscape formatted, not JSON. See ' 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl') write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n') @@ -1557,18 +1592,18 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): cookie.discard = True -class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): +class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor): def __init__(self, cookiejar=None): - compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar) + urllib.request.HTTPCookieProcessor.__init__(self, cookiejar) def http_response(self, request, response): - return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response) + return urllib.request.HTTPCookieProcessor.http_response(self, request, response) - https_request = compat_urllib_request.HTTPCookieProcessor.http_request + https_request = urllib.request.HTTPCookieProcessor.http_request https_response = http_response -class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): +class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler): """YoutubeDL redirect handler The code is based on HTTPRedirectHandler implementation from CPython [1]. @@ -1583,7 +1618,7 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): 3. https://github.com/ytdl-org/youtube-dl/issues/28768 """ - http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302 + http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302 def redirect_request(self, req, fp, code, msg, headers, newurl): """Return a Request or None in response to a redirect. @@ -1598,7 +1633,7 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): m = req.get_method() if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") or code in (301, 302, 303) and m == "POST")): - raise compat_HTTPError(req.full_url, code, msg, headers, fp) + raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp) # Strictly (according to RFC 2616), 301 or 302 in response to # a POST MUST NOT cause a redirection without confirmation # from the user (of urllib.request, in this case). In practice, @@ -1625,7 +1660,7 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): if code in (301, 302) and m == 'POST': m = 'GET' - return compat_urllib_request.Request( + return urllib.request.Request( newurl, headers=newheaders, origin_req_host=req.origin_req_host, unverifiable=True, method=m) @@ -1698,7 +1733,7 @@ def unified_strdate(date_str, day_first=True): with contextlib.suppress(ValueError): upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') if upload_date is not None: - return compat_str(upload_date) + return str(upload_date) def unified_timestamp(date_str, day_first=True): @@ -1872,21 +1907,22 @@ class DateRange: def platform_name(): - """ Returns the platform name as a compat_str """ + """ Returns the platform name as a str """ res = platform.platform() if isinstance(res, bytes): res = res.decode(preferredencoding()) - assert isinstance(res, compat_str) + assert isinstance(res, str) return res +@functools.cache def get_windows_version(): - ''' Get Windows version. None if it's not running on Windows ''' + ''' Get Windows version. returns () if it's not running on Windows ''' if compat_os_name == 'nt': return version_tuple(platform.win32_ver()[1]) else: - return None + return () def write_string(s, out=None, encoding=None): @@ -1896,15 +1932,14 @@ def write_string(s, out=None, encoding=None): if compat_os_name == 'nt' and supports_terminal_sequences(out): s = re.sub(r'([\r\n]+)', r' \1', s) + enc, buffer = None, out if 'b' in getattr(out, 'mode', ''): - byt = s.encode(encoding or preferredencoding(), 'ignore') - out.write(byt) + enc = encoding or preferredencoding() elif hasattr(out, 'buffer'): + buffer = out.buffer enc = encoding or getattr(out, 'encoding', None) or preferredencoding() - byt = s.encode(enc, 'ignore') - out.buffer.write(byt) - else: - out.write(s) + + buffer.write(s.encode(enc, 'ignore') if enc else s) out.flush() @@ -1920,11 +1955,11 @@ def bytes_to_intlist(bs): def intlist_to_bytes(xs): if not xs: return b'' - return compat_struct_pack('%dB' % len(xs), *xs) + return struct.pack('%dB' % len(xs), *xs) -class LockingUnsupportedError(IOError): - msg = 'File locking is not supported on this platform' +class LockingUnsupportedError(OSError): + msg = 'File locking is not supported' def __init__(self): super().__init__(self.msg) @@ -1977,7 +2012,8 @@ if sys.platform == 'win32': if not LockFileEx(msvcrt.get_osfhandle(f.fileno()), (0x2 if exclusive else 0x0) | (0x0 if block else 0x1), 0, whole_low, whole_high, f._lock_file_overlapped_p): - raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError()) + # NB: No argument form of "ctypes.FormatError" does not work on PyPy + raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}') def _unlock_file(f): assert f._lock_file_overlapped_p @@ -2049,8 +2085,11 @@ class locked_file: try: self.f.truncate() except OSError as e: - if e.errno != 29: # Illegal seek, expected when self.f is a FIFO - raise e + if e.errno not in ( + errno.ESPIPE, # Illegal seek - expected for FIFO + errno.EINVAL, # Invalid argument - expected for /dev/null + ): + raise return self def unlock(self): @@ -2077,6 +2116,7 @@ class locked_file: return iter(self.f) +@functools.cache def get_filesystem_encoding(): encoding = sys.getfilesystemencoding() return encoding if encoding is not None else 'utf-8' @@ -2098,7 +2138,7 @@ def smuggle_url(url, data): url, idata = unsmuggle_url(url, {}) data.update(idata) - sdata = compat_urllib_parse_urlencode( + sdata = urllib.parse.urlencode( {'__youtubedl_smuggle': json.dumps(data)}) return url + '#' + sdata @@ -2107,7 +2147,7 @@ def unsmuggle_url(smug_url, default=None): if '#__youtubedl_smuggle' not in smug_url: return smug_url, default url, _, sdata = smug_url.rpartition('#') - jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0] + jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0] data = json.loads(jsond) return url, data @@ -2267,7 +2307,7 @@ def parse_resolution(s, *, lenient=False): def parse_bitrate(s): - if not isinstance(s, compat_str): + if not isinstance(s, str): return mobj = re.search(r'\b(\d+)\s*kbps', s) if mobj: @@ -2304,7 +2344,7 @@ def fix_xml_ampersands(xml_str): def setproctitle(title): - assert isinstance(title, compat_str) + assert isinstance(title, str) # ctypes in Jython is not complete # http://bugs.jython.org/issue2148 @@ -2352,7 +2392,7 @@ def get_domain(url): def url_basename(url): - path = compat_urlparse.urlparse(url).path + path = urllib.parse.urlparse(url).path return path.strip('/').split('/')[-1] @@ -2363,24 +2403,24 @@ def base_url(url): def urljoin(base, path): if isinstance(path, bytes): path = path.decode() - if not isinstance(path, compat_str) or not path: + if not isinstance(path, str) or not path: return None if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): return path if isinstance(base, bytes): base = base.decode() - if not isinstance(base, compat_str) or not re.match( + if not isinstance(base, str) or not re.match( r'^(?:https?:)?//', base): return None - return compat_urlparse.urljoin(base, path) + return urllib.parse.urljoin(base, path) -class HEADRequest(compat_urllib_request.Request): +class HEADRequest(urllib.request.Request): def get_method(self): return 'HEAD' -class PUTRequest(compat_urllib_request.Request): +class PUTRequest(urllib.request.Request): def get_method(self): return 'PUT' @@ -2395,14 +2435,14 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): def str_or_none(v, default=None): - return default if v is None else compat_str(v) + return default if v is None else str(v) def str_to_int(int_str): """ A more relaxed version of int_or_none """ if isinstance(int_str, int): return int_str - elif isinstance(int_str, compat_str): + elif isinstance(int_str, str): int_str = re.sub(r'[,\.\+]', '', int_str) return int_or_none(int_str) @@ -2421,18 +2461,18 @@ def bool_or_none(v, default=None): def strip_or_none(v, default=None): - return v.strip() if isinstance(v, compat_str) else default + return v.strip() if isinstance(v, str) else default def url_or_none(url): - if not url or not isinstance(url, compat_str): + if not url or not isinstance(url, str): return None url = url.strip() return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None def request_to_url(req): - if isinstance(req, compat_urllib_request.Request): + if isinstance(req, urllib.request.Request): return req.get_full_url() else: return req @@ -2443,7 +2483,7 @@ def strftime_or_none(timestamp, date_format, default=None): try: if isinstance(timestamp, (int, float)): # unix timestamp datetime_object = datetime.datetime.utcfromtimestamp(timestamp) - elif isinstance(timestamp, compat_str): # assume YYYYMMDD + elif isinstance(timestamp, str): # assume YYYYMMDD datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d') return datetime_object.strftime(date_format) except (ValueError, TypeError, AttributeError): @@ -2525,7 +2565,7 @@ def check_executable(exe, args=[]): """ Checks if the given binary is installed somewhere in PATH, and returns its name. args can be a list of arguments for a short output (like -version) """ try: - Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill() + Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError: return False return exe @@ -2538,18 +2578,15 @@ def _get_exe_version_output(exe, args, *, to_screen=None): # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers # SIGTTOU if yt-dlp is run in the background. # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656 - out, _ = Popen( - [encodeArgument(exe)] + args, stdin=subprocess.PIPE, - stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill() + stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True, + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) except OSError: return False - if isinstance(out, bytes): # Python 2.x - out = out.decode('ascii', 'ignore') - return out + return stdout def detect_exe_version(output, version_re=None, unrecognized='present'): - assert isinstance(output, compat_str) + assert isinstance(output, str) if version_re is None: version_re = r'version\s+([-0-9._a-zA-Z]+)' m = re.search(version_re, output) @@ -2567,6 +2604,16 @@ def get_exe_version(exe, args=['--version'], return detect_exe_version(out, version_re, unrecognized) if out else False +def frange(start=0, stop=None, step=1): + """Float range""" + if stop is None: + start, stop = 0, start + sign = [-1, 1][step > 0] if step else 0 + while sign * start < sign * stop: + yield start + start += step + + class LazyList(collections.abc.Sequence): """Lazy immutable list from an iterable Note that slices of a LazyList are lists and not LazyList""" @@ -2763,6 +2810,140 @@ class InAdvancePagedList(PagedList): yield from page_results +class PlaylistEntries: + MissingEntry = object() + is_exhausted = False + + def __init__(self, ydl, info_dict): + self.ydl = ydl + + # _entries must be assigned now since infodict can change during iteration + entries = info_dict.get('entries') + if entries is None: + raise EntryNotInPlaylist('There are no entries') + elif isinstance(entries, list): + self.is_exhausted = True + + requested_entries = info_dict.get('requested_entries') + self.is_incomplete = bool(requested_entries) + if self.is_incomplete: + assert self.is_exhausted + self._entries = [self.MissingEntry] * max(requested_entries) + for i, entry in zip(requested_entries, entries): + self._entries[i - 1] = entry + elif isinstance(entries, (list, PagedList, LazyList)): + self._entries = entries + else: + self._entries = LazyList(entries) + + PLAYLIST_ITEMS_RE = re.compile(r'''(?x) + (?P<start>[+-]?\d+)? + (?P<range>[:-] + (?P<end>[+-]?\d+|inf(?:inite)?)? + (?::(?P<step>[+-]?\d+))? + )?''') + + @classmethod + def parse_playlist_items(cls, string): + for segment in string.split(','): + if not segment: + raise ValueError('There is two or more consecutive commas') + mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment) + if not mobj: + raise ValueError(f'{segment!r} is not a valid specification') + start, end, step, has_range = mobj.group('start', 'end', 'step', 'range') + if int_or_none(step) == 0: + raise ValueError(f'Step in {segment!r} cannot be zero') + yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start) + + def get_requested_items(self): + playlist_items = self.ydl.params.get('playlist_items') + playlist_start = self.ydl.params.get('playliststart', 1) + playlist_end = self.ydl.params.get('playlistend') + # For backwards compatibility, interpret -1 as whole list + if playlist_end in (-1, None): + playlist_end = '' + if not playlist_items: + playlist_items = f'{playlist_start}:{playlist_end}' + elif playlist_start != 1 or playlist_end: + self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True) + + for index in self.parse_playlist_items(playlist_items): + for i, entry in self[index]: + yield i, entry + if not entry: + continue + try: + # TODO: Add auto-generated fields + self.ydl._match_entry(entry, incomplete=True, silent=True) + except (ExistingVideoReached, RejectedVideoReached): + return + + def get_full_count(self): + if self.is_exhausted and not self.is_incomplete: + return len(self) + elif isinstance(self._entries, InAdvancePagedList): + if self._entries._pagesize == 1: + return self._entries._pagecount + + @functools.cached_property + def _getter(self): + if isinstance(self._entries, list): + def get_entry(i): + try: + entry = self._entries[i] + except IndexError: + entry = self.MissingEntry + if not self.is_incomplete: + raise self.IndexError() + if entry is self.MissingEntry: + raise EntryNotInPlaylist(f'Entry {i} cannot be found') + return entry + else: + def get_entry(i): + try: + return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i) + except (LazyList.IndexError, PagedList.IndexError): + raise self.IndexError() + return get_entry + + def __getitem__(self, idx): + if isinstance(idx, int): + idx = slice(idx, idx) + + # NB: PlaylistEntries[1:10] => (0, 1, ... 9) + step = 1 if idx.step is None else idx.step + if idx.start is None: + start = 0 if step > 0 else len(self) - 1 + else: + start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start + + # NB: Do not call len(self) when idx == [:] + if idx.stop is None: + stop = 0 if step < 0 else float('inf') + else: + stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop + stop += [-1, 1][step > 0] + + for i in frange(start, stop, step): + if i < 0: + continue + try: + entry = self._getter(i) + except self.IndexError: + self.is_exhausted = True + if step > 0: + break + continue + yield i + 1, entry + + def __len__(self): + return len(tuple(self[:])) + + class IndexError(IndexError): + pass + + def uppercase_escape(s): unicode_escape = codecs.getdecoder('unicode_escape') return re.sub( @@ -2786,7 +2967,7 @@ def escape_rfc3986(s): def escape_url(url): """Escape URL as suggested by RFC 3986""" - url_parsed = compat_urllib_parse_urlparse(url) + url_parsed = urllib.parse.urlparse(url) return url_parsed._replace( netloc=url_parsed.netloc.encode('idna').decode('ascii'), path=escape_rfc3986(url_parsed.path), @@ -2797,12 +2978,12 @@ def escape_url(url): def parse_qs(url): - return compat_parse_qs(compat_urllib_parse_urlparse(url).query) + return urllib.parse.parse_qs(urllib.parse.urlparse(url).query) def read_batch_urls(batch_fd): def fixup(url): - if not isinstance(url, compat_str): + if not isinstance(url, str): url = url.decode('utf-8', 'replace') BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff') for bom in BOM_UTF8: @@ -2820,22 +3001,22 @@ def read_batch_urls(batch_fd): def urlencode_postdata(*args, **kargs): - return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii') + return urllib.parse.urlencode(*args, **kargs).encode('ascii') def update_url_query(url, query): if not query: return url - parsed_url = compat_urlparse.urlparse(url) - qs = compat_parse_qs(parsed_url.query) + parsed_url = urllib.parse.urlparse(url) + qs = urllib.parse.parse_qs(parsed_url.query) qs.update(query) - return compat_urlparse.urlunparse(parsed_url._replace( - query=compat_urllib_parse_urlencode(qs, True))) + return urllib.parse.urlunparse(parsed_url._replace( + query=urllib.parse.urlencode(qs, True))) -def update_Request(req, url=None, data=None, headers={}, query={}): +def update_Request(req, url=None, data=None, headers=None, query=None): req_headers = req.headers.copy() - req_headers.update(headers) + req_headers.update(headers or {}) req_data = data or req.data req_url = update_url_query(url or req.get_full_url(), query) req_get_method = req.get_method() @@ -2844,7 +3025,7 @@ def update_Request(req, url=None, data=None, headers={}, query={}): elif req_get_method == 'PUT': req_type = PUTRequest else: - req_type = compat_urllib_request.Request + req_type = urllib.request.Request new_req = req_type( req_url, data=req_data, headers=req_headers, origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) @@ -2859,9 +3040,9 @@ def _multipart_encode_impl(data, boundary): out = b'' for k, v in data.items(): out += b'--' + boundary.encode('ascii') + b'\r\n' - if isinstance(k, compat_str): + if isinstance(k, str): k = k.encode() - if isinstance(v, compat_str): + if isinstance(v, str): v = v.encode() # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578 # suggests sending UTF-8 directly. Firefox sends UTF-8, too @@ -2942,7 +3123,7 @@ def merge_dicts(*dicts): def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): - return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) + return string if isinstance(string, str) else str(string, encoding, errors) US_RATINGS = { @@ -2966,7 +3147,7 @@ TV_PARENTAL_GUIDELINES = { def parse_age_limit(s): # isinstance(False, int) is True. So type() must be used instead - if type(s) is int: + if type(s) is int: # noqa: E721 return s if 0 <= s <= 21 else None elif not isinstance(s, str): return None @@ -3029,7 +3210,11 @@ def js_to_json(code, vars={}): return '"%s"' % v + def create_map(mobj): + return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars)))) + code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) + code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| @@ -3052,7 +3237,7 @@ def qualities(quality_ids): return q -POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist') +POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist') DEFAULT_OUTTMPL = { @@ -3290,14 +3475,13 @@ def is_html(first_bytes): (b'\xff\xfe', 'utf-16-le'), (b'\xfe\xff', 'utf-16-be'), ] + + encoding = 'utf-8' for bom, enc in BOMS: - if first_bytes.startswith(bom): - s = first_bytes[len(bom):].decode(enc, 'replace') - break - else: - s = first_bytes.decode('utf-8', 'replace') + while first_bytes.startswith(bom): + encoding, first_bytes = enc, first_bytes[len(bom):] - return re.match(r'^\s*<', s) + return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace')) def determine_protocol(info_dict): @@ -3319,7 +3503,7 @@ def determine_protocol(info_dict): elif ext == 'f4m': return 'f4m' - return compat_urllib_parse_urlparse(url).scheme + return urllib.parse.urlparse(url).scheme def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False): @@ -3376,16 +3560,15 @@ def _match_one(filter_part, dct, incomplete): else: is_incomplete = lambda k: k in incomplete - operator_rex = re.compile(r'''(?x)\s* + operator_rex = re.compile(r'''(?x) (?P<key>[a-z_]+) \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* (?: (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)| (?P<strval>.+?) ) - \s*$ ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))) - m = operator_rex.search(filter_part) + m = operator_rex.fullmatch(filter_part.strip()) if m: m = m.groupdict() unnegated_op = COMPARISON_OPERATORS[m['op']] @@ -3421,11 +3604,10 @@ def _match_one(filter_part, dct, incomplete): '': lambda v: (v is True) if isinstance(v, bool) else (v is not None), '!': lambda v: (v is False) if isinstance(v, bool) else (v is None), } - operator_rex = re.compile(r'''(?x)\s* + operator_rex = re.compile(r'''(?x) (?P<op>%s)\s*(?P<key>[a-z_]+) - \s*$ ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys()))) - m = operator_rex.search(filter_part) + m = operator_rex.fullmatch(filter_part.strip()) if m: op = UNARY_OPERATORS[m.group('op')] actual_value = dct.get(m.group('key')) @@ -3467,6 +3649,23 @@ def match_filter_func(filters): return _match_func +def download_range_func(chapters, ranges): + def inner(info_dict, ydl): + warning = ('There are no chapters matching the regex' if info_dict.get('chapters') + else 'Cannot match chapters since chapter information is unavailable') + for regex in chapters or []: + for i, chapter in enumerate(info_dict.get('chapters') or []): + if re.search(regex, chapter['title']): + warning = None + yield {**chapter, 'index': i} + if chapters and warning: + ydl.to_screen(f'[info] {info_dict["id"]}: {warning}') + + yield from ({'start_time': start, 'end_time': end} for start, end in ranges or []) + + return inner + + def parse_dfxp_time_expr(time_expr): if not time_expr: return @@ -3653,26 +3852,21 @@ def dfxp2srt(dfxp_data): return ''.join(out) -def cli_option(params, command_option, param): +def cli_option(params, command_option, param, separator=None): param = params.get(param) - if param: - param = compat_str(param) - return [command_option, param] if param is not None else [] + return ([] if param is None + else [command_option, str(param)] if separator is None + else [f'{command_option}{separator}{param}']) def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): param = params.get(param) - if param is None: - return [] - assert isinstance(param, bool) - if separator: - return [command_option + separator + (true_value if param else false_value)] - return [command_option, true_value if param else false_value] + assert param in (True, False, None) + return cli_option({True: true_value, False: false_value}, command_option, param, separator) def cli_valueless_option(params, command_option, param, expected_value=True): - param = params.get(param) - return [command_option] if param == expected_value else [] + return [command_option] if params.get(param) == expected_value else [] def cli_configuration_args(argdict, keys, default=[], use_compat=True): @@ -4165,6 +4359,9 @@ class ISO3166Utils: 'YE': 'Yemen', 'ZM': 'Zambia', 'ZW': 'Zimbabwe', + # Not ISO 3166 codes, but used for IP blocks + 'AP': 'Asia/Pacific Region', + 'EU': 'Europe', } @classmethod @@ -4427,20 +4624,20 @@ class GeoUtils: else: block = code_or_block addr, preflen = block.split('/') - addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0] + addr_min = struct.unpack('!L', socket.inet_aton(addr))[0] addr_max = addr_min | (0xffffffff >> int(preflen)) - return compat_str(socket.inet_ntoa( - compat_struct_pack('!L', random.randint(addr_min, addr_max)))) + return str(socket.inet_ntoa( + struct.pack('!L', random.randint(addr_min, addr_max)))) -class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): +class PerRequestProxyHandler(urllib.request.ProxyHandler): def __init__(self, proxies=None): # Set default handlers for type in ('http', 'https'): setattr(self, '%s_open' % type, lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: meth(r, proxy, type)) - compat_urllib_request.ProxyHandler.__init__(self, proxies) + urllib.request.ProxyHandler.__init__(self, proxies) def proxy_open(self, req, proxy, type): req_proxy = req.headers.get('Ytdl-request-proxy') @@ -4450,11 +4647,11 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): if proxy == '__noproxy__': return None # No Proxy - if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): + if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): req.add_header('Ytdl-socks-proxy', proxy) # yt-dlp's http/https handlers do wrapping the socket with socks return None - return compat_urllib_request.ProxyHandler.proxy_open( + return urllib.request.ProxyHandler.proxy_open( self, req, proxy, type) @@ -4474,7 +4671,7 @@ def long_to_bytes(n, blocksize=0): s = b'' n = int(n) while n > 0: - s = compat_struct_pack('>I', n & 0xffffffff) + s + s = struct.pack('>I', n & 0xffffffff) + s n = n >> 32 # strip off leading zeros for i in range(len(s)): @@ -4505,7 +4702,7 @@ def bytes_to_long(s): s = b'\000' * extra + s length = length + extra for i in range(0, length, 4): - acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0] + acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0] return acc @@ -4541,22 +4738,42 @@ def pkcs1pad(data, length): return [0, 2] + pseudo_random + [0] + data -def encode_base_n(num, n, table=None): - FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' - if not table: - table = FULL_TABLE[:n] +def _base_n_table(n, table): + if not table and not n: + raise ValueError('Either table or n must be specified') + table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n] - if n > len(table): - raise ValueError('base %d exceeds table length %d' % (n, len(table))) + if n != len(table): + raise ValueError(f'base {n} exceeds table length {len(table)}') + return table - if num == 0: + +def encode_base_n(num, n=None, table=None): + """Convert given int to a base-n string""" + table = _base_n_table(n, table) + if not num: return table[0] - ret = '' + result, base = '', len(table) while num: - ret = table[num % n] + ret - num = num // n - return ret + result = table[num % base] + result + num = num // base + return result + + +def decode_base_n(string, n=None, table=None): + """Convert given base-n string to int""" + table = {char: index for index, char in enumerate(_base_n_table(n, table))} + result, base = 0, len(table) + for char in string: + result = result * base + table[char] + return result + + +def decode_base(value, digits): + write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated ' + 'and may be removed in a future version. Use yt_dlp.decode_base_n instead') + return decode_base_n(value, table=digits) def decode_packed_codes(code): @@ -4613,7 +4830,7 @@ def decode_png(png_data): raise OSError('Not a valid PNG file.') int_map = {1: '>B', 2: '>H', 4: '>I'} - unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0] + unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0] chunks = [] @@ -4725,7 +4942,6 @@ def write_xattr(path, key, value): return # UNIX Method 1. Use xattrs/pyxattrs modules - from .dependencies import xattr setxattr = None if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr': @@ -4753,14 +4969,13 @@ def write_xattr(path, key, value): value = value.decode() try: - p = Popen( + _, stderr, returncode = Popen.run( [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) except OSError as e: raise XAttrMetadataError(e.errno, e.strerror) - stderr = p.communicate_or_kill()[1].decode('utf-8', 'replace') - if p.returncode: - raise XAttrMetadataError(p.returncode, stderr) + if returncode: + raise XAttrMetadataError(returncode, stderr) def random_birthday(year_field, month_field, day_field): @@ -4815,7 +5030,7 @@ def iri_to_uri(iri): The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact. """ - iri_parts = compat_urllib_parse_urlparse(iri) + iri_parts = urllib.parse.urlparse(iri) if '[' in iri_parts.netloc: raise ValueError('IPv6 URIs are not, yet, supported.') @@ -4860,11 +5075,11 @@ def to_high_limit_path(path): return path -def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None): +def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY): val = traverse_obj(obj, *variadic(field)) - if val in ignore: + if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore): return default - return template % (func(val) if func else val) + return template % func(val) def clean_podcast_url(url): @@ -4904,14 +5119,9 @@ def make_dir(path, to_screen=None): def get_executable_path(): - from zipimport import zipimporter - if hasattr(sys, 'frozen'): # Running from PyInstaller - path = os.path.dirname(sys.executable) - elif isinstance(__loader__, zipimporter): # Running from ZIP - path = os.path.join(os.path.dirname(__file__), '../..') - else: - path = os.path.join(os.path.dirname(__file__), '..') - return os.path.abspath(path) + from .update import _get_variant_and_executable_path + + return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1])) def load_plugins(name, suffix, namespace): @@ -5010,10 +5220,8 @@ def traverse_obj( if isinstance(expected_type, type): type_test = lambda val: val if isinstance(val, expected_type) else None - elif expected_type is not None: - type_test = expected_type else: - type_test = lambda val: val + type_test = expected_type or IDENTITY for path in path_list: depth = 0 @@ -5046,17 +5254,6 @@ def variadic(x, allowed_types=(str, bytes, dict)): return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,) -def decode_base(value, digits): - # This will convert given base-x string to scalar (long or int) - table = {char: index for index, char in enumerate(digits)} - result = 0 - base = len(digits) - for chr in value: - result *= base - result += table[chr] - return result - - def time_seconds(**kwargs): t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs))) return t.timestamp() @@ -5088,10 +5285,13 @@ def jwt_decode_hs256(jwt): return payload_data +WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None + + +@functools.cache def supports_terminal_sequences(stream): if compat_os_name == 'nt': - from .compat import WINDOWS_VT_MODE # Must be imported locally - if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586): + if not WINDOWS_VT_MODE: return False elif not os.getenv('TERM'): return False @@ -5101,6 +5301,19 @@ def supports_terminal_sequences(stream): return False +def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075 + if get_windows_version() < (10, 0, 10586): + return + global WINDOWS_VT_MODE + try: + Popen.run('', shell=True) + except Exception: + return + + WINDOWS_VT_MODE = True + supports_terminal_sequences.cache_clear() + + _terminal_sequences_re = re.compile('\033\\[[^m]+m') @@ -5114,7 +5327,7 @@ def number_of_digits(number): def join_nonempty(*values, delim='-', from_dict=None): if from_dict is not None: - values = map(from_dict.get, values) + values = (traverse_obj(from_dict, variadic(v)) for v in values) return delim.join(map(str, filter(None, values))) @@ -5150,13 +5363,20 @@ def parse_http_range(range): return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3)) +def read_stdin(what): + eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D' + write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n') + return sys.stdin + + class Config: own_args = None + parsed_args = None filename = None __initialized = False def __init__(self, parser, label=None): - self._parser, self.label = parser, label + self.parser, self.label = parser, label self._loaded_paths, self.configs = set(), [] def init(self, args=None, filename=None): @@ -5169,14 +5389,19 @@ class Config: return False self._loaded_paths.add(location) - self.__initialized = True - self.own_args, self.filename = args, filename - for location in self._parser.parse_args(args)[0].config_locations or []: + self.own_args, self.__initialized = args, True + opts, _ = self.parser.parse_known_args(args) + self.parsed_args, self.filename = args, filename + + for location in opts.config_locations or []: + if location == '-': + self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin') + continue location = os.path.join(directory, expand_path(location)) if os.path.isdir(location): location = os.path.join(location, 'yt-dlp.conf') if not os.path.exists(location): - self._parser.error(f'config location {location} does not exist') + self.parser.error(f'config location {location} does not exist') self.append_config(self.read_file(location), location) return True @@ -5199,6 +5424,8 @@ class Config: # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 contents = optionf.read() res = shlex.split(contents, comments=True) + except Exception as err: + raise ValueError(f'Unable to parse "{filename}": {err}') finally: optionf.close() return res @@ -5222,7 +5449,7 @@ class Config: return opts def append_config(self, *args, label=None): - config = type(self)(self._parser, label) + config = type(self)(self.parser, label) config._loaded_paths = self._loaded_paths if config.init(*args): self.configs.append(config) @@ -5231,10 +5458,13 @@ class Config: def all_args(self): for config in reversed(self.configs): yield from config.all_args - yield from self.own_args or [] + yield from self.parsed_args or [] + + def parse_known_args(self, **kwargs): + return self.parser.parse_known_args(self.all_args, **kwargs) def parse_args(self): - return self._parser.parse_args(self.all_args) + return self.parser.parse_args(self.all_args) class WebSocketsWrapper(): @@ -5314,16 +5544,25 @@ def merge_headers(*dicts): class classproperty: - def __init__(self, f): - functools.update_wrapper(self, f) - self.f = f + """classmethod(property(func)) that works in py < 3.9""" + + def __init__(self, func): + functools.update_wrapper(self, func) + self.func = func def __get__(self, _, cls): - return self.f(cls) + return self.func(cls) + +class Namespace(types.SimpleNamespace): + """Immutable namespace""" -def Namespace(**kwargs): - return collections.namedtuple('Namespace', kwargs)(**kwargs) + def __iter__(self): + return iter(self.__dict__.values()) + + @property + def items_(self): + return self.__dict__.items() # Deprecated diff --git a/yt_dlp/version.py b/yt_dlp/version.py index fb3ec8c6d..0ebc96f8d 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,5 +1,5 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2022.04.08' +__version__ = '2022.06.22.1' -RELEASE_GIT_HEAD = '7884ade65' +RELEASE_GIT_HEAD = 'a86e01e74' |