diff options
author | Jesús <heckyel@hyperbola.info> | 2021-12-29 19:12:28 -0500 |
---|---|---|
committer | Jesús <heckyel@hyperbola.info> | 2021-12-29 19:12:28 -0500 |
commit | 5aac4e0267e32d98eb68692afedafda3b41ea629 (patch) | |
tree | c3b0f52d6a8cf4ad74e7f17f1ccd7653e1071471 | |
parent | 4f0875462ee497cc13c02d0b852f52f4887b5cea (diff) | |
parent | 96f13f01a609add83555ca86fbf35d11441361d8 (diff) | |
download | hypervideo-pre-5aac4e0267e32d98eb68692afedafda3b41ea629.tar.lz hypervideo-pre-5aac4e0267e32d98eb68692afedafda3b41ea629.tar.xz hypervideo-pre-5aac4e0267e32d98eb68692afedafda3b41ea629.zip |
updated from upstream | 29/12/2021 at 19:12
49 files changed, 2583 insertions, 557 deletions
diff --git a/.gitignore b/.gitignore index 8a72e3ea9..fb09c3d6d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,27 +1,32 @@ # Config *.conf -*.spec cookies *cookies.txt .netrc # Downloaded -*.3gp *.annotations.xml -*.ape *.aria2 -*.avi *.description -*.desktop *.dump -*.flac -*.flv *.frag +*.frag.aria2 *.frag.urls *.info.json +*.live_chat.json +*.part* +*.unknown_video +*.ytdl +.cache/ + +*.3gp +*.ape +*.avi +*.desktop +*.flac +*.flv *.jpeg *.jpg -*.live_chat.json *.m4a *.m4v *.mhtml @@ -31,23 +36,18 @@ cookies *.mp4 *.ogg *.opus -*.part -*.part-* *.png *.sbv *.srt *.swf *.swp *.ttml -*.unknown_video *.url *.vtt *.wav *.webloc *.webm *.webp -*.ytdl -.cache/ # Allow config/media files in testdata !test/** @@ -86,7 +86,6 @@ CONTRIBUTING.md *.1 *.bash-completion *.fish -*.exe *.tar.gz *.zsh test/testdata/player-*.js diff --git a/CONTRIBUTORS b/CONTRIBUTORS index b3e3e97fa..35a0764a2 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -155,3 +155,26 @@ staubichsauger xenova Yakabuff zulaport +ehoogeveen-medweb +PilzAdam +zmousm +iw0nderhow +unit193 +TwoThousandHedgehogs +Jertzukka +cypheron +Hyeeji +bwildenhain +C0D3D3V +kebianizao +Lapin0t +abdullah-if +DavidSkrundz +mkubecek +raleeper +YuenSzeHong +Sematre +jaller94 +r5d +julien-hadleyjack +git-anony-mouse diff --git a/Changelog.md b/Changelog.md index 4e9a448cb..f46c22a32 100644 --- a/Changelog.md +++ b/Changelog.md @@ -10,6 +10,124 @@ * Dispatch the workflow https://github.com/yt-dlp/yt-dlp/actions/workflows/build.yml on master --> + +### 2021.12.27 + +* Avoid recursion error when re-extracting info +* [ffmpeg] Fix position of `--ppa` +* [aria2c] Don't show progress when `--no-progress` +* [cookies] Support other keyrings by [mbway](https://github.com/mbway) +* [EmbedThumbnail] Prefer AtomicParsley over ffmpeg if available +* [generic] Fix HTTP KVS Player by [git-anony-mouse](https://github.com/git-anony-mouse) +* [ThumbnailsConvertor] Fix for when there are no thumbnails +* [docs] Add examples for using `TYPES:` in `-P`/`-o` +* [PixivSketch] Add extractors by [nao20010128nao](https://github.com/nao20010128nao) +* [tiktok] Add music, sticker and tag IEs by [MinePlayersPE](https://github.com/MinePlayersPE) +* [BiliIntl] Fix extractor by [MinePlayersPE](https://github.com/MinePlayersPE) +* [CBC] Fix URL regex +* [tiktok] Fix `extractor_key` used in archive +* [youtube] **End `live-from-start` properly when stream ends with 403** +* [Zee5] Fix VALID_URL for tv-shows by [Ashish0804](https://github.com/Ashish0804) + +### 2021.12.25 + +* [dash,youtube] **Download live from start to end** by [nao20010128nao](https://github.com/nao20010128nao), [pukkandan](https://github.com/pukkandan) + * Add option `--live-from-start` to enable downloading live videos from start + * Add key `is_from_start` in formats to identify formats (of live videos) that downloads from start + * [dash] Create protocol `http_dash_segments_generator` that allows a function to be passed instead of fragments + * [fragment] Allow multiple live dash formats to download simultaneously + * [youtube] Implement fragment re-fetching for the live dash formats + * [youtube] Re-extract dash manifest every 5 hours (manifest expires in 6hrs) + * [postprocessor/ffmpeg] Add `FFmpegFixupDuplicateMoovPP` to fixup duplicated moov atoms + * Known issues: + * Ctrl+C doesn't work on Windows when downloading multiple formats + * If video becomes private, download hangs +* [SponsorBlock] Add `Filler` and `Highlight` categories by [nihil-admirari](https://github.com/nihil-admirari), [pukkandan](https://github.com/pukkandan) + * Change `--sponsorblock-cut all` to `--sponsorblock-cut default` if you do not want filler sections to be removed +* Add field `webpage_url_domain` +* Add interactive format selection with `-f -` +* Add option `--file-access-retries` by [ehoogeveen-medweb](https://github.com/ehoogeveen-medweb) +* [outtmpl] Add alternate forms `S`, `D` and improve `id` detection +* [outtmpl] Add operator `&` for replacement text by [PilzAdam](https://github.com/PilzAdam) +* [EmbedSubtitle] Disable duration check temporarily +* [extractor] Add `_search_nuxt_data` by [nao20010128nao](https://github.com/nao20010128nao) +* [extractor] Ignore errors in comment extraction when `-i` is given +* [extractor] Standardize `_live_title` +* [FormatSort] Prevent incorrect deprecation warning +* [generic] Extract m3u8 formats from JSON-LD +* [postprocessor/ffmpeg] Always add `faststart` +* [utils] Fix parsing `YYYYMMDD` dates in Nov/Dec by [wlritchi](https://github.com/wlritchi) +* [utils] Improve `parse_count` +* [utils] Update `std_headers` by [kikuyan](https://github.com/kikuyan), [fstirlitz](https://github.com/fstirlitz) +* [lazy_extractors] Fix for search IEs +* [extractor] Support default implicit graph in JSON-LD by [zmousm](https://github.com/zmousm) +* Allow `--no-write-thumbnail` to override `--write-all-thumbnail` +* Fix `--throttled-rate` +* Fix control characters being printed to `--console-title` +* Fix PostProcessor hooks not registered for some PPs +* Pre-process when using `--flat-playlist` +* Remove known invalid thumbnails from `info_dict` +* Add warning when using `-f best` +* Use `parse_duration` for `--wait-for-video` and some minor fix +* [test/download] Add more fields +* [test/download] Ignore field `webpage_url_domain` by [std-move](https://github.com/std-move) +* [compat] Suppress errors in enabling VT mode +* [docs] Improve manpage format by [iw0nderhow](https://github.com/iw0nderhow), [pukkandan](https://github.com/pukkandan) +* [docs,cleanup] Minor fixes and cleanup +* [cleanup] Fix some typos by [unit193](https://github.com/unit193) +* [ABC:iview] Add show extractor by [pabs3](https://github.com/pabs3) +* [dropout] Add extractor by [TwoThousandHedgehogs](https://github.com/TwoThousandHedgehogs), [pukkandan](https://github.com/pukkandan) +* [GameJolt] Add extractors by [MinePlayersPE](https://github.com/MinePlayersPE) +* [gofile] Add extractor by [Jertzukka](https://github.com/Jertzukka), [Ashish0804](https://github.com/Ashish0804) +* [hse] Add extractors by [cypheron](https://github.com/cypheron), [pukkandan](https://github.com/pukkandan) +* [NateTV] Add NateIE and NateProgramIE by [Ashish0804](https://github.com/Ashish0804), [Hyeeji](https://github.com/Hyeeji) +* [OpenCast] Add extractors by [bwildenhain](https://github.com/bwildenhain), [C0D3D3V](https://github.com/C0D3D3V) +* [rtve] Add `RTVEAudioIE` by [kebianizao](https://github.com/kebianizao) +* [Rutube] Add RutubeChannelIE by [Ashish0804](https://github.com/Ashish0804) +* [skeb] Add extractor by [nao20010128nao](https://github.com/nao20010128nao) +* [soundcloud] Add related tracks extractor by [Lapin0t](https://github.com/Lapin0t) +* [toggo] Add extractor by [nyuszika7h](https://github.com/nyuszika7h) +* [TrueID] Add extractor by [MinePlayersPE](https://github.com/MinePlayersPE) +* [audiomack] Update album and song VALID_URL by [abdullah-if](https://github.com/abdullah-if), [dirkf](https://github.com/dirkf) +* [CBC Gem] Extract 1080p formats by [DavidSkrundz](https://github.com/DavidSkrundz) +* [ceskatelevize] Fetch iframe from nextJS data by [mkubecek](https://github.com/mkubecek) +* [crackle] Look for non-DRM formats by [raleeper](https://github.com/raleeper) +* [dplay] Temporary fix for `discoveryplus.com/it` +* [DiscoveryPlusShowBaseIE] yield actual video id by [Ashish0804](https://github.com/Ashish0804) +* [Facebook] Handle redirect URLs +* [fujitv] Extract 1080p from `tv_android` m3u8 by [YuenSzeHong](https://github.com/YuenSzeHong) +* [gronkh] Support new URL pattern by [Sematre](https://github.com/Sematre) +* [instagram] Expand valid URL by [u-spec-png](https://github.com/u-spec-png) +* [Instagram] Try bypassing login wall with embed page by [MinePlayersPE](https://github.com/MinePlayersPE) +* [Jamendo] Fix use of `_VALID_URL_RE` by [jaller94](https://github.com/jaller94) +* [LBRY] Support livestreams by [Ashish0804](https://github.com/Ashish0804), [pukkandan](https://github.com/pukkandan) +* [NJPWWorld] Extract formats from m3u8 by [aarubui](https://github.com/aarubui) +* [NovaEmbed] update player regex by [std-move](https://github.com/std-move) +* [npr] Make SMIL extraction non-fatal by [r5d](https://github.com/r5d) +* [ntvcojp] Extract NUXT data by [nao20010128nao](https://github.com/nao20010128nao) +* [ok.ru] add mobile fallback by [nao20010128nao](https://github.com/nao20010128nao) +* [olympics] Add uploader and cleanup by [u-spec-png](https://github.com/u-spec-png) +* [ondemandkorea] Update `jw_config` regex by [julien-hadleyjack](https://github.com/julien-hadleyjack) +* [PlutoTV] Expand `_VALID_URL` +* [RaiNews] Fix extractor by [nixxo](https://github.com/nixxo) +* [RCTIPlusSeries] Lazy extraction and video type selection by [MinePlayersPE](https://github.com/MinePlayersPE) +* [redtube] Handle formats delivered inside a JSON by [dirkf](https://github.com/dirkf), [nixxo](https://github.com/nixxo) +* [SonyLiv] Add OTP login support by [Ashish0804](https://github.com/Ashish0804) +* [Steam] Fix extractor by [u-spec-png](https://github.com/u-spec-png) +* [TikTok] Pass cookies to mobile API by [MinePlayersPE](https://github.com/MinePlayersPE) +* [trovo] Fix inheritance of `TrovoChannelBaseIE` +* [TVer] Extract better thumbnails by [YuenSzeHong](https://github.com/YuenSzeHong) +* [vimeo] Extract chapters +* [web.archive:youtube] Improve metadata extraction by [coletdjnz](https://github.com/coletdjnz) +* [youtube:comments] Add more options for limiting number of comments extracted by [coletdjnz](https://github.com/coletdjnz) +* [youtube:tab] Extract more metadata from feeds/channels/playlists by [coletdjnz](https://github.com/coletdjnz) +* [youtube:tab] Extract video thumbnails from playlist by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* [youtube:tab] Ignore query when redirecting channel to playlist and cleanup of related code +* [youtube] Fix `ytsearchdate` +* [zdf] Support videos with different ptmd location by [iw0nderhow](https://github.com/iw0nderhow) +* [zee5] Support /episodes in URL + + ### 2021.12.01 * **Add option `--wait-for-video` to wait for scheduled streams** diff --git a/docs/Contributing.md b/docs/Contributing.md new file mode 100644 index 000000000..60fe46909 --- /dev/null +++ b/docs/Contributing.md @@ -0,0 +1,5 @@ +--- +orphan: true +--- +```{include} ../Contributing.md +``` diff --git a/supportedsites.md b/supportedsites.md index 2c13a28b7..9dc94f27d 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -21,6 +21,7 @@ - **9now.com.au** - **abc.net.au** - **abc.net.au:iview** + - **abc.net.au:iview:showseries** - **abcnews** - **abcnews:video** - **abcotvs**: ABC Owned Television Stations @@ -273,6 +274,7 @@ - **DiscoveryPlus** - **DiscoveryPlusIndia** - **DiscoveryPlusIndiaShow** + - **DiscoveryPlusItaly** - **DiscoveryPlusItalyShow** - **DiscoveryVR** - **Disney** @@ -287,6 +289,8 @@ - **DPlay** - **DRBonanza** - **Dropbox** + - **Dropout** + - **DropoutSeason** - **DrTuber** - **drtv** - **drtv:live** @@ -379,6 +383,12 @@ - **GabTV** - **Gaia** - **GameInformer** + - **GameJolt** + - **GameJoltCommunity** + - **GameJoltGame** + - **GameJoltGameSoundtrack** + - **GameJoltSearch** + - **GameJoltUser** - **GameSpot** - **GameStar** - **Gaskrank** @@ -399,6 +409,7 @@ - **GloboArticle** - **Go** - **GodTube** + - **Gofile** - **Golem** - **google:podcasts** - **google:podcasts:feed** @@ -436,6 +447,8 @@ - **hrfernsehen** - **HRTi** - **HRTiPlaylist** + - **HSEProduct** + - **HSEShow** - **Huajiao**: 花椒直播 - **HuffPost**: Huffington Post - **Hungama** @@ -652,6 +665,8 @@ - **n-tv.de** - **N1Info:article** - **N1InfoAsset** + - **Nate** + - **NateProgram** - **natgeo:video** - **NationalGeographicTV** - **Naver** @@ -766,6 +781,8 @@ - **OnionStudios** - **Ooyala** - **OoyalaExternal** + - **Opencast** + - **OpencastPlaylist** - **openrec** - **openrec:capture** - **OraTV** @@ -819,6 +836,8 @@ - **Pinkbike** - **Pinterest** - **PinterestCollection** + - **pixiv:sketch** + - **pixiv:sketch:user** - **Pladform** - **PlanetMarathi** - **Platzi** @@ -941,6 +960,7 @@ - **RTRFM** - **RTS**: RTS.ch - **rtve.es:alacarta**: RTVE a la carta + - **rtve.es:audio**: RTVE audio - **rtve.es:infantil**: RTVE infantil - **rtve.es:live**: RTVE.es live streams - **rtve.es:television** @@ -950,11 +970,12 @@ - **RumbleChannel** - **RumbleEmbed** - **rutube**: Rutube videos - - **rutube:channel**: Rutube channels + - **rutube:channel**: Rutube channel - **rutube:embed**: Rutube embedded videos - **rutube:movie**: Rutube movies - **rutube:person**: Rutube person videos - **rutube:playlist**: Rutube playlists + - **rutube:tags**: Rutube tags - **RUTV**: RUTV.RU - **Ruutu** - **Ruv** @@ -994,6 +1015,7 @@ - **simplecast:episode** - **simplecast:podcast** - **Sina** + - **Skeb** - **sky.it** - **sky:news** - **sky:news:story** @@ -1013,6 +1035,7 @@ - **SonyLIVSeries** - **soundcloud** - **soundcloud:playlist** + - **soundcloud:related** - **soundcloud:search**: Soundcloud search; "scsearch:" prefix - **soundcloud:set** - **soundcloud:trackstation** @@ -1120,12 +1143,16 @@ - **ThreeSpeak** - **ThreeSpeakUser** - **TikTok** + - **tiktok:effect** + - **tiktok:sound** + - **tiktok:tag** - **tiktok:user** - **tinypic**: tinypic.com videos - **TMZ** - **TNAFlix** - **TNAFlixNetworkEmbed** - **toggle** + - **toggo** - **Tokentube** - **Tokentube:channel** - **ToonGoggles** @@ -1138,6 +1165,7 @@ - **TrovoChannelClip**: All Clips of a trovo.live channel; "trovoclip:" prefix - **TrovoChannelVod**: All VODs of a trovo.live channel; "trovovod:" prefix - **TrovoVod** + - **TrueID** - **TruNews** - **TruTV** - **Tube8** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 6c2530046..61923513e 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -717,6 +717,7 @@ class TestYoutubeDL(unittest.TestCase): test('%(id)s', '.abcd', info={'id': '.abcd'}) test('%(id)s', 'ab__cd', info={'id': 'ab__cd'}) test('%(id)s', ('ab:cd', 'ab -cd'), info={'id': 'ab:cd'}) + test('%(id.0)s', '-', info={'id': '--'}) # Invalid templates self.assertTrue(isinstance(YoutubeDL.validate_outtmpl('%(title)'), ValueError)) @@ -777,6 +778,10 @@ class TestYoutubeDL(unittest.TestCase): test('%(title5)#U', 'a\u0301e\u0301i\u0301 𝐀') test('%(title5)+U', 'áéí A') test('%(title5)+#U', 'a\u0301e\u0301i\u0301 A') + test('%(height)D', '1K') + test('%(height)5.2D', ' 1.08K') + test('%(title4)#S', 'foo_bar_test') + test('%(title4).10S', ('foo \'bar\' ', 'foo \'bar\'' + ('#' if compat_os_name == 'nt' else ' '))) if compat_os_name == 'nt': test('%(title4)q', ('"foo \\"bar\\" test"', "'foo _'bar_' test'")) test('%(formats.:.id)#q', ('"id 1" "id 2" "id 3"', "'id 1' 'id 2' 'id 3'")) @@ -808,6 +813,11 @@ class TestYoutubeDL(unittest.TestCase): test('%(width-100,height+width|def)s', 'def') test('%(timestamp-x>%H\\,%M\\,%S,timestamp>%H\\,%M\\,%S)s', '12,00,00') + # Replacement + test('%(id&foo)s.bar', 'foo.bar') + test('%(title&foo)s.bar', 'NA.bar') + test('%(title&foo|baz)s.bar', 'baz.bar') + # Laziness def gen(): yield from range(5) @@ -836,11 +846,6 @@ class TestYoutubeDL(unittest.TestCase): test('%(title3)s', ('foo/bar\\test', 'foo_bar_test')) test('folder/%(title3)s', ('folder/foo/bar\\test', 'folder%sfoo_bar_test' % os.path.sep)) - # Replacement - test('%(id&foo)s.bar', 'foo.bar') - test('%(title&foo)s.bar', 'NA.bar') - test('%(title&foo|baz)s.bar', 'baz.bar') - def test_format_note(self): ydl = YoutubeDL() self.assertEqual(ydl._format_note({}), '') diff --git a/test/test_cookies.py b/test/test_cookies.py index 7d509ebe8..842ebcb99 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -8,6 +8,8 @@ from yt_dlp.cookies import ( WindowsChromeCookieDecryptor, parse_safari_cookies, pbkdf2_sha1, + _get_linux_desktop_environment, + _LinuxDesktopEnvironment, ) @@ -42,6 +44,37 @@ class MonkeyPatch: class TestCookies(unittest.TestCase): + def test_get_desktop_environment(self): + """ based on https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util_unittest.cc """ + test_cases = [ + ({}, _LinuxDesktopEnvironment.OTHER), + + ({'DESKTOP_SESSION': 'gnome'}, _LinuxDesktopEnvironment.GNOME), + ({'DESKTOP_SESSION': 'mate'}, _LinuxDesktopEnvironment.GNOME), + ({'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE), + ({'DESKTOP_SESSION': 'kde'}, _LinuxDesktopEnvironment.KDE), + ({'DESKTOP_SESSION': 'xfce'}, _LinuxDesktopEnvironment.XFCE), + + ({'GNOME_DESKTOP_SESSION_ID': 1}, _LinuxDesktopEnvironment.GNOME), + ({'KDE_FULL_SESSION': 1}, _LinuxDesktopEnvironment.KDE), + + ({'XDG_CURRENT_DESKTOP': 'X-Cinnamon'}, _LinuxDesktopEnvironment.CINNAMON), + ({'XDG_CURRENT_DESKTOP': 'GNOME'}, _LinuxDesktopEnvironment.GNOME), + ({'XDG_CURRENT_DESKTOP': 'GNOME:GNOME-Classic'}, _LinuxDesktopEnvironment.GNOME), + ({'XDG_CURRENT_DESKTOP': 'GNOME : GNOME-Classic'}, _LinuxDesktopEnvironment.GNOME), + + ({'XDG_CURRENT_DESKTOP': 'Unity', 'DESKTOP_SESSION': 'gnome-fallback'}, _LinuxDesktopEnvironment.GNOME), + ({'XDG_CURRENT_DESKTOP': 'KDE', 'KDE_SESSION_VERSION': '5'}, _LinuxDesktopEnvironment.KDE), + ({'XDG_CURRENT_DESKTOP': 'KDE'}, _LinuxDesktopEnvironment.KDE), + ({'XDG_CURRENT_DESKTOP': 'Pantheon'}, _LinuxDesktopEnvironment.PANTHEON), + ({'XDG_CURRENT_DESKTOP': 'Unity'}, _LinuxDesktopEnvironment.UNITY), + ({'XDG_CURRENT_DESKTOP': 'Unity:Unity7'}, _LinuxDesktopEnvironment.UNITY), + ({'XDG_CURRENT_DESKTOP': 'Unity:Unity8'}, _LinuxDesktopEnvironment.UNITY), + ] + + for env, expected_desktop_environment in test_cases: + self.assertEqual(_get_linux_desktop_environment(env), expected_desktop_environment) + def test_chrome_cookie_decryptor_linux_derive_key(self): key = LinuxChromeCookieDecryptor.derive_key(b'abc') self.assertEqual(key, b'7\xa1\xec\xd4m\xfcA\xc7\xb19Z\xd0\x19\xdcM\x17') @@ -58,8 +91,7 @@ class TestCookies(unittest.TestCase): self.assertEqual(decryptor.decrypt(encrypted_value), value) def test_chrome_cookie_decryptor_linux_v11(self): - with MonkeyPatch(cookies, {'_get_linux_keyring_password': lambda *args, **kwargs: b'', - 'KEYRING_AVAILABLE': True}): + with MonkeyPatch(cookies, {'_get_linux_keyring_password': lambda *args, **kwargs: b''}): encrypted_value = b'v11#\x81\x10>`w\x8f)\xc0\xb2\xc1\r\xf4\x1al\xdd\x93\xfd\xf8\xf8N\xf2\xa9\x83\xf1\xe9o\x0elVQd' value = 'tz=Europe.London' decryptor = LinuxChromeCookieDecryptor('Chrome', Logger()) diff --git a/test/test_utils.py b/test/test_utils.py index 22dda4f37..2e33308c7 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -37,6 +37,7 @@ from yt_dlp.utils import ( ExtractorError, find_xpath_attr, fix_xml_ampersands, + format_bytes, float_or_none, get_element_by_class, get_element_by_attribute, @@ -1156,9 +1157,16 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_count('1000'), 1000) self.assertEqual(parse_count('1.000'), 1000) self.assertEqual(parse_count('1.1k'), 1100) + self.assertEqual(parse_count('1.1 k'), 1100) + self.assertEqual(parse_count('1,1 k'), 1100) self.assertEqual(parse_count('1.1kk'), 1100000) self.assertEqual(parse_count('1.1kk '), 1100000) + self.assertEqual(parse_count('1,1kk'), 1100000) + self.assertEqual(parse_count('100 views'), 100) + self.assertEqual(parse_count('1,100 views'), 1100) self.assertEqual(parse_count('1.1kk views'), 1100000) + self.assertEqual(parse_count('10M views'), 10000000) + self.assertEqual(parse_count('has 10M views'), 10000000) def test_parse_resolution(self): self.assertEqual(parse_resolution(None), {}) @@ -1681,6 +1689,18 @@ Line 1 ll = reversed(ll) test(ll, -15, 14, range(15)) + def test_format_bytes(self): + self.assertEqual(format_bytes(0), '0.00B') + self.assertEqual(format_bytes(1000), '1000.00B') + self.assertEqual(format_bytes(1024), '1.00KiB') + self.assertEqual(format_bytes(1024**2), '1.00MiB') + self.assertEqual(format_bytes(1024**3), '1.00GiB') + self.assertEqual(format_bytes(1024**4), '1.00TiB') + self.assertEqual(format_bytes(1024**5), '1.00PiB') + self.assertEqual(format_bytes(1024**6), '1.00EiB') + self.assertEqual(format_bytes(1024**7), '1.00ZiB') + self.assertEqual(format_bytes(1024**8), '1.00YiB') + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 3359ac457..5f8114a1c 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -82,6 +82,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/f1ca6900/player_ias.vflset/en_US/base.js', 'cu3wyu6LQn2hse', 'jvxetvmlI9AN9Q', ), + ( + 'https://www.youtube.com/s/player/8040e515/player_ias.vflset/en_US/base.js', + 'wvOFaY-yjgDuIEg5', 'HkfBFDHmgw4rsw', + ), ] diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index d542d22e6..ed1881da5 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -67,6 +67,7 @@ from .utils import ( float_or_none, format_bytes, format_field, + format_decimal_suffix, formatSeconds, GeoRestrictedError, get_domain, @@ -315,10 +316,10 @@ class YoutubeDL(object): break_per_url: Whether break_on_reject and break_on_existing should act on each input URL as opposed to for the entire queue cookiefile: File name where cookies should be read from and dumped to - cookiesfrombrowser: A tuple containing the name of the browser and the profile - name/path from where cookies are loaded. - Eg: ('chrome', ) or ('vivaldi', 'default') - nocheckcertificate:Do not verify SSL certificates + cookiesfrombrowser: A tuple containing the name of the browser, the profile + name/pathfrom where cookies are loaded, and the name of the + keyring. Eg: ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT') + nocheckcertificate: Do not verify SSL certificates prefer_insecure: Use HTTP instead of HTTPS to retrieve information. At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use @@ -448,8 +449,8 @@ class YoutubeDL(object): The following parameters are not used by YoutubeDL itself, they are used by the downloader (see yt_dlp/downloader/common.py): nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize, - max_filesize, test, noresizebuffer, retries, fragment_retries, continuedl, - noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size, + max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries, + continuedl, noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size, external_downloader_args, concurrent_fragment_downloads. The following options are used by the post processors: @@ -1004,7 +1005,7 @@ class YoutubeDL(object): def validate_outtmpl(cls, outtmpl): ''' @return None or Exception object ''' outtmpl = re.sub( - STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBU]'), + STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBUDS]'), lambda mobj: f'{mobj.group(0)[:-1]}s', cls._outtmpl_expandpath(outtmpl)) try: @@ -1020,8 +1021,12 @@ class YoutubeDL(object): info_dict.pop(key, None) return info_dict - def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None): - """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict """ + def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): + """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict + @param sanitize Whether to sanitize the output as a filename. + For backward compatibility, a function can also be passed + """ + info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set info_dict = self._copy_infodict(info_dict) @@ -1042,7 +1047,7 @@ class YoutubeDL(object): } TMPL_DICT = {} - EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBU]')) + EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBUDS]')) MATH_FUNCTIONS = { '+': float.__add__, '-': float.__sub__, @@ -1050,7 +1055,7 @@ class YoutubeDL(object): # Field is of the form key1.key2... # where keys (except first) can be string, int or slice FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)') - MATH_FIELD_RE = r'''{field}|{num}'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?') + MATH_FIELD_RE = r'''(?:{field}|{num})'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?') MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys())) INTERNAL_FORMAT_RE = re.compile(r'''(?x) (?P<negate>-)? @@ -1106,6 +1111,13 @@ class YoutubeDL(object): na = self.params.get('outtmpl_na_placeholder', 'NA') + def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')): + return sanitize_filename(str(value), restricted=restricted, + is_id=re.search(r'(^|[_.])id(\.|$)', key)) + + sanitizer = sanitize if callable(sanitize) else filename_sanitizer + sanitize = bool(sanitize) + def _dumpjson_default(obj): if isinstance(obj, (set, LazyList)): return list(obj) @@ -1116,7 +1128,7 @@ class YoutubeDL(object): return outer_mobj.group(0) key = outer_mobj.group('key') mobj = re.match(INTERNAL_FORMAT_RE, key) - initial_field = mobj.group('fields').split('.')[-1] if mobj else '' + initial_field = mobj.group('fields') if mobj else '' value, replacement, default = None, None, na while mobj: mobj = mobj.groupdict() @@ -1138,7 +1150,7 @@ class YoutubeDL(object): str_fmt = f'{fmt[:-1]}s' if fmt[-1] == 'l': # list delim = '\n' if '#' in flags else ', ' - value, fmt = delim.join(variadic(value)), str_fmt + value, fmt = delim.join(variadic(value, allowed_types=(str, bytes))), str_fmt elif fmt[-1] == 'j': # json value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt elif fmt[-1] == 'q': # quoted @@ -1152,6 +1164,10 @@ class YoutubeDL(object): # "+" = compatibility equivalence, "#" = NFD 'NF%s%s' % ('K' if '+' in flags else '', 'D' if '#' in flags else 'C'), value), str_fmt + elif fmt[-1] == 'D': # decimal suffix + value, fmt = format_decimal_suffix(value, f'%{fmt[:-1]}f%s' if fmt[:-1] else '%d%s'), 's' + elif fmt[-1] == 'S': # filename sanitization + value, fmt = filename_sanitizer(initial_field, value, restricted='#' in flags), str_fmt elif fmt[-1] == 'c': if value: value = str(value)[0] @@ -1168,7 +1184,7 @@ class YoutubeDL(object): # So we convert it to repr first value, fmt = repr(value), str_fmt if fmt[-1] in 'csr': - value = sanitize(initial_field, value) + value = sanitizer(initial_field, value) key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format')) TMPL_DICT[key] = value @@ -1182,12 +1198,8 @@ class YoutubeDL(object): def _prepare_filename(self, info_dict, tmpl_type='default'): try: - sanitize = lambda k, v: sanitize_filename( - compat_str(v), - restricted=self.params.get('restrictfilenames'), - is_id=(k == 'id' or k.endswith('_id'))) outtmpl = self._outtmpl_expandpath(self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])) - filename = self.evaluate_outtmpl(outtmpl, info_dict, sanitize) + filename = self.evaluate_outtmpl(outtmpl, info_dict, True) force_ext = OUTTMPL_TYPES.get(tmpl_type) if filename and force_ext is not None: @@ -1335,31 +1347,33 @@ class YoutubeDL(object): def __handle_extraction_exceptions(func): @functools.wraps(func) def wrapper(self, *args, **kwargs): - try: - return func(self, *args, **kwargs) - except GeoRestrictedError as e: - msg = e.msg - if e.countries: - msg += '\nThis video is available in %s.' % ', '.join( - map(ISO3166Utils.short2full, e.countries)) - msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' - self.report_error(msg) - except ExtractorError as e: # An error we somewhat expected - self.report_error(compat_str(e), e.format_traceback()) - except ReExtractInfo as e: - if e.expected: - self.to_screen(f'{e}; Re-extracting data') - else: - self.to_stderr('\r') - self.report_warning(f'{e}; Re-extracting data') - return wrapper(self, *args, **kwargs) - except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError): - raise - except Exception as e: - if self.params.get('ignoreerrors'): - self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) - else: + while True: + try: + return func(self, *args, **kwargs) + except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError): raise + except ReExtractInfo as e: + if e.expected: + self.to_screen(f'{e}; Re-extracting data') + else: + self.to_stderr('\r') + self.report_warning(f'{e}; Re-extracting data') + continue + except GeoRestrictedError as e: + msg = e.msg + if e.countries: + msg += '\nThis video is available in %s.' % ', '.join( + map(ISO3166Utils.short2full, e.countries)) + msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' + self.report_error(msg) + except ExtractorError as e: # An error we somewhat expected + self.report_error(str(e), e.format_traceback()) + except Exception as e: + if self.params.get('ignoreerrors'): + self.report_error(str(e), tb=encode_compat_str(traceback.format_exc())) + else: + raise + break return wrapper def _wait_for_video(self, ie_result): @@ -1482,7 +1496,7 @@ class YoutubeDL(object): self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls)) ie_result['additional_entries'] = [ self.extract_info( - url, download, extra_info, + url, download, extra_info=extra_info, force_generic_extractor=self.params.get('force_generic_extractor')) for url in additional_urls ] @@ -2461,10 +2475,7 @@ class YoutubeDL(object): info_dict['id'], automatic_captions, 'automatic captions') self.list_subtitles(info_dict['id'], subtitles, 'subtitles') if self.params.get('listformats') or interactive_format_selection: - if not info_dict.get('formats') and not info_dict.get('url'): - self.to_screen('%s has no formats' % info_dict['id']) - else: - self.list_formats(info_dict) + self.list_formats(info_dict) if list_only: # Without this printing, -F --print-json will not work self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True) @@ -3135,9 +3146,8 @@ class YoutubeDL(object): 'requested_formats', 'requested_subtitles', 'requested_entries', 'entries', 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber', } - empty_values = (None, {}, [], set(), tuple()) reject = lambda k, v: k not in keep_keys and ( - k.startswith('_') or k in remove_keys or v in empty_values) + k.startswith('_') or k in remove_keys or v is None) else: reject = lambda k, v: k in remove_keys @@ -3348,6 +3358,11 @@ class YoutubeDL(object): return headers def list_formats(self, info_dict): + if not info_dict.get('formats') and not info_dict.get('url'): + self.to_screen('%s has no formats' % info_dict['id']) + return + self.to_screen('[info] Available formats for %s:' % info_dict['id']) + formats = info_dict.get('formats', [info_dict]) new_format = self.params.get('listformats_table', True) is not False if new_format: @@ -3362,7 +3377,7 @@ class YoutubeDL(object): delim, format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes), format_field(f, 'tbr', '\t%dk'), - shorten_protocol_name(f.get('protocol', '').replace('native', 'n')), + shorten_protocol_name(f.get('protocol', '')), delim, format_field(f, 'vcodec', default='unknown').replace( 'none', @@ -3398,8 +3413,6 @@ class YoutubeDL(object): if f.get('preference') is None or f['preference'] >= -1000] header_line = ['format code', 'extension', 'resolution', 'note'] - self.to_screen( - '[info] Available formats for %s:' % info_dict['id']) self.to_stdout(render_table( header_line, table, extra_gap=(0 if new_format else 1), @@ -3527,11 +3540,11 @@ class YoutubeDL(object): from .downloader.websocket import has_websockets from .postprocessor.embedthumbnail import has_mutagen - from .cookies import SQLITE_AVAILABLE, KEYRING_AVAILABLE + from .cookies import SQLITE_AVAILABLE, SECRETSTORAGE_AVAILABLE lib_str = join_nonempty( compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0], - KEYRING_AVAILABLE and 'keyring', + SECRETSTORAGE_AVAILABLE and 'secretstorage', has_mutagen and 'mutagen', SQLITE_AVAILABLE and 'sqlite', has_websockets and 'websockets', diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 198962aa5..4fa2e2d8c 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -16,10 +16,11 @@ from .options import ( ) from .compat import ( compat_getpass, + compat_os_name, compat_shlex_quote, workaround_optparse_bug9161, ) -from .cookies import SUPPORTED_BROWSERS +from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS from .utils import ( DateRange, decodeOption, @@ -92,7 +93,8 @@ def _real_main(argv=None): if opts.batchfile is not None: try: if opts.batchfile == '-': - write_string('Reading URLs from stdin:\n') + write_string('Reading URLs from stdin - EOF (%s) to end:\n' % ( + 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D')) batchfd = sys.stdin else: batchfd = io.open( @@ -134,10 +136,10 @@ def _real_main(argv=None): # Conflicting, missing and erroneous options if opts.format == 'best': - warnings.append('.\n '.join( + warnings.append('.\n '.join(( '"-f best" selects the best pre-merged format which is often not the best option', 'To let yt-dlp download and merge the best available formats, simply do not pass any format selection', - 'If you know what you are doing and want only the best pre-merged format, use "-f b" instead to suppress this warning')) + 'If you know what you are doing and want only the best pre-merged format, use "-f b" instead to suppress this warning'))) if opts.usenetrc and (opts.username is not None or opts.password is not None): parser.error('using .netrc conflicts with giving username/password') if opts.password is not None and opts.username is None: @@ -217,6 +219,8 @@ def _real_main(argv=None): return parsed_retries if opts.retries is not None: opts.retries = parse_retries(opts.retries) + if opts.file_access_retries is not None: + opts.file_access_retries = parse_retries(opts.file_access_retries, 'file access ') if opts.fragment_retries is not None: opts.fragment_retries = parse_retries(opts.fragment_retries, 'fragment ') if opts.extractor_retries is not None: @@ -259,10 +263,20 @@ def _real_main(argv=None): if opts.convertthumbnails not in FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS: parser.error('invalid thumbnail format specified') if opts.cookiesfrombrowser is not None: - opts.cookiesfrombrowser = [ - part.strip() or None for part in opts.cookiesfrombrowser.split(':', 1)] - if opts.cookiesfrombrowser[0].lower() not in SUPPORTED_BROWSERS: - parser.error('unsupported browser specified for cookies') + mobj = re.match(r'(?P<name>[^+:]+)(\s*\+\s*(?P<keyring>[^:]+))?(\s*:(?P<profile>.+))?', opts.cookiesfrombrowser) + if mobj is None: + parser.error(f'invalid cookies from browser arguments: {opts.cookiesfrombrowser}') + browser_name, keyring, profile = mobj.group('name', 'keyring', 'profile') + browser_name = browser_name.lower() + if browser_name not in SUPPORTED_BROWSERS: + parser.error(f'unsupported browser specified for cookies: "{browser_name}". ' + f'Supported browsers are: {", ".join(sorted(SUPPORTED_BROWSERS))}') + if keyring is not None: + keyring = keyring.upper() + if keyring not in SUPPORTED_KEYRINGS: + parser.error(f'unsupported keyring specified for cookies: "{keyring}". ' + f'Supported keyrings are: {", ".join(sorted(SUPPORTED_KEYRINGS))}') + opts.cookiesfrombrowser = (browser_name, profile, keyring) geo_bypass_code = opts.geo_bypass_ip_block or opts.geo_bypass_country if geo_bypass_code is not None: try: @@ -515,7 +529,7 @@ def _real_main(argv=None): if len(dur) == 2 and all(t is not None for t in dur): remove_ranges.append(tuple(dur)) continue - parser.error(f'invalid --remove-chapters time range {regex!r}. Must be of the form ?start-end') + parser.error(f'invalid --remove-chapters time range {regex!r}. Must be of the form *start-end') try: remove_chapters_patterns.append(re.compile(regex)) except re.error as err: @@ -668,6 +682,7 @@ def _real_main(argv=None): 'throttledratelimit': opts.throttledratelimit, 'overwrites': opts.overwrites, 'retries': opts.retries, + 'file_access_retries': opts.file_access_retries, 'fragment_retries': opts.fragment_retries, 'extractor_retries': opts.extractor_retries, 'skip_unavailable_fragments': opts.skip_unavailable_fragments, diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index ec68a809d..74e133bc9 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1,3 +1,4 @@ +import contextlib import ctypes import json import os @@ -7,6 +8,7 @@ import subprocess import sys import tempfile from datetime import datetime, timedelta, timezone +from enum import Enum, auto from hashlib import pbkdf2_hmac from .aes import aes_cbc_decrypt_bytes, aes_gcm_decrypt_and_verify_bytes @@ -15,7 +17,6 @@ from .compat import ( compat_cookiejar_Cookie, ) from .utils import ( - bug_reports_message, expand_path, Popen, YoutubeDLCookieJar, @@ -31,19 +32,16 @@ except ImportError: try: - import keyring - KEYRING_AVAILABLE = True - KEYRING_UNAVAILABLE_REASON = f'due to unknown reasons{bug_reports_message()}' + import secretstorage + SECRETSTORAGE_AVAILABLE = True except ImportError: - KEYRING_AVAILABLE = False - KEYRING_UNAVAILABLE_REASON = ( - 'as the `keyring` module is not installed. ' - 'Please install by running `python3 -m pip install keyring`. ' - 'Depending on your platform, additional packages may be required ' - 'to access the keyring; see https://pypi.org/project/keyring') + SECRETSTORAGE_AVAILABLE = False + SECRETSTORAGE_UNAVAILABLE_REASON = ( + 'as the `secretstorage` module is not installed. ' + 'Please install by running `python3 -m pip install secretstorage`.') except Exception as _err: - KEYRING_AVAILABLE = False - KEYRING_UNAVAILABLE_REASON = 'as the `keyring` module could not be initialized: %s' % _err + SECRETSTORAGE_AVAILABLE = False + SECRETSTORAGE_UNAVAILABLE_REASON = f'as the `secretstorage` module could not be initialized. {_err}' CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} @@ -74,8 +72,8 @@ class YDLLogger: def load_cookies(cookie_file, browser_specification, ydl): cookie_jars = [] if browser_specification is not None: - browser_name, profile = _parse_browser_specification(*browser_specification) - cookie_jars.append(extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl))) + browser_name, profile, keyring = _parse_browser_specification(*browser_specification) + cookie_jars.append(extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring)) if cookie_file is not None: cookie_file = expand_path(cookie_file) @@ -87,13 +85,13 @@ def load_cookies(cookie_file, browser_specification, ydl): return _merge_cookie_jars(cookie_jars) -def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger()): +def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), *, keyring=None): if browser_name == 'firefox': return _extract_firefox_cookies(profile, logger) elif browser_name == 'safari': return _extract_safari_cookies(profile, logger) elif browser_name in CHROMIUM_BASED_BROWSERS: - return _extract_chrome_cookies(browser_name, profile, logger) + return _extract_chrome_cookies(browser_name, profile, keyring, logger) else: raise ValueError('unknown browser: {}'.format(browser_name)) @@ -207,7 +205,7 @@ def _get_chromium_based_browser_settings(browser_name): } -def _extract_chrome_cookies(browser_name, profile, logger): +def _extract_chrome_cookies(browser_name, profile, keyring, logger): logger.info('Extracting cookies from {}'.format(browser_name)) if not SQLITE_AVAILABLE: @@ -234,7 +232,7 @@ def _extract_chrome_cookies(browser_name, profile, logger): raise FileNotFoundError('could not find {} cookies database in "{}"'.format(browser_name, search_root)) logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path)) - decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger) + decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger, keyring=keyring) with tempfile.TemporaryDirectory(prefix='yt_dlp') as tmpdir: cursor = None @@ -247,6 +245,7 @@ def _extract_chrome_cookies(browser_name, profile, logger): 'expires_utc, {} FROM cookies'.format(secure_column)) jar = YoutubeDLCookieJar() failed_cookies = 0 + unencrypted_cookies = 0 for host_key, name, value, encrypted_value, path, expires_utc, is_secure in cursor.fetchall(): host_key = host_key.decode('utf-8') name = name.decode('utf-8') @@ -258,6 +257,8 @@ def _extract_chrome_cookies(browser_name, profile, logger): if value is None: failed_cookies += 1 continue + else: + unencrypted_cookies += 1 cookie = compat_cookiejar_Cookie( version=0, name=name, value=value, port=None, port_specified=False, @@ -270,6 +271,9 @@ def _extract_chrome_cookies(browser_name, profile, logger): else: failed_message = '' logger.info('Extracted {} cookies from {}{}'.format(len(jar), browser_name, failed_message)) + counts = decryptor.cookie_counts.copy() + counts['unencrypted'] = unencrypted_cookies + logger.debug('cookie version breakdown: {}'.format(counts)) return jar finally: if cursor is not None: @@ -305,10 +309,14 @@ class ChromeCookieDecryptor: def decrypt(self, encrypted_value): raise NotImplementedError + @property + def cookie_counts(self): + raise NotImplementedError + -def get_cookie_decryptor(browser_root, browser_keyring_name, logger): +def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None): if sys.platform in ('linux', 'linux2'): - return LinuxChromeCookieDecryptor(browser_keyring_name, logger) + return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring) elif sys.platform == 'darwin': return MacChromeCookieDecryptor(browser_keyring_name, logger) elif sys.platform == 'win32': @@ -319,13 +327,12 @@ def get_cookie_decryptor(browser_root, browser_keyring_name, logger): class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): - def __init__(self, browser_keyring_name, logger): + def __init__(self, browser_keyring_name, logger, *, keyring=None): self._logger = logger self._v10_key = self.derive_key(b'peanuts') - if KEYRING_AVAILABLE: - self._v11_key = self.derive_key(_get_linux_keyring_password(browser_keyring_name)) - else: - self._v11_key = None + password = _get_linux_keyring_password(browser_keyring_name, keyring, logger) + self._v11_key = None if password is None else self.derive_key(password) + self._cookie_counts = {'v10': 0, 'v11': 0, 'other': 0} @staticmethod def derive_key(password): @@ -333,20 +340,27 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_linux.cc return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1, key_length=16) + @property + def cookie_counts(self): + return self._cookie_counts + def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] if version == b'v10': + self._cookie_counts['v10'] += 1 return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger) elif version == b'v11': + self._cookie_counts['v11'] += 1 if self._v11_key is None: - self._logger.warning(f'cannot decrypt cookie {KEYRING_UNAVAILABLE_REASON}', only_once=True) + self._logger.warning('cannot decrypt v11 cookies: no key found', only_once=True) return None return _decrypt_aes_cbc(ciphertext, self._v11_key, self._logger) else: + self._cookie_counts['other'] += 1 return None @@ -355,6 +369,7 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor): self._logger = logger password = _get_mac_keyring_password(browser_keyring_name, logger) self._v10_key = None if password is None else self.derive_key(password) + self._cookie_counts = {'v10': 0, 'other': 0} @staticmethod def derive_key(password): @@ -362,11 +377,16 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor): # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1003, key_length=16) + @property + def cookie_counts(self): + return self._cookie_counts + def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] if version == b'v10': + self._cookie_counts['v10'] += 1 if self._v10_key is None: self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) return None @@ -374,6 +394,7 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor): return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger) else: + self._cookie_counts['other'] += 1 # other prefixes are considered 'old data' which were stored as plaintext # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm return encrypted_value @@ -383,12 +404,18 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): def __init__(self, browser_root, logger): self._logger = logger self._v10_key = _get_windows_v10_key(browser_root, logger) + self._cookie_counts = {'v10': 0, 'other': 0} + + @property + def cookie_counts(self): + return self._cookie_counts def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] if version == b'v10': + self._cookie_counts['v10'] += 1 if self._v10_key is None: self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) return None @@ -408,6 +435,7 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): return _decrypt_aes_gcm(ciphertext, self._v10_key, nonce, authentication_tag, self._logger) else: + self._cookie_counts['other'] += 1 # any other prefix means the data is DPAPI encrypted # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc return _decrypt_windows_dpapi(encrypted_value, self._logger).decode('utf-8') @@ -577,42 +605,221 @@ def parse_safari_cookies(data, jar=None, logger=YDLLogger()): return jar -def _get_linux_keyring_password(browser_keyring_name): - password = keyring.get_password('{} Keys'.format(browser_keyring_name), - '{} Safe Storage'.format(browser_keyring_name)) - if password is None: - # this sometimes occurs in KDE because chrome does not check hasEntry and instead - # just tries to read the value (which kwallet returns "") whereas keyring checks hasEntry - # to verify this: - # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" - # while starting chrome. - # this may be a bug as the intended behaviour is to generate a random password and store - # it, but that doesn't matter here. - password = '' - return password.encode('utf-8') +class _LinuxDesktopEnvironment(Enum): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util.h + DesktopEnvironment + """ + OTHER = auto() + CINNAMON = auto() + GNOME = auto() + KDE = auto() + PANTHEON = auto() + UNITY = auto() + XFCE = auto() -def _get_mac_keyring_password(browser_keyring_name, logger): - if KEYRING_AVAILABLE: - logger.debug('using keyring to obtain password') - password = keyring.get_password('{} Safe Storage'.format(browser_keyring_name), browser_keyring_name) - return password.encode('utf-8') +class _LinuxKeyring(Enum): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.h + SelectedLinuxBackend + """ + KWALLET = auto() + GNOMEKEYRING = auto() + BASICTEXT = auto() + + +SUPPORTED_KEYRINGS = _LinuxKeyring.__members__.keys() + + +def _get_linux_desktop_environment(env): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util.cc + GetDesktopEnvironment + """ + xdg_current_desktop = env.get('XDG_CURRENT_DESKTOP', None) + desktop_session = env.get('DESKTOP_SESSION', None) + if xdg_current_desktop is not None: + xdg_current_desktop = xdg_current_desktop.split(':')[0].strip() + + if xdg_current_desktop == 'Unity': + if desktop_session is not None and 'gnome-fallback' in desktop_session: + return _LinuxDesktopEnvironment.GNOME + else: + return _LinuxDesktopEnvironment.UNITY + elif xdg_current_desktop == 'GNOME': + return _LinuxDesktopEnvironment.GNOME + elif xdg_current_desktop == 'X-Cinnamon': + return _LinuxDesktopEnvironment.CINNAMON + elif xdg_current_desktop == 'KDE': + return _LinuxDesktopEnvironment.KDE + elif xdg_current_desktop == 'Pantheon': + return _LinuxDesktopEnvironment.PANTHEON + elif xdg_current_desktop == 'XFCE': + return _LinuxDesktopEnvironment.XFCE + elif desktop_session is not None: + if desktop_session in ('mate', 'gnome'): + return _LinuxDesktopEnvironment.GNOME + elif 'kde' in desktop_session: + return _LinuxDesktopEnvironment.KDE + elif 'xfce' in desktop_session: + return _LinuxDesktopEnvironment.XFCE + else: + if 'GNOME_DESKTOP_SESSION_ID' in env: + return _LinuxDesktopEnvironment.GNOME + elif 'KDE_FULL_SESSION' in env: + return _LinuxDesktopEnvironment.KDE + else: + return _LinuxDesktopEnvironment.OTHER + + +def _choose_linux_keyring(logger): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.cc + SelectBackend + """ + desktop_environment = _get_linux_desktop_environment(os.environ) + logger.debug('detected desktop environment: {}'.format(desktop_environment.name)) + if desktop_environment == _LinuxDesktopEnvironment.KDE: + linux_keyring = _LinuxKeyring.KWALLET + elif desktop_environment == _LinuxDesktopEnvironment.OTHER: + linux_keyring = _LinuxKeyring.BASICTEXT else: - logger.debug('using find-generic-password to obtain password') + linux_keyring = _LinuxKeyring.GNOMEKEYRING + return linux_keyring + + +def _get_kwallet_network_wallet(logger): + """ The name of the wallet used to store network passwords. + + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/kwallet_dbus.cc + KWalletDBus::NetworkWallet + which does a dbus call to the following function: + https://api.kde.org/frameworks/kwallet/html/classKWallet_1_1Wallet.html + Wallet::NetworkWallet + """ + default_wallet = 'kdewallet' + try: + proc = Popen([ + 'dbus-send', '--session', '--print-reply=literal', + '--dest=org.kde.kwalletd5', + '/modules/kwalletd5', + 'org.kde.KWallet.networkWallet' + ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + + stdout, stderr = proc.communicate_or_kill() + if proc.returncode != 0: + logger.warning('failed to read NetworkWallet') + return default_wallet + else: + network_wallet = stdout.decode('utf-8').strip() + logger.debug('NetworkWallet = "{}"'.format(network_wallet)) + return network_wallet + except BaseException as e: + logger.warning('exception while obtaining NetworkWallet: {}'.format(e)) + return default_wallet + + +def _get_kwallet_password(browser_keyring_name, logger): + logger.debug('using kwallet-query to obtain password from kwallet') + + if shutil.which('kwallet-query') is None: + logger.error('kwallet-query command not found. KWallet and kwallet-query ' + 'must be installed to read from KWallet. kwallet-query should be' + 'included in the kwallet package for your distribution') + return b'' + + network_wallet = _get_kwallet_network_wallet(logger) + + try: + proc = Popen([ + 'kwallet-query', + '--read-password', '{} Safe Storage'.format(browser_keyring_name), + '--folder', '{} Keys'.format(browser_keyring_name), + network_wallet + ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + + stdout, stderr = proc.communicate_or_kill() + if proc.returncode != 0: + logger.error('kwallet-query failed with return code {}. Please consult ' + 'the kwallet-query man page for details'.format(proc.returncode)) + return b'' + else: + if stdout.lower().startswith(b'failed to read'): + logger.debug('failed to read password from kwallet. Using empty string instead') + # this sometimes occurs in KDE because chrome does not check hasEntry and instead + # just tries to read the value (which kwallet returns "") whereas kwallet-query + # checks hasEntry. To verify this: + # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" + # while starting chrome. + # this may be a bug as the intended behaviour is to generate a random password and store + # it, but that doesn't matter here. + return b'' + else: + logger.debug('password found') + if stdout[-1:] == b'\n': + stdout = stdout[:-1] + return stdout + except BaseException as e: + logger.warning(f'exception running kwallet-query: {type(e).__name__}({e})') + return b'' + + +def _get_gnome_keyring_password(browser_keyring_name, logger): + if not SECRETSTORAGE_AVAILABLE: + logger.error('secretstorage not available {}'.format(SECRETSTORAGE_UNAVAILABLE_REASON)) + return b'' + # the Gnome keyring does not seem to organise keys in the same way as KWallet, + # using `dbus-monitor` during startup, it can be observed that chromium lists all keys + # and presumably searches for its key in the list. It appears that we must do the same. + # https://github.com/jaraco/keyring/issues/556 + with contextlib.closing(secretstorage.dbus_init()) as con: + col = secretstorage.get_default_collection(con) + for item in col.get_all_items(): + if item.get_label() == '{} Safe Storage'.format(browser_keyring_name): + return item.get_secret() + else: + logger.error('failed to read from keyring') + return b'' + + +def _get_linux_keyring_password(browser_keyring_name, keyring, logger): + # note: chrome/chromium can be run with the following flags to determine which keyring backend + # it has chosen to use + # chromium --enable-logging=stderr --v=1 2>&1 | grep key_storage_ + # Chromium supports a flag: --password-store=<basic|gnome|kwallet> so the automatic detection + # will not be sufficient in all cases. + + keyring = _LinuxKeyring[keyring] or _choose_linux_keyring(logger) + logger.debug(f'Chosen keyring: {keyring.name}') + + if keyring == _LinuxKeyring.KWALLET: + return _get_kwallet_password(browser_keyring_name, logger) + elif keyring == _LinuxKeyring.GNOMEKEYRING: + return _get_gnome_keyring_password(browser_keyring_name, logger) + elif keyring == _LinuxKeyring.BASICTEXT: + # when basic text is chosen, all cookies are stored as v10 (so no keyring password is required) + return None + assert False, f'Unknown keyring {keyring}' + + +def _get_mac_keyring_password(browser_keyring_name, logger): + logger.debug('using find-generic-password to obtain password from OSX keychain') + try: proc = Popen( ['security', 'find-generic-password', '-w', # write password to stdout '-a', browser_keyring_name, # match 'account' '-s', '{} Safe Storage'.format(browser_keyring_name)], # match 'service' stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) - try: - stdout, stderr = proc.communicate_or_kill() - if stdout[-1:] == b'\n': - stdout = stdout[:-1] - return stdout - except BaseException as e: - logger.warning(f'exception running find-generic-password: {type(e).__name__}({e})') - return None + + stdout, stderr = proc.communicate_or_kill() + if stdout[-1:] == b'\n': + stdout = stdout[:-1] + return stdout + except BaseException as e: + logger.warning(f'exception running find-generic-password: {type(e).__name__}({e})') + return None def _get_windows_v10_key(browser_root, logger): @@ -736,10 +943,11 @@ def _is_path(value): return os.path.sep in value -def _parse_browser_specification(browser_name, profile=None): - browser_name = browser_name.lower() +def _parse_browser_specification(browser_name, profile=None, keyring=None): if browser_name not in SUPPORTED_BROWSERS: raise ValueError(f'unsupported browser: "{browser_name}"') + if keyring not in (None, *SUPPORTED_KEYRINGS): + raise ValueError(f'unsupported keyring: "{keyring}"') if profile is not None and _is_path(profile): profile = os.path.expanduser(profile) - return browser_name, profile + return browser_name, profile, keyring diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index d0c9c223f..37321e34b 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -4,12 +4,14 @@ import os import re import time import random +import errno from ..utils import ( decodeArgument, encodeFilename, error_to_compat_str, format_bytes, + sanitize_open, shell_quote, timeconvert, timetuple_from_msec, @@ -39,6 +41,7 @@ class FileDownloader(object): ratelimit: Download speed limit, in bytes/sec. throttledratelimit: Assume the download is being throttled below this speed (bytes/sec) retries: Number of times to retry for HTTP error 5xx + file_access_retries: Number of times to retry on file access error buffersize: Size of download buffer in bytes. noresizebuffer: Do not automatically resize the download buffer. continuedl: Try to continue downloads if possible. @@ -207,6 +210,21 @@ class FileDownloader(object): def ytdl_filename(self, filename): return filename + '.ytdl' + def sanitize_open(self, filename, open_mode): + file_access_retries = self.params.get('file_access_retries', 10) + retry = 0 + while True: + try: + return sanitize_open(filename, open_mode) + except (IOError, OSError) as err: + retry = retry + 1 + if retry > file_access_retries or err.errno not in (errno.EACCES,): + raise + self.to_screen( + '[download] Got file access error. Retrying (attempt %d of %s) ...' + % (retry, self.format_retries(file_access_retries))) + time.sleep(0.01) + def try_rename(self, old_filename, new_filename): if old_filename == new_filename: return @@ -397,6 +415,7 @@ class FileDownloader(object): 'status': 'finished', 'total_bytes': os.path.getsize(encodeFilename(filename)), }, info_dict) + self._finish_multiline_status() return True, False if subtitle is False: diff --git a/yt_dlp/downloader/dash.py b/yt_dlp/downloader/dash.py index 4c23edd32..a845ee7d3 100644 --- a/yt_dlp/downloader/dash.py +++ b/yt_dlp/downloader/dash.py @@ -57,7 +57,7 @@ class DashSegmentsFD(FragmentFD): def _resolve_fragments(self, fragments, ctx): fragments = fragments(ctx) if callable(fragments) else fragments - return [next(fragments)] if self.params.get('test') else fragments + return [next(iter(fragments))] if self.params.get('test') else fragments def _get_fragments(self, fmt, ctx): fragment_base_url = fmt.get('fragment_base_url') diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index da69423f7..17be3c46f 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -22,7 +22,6 @@ from ..utils import ( handle_youtubedl_headers, check_executable, Popen, - sanitize_open, ) @@ -144,11 +143,11 @@ class ExternalFD(FragmentFD): return -1 decrypt_fragment = self.decrypter(info_dict) - dest, _ = sanitize_open(tmpfilename, 'wb') + dest, _ = self.sanitize_open(tmpfilename, 'wb') for frag_index, fragment in enumerate(info_dict['fragments']): fragment_filename = '%s-Frag%d' % (tmpfilename, frag_index) try: - src, _ = sanitize_open(fragment_filename, 'rb') + src, _ = self.sanitize_open(fragment_filename, 'rb') except IOError as err: if skip_unavailable_fragments and frag_index > 1: self.report_skip_fragment(frag_index, err) @@ -266,6 +265,7 @@ class Aria2cFD(ExternalFD): cmd += self._option('--all-proxy', 'proxy') cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') cmd += self._bool_option('--remote-time', 'updatetime', 'true', 'false', '=') + cmd += self._bool_option('--show-console-readout', 'noprogress', 'false', 'true', '=') cmd += self._configuration_args() # aria2c strips out spaces from the beginning/end of filenames and paths. @@ -290,7 +290,7 @@ class Aria2cFD(ExternalFD): for frag_index, fragment in enumerate(info_dict['fragments']): fragment_filename = '%s-Frag%d' % (os.path.basename(tmpfilename), frag_index) url_list.append('%s\n\tout=%s' % (fragment['url'], fragment_filename)) - stream, _ = sanitize_open(url_list_file, 'wb') + stream, _ = self.sanitize_open(url_list_file, 'wb') stream.write('\n'.join(url_list).encode('utf-8')) stream.close() cmd += ['-i', url_list_file] diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 79c6561c7..d4f112b0f 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -24,7 +24,6 @@ from ..utils import ( DownloadError, error_to_compat_str, encodeFilename, - sanitize_open, sanitized_Request, ) @@ -96,7 +95,7 @@ class FragmentFD(FileDownloader): def _read_ytdl_file(self, ctx): assert 'ytdl_corrupt' not in ctx - stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r') + stream, _ = self.sanitize_open(self.ytdl_filename(ctx['filename']), 'r') try: ytdl_data = json.loads(stream.read()) ctx['fragment_index'] = ytdl_data['downloader']['current_fragment']['index'] @@ -108,7 +107,7 @@ class FragmentFD(FileDownloader): stream.close() def _write_ytdl_file(self, ctx): - frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w') + frag_index_stream, _ = self.sanitize_open(self.ytdl_filename(ctx['filename']), 'w') try: downloader = { 'current_fragment': { @@ -140,7 +139,7 @@ class FragmentFD(FileDownloader): return True, self._read_fragment(ctx) def _read_fragment(self, ctx): - down, frag_sanitized = sanitize_open(ctx['fragment_filename_sanitized'], 'rb') + down, frag_sanitized = self.sanitize_open(ctx['fragment_filename_sanitized'], 'rb') ctx['fragment_filename_sanitized'] = frag_sanitized frag_content = down.read() down.close() @@ -216,7 +215,7 @@ class FragmentFD(FileDownloader): self._write_ytdl_file(ctx) assert ctx['fragment_index'] == 0 - dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) + dest_stream, tmpfilename = self.sanitize_open(tmpfilename, open_mode) ctx.update({ 'dl': dl, @@ -434,6 +433,7 @@ class FragmentFD(FileDownloader): def download_fragment(fragment, ctx): frag_index = ctx['fragment_index'] = fragment['frag_index'] + ctx['last_error'] = None if not interrupt_trigger[0]: return False, frag_index headers = info_dict.get('http_headers', {}).copy() @@ -456,6 +456,7 @@ class FragmentFD(FileDownloader): # See https://github.com/ytdl-org/youtube-dl/issues/10165, # https://github.com/ytdl-org/youtube-dl/issues/10448). count += 1 + ctx['last_error'] = err if count <= fragment_retries: self.report_retry_fragment(err, frag_index, count, fragment_retries) except DownloadError: diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index 6290884a8..34a1eb59b 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -16,7 +16,6 @@ from ..utils import ( ContentTooShortError, encodeFilename, int_or_none, - sanitize_open, sanitized_Request, ThrottledDownload, write_xattr, @@ -263,7 +262,7 @@ class HttpFD(FileDownloader): # Open destination file just in time if ctx.stream is None: try: - ctx.stream, ctx.tmpfilename = sanitize_open( + ctx.stream, ctx.tmpfilename = self.sanitize_open( ctx.tmpfilename, ctx.open_mode) assert ctx.stream is not None ctx.filename = self.undo_temp_name(ctx.tmpfilename) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index e019ec6a8..2cb01ff83 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -19,14 +19,15 @@ from ..utils import ( parse_iso8601, traverse_obj, try_get, + parse_count, smuggle_url, srt_subtitles_timecode, str_or_none, - str_to_int, strip_jsonp, unified_timestamp, unsmuggle_url, urlencode_postdata, + url_or_none, OnDemandPagedList ) @@ -722,10 +723,10 @@ class BiliBiliPlayerIE(InfoExtractor): class BiliIntlBaseIE(InfoExtractor): - _API_URL = 'https://api.bili{}/intl/gateway{}' + _API_URL = 'https://api.bilibili.tv/intl/gateway' - def _call_api(self, type, endpoint, id): - return self._download_json(self._API_URL.format(type, endpoint), id)['data'] + def _call_api(self, endpoint, *args, **kwargs): + return self._download_json(self._API_URL + endpoint, *args, **kwargs)['data'] def json2srt(self, json): data = '\n\n'.join( @@ -733,29 +734,40 @@ class BiliIntlBaseIE(InfoExtractor): for i, line in enumerate(json['body'])) return data - def _get_subtitles(self, type, ep_id): - sub_json = self._call_api(type, f'/m/subtitle?ep_id={ep_id}&platform=web', ep_id) + def _get_subtitles(self, ep_id): + sub_json = self._call_api(f'/web/v2/subtitle?episode_id={ep_id}&platform=web', ep_id) subtitles = {} - for sub in sub_json.get('subtitles', []): + for sub in sub_json.get('subtitles') or []: sub_url = sub.get('url') if not sub_url: continue - sub_data = self._download_json(sub_url, ep_id, fatal=False) + sub_data = self._download_json( + sub_url, ep_id, errnote='Unable to download subtitles', fatal=False, + note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '') if not sub_data: continue - subtitles.setdefault(sub.get('key', 'en'), []).append({ + subtitles.setdefault(sub.get('lang_key', 'en'), []).append({ 'ext': 'srt', 'data': self.json2srt(sub_data) }) return subtitles - def _get_formats(self, type, ep_id): - video_json = self._call_api(type, f'/web/playurl?ep_id={ep_id}&platform=web', ep_id) - if not video_json: - self.raise_login_required(method='cookies') + def _get_formats(self, ep_id): + video_json = self._call_api(f'/web/playurl?ep_id={ep_id}&platform=web', ep_id, + note='Downloading video formats', errnote='Unable to download video formats') + if video_json.get('code'): + if video_json['code'] in (10004004, 10004005, 10023006): + self.raise_login_required(method='cookies') + elif video_json['code'] == 10004001: + self.raise_geo_restricted() + elif video_json.get('message') and str(video_json['code']) != video_json['message']: + raise ExtractorError( + f'Unable to download video formats: {self.IE_NAME} said: {video_json["message"]}', expected=True) + else: + raise ExtractorError('Unable to download video formats') video_json = video_json['playurl'] formats = [] - for vid in video_json.get('video', []): + for vid in video_json.get('video') or []: video_res = vid.get('video_resource') or {} video_info = vid.get('stream_info') or {} if not video_res.get('url'): @@ -771,7 +783,7 @@ class BiliIntlBaseIE(InfoExtractor): 'vcodec': video_res.get('codecs'), 'filesize': video_res.get('size'), }) - for aud in video_json.get('audio_resource', []): + for aud in video_json.get('audio_resource') or []: if not aud.get('url'): continue formats.append({ @@ -786,85 +798,93 @@ class BiliIntlBaseIE(InfoExtractor): self._sort_formats(formats) return formats - def _extract_ep_info(self, type, episode_data, ep_id): + def _extract_ep_info(self, episode_data, ep_id): return { 'id': ep_id, - 'title': episode_data.get('long_title') or episode_data['title'], + 'title': episode_data.get('title_display') or episode_data['title'], 'thumbnail': episode_data.get('cover'), - 'episode_number': str_to_int(episode_data.get('title')), - 'formats': self._get_formats(type, ep_id), - 'subtitles': self._get_subtitles(type, ep_id), + 'episode_number': int_or_none(self._search_regex( + r'^E(\d+)(?:$| - )', episode_data.get('title_display'), 'episode number', default=None)), + 'formats': self._get_formats(ep_id), + 'subtitles': self._get_subtitles(ep_id), 'extractor_key': BiliIntlIE.ie_key(), } class BiliIntlIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?P<type>bili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.bilibili.tv/en/play/34613/341736', 'info_dict': { 'id': '341736', 'ext': 'mp4', - 'title': 'The First Night', - 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png', + 'title': 'E2 - The First Night', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', 'episode_number': 2, - }, - 'params': { - 'format': 'bv', - }, + } }, { - 'url': 'https://www.biliintl.com/en/play/34613/341736', + 'url': 'https://www.bilibili.tv/en/play/1033760/11005006', 'info_dict': { - 'id': '341736', + 'id': '11005006', 'ext': 'mp4', - 'title': 'The First Night', - 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png', - 'episode_number': 2, - }, - 'params': { - 'format': 'bv', - }, + 'title': 'E3 - Who?', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode_number': 3, + } + }, { + 'url': 'https://www.biliintl.com/en/play/34613/341736', + 'only_matching': True, }] def _real_extract(self, url): - type, season_id, id = self._match_valid_url(url).groups() - data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={season_id}', id) - episode_data = next( - episode for episode in data_json.get('episodes', []) - if str(episode.get('ep_id')) == id) - return self._extract_ep_info(type, episode_data, id) + season_id, video_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, video_id) + # Bstation layout + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), video_id, fatal=False) or {} + episode_data = traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) + + if not episode_data: + # Non-Bstation layout, read through episode list + season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) + episode_data = next( + episode for episode in traverse_obj(season_json, ('sections', ..., 'episodes', ...), expected_type=dict) + if str(episode.get('episode_id')) == video_id) + return self._extract_ep_info(episode_data, video_id) class BiliIntlSeriesIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?P<type>bili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)$' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)$' _TESTS = [{ 'url': 'https://www.bilibili.tv/en/play/34613', 'playlist_mincount': 15, 'info_dict': { 'id': '34613', + 'title': 'Fly Me to the Moon', + 'description': 'md5:a861ee1c4dc0acfad85f557cc42ac627', + 'categories': ['Romance', 'Comedy', 'Slice of life'], + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'view_count': int, }, 'params': { 'skip_download': True, - 'format': 'bv', }, }, { 'url': 'https://www.biliintl.com/en/play/34613', - 'playlist_mincount': 15, - 'info_dict': { - 'id': '34613', - }, - 'params': { - 'skip_download': True, - 'format': 'bv', - }, + 'only_matching': True, }] - def _entries(self, id, type): - data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={id}', id) - for episode in data_json.get('episodes', []): - episode_id = str(episode.get('ep_id')) - yield self._extract_ep_info(type, episode, episode_id) + def _entries(self, series_id): + series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id) + for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]): + episode_id = str(episode.get('episode_id')) + yield self._extract_ep_info(episode, episode_id) def _real_extract(self, url): - type, id = self._match_valid_url(url).groups() - return self.playlist_result(self._entries(id, type), playlist_id=id) + series_id = self._match_id(url) + series_info = self._call_api(f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {} + return self.playlist_result( + self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'), + categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none), + thumbnail=url_or_none(series_info.get('horizontal_cover')), view_count=parse_count(series_info.get('view'))) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 392c77884..ac1272f7b 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -340,7 +340,8 @@ class CBCGemIE(InfoExtractor): yield { **base_format, 'format_id': join_nonempty('sec', height), - 'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\<1>{bitrate}\2', base_url), + # Note: \g<1> is necessary instead of \1 since bitrate is a number + 'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\g<1>{bitrate}\2', base_url), 'width': int_or_none(video_quality.attrib.get('MaxWidth')), 'tbr': bitrate / 1000.0, 'height': height, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 9abbaf04f..3260399cb 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -616,7 +616,7 @@ class InfoExtractor(object): kwargs = { 'video_id': e.video_id or self.get_temp_id(url), 'ie': self.IE_NAME, - 'tb': e.traceback, + 'tb': e.traceback or sys.exc_info()[2], 'expected': e.expected, 'cause': e.cause } @@ -1574,7 +1574,7 @@ class InfoExtractor(object): 'vcodec': {'type': 'ordered', 'regex': True, 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, 'acodec': {'type': 'ordered', 'regex': True, - 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, + 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]}, 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol', @@ -2332,7 +2332,7 @@ class InfoExtractor(object): if smil is False: assert not fatal - return [] + return [], {} namespace = self._parse_smil_namespace(smil) @@ -3663,7 +3663,7 @@ class InfoExtractor(object): else 'public' if all_known else None) - def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False): + def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False): ''' @returns A list of values for the extractor argument given by "key" or "default" if no such key is present @@ -3671,7 +3671,7 @@ class InfoExtractor(object): @param casesense When false, the values are converted to lower case ''' val = traverse_obj( - self._downloader.params, ('extractor_args', self.ie_key().lower(), key)) + self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key)) if val is None: return [] if default is NO_DEFAULT else default return list(val) if casesense else [x.lower() for x in val] diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index 51e1f8f3c..e1f5e9dc8 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -348,7 +348,7 @@ class HGTVDeIE(DPlayBaseIE): class DiscoveryPlusIE(DPlayBaseIE): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?:\w{2}/)?video' + DPlayBaseIE._PATH_REGEX + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?!it/)(?:\w{2}/)?video' + DPlayBaseIE._PATH_REGEX _TESTS = [{ 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', 'info_dict': { @@ -575,6 +575,18 @@ class DiscoveryPlusShowBaseIE(DPlayBaseIE): return self.playlist_result(self._entries(show_name), playlist_id=show_name) +class DiscoveryPlusItalyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/it/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.discoveryplus.com/it/video/i-signori-della-neve/stagione-2-episodio-1-i-preparativi', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result(f'https://discoveryplus.it/video/{video_id}', DPlayIE.ie_key(), video_id) + + class DiscoveryPlusItalyShowIE(DiscoveryPlusShowBaseIE): _VALID_URL = r'https?://(?:www\.)?discoveryplus\.it/programmi/(?P<show_name>[^/]+)/?(?:[?#]|$)' _TESTS = [{ diff --git a/yt_dlp/extractor/drooble.py b/yt_dlp/extractor/drooble.py new file mode 100644 index 000000000..058425095 --- /dev/null +++ b/yt_dlp/extractor/drooble.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + try_get, +) + + +class DroobleIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://drooble\.com/(?: + (?:(?P<user>[^/]+)/)?(?P<kind>song|videos|music/albums)/(?P<id>\d+)| + (?P<user_2>[^/]+)/(?P<kind_2>videos|music)) + ''' + _TESTS = [{ + 'url': 'https://drooble.com/song/2858030', + 'md5': '5ffda90f61c7c318dc0c3df4179eb064', + 'info_dict': { + 'id': '2858030', + 'ext': 'mp3', + 'title': 'Skankocillin', + 'upload_date': '20200801', + 'timestamp': 1596241390, + 'uploader_id': '95894', + 'uploader': 'Bluebeat Shelter', + } + }, { + 'url': 'https://drooble.com/karl340758/videos/2859183', + 'info_dict': { + 'id': 'J6QCQY_I5Tk', + 'ext': 'mp4', + 'title': 'Skankocillin', + 'uploader_id': 'UCrSRoI5vVyeYihtWEYua7rg', + 'description': 'md5:ffc0bd8ba383db5341a86a6cd7d9bcca', + 'upload_date': '20200731', + 'uploader': 'Bluebeat Shelter', + } + }, { + 'url': 'https://drooble.com/karl340758/music/albums/2858031', + 'info_dict': { + 'id': '2858031', + }, + 'playlist_mincount': 8, + }, { + 'url': 'https://drooble.com/karl340758/music', + 'info_dict': { + 'id': 'karl340758', + }, + 'playlist_mincount': 8, + }, { + 'url': 'https://drooble.com/karl340758/videos', + 'info_dict': { + 'id': 'karl340758', + }, + 'playlist_mincount': 8, + }] + + def _call_api(self, method, video_id, data=None): + response = self._download_json( + f'https://drooble.com/api/dt/{method}', video_id, data=json.dumps(data).encode()) + if not response[0]: + raise ExtractorError('Unable to download JSON metadata') + return response[1] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + user = mobj.group('user') or mobj.group('user_2') + kind = mobj.group('kind') or mobj.group('kind_2') + display_id = mobj.group('id') or user + + if mobj.group('kind_2') == 'videos': + data = {'from_user': display_id, 'album': -1, 'limit': 18, 'offset': 0, 'order': 'new2old', 'type': 'video'} + elif kind in ('music/albums', 'music'): + data = {'user': user, 'public_only': True, 'individual_limit': {'singles': 1, 'albums': 1, 'playlists': 1}} + else: + data = {'url_slug': display_id, 'children': 10, 'order': 'old2new'} + + method = 'getMusicOverview' if kind in ('music/albums', 'music') else 'getElements' + json_data = self._call_api(method, display_id, data=data) + if kind in ('music/albums', 'music'): + json_data = json_data['singles']['list'] + + entites = [] + for media in json_data: + url = media.get('external_media_url') or media.get('link') + if url.startswith('https://www.youtube.com'): + entites.append({ + '_type': 'url', + 'url': url, + 'ie_key': 'Youtube' + }) + continue + is_audio = (media.get('type') or '').lower() == 'audio' + entites.append({ + 'url': url, + 'id': media['id'], + 'title': media['title'], + 'duration': int_or_none(media.get('duration')), + 'timestamp': int_or_none(media.get('timestamp')), + 'album': try_get(media, lambda x: x['album']['title']), + 'uploader': try_get(media, lambda x: x['creator']['display_name']), + 'uploader_id': try_get(media, lambda x: x['creator']['id']), + 'thumbnail': media.get('image_comment'), + 'like_count': int_or_none(media.get('likes')), + 'vcodec': 'none' if is_audio else None, + 'ext': 'mp3' if is_audio else None, + }) + + if len(entites) > 1: + return self.playlist_result(entites, display_id) + + return entites[0] diff --git a/yt_dlp/extractor/dropout.py b/yt_dlp/extractor/dropout.py new file mode 100644 index 000000000..a7442d8f0 --- /dev/null +++ b/yt_dlp/extractor/dropout.py @@ -0,0 +1,212 @@ +# coding: utf-8 +from .common import InfoExtractor +from .vimeo import VHXEmbedIE +from ..utils import ( + clean_html, + ExtractorError, + get_element_by_class, + get_element_by_id, + get_elements_by_class, + int_or_none, + join_nonempty, + unified_strdate, + urlencode_postdata, +) + + +class DropoutIE(InfoExtractor): + _LOGIN_URL = 'https://www.dropout.tv/login' + _NETRC_MACHINE = 'dropout' + + _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?:[^/]+/)*videos/(?P<id>[^/]+)/?$' + _TESTS = [ + { + 'url': 'https://www.dropout.tv/game-changer/season:2/videos/yes-or-no', + 'note': 'Episode in a series', + 'md5': '5e000fdfd8d8fa46ff40456f1c2af04a', + 'info_dict': { + 'id': '738153', + 'display_id': 'yes-or-no', + 'ext': 'mp4', + 'title': 'Yes or No', + 'description': 'Ally, Brennan, and Zac are asked a simple question, but is there a correct answer?', + 'release_date': '20200508', + 'thumbnail': 'https://vhx.imgix.net/chuncensoredstaging/assets/351e3f24-c4a3-459a-8b79-dc80f1e5b7fd.jpg', + 'series': 'Game Changer', + 'season_number': 2, + 'season': 'Season 2', + 'episode_number': 6, + 'episode': 'Yes or No', + 'duration': 1180, + 'uploader_id': 'user80538407', + 'uploader_url': 'https://vimeo.com/user80538407', + 'uploader': 'OTT Videos' + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'] + }, + { + 'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1/videos/episode-1', + 'note': 'Episode in a series (missing release_date)', + 'md5': '712caf7c191f1c47c8f1879520c2fa5c', + 'info_dict': { + 'id': '320562', + 'display_id': 'episode-1', + 'ext': 'mp4', + 'title': 'The Beginning Begins', + 'description': 'The cast introduces their PCs, including a neurotic elf, a goblin PI, and a corn-worshipping cleric.', + 'thumbnail': 'https://vhx.imgix.net/chuncensoredstaging/assets/4421ed0d-f630-4c88-9004-5251b2b8adfa.jpg', + 'series': 'Dimension 20: Fantasy High', + 'season_number': 1, + 'season': 'Season 1', + 'episode_number': 1, + 'episode': 'The Beginning Begins', + 'duration': 6838, + 'uploader_id': 'user80538407', + 'uploader_url': 'https://vimeo.com/user80538407', + 'uploader': 'OTT Videos' + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'] + }, + { + 'url': 'https://www.dropout.tv/videos/misfits-magic-holiday-special', + 'note': 'Episode not in a series', + 'md5': 'c30fa18999c5880d156339f13c953a26', + 'info_dict': { + 'id': '1915774', + 'display_id': 'misfits-magic-holiday-special', + 'ext': 'mp4', + 'title': 'Misfits & Magic Holiday Special', + 'description': 'The magical misfits spend Christmas break at Gowpenny, with an unwelcome visitor.', + 'release_date': '20211215', + 'thumbnail': 'https://vhx.imgix.net/chuncensoredstaging/assets/d91ea8a6-b250-42ed-907e-b30fb1c65176-8e24b8e5.jpg', + 'duration': 11698, + 'uploader_id': 'user80538407', + 'uploader_url': 'https://vimeo.com/user80538407', + 'uploader': 'OTT Videos' + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'] + } + ] + + def _get_authenticity_token(self, display_id): + signin_page = self._download_webpage( + self._LOGIN_URL, display_id, note='Getting authenticity token') + return self._html_search_regex( + r'name=["\']authenticity_token["\'] value=["\'](.+?)["\']', + signin_page, 'authenticity_token') + + def _login(self, display_id): + username, password = self._get_login_info() + if not (username and password): + self.raise_login_required(method='password') + + response = self._download_webpage( + self._LOGIN_URL, display_id, note='Logging in', data=urlencode_postdata({ + 'email': username, + 'password': password, + 'authenticity_token': self._get_authenticity_token(display_id), + 'utf8': True + })) + + user_has_subscription = self._search_regex( + r'user_has_subscription:\s*["\'](.+?)["\']', response, 'subscription status', default='none') + if user_has_subscription.lower() == 'true': + return response + elif user_has_subscription.lower() == 'false': + raise ExtractorError('Account is not subscribed') + else: + raise ExtractorError('Incorrect username/password') + + def _real_extract(self, url): + display_id = self._match_id(url) + try: + self._login(display_id) + webpage = self._download_webpage(url, display_id, note='Downloading video webpage') + finally: + self._download_webpage('https://www.dropout.tv/logout', display_id, note='Logging out') + + embed_url = self._search_regex(r'embed_url:\s*["\'](.+?)["\']', webpage, 'embed url') + thumbnail = self._og_search_thumbnail(webpage) + watch_info = get_element_by_id('watch-info', webpage) or '' + + title = clean_html(get_element_by_class('video-title', watch_info)) + season_episode = get_element_by_class( + 'site-font-secondary-color', get_element_by_class('text', watch_info)) + episode_number = int_or_none(self._search_regex( + r'Episode (\d+)', season_episode or '', 'episode', default=None)) + + return { + '_type': 'url_transparent', + 'ie_key': VHXEmbedIE.ie_key(), + 'url': embed_url, + 'id': self._search_regex(r'embed.vhx.tv/videos/(.+?)\?', embed_url, 'id'), + 'display_id': display_id, + 'title': title, + 'description': self._html_search_meta('description', webpage, fatal=False), + 'thumbnail': thumbnail.split('?')[0] if thumbnail else None, # Ignore crop/downscale + 'series': clean_html(get_element_by_class('series-title', watch_info)), + 'episode_number': episode_number, + 'episode': title if episode_number else None, + 'season_number': int_or_none(self._search_regex( + r'Season (\d+),', season_episode or '', 'season', default=None)), + 'release_date': unified_strdate(self._search_regex( + r'data-meta-field-name=["\']release_dates["\'] data-meta-field-value=["\'](.+?)["\']', + watch_info, 'release date', default=None)), + } + + +class DropoutSeasonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?P<id>[^\/$&?#]+)(?:/?$|/season:[0-9]+/?$)' + _TESTS = [ + { + 'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1', + 'note': 'Multi-season series with the season in the url', + 'playlist_count': 17, + 'info_dict': { + 'id': 'dimension-20-fantasy-high-season-1', + 'title': 'Dimension 20 Fantasy High - Season 1' + } + }, + { + 'url': 'https://www.dropout.tv/dimension-20-fantasy-high', + 'note': 'Multi-season series with the season not in the url', + 'playlist_count': 17, + 'info_dict': { + 'id': 'dimension-20-fantasy-high-season-1', + 'title': 'Dimension 20 Fantasy High - Season 1' + } + }, + { + 'url': 'https://www.dropout.tv/dimension-20-shriek-week', + 'note': 'Single-season series', + 'playlist_count': 4, + 'info_dict': { + 'id': 'dimension-20-shriek-week-season-1', + 'title': 'Dimension 20 Shriek Week - Season 1' + } + } + ] + + def _real_extract(self, url): + season_id = self._match_id(url) + season_title = season_id.replace('-', ' ').title() + webpage = self._download_webpage(url, season_id) + + entries = [ + self.url_result( + url=self._search_regex(r'<a href=["\'](.+?)["\'] class=["\']browse-item-link["\']', + item, 'item_url'), + ie=DropoutIE.ie_key() + ) for item in get_elements_by_class('js-collection-item', webpage) + ] + + seasons = (get_element_by_class('select-dropdown-wrapper', webpage) or '').strip().replace('\n', '') + current_season = self._search_regex(r'<option[^>]+selected>([^<]+)</option>', + seasons, 'current_season', default='').strip() + + return { + '_type': 'playlist', + 'id': join_nonempty(season_id, current_season.lower().replace(' ', '-')), + 'title': join_nonempty(season_title, current_season, delim=' - '), + 'entries': entries + } diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 0b359a253..1b32efc47 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -357,6 +357,7 @@ from .dplay import ( AnimalPlanetIE, DiscoveryPlusIndiaIE, DiscoveryNetworksDeIE, + DiscoveryPlusItalyIE, DiscoveryPlusItalyShowIE, DiscoveryPlusIndiaShowIE, ) @@ -385,6 +386,10 @@ from .disney import DisneyIE from .dispeak import DigitallySpeakingIE from .doodstream import DoodStreamIE from .dropbox import DropboxIE +from .dropout import ( + DropoutSeasonIE, + DropoutIE +) from .dw import ( DWIE, DWArticleIE, @@ -507,6 +512,14 @@ from .gab import ( ) from .gaia import GaiaIE from .gameinformer import GameInformerIE +from .gamejolt import ( + GameJoltIE, + GameJoltUserIE, + GameJoltGameIE, + GameJoltGameSoundtrackIE, + GameJoltCommunityIE, + GameJoltSearchIE, +) from .gamespot import GameSpotIE from .gamestar import GameStarIE from .gaskrank import GaskrankIE @@ -608,6 +621,7 @@ from .instagram import ( InstagramIOSIE, InstagramUserIE, InstagramTagIE, + InstagramStoryIE, ) from .internazionale import InternazionaleIE from .internetvideoarchive import InternetVideoArchiveIE @@ -1036,6 +1050,10 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) +from .opencast import ( + OpencastIE, + OpencastPlaylistIE, +) from .openrec import ( OpenRecIE, OpenRecCaptureIE, @@ -1109,6 +1127,10 @@ from .pinterest import ( PinterestIE, PinterestCollectionIE, ) +from .pixivsketch import ( + PixivSketchIE, + PixivSketchUserIE, +) from .pladform import PladformIE from .planetmarathi import PlanetMarathiIE from .platzi import ( @@ -1517,6 +1539,9 @@ from .threeqsdn import ThreeQSDNIE from .tiktok import ( TikTokIE, TikTokUserIE, + TikTokSoundIE, + TikTokEffectIE, + TikTokTagIE, DouyinIE, ) from .tinypic import TinyPicIE @@ -1667,6 +1692,7 @@ from .dlive import ( DLiveVODIE, DLiveStreamIE, ) +from .drooble import DroobleIE from .umg import UMGDeIE from .unistra import UnistraIE from .unity import UnityIE diff --git a/yt_dlp/extractor/fancode.py b/yt_dlp/extractor/fancode.py index f6733b124..978df31ff 100644 --- a/yt_dlp/extractor/fancode.py +++ b/yt_dlp/extractor/fancode.py @@ -41,7 +41,7 @@ class FancodeVodIE(InfoExtractor): _ACCESS_TOKEN = None _NETRC_MACHINE = 'fancode' - _LOGIN_HINT = 'Use "--user refresh --password <refresh_token>" to login using a refresh token' + _LOGIN_HINT = 'Use "--username refresh --password <refresh_token>" to login using a refresh token' headers = { 'content-type': 'application/json', diff --git a/yt_dlp/extractor/gamejolt.py b/yt_dlp/extractor/gamejolt.py new file mode 100644 index 000000000..7f2f6f3e1 --- /dev/null +++ b/yt_dlp/extractor/gamejolt.py @@ -0,0 +1,540 @@ +# coding: utf-8 +import itertools +import json +import math + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + determine_ext, + int_or_none, + str_or_none, + traverse_obj, + try_get +) + + +class GameJoltBaseIE(InfoExtractor): + _API_BASE = 'https://gamejolt.com/site-api/' + + def _call_api(self, endpoint, *args, **kwargs): + kwargs.setdefault('headers', {}).update({'Accept': 'image/webp,*/*'}) + return self._download_json(self._API_BASE + endpoint, *args, **kwargs)['payload'] + + def _parse_content_as_text(self, content): + outer_contents, joined_contents = content.get('content') or [], [] + for outer_content in outer_contents: + if outer_content.get('type') != 'paragraph': + joined_contents.append(self._parse_content_as_text(outer_content)) + continue + inner_contents, inner_content_text = outer_content.get('content') or [], '' + for inner_content in inner_contents: + if inner_content.get('text'): + inner_content_text += inner_content['text'] + elif inner_content.get('type') == 'hardBreak': + inner_content_text += '\n' + joined_contents.append(inner_content_text) + + return '\n'.join(joined_contents) + + def _get_comments(self, post_num_id, post_hash_id): + sort_by, scroll_id = self._configuration_arg('comment_sort', ['hot'], ie_key=GameJoltIE.ie_key())[0], -1 + is_scrolled = sort_by in ('new', 'you') + for page in itertools.count(1): + comments_data = self._call_api( + 'comments/Fireside_Post/%s/%s?%s=%d' % ( + post_num_id, sort_by, + 'scroll_id' if is_scrolled else 'page', scroll_id if is_scrolled else page), + post_hash_id, note='Downloading comments list page %d' % page) + if not comments_data.get('comments'): + break + for comment in traverse_obj(comments_data, (('comments', 'childComments'), ...), expected_type=dict, default=[]): + yield { + 'id': comment['id'], + 'text': self._parse_content_as_text( + self._parse_json(comment['comment_content'], post_hash_id)), + 'timestamp': int_or_none(comment.get('posted_on'), scale=1000), + 'like_count': comment.get('votes'), + 'author': traverse_obj(comment, ('user', ('display_name', 'name')), expected_type=str_or_none, get_all=False), + 'author_id': traverse_obj(comment, ('user', 'username'), expected_type=str_or_none), + 'author_thumbnail': traverse_obj(comment, ('user', 'image_avatar'), expected_type=str_or_none), + 'parent': comment.get('parent_id') or None, + } + scroll_id = int_or_none(comments_data['comments'][-1].get('posted_on')) + + def _parse_post(self, post_data): + post_id = post_data['hash'] + lead_content = self._parse_json(post_data.get('lead_content') or '{}', post_id, fatal=False) or {} + description, full_description = post_data.get('leadStr') or self._parse_content_as_text( + self._parse_json(post_data.get('lead_content'), post_id)), None + if post_data.get('has_article'): + article_content = self._parse_json( + post_data.get('article_content') + or self._call_api(f'web/posts/article/{post_data.get("id", post_id)}', post_id, + note='Downloading article metadata', errnote='Unable to download article metadata', fatal=False).get('article'), + post_id, fatal=False) + full_description = self._parse_content_as_text(article_content) + + user_data = post_data.get('user') or {} + info_dict = { + 'extractor_key': GameJoltIE.ie_key(), + 'extractor': 'GameJolt', + 'webpage_url': str_or_none(post_data.get('url')) or f'https://gamejolt.com/p/{post_id}', + 'id': post_id, + 'title': description, + 'description': full_description or description, + 'display_id': post_data.get('slug'), + 'uploader': user_data.get('display_name') or user_data.get('name'), + 'uploader_id': user_data.get('username'), + 'uploader_url': 'https://gamejolt.com' + user_data['url'] if user_data.get('url') else None, + 'categories': [try_get(category, lambda x: '%s - %s' % (x['community']['name'], x['channel'].get('display_title') or x['channel']['title'])) + for category in post_data.get('communities' or [])], + 'tags': traverse_obj( + lead_content, ('content', ..., 'content', ..., 'marks', ..., 'attrs', 'tag'), expected_type=str_or_none), + 'like_count': int_or_none(post_data.get('like_count')), + 'comment_count': int_or_none(post_data.get('comment_count'), default=0), + 'timestamp': int_or_none(post_data.get('added_on'), scale=1000), + 'release_timestamp': int_or_none(post_data.get('published_on'), scale=1000), + '__post_extractor': self.extract_comments(post_data.get('id'), post_id) + } + + # TODO: Handle multiple videos/embeds? + video_data = traverse_obj(post_data, ('videos', ...), expected_type=dict, get_all=False) or {} + formats, subtitles, thumbnails = [], {}, [] + for media in video_data.get('media') or []: + media_url, mimetype, ext, media_id = media['img_url'], media.get('filetype', ''), determine_ext(media['img_url']), media.get('type') + if mimetype == 'application/vnd.apple.mpegurl' or ext == 'm3u8': + hls_formats, hls_subs = self._extract_m3u8_formats_and_subtitles(media_url, post_id, 'mp4', m3u8_id=media_id) + formats.extend(hls_formats) + subtitles.update(hls_subs) + elif mimetype == 'application/dash+xml' or ext == 'mpd': + dash_formats, dash_subs = self._extract_mpd_formats_and_subtitles(media_url, post_id, mpd_id=media_id) + formats.extend(dash_formats) + subtitles.update(dash_subs) + elif 'image' in mimetype: + thumbnails.append({ + 'id': media_id, + 'url': media_url, + 'width': media.get('width'), + 'height': media.get('height'), + 'filesize': media.get('filesize'), + }) + else: + formats.append({ + 'format_id': media_id, + 'url': media_url, + 'width': media.get('width'), + 'height': media.get('height'), + 'filesize': media.get('filesize'), + 'acodec': 'none' if 'video-card' in media_url else None, + }) + + if formats: + return { + **info_dict, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'view_count': int_or_none(video_data.get('view_count')), + } + + gif_entries = [] + for media in post_data.get('media', []): + if determine_ext(media['img_url']) != 'gif' or 'gif' not in media.get('filetype', ''): + continue + gif_entries.append({ + 'id': media['hash'], + 'title': media['filename'].split('.')[0], + 'formats': [{ + 'format_id': url_key, + 'url': media[url_key], + 'width': media.get('width') if url_key == 'img_url' else None, + 'height': media.get('height') if url_key == 'img_url' else None, + 'filesize': media.get('filesize') if url_key == 'img_url' else None, + 'acodec': 'none', + } for url_key in ('img_url', 'mediaserver_url', 'mediaserver_url_mp4', 'mediaserver_url_webm') if media.get(url_key)] + }) + if gif_entries: + return { + '_type': 'playlist', + **info_dict, + 'entries': gif_entries, + } + + embed_url = traverse_obj(post_data, ('embeds', ..., 'url'), expected_type=str_or_none, get_all=False) + if embed_url: + return self.url_result(embed_url) + return info_dict + + +class GameJoltIE(GameJoltBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/p/(?:[\w-]*-)?(?P<id>\w{8})' + _TESTS = [{ + # No audio + 'url': 'https://gamejolt.com/p/introducing-ramses-jackson-some-fnf-himbo-i-ve-been-animating-fo-c6achnzu', + 'md5': 'cd5f733258f6678b0ce500dd88166d86', + 'info_dict': { + 'id': 'c6achnzu', + 'ext': 'mp4', + 'display_id': 'introducing-ramses-jackson-some-fnf-himbo-i-ve-been-animating-fo-c6achnzu', + 'title': 'Introducing Ramses Jackson, some FNF himbo I’ve been animating for the past few days, hehe.\n#fnfmod #fridaynightfunkin', + 'description': 'Introducing Ramses Jackson, some FNF himbo I’ve been animating for the past few days, hehe.\n#fnfmod #fridaynightfunkin', + 'uploader': 'Jakeneutron', + 'uploader_id': 'Jakeneutron', + 'uploader_url': 'https://gamejolt.com/@Jakeneutron', + 'categories': ['Friday Night Funkin\' - Videos'], + 'tags': ['fnfmod', 'fridaynightfunkin'], + 'timestamp': 1633499590, + 'upload_date': '20211006', + 'release_timestamp': 1633499655, + 'release_date': '20211006', + 'thumbnail': 're:^https?://.+wgch9mhq.png$', + 'like_count': int, + 'comment_count': int, + 'view_count': int, + } + }, { + # YouTube embed + 'url': 'https://gamejolt.com/p/hey-hey-if-there-s-anyone-who-s-looking-to-get-into-learning-a-n6g4jzpq', + 'md5': '79a931ff500a5c783ef6c3bda3272e32', + 'info_dict': { + 'id': 'XsNA_mzC0q4', + 'title': 'Adobe Animate CC 2021 Tutorial || Part 1 - The Basics', + 'description': 'md5:9d1ab9e2625b3fe1f42b2a44c67fdd13', + 'uploader': 'Jakeneutron', + 'uploader_id': 'Jakeneutron', + 'uploader_url': 'http://www.youtube.com/user/Jakeneutron', + 'ext': 'mp4', + 'duration': 1749, + 'tags': ['Adobe Animate CC', 'Tutorial', 'Animation', 'The Basics', 'For Beginners'], + 'like_count': int, + 'playable_in_embed': True, + 'categories': ['Education'], + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi_webp/XsNA_mzC0q4/maxresdefault.webp', + 'age_limit': 0, + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UC6_L7fnczNalFZyBthUE9oA', + 'channel': 'Jakeneutron', + 'channel_id': 'UC6_L7fnczNalFZyBthUE9oA', + 'upload_date': '20211015', + 'view_count': int, + 'chapters': 'count:18', + } + }, { + # Article + 'url': 'https://gamejolt.com/p/i-fuckin-broke-chaos-d56h3eue', + 'md5': '786c1ccf98fde02c03a2768acb4258d0', + 'info_dict': { + 'id': 'd56h3eue', + 'ext': 'mp4', + 'display_id': 'i-fuckin-broke-chaos-d56h3eue', + 'title': 'I fuckin broke Chaos.', + 'description': 'I moved my tab durning the cutscene so now it\'s stuck like this.', + 'uploader': 'Jeff____________', + 'uploader_id': 'The_Nyesh_Man', + 'uploader_url': 'https://gamejolt.com/@The_Nyesh_Man', + 'categories': ['Friday Night Funkin\' - Videos'], + 'timestamp': 1639800264, + 'upload_date': '20211218', + 'release_timestamp': 1639800330, + 'release_date': '20211218', + 'thumbnail': 're:^https?://.+euksy8bd.png$', + 'like_count': int, + 'comment_count': int, + 'view_count': int, + } + }, { + # Single GIF + 'url': 'https://gamejolt.com/p/hello-everyone-i-m-developing-a-pixel-art-style-mod-for-fnf-and-i-vs4gdrd8', + 'info_dict': { + 'id': 'vs4gdrd8', + 'display_id': 'hello-everyone-i-m-developing-a-pixel-art-style-mod-for-fnf-and-i-vs4gdrd8', + 'title': 'md5:cc3d8b031d9bc7ec2ec5a9ffc707e1f9', + 'description': 'md5:cc3d8b031d9bc7ec2ec5a9ffc707e1f9', + 'uploader': 'Quesoguy', + 'uploader_id': 'CheeseguyDev', + 'uploader_url': 'https://gamejolt.com/@CheeseguyDev', + 'categories': ['Game Dev - General', 'Arts n\' Crafts - Creations', 'Pixel Art - showcase', + 'Friday Night Funkin\' - Mods', 'Newgrounds - Friday Night Funkin (13+)'], + 'timestamp': 1639517122, + 'release_timestamp': 1639519966, + 'like_count': int, + 'comment_count': int, + }, + 'playlist': [{ + 'info_dict': { + 'id': 'dszyjnwi', + 'ext': 'webm', + 'title': 'gif-presentacion-mejorado-dszyjnwi', + 'n_entries': 1, + } + }] + }, { + # Multiple GIFs + 'url': 'https://gamejolt.com/p/gif-yhsqkumq', + 'playlist_count': 35, + 'info_dict': { + 'id': 'yhsqkumq', + 'display_id': 'gif-yhsqkumq', + 'title': 'GIF', + 'description': 'GIF', + 'uploader': 'DaniilTvman', + 'uploader_id': 'DaniilTvman', + 'uploader_url': 'https://gamejolt.com/@DaniilTvman', + 'categories': ['Five Nights At The AGK Studio Comunity - NEWS game'], + 'timestamp': 1638721559, + 'release_timestamp': 1638722276, + 'like_count': int, + 'comment_count': int, + }, + }] + + def _real_extract(self, url): + post_id = self._match_id(url) + post_data = self._call_api( + f'web/posts/view/{post_id}', post_id)['post'] + return self._parse_post(post_data) + + +class GameJoltPostListBaseIE(GameJoltBaseIE): + def _entries(self, endpoint, list_id, note='Downloading post list', errnote='Unable to download post list', initial_items=[]): + page_num, scroll_id = 1, None + items = initial_items or self._call_api(endpoint, list_id, note=note, errnote=errnote)['items'] + while items: + for item in items: + yield self._parse_post(item['action_resource_model']) + scroll_id = items[-1]['scroll_id'] + page_num += 1 + items = self._call_api( + endpoint, list_id, note=f'{note} page {page_num}', errnote=errnote, data=json.dumps({ + 'scrollDirection': 'from', + 'scrollId': scroll_id, + }).encode('utf-8')).get('items') + + +class GameJoltUserIE(GameJoltPostListBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/@(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://gamejolt.com/@BlazikenSuperStar', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '6116784', + 'title': 'S. Blaze', + 'description': 'md5:5ba7fbbb549e8ea2545aafbfe22eb03a', + }, + 'params': { + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + user_data = self._call_api( + f'web/profile/@{user_id}', user_id, note='Downloading user info', errnote='Unable to download user info')['user'] + bio = self._parse_content_as_text( + self._parse_json(user_data.get('bio_content', '{}'), user_id, fatal=False) or {}) + return self.playlist_result( + self._entries(f'web/posts/fetch/user/@{user_id}?tab=active', user_id, 'Downloading user posts', 'Unable to download user posts'), + str_or_none(user_data.get('id')), user_data.get('display_name') or user_data.get('name'), bio) + + +class GameJoltGameIE(GameJoltPostListBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/games/[\w-]+/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://gamejolt.com/games/Friday4Fun/655124', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '655124', + 'title': 'Friday Night Funkin\': Friday 4 Fun', + 'description': 'md5:576a7dd87912a2dcf33c50d2bd3966d3' + }, + 'params': { + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }] + + def _real_extract(self, url): + game_id = self._match_id(url) + game_data = self._call_api( + f'web/discover/games/{game_id}', game_id, note='Downloading game info', errnote='Unable to download game info')['game'] + description = self._parse_content_as_text( + self._parse_json(game_data.get('description_content', '{}'), game_id, fatal=False) or {}) + return self.playlist_result( + self._entries(f'web/posts/fetch/game/{game_id}', game_id, 'Downloading game posts', 'Unable to download game posts'), + game_id, game_data.get('title'), description) + + +class GameJoltGameSoundtrackIE(GameJoltBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/get/soundtrack(?:\?|\#!?)(?:.*?[&;])??game=(?P<id>(?:\d+)+)' + _TESTS = [{ + 'url': 'https://gamejolt.com/get/soundtrack?foo=bar&game=657899', + 'info_dict': { + 'id': '657899', + 'title': 'Friday Night Funkin\': Vs Oswald', + 'n_entries': None, + }, + 'playlist': [{ + 'info_dict': { + 'id': '184434', + 'ext': 'mp3', + 'title': 'Gettin\' Lucky (Menu Music)', + 'url': r're:^https://.+vs-oswald-menu-music\.mp3$', + 'release_timestamp': 1635190816, + 'release_date': '20211025', + 'n_entries': 3, + } + }, { + 'info_dict': { + 'id': '184435', + 'ext': 'mp3', + 'title': 'Rabbit\'s Luck (Extended Version)', + 'url': r're:^https://.+rabbit-s-luck--full-version-\.mp3$', + 'release_timestamp': 1635190841, + 'release_date': '20211025', + 'n_entries': 3, + } + }, { + 'info_dict': { + 'id': '185228', + 'ext': 'mp3', + 'title': 'Last Straw', + 'url': r're:^https://.+last-straw\.mp3$', + 'release_timestamp': 1635881104, + 'release_date': '20211102', + 'n_entries': 3, + } + }] + }] + + def _real_extract(self, url): + game_id = self._match_id(url) + game_overview = self._call_api( + f'web/discover/games/overview/{game_id}', game_id, note='Downloading soundtrack info', errnote='Unable to download soundtrack info') + return self.playlist_result([{ + 'id': str_or_none(song.get('id')), + 'title': str_or_none(song.get('title')), + 'url': str_or_none(song.get('url')), + 'release_timestamp': int_or_none(song.get('posted_on'), scale=1000), + } for song in game_overview.get('songs') or []], game_id, traverse_obj( + game_overview, ('microdata', 'name'), (('twitter', 'fb'), 'title'), expected_type=str_or_none, get_all=False)) + + +class GameJoltCommunityIE(GameJoltPostListBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/c/(?P<id>(?P<community>[\w-]+)(?:/(?P<channel>[\w-]+))?)(?:(?:\?|\#!?)(?:.*?[&;])??sort=(?P<sort>\w+))?' + _TESTS = [{ + 'url': 'https://gamejolt.com/c/fnf/videos', + 'playlist_mincount': 50, + 'info_dict': { + 'id': 'fnf/videos', + 'title': 'Friday Night Funkin\' - Videos', + 'description': 'md5:6d8c06f27460f7d35c1554757ffe53c8' + }, + 'params': { + 'playlistend': 50, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }, { + 'url': 'https://gamejolt.com/c/youtubers', + 'playlist_mincount': 50, + 'info_dict': { + 'id': 'youtubers/featured', + 'title': 'Youtubers - featured', + 'description': 'md5:53e5582c93dcc467ab597bfca4db17d4' + }, + 'params': { + 'playlistend': 50, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }] + + def _real_extract(self, url): + display_id, community_id, channel_id, sort_by = self._match_valid_url(url).group('id', 'community', 'channel', 'sort') + channel_id, sort_by = channel_id or 'featured', sort_by or 'new' + + community_data = self._call_api( + f'web/communities/view/{community_id}', display_id, + note='Downloading community info', errnote='Unable to download community info')['community'] + channel_data = traverse_obj(self._call_api( + f'web/communities/view-channel/{community_id}/{channel_id}', display_id, + note='Downloading channel info', errnote='Unable to download channel info', fatal=False), 'channel') or {} + + title = f'{community_data.get("name") or community_id} - {channel_data.get("display_title") or channel_id}' + description = self._parse_content_as_text( + self._parse_json(community_data.get('description_content') or '{}', display_id, fatal=False) or {}) + return self.playlist_result( + self._entries( + f'web/posts/fetch/community/{community_id}?channels[]={sort_by}&channels[]={channel_id}', + display_id, 'Downloading community posts', 'Unable to download community posts'), + f'{community_id}/{channel_id}', title, description) + + +class GameJoltSearchIE(GameJoltPostListBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/search(?:/(?P<filter>communities|users|games))?(?:\?|\#!?)(?:.*?[&;])??q=(?P<id>(?:[^&#]+)+)' + _URL_FORMATS = { + 'users': 'https://gamejolt.com/@{username}', + 'communities': 'https://gamejolt.com/c/{path}', + 'games': 'https://gamejolt.com/games/{slug}/{id}', + } + _TESTS = [{ + 'url': 'https://gamejolt.com/search?foo=bar&q=%23fnf', + 'playlist_mincount': 50, + 'info_dict': { + 'id': '#fnf', + 'title': '#fnf', + }, + 'params': { + 'playlistend': 50, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }, { + 'url': 'https://gamejolt.com/search/communities?q=cookie%20run', + 'playlist_mincount': 10, + 'info_dict': { + 'id': 'cookie run', + 'title': 'cookie run', + }, + }, { + 'url': 'https://gamejolt.com/search/users?q=mlp', + 'playlist_mincount': 278, + 'info_dict': { + 'id': 'mlp', + 'title': 'mlp', + }, + }, { + 'url': 'https://gamejolt.com/search/games?q=roblox', + 'playlist_mincount': 688, + 'info_dict': { + 'id': 'roblox', + 'title': 'roblox', + }, + }] + + def _search_entries(self, query, filter_mode, display_query): + initial_search_data = self._call_api( + f'web/search/{filter_mode}?q={query}', display_query, + note=f'Downloading {filter_mode} list', errnote=f'Unable to download {filter_mode} list') + entries_num = traverse_obj(initial_search_data, 'count', f'{filter_mode}Count') + if not entries_num: + return + for page in range(1, math.ceil(entries_num / initial_search_data['perPage']) + 1): + search_results = self._call_api( + f'web/search/{filter_mode}?q={query}&page={page}', display_query, + note=f'Downloading {filter_mode} list page {page}', errnote=f'Unable to download {filter_mode} list') + for result in search_results[filter_mode]: + yield self.url_result(self._URL_FORMATS[filter_mode].format(**result)) + + def _real_extract(self, url): + filter_mode, query = self._match_valid_url(url).group('filter', 'id') + display_query = compat_urllib_parse_unquote(query) + return self.playlist_result( + self._search_entries(query, filter_mode, display_query) if filter_mode else self._entries( + f'web/posts/fetch/search/{query}', display_query, initial_items=self._call_api( + f'web/search?q={query}', display_query, + note='Downloading initial post list', errnote='Unable to download initial post list')['posts']), + display_query, display_query) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 1ec0ce986..5dafef283 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2345,6 +2345,18 @@ class GenericIE(InfoExtractor): } }, { + # KVS Player (for sites that serve kt_player.js via non-https urls) + 'url': 'http://www.camhub.world/embed/389508', + 'md5': 'fbe89af4cfb59c8fd9f34a202bb03e32', + 'info_dict': { + 'id': '389508', + 'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source', + 'ext': 'mp4', + 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', + 'thumbnail': 'http://www.camhub.world/contents/videos_screenshots/389000/389508/preview.mp4.jpg', + } + }, + { # Reddit-hosted video that will redirect and be processed by RedditIE # Redirects to https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ 'url': 'https://v.redd.it/zv89llsvexdz', @@ -3689,7 +3701,7 @@ class GenericIE(InfoExtractor): self.report_detected('JW Player embed') if not found: # Look for generic KVS player - found = re.search(r'<script [^>]*?src="https://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage) + found = re.search(r'<script [^>]*?src="https?://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage) if found: self.report_detected('KWS Player') if found.group('maj_ver') not in ['4', '5']: diff --git a/yt_dlp/extractor/gfycat.py b/yt_dlp/extractor/gfycat.py index 18a30fe67..56a6dc03d 100644 --- a/yt_dlp/extractor/gfycat.py +++ b/yt_dlp/extractor/gfycat.py @@ -24,9 +24,10 @@ class GfycatIE(InfoExtractor): 'duration': 10.4, 'view_count': int, 'like_count': int, - 'dislike_count': int, 'categories': list, 'age_limit': 0, + 'uploader_id': 'anonymous', + 'description': '', } }, { 'url': 'http://gfycat.com/ifr/JauntyTimelyAmazontreeboa', @@ -40,9 +41,27 @@ class GfycatIE(InfoExtractor): 'duration': 3.52, 'view_count': int, 'like_count': int, - 'dislike_count': int, 'categories': list, 'age_limit': 0, + 'uploader_id': 'anonymous', + 'description': '', + } + }, { + 'url': 'https://gfycat.com/alienatedsolidgreathornedowl', + 'info_dict': { + 'id': 'alienatedsolidgreathornedowl', + 'ext': 'mp4', + 'upload_date': '20211226', + 'uploader_id': 'reactions', + 'timestamp': 1640536930, + 'like_count': int, + 'description': '', + 'title': 'Ingrid Michaelson, Zooey Deschanel - Merry Christmas Happy New Year', + 'categories': list, + 'age_limit': 0, + 'duration': 2.9583333333333335, + 'uploader': 'Reaction GIFs', + 'view_count': int, } }, { 'url': 'https://gfycat.com/ru/RemarkableDrearyAmurstarfish', @@ -74,7 +93,7 @@ class GfycatIE(InfoExtractor): title = gfy.get('title') or gfy['gfyName'] description = gfy.get('description') timestamp = int_or_none(gfy.get('createDate')) - uploader = gfy.get('userName') + uploader = gfy.get('userName') or gfy.get('username') view_count = int_or_none(gfy.get('views')) like_count = int_or_none(gfy.get('likes')) dislike_count = int_or_none(gfy.get('dislikes')) @@ -114,7 +133,8 @@ class GfycatIE(InfoExtractor): 'title': title, 'description': description, 'timestamp': timestamp, - 'uploader': uploader, + 'uploader': gfy.get('userDisplayName') or uploader, + 'uploader_id': uploader, 'duration': duration, 'view_count': view_count, 'like_count': like_count, diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index 84c1daca6..ab14e5b0a 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -17,6 +17,7 @@ from ..utils import ( int_or_none, lowercase_escape, std_headers, + str_to_int, traverse_obj, url_or_none, urlencode_postdata, @@ -293,7 +294,10 @@ class InstagramIE(InstagramBaseIE): video_id, url = self._match_valid_url(url).group('id', 'url') webpage, urlh = self._download_webpage_handle(url, video_id) if 'www.instagram.com/accounts/login' in urlh.geturl(): - self.raise_login_required('You need to log in to access this content') + self.report_warning('Main webpage is locked behind the login page. ' + 'Retrying with embed webpage (Note that some metadata might be missing)') + webpage = self._download_webpage( + 'https://www.instagram.com/p/%s/embed/' % video_id, video_id, note='Downloading embed webpage') shared_data = self._parse_json( self._search_regex( @@ -314,7 +318,10 @@ class InstagramIE(InstagramBaseIE): r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;', webpage, 'additional data', default='{}'), video_id, fatal=False) - media = traverse_obj(additional_data, ('graphql', 'shortcode_media'), expected_type=dict) or {} + media = traverse_obj(additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {} + + if not media and 'www.instagram.com/accounts/login' in urlh.geturl(): + self.raise_login_required('You need to log in to access this content') uploader_id = traverse_obj(media, ('owner', 'username')) or self._search_regex( r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'uploader id', fatal=False) @@ -348,13 +355,14 @@ class InstagramIE(InstagramBaseIE): formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash')) self._sort_formats(formats) + comment_data = traverse_obj(media, ('edge_media_to_parent_comment', 'edges')) comments = [{ 'author': traverse_obj(comment_dict, ('node', 'owner', 'username')), 'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id')), 'id': traverse_obj(comment_dict, ('node', 'id')), 'text': traverse_obj(comment_dict, ('node', 'text')), 'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), expected_type=int_or_none), - } for comment_dict in traverse_obj(media, ('edge_media_to_parent_comment', 'edges'))] + } for comment_dict in comment_data] if comment_data else None display_resources = ( media.get('display_resources') @@ -375,7 +383,8 @@ class InstagramIE(InstagramBaseIE): 'timestamp': traverse_obj(media, 'taken_at_timestamp', 'date', expected_type=int_or_none), 'uploader_id': uploader_id, 'uploader': traverse_obj(media, ('owner', 'full_name')), - 'like_count': self._get_count(media, 'likes', 'preview_like'), + 'like_count': self._get_count(media, 'likes', 'preview_like') or str_to_int(self._search_regex( + r'data-log-event="likeCountClick"[^>]*>[^\d]*([\d,\.]+)', webpage, 'like count', fatal=False)), 'comment_count': self._get_count(media, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'), 'comments': comments, 'thumbnails': thumbnails, @@ -533,3 +542,77 @@ class InstagramTagIE(InstagramPlaylistBaseIE): 'tag_name': data['entry_data']['TagPage'][0]['graphql']['hashtag']['name'] } + + +class InstagramStoryIE(InstagramBaseIE): + _VALID_URL = r'https?://(?:www\.)?instagram\.com/stories/(?P<user>[^/]+)/(?P<id>\d+)' + IE_NAME = 'instagram:story' + + _TESTS = [{ + 'url': 'https://www.instagram.com/stories/highlights/18090946048123978/', + 'info_dict': { + 'id': '18090946048123978', + 'title': 'Rare', + }, + 'playlist_mincount': 50 + }] + + def _real_extract(self, url): + username, story_id = self._match_valid_url(url).groups() + + story_info_url = f'{username}/{story_id}/?__a=1' if username == 'highlights' else f'{username}/?__a=1' + story_info = self._download_json(f'https://www.instagram.com/stories/{story_info_url}', story_id, headers={ + 'X-IG-App-ID': 936619743392459, + 'X-ASBD-ID': 198387, + 'X-IG-WWW-Claim': 0, + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': url, + }) + user_id = story_info['user']['id'] + highlight_title = traverse_obj(story_info, ('highlight', 'title')) + + story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' + videos = self._download_json(f'https://i.instagram.com/api/v1/feed/reels_media/?reel_ids={story_info_url}', story_id, headers={ + 'X-IG-App-ID': 936619743392459, + 'X-ASBD-ID': 198387, + 'X-IG-WWW-Claim': 0, + })['reels'] + entites = [] + + videos = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items')) + for video_info in videos: + formats = [] + if isinstance(video_info, list): + video_info = video_info[0] + vcodec = video_info.get('video_codec') + dash_manifest_raw = video_info.get('video_dash_manifest') + videos_list = video_info.get('video_versions') + if not (dash_manifest_raw or videos_list): + continue + for format in videos_list: + formats.append({ + 'url': format.get('url'), + 'width': format.get('width'), + 'height': format.get('height'), + 'vcodec': vcodec, + }) + if dash_manifest_raw: + formats.extend(self._parse_mpd_formats(self._parse_xml(dash_manifest_raw, story_id), mpd_id='dash')) + self._sort_formats(formats) + thumbnails = [{ + 'url': thumbnail.get('url'), + 'width': thumbnail.get('width'), + 'height': thumbnail.get('height') + } for thumbnail in traverse_obj(video_info, ('image_versions2', 'candidates')) or []] + entites.append({ + 'id': video_info.get('id'), + 'title': f'Story by {username}', + 'timestamp': int_or_none(video_info.get('taken_at')), + 'uploader': traverse_obj(videos, ('user', 'full_name')), + 'duration': float_or_none(video_info.get('video_duration')), + 'uploader_id': user_id, + 'thumbnails': thumbnails, + 'formats': formats, + }) + + return self.playlist_result(entites, playlist_id=story_id, playlist_title=highlight_title) diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index 0f87bf1d7..1405ce0c7 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -184,28 +184,38 @@ class LBRYIE(LBRYBaseIE): display_id = compat_urllib_parse_unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') - result_value = result['value'] - if result_value.get('stream_type') not in self._SUPPORTED_STREAM_TYPES: + if result['value'].get('stream_type') in self._SUPPORTED_STREAM_TYPES: + claim_id, is_live, headers = result['claim_id'], False, None + streaming_url = self._call_api_proxy( + 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] + final_url = self._request_webpage( + streaming_url, display_id, note='Downloading streaming redirect url info').geturl() + elif result.get('value_type') == 'stream': + claim_id, is_live = result['signing_channel']['claim_id'], True + headers = {'referer': 'https://player.odysee.live/'} + live_data = self._download_json( + f'https://api.live.odysee.com/v1/odysee/live/{claim_id}', claim_id, + note='Downloading livestream JSON metadata')['data'] + if not live_data['live']: + raise ExtractorError('This stream is not live', expected=True) + streaming_url = final_url = live_data['url'] + else: raise ExtractorError('Unsupported URL', expected=True) - claim_id = result['claim_id'] - title = result_value['title'] - streaming_url = self._call_api_proxy( - 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] + info = self._parse_stream(result, url) - urlh = self._request_webpage( - streaming_url, display_id, note='Downloading streaming redirect url info') - if determine_ext(urlh.geturl()) == 'm3u8': + if determine_ext(final_url) == 'm3u8': info['formats'] = self._extract_m3u8_formats( - urlh.geturl(), display_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + final_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live, headers=headers) self._sort_formats(info['formats']) else: info['url'] = streaming_url - info.update({ + return { + **info, 'id': claim_id, - 'title': title, - }) - return info + 'title': result['value']['title'], + 'is_live': is_live, + 'http_headers': headers, + } class LBRYChannelIE(LBRYBaseIE): diff --git a/yt_dlp/extractor/njpwworld.py b/yt_dlp/extractor/njpwworld.py index 3639d142f..89380d039 100644 --- a/yt_dlp/extractor/njpwworld.py +++ b/yt_dlp/extractor/njpwworld.py @@ -77,13 +77,8 @@ class NJPWWorldIE(InfoExtractor): for kind, vid in re.findall(r'if\s+\(\s*imageQualityType\s*==\s*\'([^\']+)\'\s*\)\s*{\s*video_id\s*=\s*"(\d+)"', webpage): player_path = '/intent?id=%s&type=url' % vid player_url = compat_urlparse.urljoin(url, player_path) - formats.append({ - 'url': player_url, - 'format_id': kind, - 'ext': 'mp4', - 'protocol': 'm3u8', - 'quality': 2 if kind == 'high' else 1, - }) + formats += self._extract_m3u8_formats( + player_url, video_id, 'mp4', 'm3u8_native', m3u8_id=kind, fatal=False, quality=int(kind == 'high')) self._sort_formats(formats) diff --git a/yt_dlp/extractor/npr.py b/yt_dlp/extractor/npr.py index 9d1122f0c..49f062d7a 100644 --- a/yt_dlp/extractor/npr.py +++ b/yt_dlp/extractor/npr.py @@ -91,7 +91,8 @@ class NprIE(InfoExtractor): elif format_id == 'smil': smil_formats = self._extract_smil_formats( format_url, media_id, transform_source=lambda s: s.replace( - 'rtmp://flash.npr.org/ondemand/', 'https://ondemand.npr.org/')) + 'rtmp://flash.npr.org/ondemand/', 'https://ondemand.npr.org/'), + fatal=False) self._check_formats(smil_formats, media_id) formats.extend(smil_formats) else: diff --git a/yt_dlp/extractor/opencast.py b/yt_dlp/extractor/opencast.py new file mode 100644 index 000000000..cf8d91717 --- /dev/null +++ b/yt_dlp/extractor/opencast.py @@ -0,0 +1,177 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_iso8601, + traverse_obj, + variadic, +) + + +class OpencastBaseIE(InfoExtractor): + _INSTANCES_RE = r'''(?: + opencast\.informatik\.kit\.edu| + electures\.uni-muenster\.de| + oc-presentation\.ltcc\.tuwien\.ac\.at| + medien\.ph-noe\.ac\.at| + oc-video\.ruhr-uni-bochum\.de| + oc-video1\.ruhr-uni-bochum\.de| + opencast\.informatik\.uni-goettingen\.de| + heicast\.uni-heidelberg\.de| + opencast\.hawk\.de:8080| + opencast\.hs-osnabrueck\.de| + video[0-9]+\.virtuos\.uni-osnabrueck\.de| + opencast\.uni-koeln\.de| + media\.opencast\.hochschule-rhein-waal\.de| + matterhorn\.dce\.harvard\.edu| + hs-harz\.opencast\.uni-halle\.de| + videocampus\.urz\.uni-leipzig\.de| + media\.uct\.ac\.za| + vid\.igb\.illinois\.edu| + cursosabertos\.c3sl\.ufpr\.br| + mcmedia\.missioncollege\.org| + clases\.odon\.edu\.uy + )''' + _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' + + def _call_api(self, host, video_id, **kwargs): + return self._download_json(self._API_BASE % (host, video_id), video_id, **kwargs) + + def _parse_mediapackage(self, video): + video_id = video.get('id') + if video_id is None: + raise ExtractorError('Video id was not found') + + formats = [] + for track in variadic(traverse_obj(video, ('media', 'track')) or []): + href = track.get('url') + if href is None: + continue + ext = determine_ext(href, None) + + transport = track.get('transport') + + if transport == 'DASH' or ext == 'mpd': + formats.extend(self._extract_mpd_formats_and_subtitles(href, video_id, mpd_id='dash', fatal=False)) + elif transport == 'HLS' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats_and_subtitles( + href, video_id, m3u8_id='hls', entry_protocol='m3u8_native', fatal=False)) + elif transport == 'HDS' or ext == 'f4m': + formats.extend(self._extract_f4m_formats(href, video_id, f4m_id='hds', fatal=False)) + elif transport == 'SMOOTH': + formats.extend(self._extract_ism_formats(href, video_id, ism_id='smooth', fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats(href, video_id, fatal=False)) + else: + track_obj = { + 'url': href, + 'ext': ext, + 'format_note': track.get('transport'), + 'resolution': traverse_obj(track, ('video', 'resolution')), + 'fps': int_or_none(traverse_obj(track, ('video', 'framerate'))), + 'vbr': int_or_none(traverse_obj(track, ('video', 'bitrate')), scale=1000), + 'vcodec': traverse_obj(track, ('video', 'encoder', 'type')) if track.get('video') else 'none', + 'abr': int_or_none(traverse_obj(track, ('audio', 'bitrate')), scale=1000), + 'asr': int_or_none(traverse_obj(track, ('audio', 'samplingrate'))), + 'acodec': traverse_obj(track, ('audio', 'encoder', 'type')) if track.get('audio') else 'none', + } + + if transport == 'RTMP': + m_obj = re.search(r'(?:rtmp://[^/]+/(?P<app>[^/]+))/(?P<ext>.+):(?P<playpath>.+)', href) + if not m_obj: + continue + track_obj.update({ + 'app': m_obj.group('app'), + 'ext': m_obj.group('ext'), + 'play_path': m_obj.group('ext') + ':' + m_obj.group('playpath'), + 'rtmp_live': True, + 'preference': -2, + }) + formats.append(track_obj) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': video.get('title'), + 'series': video.get('seriestitle'), + 'season_id': video.get('series'), + 'creator': traverse_obj(video, ('creators', 'creator')), + 'timestamp': parse_iso8601(video.get('start')), + 'thumbnail': traverse_obj(video, ('attachments', 'attachment', ..., 'url'), get_all=False), + } + + +class OpencastIE(OpencastBaseIE): + _VALID_URL = r'''(?x) + https?://(?P<host>%s)/paella/ui/watch.html\?.*? + id=(?P<id>%s) + ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE) + + _API_BASE = 'https://%s/search/episode.json?id=%s' + + _TESTS = [ + { + 'url': 'https://oc-video1.ruhr-uni-bochum.de/paella/ui/watch.html?id=ed063cd5-72c8-46b5-a60a-569243edcea8', + 'md5': '554c8e99a90f7be7e874619fcf2a3bc9', + 'info_dict': { + 'id': 'ed063cd5-72c8-46b5-a60a-569243edcea8', + 'ext': 'mp4', + 'title': '11 - Kryptographie - 24.11.2015', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1606208400, + 'upload_date': '20201124', + }, + } + ] + + def _real_extract(self, url): + host, video_id = self._match_valid_url(url).group('host', 'id') + return self._parse_mediapackage( + self._call_api(host, video_id)['search-results']['result']['mediapackage']) + + +class OpencastPlaylistIE(OpencastBaseIE): + _VALID_URL = r'''(?x) + https?://(?P<host>%s)/engage/ui/index.html\?.*? + epFrom=(?P<id>%s) + ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE) + + _API_BASE = 'https://%s/search/episode.json?sid=%s' + + _TESTS = [ + { + 'url': 'https://oc-video1.ruhr-uni-bochum.de/engage/ui/index.html?epFrom=cf68a4a1-36b1-4a53-a6ba-61af5705a0d0', + 'info_dict': { + 'id': 'cf68a4a1-36b1-4a53-a6ba-61af5705a0d0', + 'title': 'Kryptographie - WiSe 15/16', + }, + 'playlist_mincount': 28, + }, + { + 'url': 'https://oc-video.ruhr-uni-bochum.de/engage/ui/index.html?e=1&p=1&epFrom=b1a54262-3684-403f-9731-8e77c3766f9a', + 'info_dict': { + 'id': 'b1a54262-3684-403f-9731-8e77c3766f9a', + 'title': 'inSTUDIES-Social movements and prefigurative politics in a global perspective', + }, + 'playlist_mincount': 6, + }, + ] + + def _real_extract(self, url): + host, video_id = self._match_valid_url(url).group('host', 'id') + + entries = [ + self._parse_mediapackage(episode['mediapackage']) + for episode in variadic(self._call_api(host, video_id)['search-results']['result']) + if episode.get('mediapackage') + ] + + return self.playlist_result(entries, video_id, traverse_obj(entries, (0, 'series'))) diff --git a/yt_dlp/extractor/pixivsketch.py b/yt_dlp/extractor/pixivsketch.py new file mode 100644 index 000000000..f0ad0b24a --- /dev/null +++ b/yt_dlp/extractor/pixivsketch.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_timestamp, +) + + +class PixivSketchBaseIE(InfoExtractor): + def _call_api(self, video_id, path, referer, note='Downloading JSON metadata'): + response = self._download_json(f'https://sketch.pixiv.net/api/{path}', video_id, note=note, headers={ + 'Referer': referer, + 'X-Requested-With': referer, + }) + errors = traverse_obj(response, ('errors', ..., 'message')) + if errors: + raise ExtractorError(' '.join(f'{e}.' for e in errors)) + return response.get('data') or {} + + +class PixivSketchIE(PixivSketchBaseIE): + IE_NAME = 'pixiv:sketch' + _VALID_URL = r'https?://sketch\.pixiv\.net/@(?P<uploader_id>[a-zA-Z0-9_-]+)/lives/(?P<id>\d+)/?' + _TESTS = [{ + 'url': 'https://sketch.pixiv.net/@nuhutya/lives/3654620468641830507', + 'info_dict': { + 'id': '7370666691623196569', + 'title': 'まにあえクリスマス!', + 'uploader': 'ぬふちゃ', + 'uploader_id': 'nuhutya', + 'channel_id': '9844815', + 'age_limit': 0, + 'timestamp': 1640351536, + }, + 'skip': True, + }, { + # these two (age_limit > 0) requires you to login on website, but it's actually not required for download + 'url': 'https://sketch.pixiv.net/@namahyou/lives/4393103321546851377', + 'info_dict': { + 'id': '4907995960957946943', + 'title': 'クリスマスなんて知らん🖕', + 'uploader': 'すゃもり', + 'uploader_id': 'suya2mori2', + 'channel_id': '31169300', + 'age_limit': 15, + 'timestamp': 1640347640, + }, + 'skip': True, + }, { + 'url': 'https://sketch.pixiv.net/@8aki/lives/3553803162487249670', + 'info_dict': { + 'id': '1593420639479156945', + 'title': 'おまけ本作業(リョナ有)', + 'uploader': 'おぶい / Obui', + 'uploader_id': 'oving', + 'channel_id': '17606', + 'age_limit': 18, + 'timestamp': 1640330263, + }, + 'skip': True, + }] + + def _real_extract(self, url): + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') + data = self._call_api(video_id, f'lives/{video_id}.json', url) + + if not traverse_obj(data, 'is_broadcasting'): + raise ExtractorError(f'This live is offline. Use https://sketch.pixiv.net/@{uploader_id} for ongoing live.', expected=True) + + m3u8_url = traverse_obj(data, ('owner', 'hls_movie', 'url')) + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': data.get('name'), + 'formats': formats, + 'uploader': traverse_obj(data, ('user', 'name'), ('owner', 'user', 'name')), + 'uploader_id': traverse_obj(data, ('user', 'unique_name'), ('owner', 'user', 'unique_name')), + 'channel_id': str(traverse_obj(data, ('user', 'pixiv_user_id'), ('owner', 'user', 'pixiv_user_id'))), + 'age_limit': 18 if data.get('is_r18') else 15 if data.get('is_r15') else 0, + 'timestamp': unified_timestamp(data.get('created_at')), + 'is_live': True + } + + +class PixivSketchUserIE(PixivSketchBaseIE): + IE_NAME = 'pixiv:sketch:user' + _VALID_URL = r'https?://sketch\.pixiv\.net/@(?P<id>[a-zA-Z0-9_-]+)/?' + _TESTS = [{ + 'url': 'https://sketch.pixiv.net/@nuhutya', + 'only_matching': True, + }, { + 'url': 'https://sketch.pixiv.net/@namahyou', + 'only_matching': True, + }, { + 'url': 'https://sketch.pixiv.net/@8aki', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return super(PixivSketchUserIE, cls).suitable(url) and not PixivSketchIE.suitable(url) + + def _real_extract(self, url): + user_id = self._match_id(url) + data = self._call_api(user_id, f'lives/users/@{user_id}.json', url) + + if not traverse_obj(data, 'is_broadcasting'): + try: + self._call_api(user_id, 'users/current.json', url, 'Investigating reason for request failure') + except ExtractorError as ex: + if ex.cause and ex.cause.code == 401: + self.raise_login_required(f'Please log in, or use direct link like https://sketch.pixiv.net/@{user_id}/1234567890', method='cookies') + raise ExtractorError('This user is offline', expected=True) + + return self.url_result(f'https://sketch.pixiv.net/@{user_id}/lives/{data["id"]}') diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 6d894affd..4357c79df 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -258,8 +258,7 @@ class PornHubIE(PornHubBaseIE): webpage) def _extract_count(self, pattern, webpage, name): - return str_to_int(self._search_regex( - pattern, webpage, '%s count' % name, fatal=False)) + return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None)) def _real_extract(self, url): mobj = self._match_valid_url(url) diff --git a/yt_dlp/extractor/rcti.py b/yt_dlp/extractor/rcti.py index 19b2f451c..ac42e58d9 100644 --- a/yt_dlp/extractor/rcti.py +++ b/yt_dlp/extractor/rcti.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import itertools import json import random import time @@ -12,6 +11,7 @@ from ..utils import ( dict_get, ExtractorError, strip_or_none, + traverse_obj, try_get ) @@ -26,7 +26,7 @@ class RCTIPlusBaseIE(InfoExtractor): json = self._download_json( url, video_id, note=note, headers={'Authorization': self._AUTH_KEY}) if json.get('status', {}).get('code', 0) != 0: - raise ExtractorError('%s said: %s' % (self.IE_NAME, json["status"]["message_client"]), cause=json) + raise ExtractorError(f'{self.IE_NAME} said: {json["status"]["message_client"]}', cause=json) return json.get('data'), json.get('meta') @@ -223,18 +223,30 @@ class RCTIPlusIE(RCTIPlusBaseIE): class RCTIPlusSeriesIE(RCTIPlusBaseIE): - _VALID_URL = r'https://www\.rctiplus\.com/programs/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' + _VALID_URL = r'https://www\.rctiplus\.com/programs/(?P<id>\d+)/(?P<display_id>[^/?#&]+)(?:/(?P<type>episodes|extras|clips))?' _TESTS = [{ - 'url': 'https://www.rctiplus.com/programs/540/upin-ipin', - 'playlist_mincount': 417, + 'url': 'https://www.rctiplus.com/programs/829/putri-untuk-pangeran', + 'playlist_mincount': 1019, 'info_dict': { - 'id': '540', - 'title': 'Upin & Ipin', - 'description': 'md5:22cc912381f389664416844e1ec4f86b', + 'id': '829', + 'title': 'Putri Untuk Pangeran', + 'description': 'md5:aca7b54d05bd95a67d4f4613cc1d622d', + 'age_limit': 2, + 'cast': ['Verrel Bramasta', 'Ranty Maria', 'Riza Syah', 'Ivan Fadilla', 'Nicole Parham', 'Dll', 'Aviv Elham'], + 'display_id': 'putri-untuk-pangeran', + 'tag': 'count:18', }, - }, { - 'url': 'https://www.rctiplus.com/programs/540/upin-ipin/episodes?utm_source=Rplusdweb&utm_medium=share_copy&utm_campaign=programsupin-ipin', - 'only_matching': True, + }, { # No episodes + 'url': 'https://www.rctiplus.com/programs/615/inews-pagi', + 'playlist_mincount': 388, + 'info_dict': { + 'id': '615', + 'title': 'iNews Pagi', + 'description': 'md5:f18ee3d4643cfb41c358e5a9b693ee04', + 'age_limit': 2, + 'tag': 'count:11', + 'display_id': 'inews-pagi', + } }] _AGE_RATINGS = { # Based off https://id.wikipedia.org/wiki/Sistem_rating_konten_televisi with additional ratings 'S-SU': 2, @@ -269,47 +281,63 @@ class RCTIPlusSeriesIE(RCTIPlusBaseIE): display_id, '%s page %s' % (note, page_num))[0] or [] for video_json in episode_list: - link = video_json['share_link'] - url_res = self.url_result(link, 'RCTIPlus', video_json.get('product_id'), video_json.get('title')) - url_res.update(metadata) - yield url_res + yield { + '_type': 'url', + 'url': video_json['share_link'], + 'ie_key': RCTIPlusIE.ie_key(), + 'id': video_json.get('product_id'), + 'title': video_json.get('title'), + 'display_id': video_json.get('title_code').replace('_', '-'), + 'description': video_json.get('summary'), + 'timestamp': video_json.get('release_date'), + 'duration': video_json.get('duration'), + 'season_number': video_json.get('season'), + 'episode_number': video_json.get('episode'), + **metadata + } + + def _series_entries(self, series_id, display_id=None, video_type=None, metadata={}): + if not video_type or video_type in 'episodes': + try: + seasons_list = self._call_api( + f'https://api.rctiplus.com/api/v1/program/{series_id}/season', + display_id, 'Downloading seasons list JSON')[0] + except ExtractorError as e: + if 'not found' not in str(e): + raise + seasons_list = [] + for season in seasons_list: + yield from self._entries( + f'https://api.rctiplus.com/api/v2/program/{series_id}/episode?season={season["season"]}', + display_id, f'Downloading season {season["season"]} episode entries', metadata) + if not video_type or video_type in 'extras': + yield from self._entries( + f'https://api.rctiplus.com/api/v2/program/{series_id}/extra?content_id=0', + display_id, 'Downloading extra entries', metadata) + if not video_type or video_type in 'clips': + yield from self._entries( + f'https://api.rctiplus.com/api/v2/program/{series_id}/clip?content_id=0', + display_id, 'Downloading clip entries', metadata) def _real_extract(self, url): - series_id, display_id = self._match_valid_url(url).groups() + series_id, display_id, video_type = self._match_valid_url(url).group('id', 'display_id', 'type') + if video_type: + self.report_warning( + f'Only {video_type} will be downloaded. ' + f'To download everything from the series, remove "/{video_type}" from the URL') series_meta, meta_paths = self._call_api( - 'https://api.rctiplus.com/api/v1/program/%s/detail' % series_id, display_id, 'Downloading series metadata') + f'https://api.rctiplus.com/api/v1/program/{series_id}/detail', display_id, 'Downloading series metadata') metadata = { - 'age_limit': try_get(series_meta, lambda x: self._AGE_RATINGS[x['age_restriction'][0]['code']]) + 'age_limit': try_get(series_meta, lambda x: self._AGE_RATINGS[x['age_restriction'][0]['code']]), + 'cast': traverse_obj(series_meta, (('starring', 'creator', 'writer'), ..., 'name'), + expected_type=lambda x: strip_or_none(x) or None), + 'tag': traverse_obj(series_meta, ('tag', ..., 'name'), + expected_type=lambda x: strip_or_none(x) or None), } - - cast = [] - for star in series_meta.get('starring', []): - cast.append(strip_or_none(star.get('name'))) - for star in series_meta.get('creator', []): - cast.append(strip_or_none(star.get('name'))) - for star in series_meta.get('writer', []): - cast.append(strip_or_none(star.get('name'))) - metadata['cast'] = cast - - tags = [] - for tag in series_meta.get('tag', []): - tags.append(strip_or_none(tag.get('name'))) - metadata['tag'] = tags - - entries = [] - seasons_list = self._call_api( - 'https://api.rctiplus.com/api/v1/program/%s/season' % series_id, display_id, 'Downloading seasons list JSON')[0] - for season in seasons_list: - entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/episode?season=%s' % (series_id, season['season']), - display_id, 'Downloading season %s episode entries' % season['season'], metadata)) - - entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/clip?content_id=0' % series_id, - display_id, 'Downloading clip entries', metadata)) - entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/extra?content_id=0' % series_id, - display_id, 'Downloading extra entries', metadata)) - - return self.playlist_result(itertools.chain(*entries), series_id, series_meta.get('title'), series_meta.get('summary'), **metadata) + return self.playlist_result( + self._series_entries(series_id, display_id, video_type, metadata), series_id, + series_meta.get('title'), series_meta.get('summary'), display_id=display_id, **metadata) class RCTIPlusTVIE(RCTIPlusBaseIE): @@ -345,5 +373,6 @@ class RCTIPlusTVIE(RCTIPlusBaseIE): tv_id = match.get('tvname') or match.get('eventname') webpage = self._download_webpage(url, tv_id) video_type, video_id = self._search_regex( - r'url\s*:\s*["\']https://api\.rctiplus\.com/api/v./(?P<type>[^/]+)/(?P<id>\d+)/url', webpage, 'video link', group=('type', 'id')) + r'url\s*:\s*["\']https://api\.rctiplus\.com/api/v./(?P<type>[^/]+)/(?P<id>\d+)/url', + webpage, 'video link', group=('type', 'id')) return self.url_result(f'https://www.rctiplus.com/{video_type}/{video_id}/{tv_id}', 'RCTIPlus') diff --git a/yt_dlp/extractor/roosterteeth.py b/yt_dlp/extractor/roosterteeth.py index 18672b2e3..652fdd116 100644 --- a/yt_dlp/extractor/roosterteeth.py +++ b/yt_dlp/extractor/roosterteeth.py @@ -99,7 +99,7 @@ class RoosterTeethIE(RoosterTeethBaseIE): 'series': 'Million Dollars, But...', 'episode': 'Million Dollars, But... The Game Announcement', }, - 'skip_download': 'm3u8', + 'params': {'skip_download': True}, }, { 'url': 'https://roosterteeth.com/watch/rwby-bonus-25', 'info_dict': { @@ -112,7 +112,7 @@ class RoosterTeethIE(RoosterTeethBaseIE): 'thumbnail': r're:^https?://.*\.(png|jpe?g)$', 'ext': 'mp4', }, - 'skip_download': 'm3u8', + 'params': {'skip_download': True}, }, { 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', 'only_matching': True, diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index f251e5599..8146b3ef5 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -130,7 +130,7 @@ class SoundcloudBaseIE(InfoExtractor): elif username is not None: self.report_warning( 'Login using username and password is not currently supported. ' - 'Use "--user oauth --password <oauth_token>" to login using an oauth token') + 'Use "--username oauth --password <oauth_token>" to login using an oauth token') r''' def genDevId(): diff --git a/yt_dlp/extractor/steam.py b/yt_dlp/extractor/steam.py index 7f777c40b..4ed0fb592 100644 --- a/yt_dlp/extractor/steam.py +++ b/yt_dlp/extractor/steam.py @@ -7,14 +7,13 @@ from ..utils import ( extract_attributes, ExtractorError, get_element_by_class, - js_to_json, ) class SteamIE(InfoExtractor): _VALID_URL = r"""(?x) - https?://store\.steampowered\.com/ - (agecheck/)? + https?://(?:store\.steampowered|steamcommunity)\.com/ + (?:agecheck/)? (?P<urltype>video|app)/ #If the page is only for videos or for a game (?P<gameID>\d+)/? (?P<videoID>\d*)(?P<extra>\??) # For urltype == video we sometimes get the videoID @@ -27,21 +26,24 @@ class SteamIE(InfoExtractor): 'url': 'http://store.steampowered.com/video/105600/', 'playlist': [ { - 'md5': '6a294ee0c4b1f47f5bb76a65e31e3592', + 'md5': '695242613303ffa2a4c44c9374ddc067', 'info_dict': { - 'id': '2040428', + 'id': '256785003', 'ext': 'mp4', - 'title': 'Terraria 1.3 Trailer', - 'playlist_index': 1, + 'title': 'Terraria video 256785003', + 'thumbnail': r're:^https://cdn\.[^\.]+\.steamstatic\.com', + 'n_entries': 2, } }, { - 'md5': '911672b20064ca3263fa89650ba5a7aa', + 'md5': '6a294ee0c4b1f47f5bb76a65e31e3592', 'info_dict': { - 'id': '2029566', + 'id': '2040428', 'ext': 'mp4', - 'title': 'Terraria 1.2 Trailer', + 'title': 'Terraria video 2040428', 'playlist_index': 2, + 'thumbnail': r're:^https://cdn\.[^\.]+\.steamstatic\.com', + 'n_entries': 2, } } ], @@ -53,96 +55,76 @@ class SteamIE(InfoExtractor): 'playlistend': 2, } }, { - 'url': 'http://steamcommunity.com/sharedfiles/filedetails/?id=242472205', + 'url': 'https://store.steampowered.com/app/271590/Grand_Theft_Auto_V/', 'info_dict': { - 'id': 'X8kpJBlzD2E', + 'id': '256757115', + 'title': 'Grand Theft Auto V video 256757115', 'ext': 'mp4', - 'upload_date': '20140617', - 'title': 'FRONTIERS - Trapping', - 'description': 'md5:bf6f7f773def614054089e5769c12a6e', - 'uploader': 'AAD Productions', - 'uploader_id': 'AtomicAgeDogGames', - } + 'thumbnail': r're:^https://cdn\.[^\.]+\.steamstatic\.com', + 'n_entries': 20, + }, }] def _real_extract(self, url): m = self._match_valid_url(url) fileID = m.group('fileID') if fileID: - videourl = url + video_url = url playlist_id = fileID else: gameID = m.group('gameID') playlist_id = gameID - videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id + video_url = self._VIDEO_PAGE_TEMPLATE % playlist_id - self._set_cookie('steampowered.com', 'mature_content', '1') + self._set_cookie('steampowered.com', 'wants_mature_content', '1') + self._set_cookie('steampowered.com', 'birthtime', '944006401') + self._set_cookie('steampowered.com', 'lastagecheckage', '1-0-2000') - webpage = self._download_webpage(videourl, playlist_id) + webpage = self._download_webpage(video_url, playlist_id) - if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None: - videourl = self._AGECHECK_TEMPLATE % playlist_id + if re.search('<div[^>]+>Please enter your birth date to continue:</div>', webpage) is not None: + video_url = self._AGECHECK_TEMPLATE % playlist_id self.report_age_confirmation() - webpage = self._download_webpage(videourl, playlist_id) - - flash_vars = self._parse_json(self._search_regex( - r'(?s)rgMovieFlashvars\s*=\s*({.+?});', webpage, - 'flash vars'), playlist_id, js_to_json) + webpage = self._download_webpage(video_url, playlist_id) - playlist_title = None + videos = re.findall(r'(<div[^>]+id=[\'"]highlight_movie_(\d+)[\'"][^>]+>)', webpage) entries = [] - if fileID: - playlist_title = get_element_by_class('workshopItemTitle', webpage) - for movie in flash_vars.values(): - if not movie: - continue - youtube_id = movie.get('YOUTUBE_VIDEO_ID') - if not youtube_id: - continue + playlist_title = get_element_by_class('apphub_AppName', webpage) + for movie, movie_id in videos: + if not movie: + continue + movie = extract_attributes(movie) + if not movie_id: + continue + entry = { + 'id': movie_id, + 'title': f'{playlist_title} video {movie_id}', + } + formats = [] + if movie: + entry['thumbnail'] = movie.get('data-poster') + for quality in ('', '-hd'): + for ext in ('webm', 'mp4'): + video_url = movie.get('data-%s%s-source' % (ext, quality)) + if video_url: + formats.append({ + 'format_id': ext + quality, + 'url': video_url, + }) + self._sort_formats(formats) + entry['formats'] = formats + entries.append(entry) + embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage) + for evideos in embedded_videos: + evideos = extract_attributes(evideos).get('src') + video_id = self._search_regex(r'youtube\.com/embed/([0-9A-Za-z_-]{11})', evideos, 'youtube_video_id', default=None) + if video_id: entries.append({ - '_type': 'url', - 'url': youtube_id, + '_type': 'url_transparent', + 'id': video_id, + 'url': video_id, 'ie_key': 'Youtube', }) - else: - playlist_title = get_element_by_class('apphub_AppName', webpage) - for movie_id, movie in flash_vars.items(): - if not movie: - continue - video_id = self._search_regex(r'movie_(\d+)', movie_id, 'video id', fatal=False) - title = movie.get('MOVIE_NAME') - if not title or not video_id: - continue - entry = { - 'id': video_id, - 'title': title.replace('+', ' '), - } - formats = [] - flv_url = movie.get('FILENAME') - if flv_url: - formats.append({ - 'format_id': 'flv', - 'url': flv_url, - }) - highlight_element = self._search_regex( - r'(<div[^>]+id="highlight_movie_%s"[^>]+>)' % video_id, - webpage, 'highlight element', fatal=False) - if highlight_element: - highlight_attribs = extract_attributes(highlight_element) - if highlight_attribs: - entry['thumbnail'] = highlight_attribs.get('data-poster') - for quality in ('', '-hd'): - for ext in ('webm', 'mp4'): - video_url = highlight_attribs.get('data-%s%s-source' % (ext, quality)) - if video_url: - formats.append({ - 'format_id': ext + quality, - 'url': video_url, - }) - if not formats and not self.get_param('ignore_no_formats'): - continue - entry['formats'] = formats - entries.append(entry) if not entries: raise ExtractorError('Could not find any videos') diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 2cd7ba02e..18f1c5630 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -22,8 +22,8 @@ from ..utils import ( class TikTokBaseIE(InfoExtractor): - _APP_VERSION = '20.9.3' - _MANIFEST_APP_VERSION = '291' + _APP_VERSION = '20.1.0' + _MANIFEST_APP_VERSION = '200' _APP_NAME = 'trill' _AID = 1180 _API_HOSTNAME = 'api-h2.tiktokv.com' @@ -342,16 +342,66 @@ class TikTokIE(TikTokBaseIE): 'comment_count': int, } }, { - # Promoted content/ad - 'url': 'https://www.tiktok.com/@MS4wLjABAAAAAR29F6J2Ktu0Daw03BJyXPNoRQ-W7U5a0Mn3lVCq2rQhjOd_WNLclHUoFgwX8Eno/video/6932675057474981122', - 'only_matching': True, + # Banned audio, only available on the app + 'url': 'https://www.tiktok.com/@barudakhb_/video/6984138651336838402', + 'info_dict': { + 'id': '6984138651336838402', + 'ext': 'mp4', + 'title': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥', + 'description': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥', + 'uploader': 'barudakhb_', + 'creator': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6', + 'uploader_id': '6974687867511718913', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d', + 'track': 'Boka Dance', + 'artist': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6', + 'timestamp': 1626121503, + 'duration': 18, + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'upload_date': '20210712', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + } + }, { + # Sponsored video, only available with feed workaround + 'url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_/video/7042692929109986561', + 'info_dict': { + 'id': '7042692929109986561', + 'ext': 'mp4', + 'title': 'Slap and Run!', + 'description': 'Slap and Run!', + 'uploader': 'user440922249', + 'creator': 'Slap And Run', + 'uploader_id': '7036055384943690754', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_', + 'track': 'Promoted Music', + 'timestamp': 1639754738, + 'duration': 30, + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'upload_date': '20211217', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + }, + 'expected_warnings': ['Video not available'] }] def _extract_aweme_app(self, aweme_id): - aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id, - note='Downloading video details', errnote='Unable to download video details').get('aweme_detail') - if not aweme_detail: - raise ExtractorError('Video not available', video_id=aweme_id) + try: + aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id, + note='Downloading video details', errnote='Unable to download video details').get('aweme_detail') + if not aweme_detail: + raise ExtractorError('Video not available', video_id=aweme_id) + except ExtractorError as e: + self.report_warning(f'{e}; Retrying with feed workaround') + feed_list = self._call_api('feed', {'aweme_id': aweme_id}, aweme_id, + note='Downloading video feed', errnote='Unable to download video feed').get('aweme_list') or [] + aweme_detail = next(aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id) + if not aweme_detail: + raise ExtractorError('Unable to find video in feed', video_id=aweme_id) return self._parse_aweme_video_app(aweme_detail) def _real_extract(self, url): @@ -447,7 +497,7 @@ class TikTokUserIE(TikTokBaseIE): for video in post_list.get('aweme_list', []): yield { **self._parse_aweme_video_app(video), - 'ie_key': TikTokIE.ie_key(), + 'extractor_key': TikTokIE.ie_key(), 'extractor': 'TikTok', 'webpage_url': f'https://tiktok.com/@{user_id}/video/{video["aweme_id"]}', } @@ -464,6 +514,114 @@ class TikTokUserIE(TikTokBaseIE): return self.playlist_result(self._entries_api(webpage, user_id, user_name), user_id, user_name) +class TikTokBaseListIE(TikTokBaseIE): + def _entries(self, list_id, display_id): + query = { + self._QUERY_NAME: list_id, + 'cursor': 0, + 'count': 20, + 'type': 5, + 'device_id': ''.join(random.choice(string.digits) for i in range(19)) + } + + max_retries = self.get_param('extractor_retries', 3) + for page in itertools.count(1): + for retries in itertools.count(): + try: + post_list = self._call_api(self._API_ENDPOINT, query, display_id, + note='Downloading video list page %d%s' % (page, f' (attempt {retries})' if retries != 0 else ''), + errnote='Unable to download video list') + except ExtractorError as e: + if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0 and retries != max_retries: + self.report_warning('%s. Retrying...' % str(e.cause or e.msg)) + continue + raise + break + for video in post_list.get('aweme_list', []): + yield { + **self._parse_aweme_video_app(video), + 'extractor_key': TikTokIE.ie_key(), + 'extractor': 'TikTok', + 'webpage_url': f'https://tiktok.com/@_/video/{video["aweme_id"]}', + } + if not post_list.get('has_more'): + break + query['cursor'] = post_list['cursor'] + + def _real_extract(self, url): + list_id = self._match_id(url) + return self.playlist_result(self._entries(list_id, list_id), list_id) + + +class TikTokSoundIE(TikTokBaseListIE): + IE_NAME = 'tiktok:sound' + _VALID_URL = r'https?://(?:www\.)?tiktok\.com/music/[\w\.-]+-(?P<id>[\d]+)[/?#&]?' + _QUERY_NAME = 'music_id' + _API_ENDPOINT = 'music/aweme' + _TESTS = [{ + 'url': 'https://www.tiktok.com/music/Build-a-Btch-6956990112127585029?lang=en', + 'playlist_mincount': 100, + 'info_dict': { + 'id': '6956990112127585029' + }, + 'expected_warnings': ['Retrying'] + }, { + # Actual entries are less than listed video count + 'url': 'https://www.tiktok.com/music/jiefei-soap-remix-7036843036118469381', + 'playlist_mincount': 2182, + 'info_dict': { + 'id': '7036843036118469381' + }, + 'expected_warnings': ['Retrying'] + }] + + +class TikTokEffectIE(TikTokBaseListIE): + IE_NAME = 'tiktok:effect' + _VALID_URL = r'https?://(?:www\.)?tiktok\.com/sticker/[\w\.-]+-(?P<id>[\d]+)[/?#&]?' + _QUERY_NAME = 'sticker_id' + _API_ENDPOINT = 'sticker/aweme' + _TESTS = [{ + 'url': 'https://www.tiktok.com/sticker/MATERIAL-GWOOORL-1258156', + 'playlist_mincount': 100, + 'info_dict': { + 'id': '1258156', + }, + 'expected_warnings': ['Retrying'] + }, { + # Different entries between mobile and web, depending on region + 'url': 'https://www.tiktok.com/sticker/Elf-Friend-479565', + 'only_matching': True + }] + + +class TikTokTagIE(TikTokBaseListIE): + IE_NAME = 'tiktok:tag' + _VALID_URL = r'https?://(?:www\.)?tiktok\.com/tag/(?P<id>[^/?#&]+)' + _QUERY_NAME = 'ch_id' + _API_ENDPOINT = 'challenge/aweme' + _TESTS = [{ + 'url': 'https://tiktok.com/tag/hello2018', + 'playlist_mincount': 39, + 'info_dict': { + 'id': '46294678', + 'title': 'hello2018', + }, + 'expected_warnings': ['Retrying'] + }, { + 'url': 'https://tiktok.com/tag/fypシ?is_copy_url=0&is_from_webapp=v1', + 'only_matching': True + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id, headers={ + 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)' + }) + tag_id = self._html_search_regex(r'snssdk\d*://challenge/detail/(\d+)', webpage, 'tag ID') + return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id) + + class DouyinIE(TikTokIE): _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)' _TESTS = [{ diff --git a/yt_dlp/extractor/voicy.py b/yt_dlp/extractor/voicy.py index 11ebe76e1..37c7d5685 100644 --- a/yt_dlp/extractor/voicy.py +++ b/yt_dlp/extractor/voicy.py @@ -6,9 +6,10 @@ from ..compat import compat_str from ..utils import ( ExtractorError, smuggle_url, + str_or_none, traverse_obj, - unsmuggle_url, unified_strdate, + unsmuggle_url, ) import itertools @@ -25,9 +26,9 @@ class VoicyBaseIE(InfoExtractor): 'id': voice_id, 'title': compat_str(value.get('PlaylistName')), 'uploader': value.get('SpeakerName'), - 'uploader_id': compat_str(value.get('SpeakerId')), + 'uploader_id': str_or_none(value.get('SpeakerId')), 'channel': value.get('ChannelName'), - 'channel_id': compat_str(value.get('ChannelId')), + 'channel_id': str_or_none(value.get('ChannelId')), 'upload_date': upload_date, } diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 1f5009399..852fbd78e 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -668,6 +668,30 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return text @staticmethod + def _extract_thumbnails(data, *path_list): + """ + Extract thumbnails from thumbnails dict + @param path_list: path list to level that contains 'thumbnails' key + """ + thumbnails = [] + for path in path_list or [()]: + for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...), default=[]): + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + # Sometimes youtube gives a wrong thumbnail URL. See: + # https://github.com/yt-dlp/yt-dlp/issues/233 + # https://github.com/ytdl-org/youtube-dl/issues/28023 + if 'maxresdefault' in thumbnail_url: + thumbnail_url = thumbnail_url.split('?')[0] + thumbnails.append({ + 'url': thumbnail_url, + 'height': int_or_none(thumbnail.get('height')), + 'width': int_or_none(thumbnail.get('width')), + }) + return thumbnails + + @staticmethod def extract_relative_time(relative_time_text): """ Extracts a relative time from string and converts to dt object @@ -783,6 +807,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): overlay_style = traverse_obj( renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) badges = self._extract_badges(renderer) + thumbnails = self._extract_thumbnails(renderer, 'thumbnail') + return { '_type': 'url', 'ie_key': YoutubeIE.ie_key(), @@ -794,6 +820,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'view_count': view_count, 'uploader': uploader, 'channel_id': channel_id, + 'thumbnails': thumbnails, 'upload_date': strftime_or_none(timestamp, '%Y%m%d'), 'live_status': ('is_upcoming' if scheduled_timestamp is not None else 'was_live' if 'streamed' in time_text.lower() @@ -1750,16 +1777,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._player_cache = {} def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data): - EXPIRATION_DURATION = 18_000 lock = threading.Lock() is_live = True - expiration_time = time.time() + EXPIRATION_DURATION + start_time = time.time() formats = [f for f in formats if f.get('is_from_start')] - def refetch_manifest(format_id): - nonlocal formats, expiration_time, is_live - if time.time() <= expiration_time: + def refetch_manifest(format_id, delay): + nonlocal formats, start_time, is_live + if time.time() <= start_time + delay: return _, _, prs, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url) @@ -1769,19 +1795,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): prs, (..., 'microformat', 'playerMicroformatRenderer'), expected_type=dict, default=[]) _, is_live, _, formats = self._list_formats(video_id, microformats, video_details, prs, player_url) - expiration_time = time.time() + EXPIRATION_DURATION + start_time = time.time() - def mpd_feed(format_id): + def mpd_feed(format_id, delay): """ @returns (manifest_url, manifest_stream_number, is_live) or None """ with lock: - refetch_manifest(format_id) + refetch_manifest(format_id, delay) f = next((f for f in formats if f['format_id'] == format_id), None) if not f: - self.report_warning( - f'Cannot find refreshed manifest for format {format_id}{bug_reports_message()}') + if not is_live: + self.to_screen(f'{video_id}: Video is no longer live') + else: + self.report_warning( + f'Cannot find refreshed manifest for format {format_id}{bug_reports_message()}') return None return f['manifest_url'], f['manifest_stream_number'], is_live @@ -1812,9 +1841,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): nonlocal mpd_url, stream_number, is_live, no_fragment_score, fragments, fragment_base_url # Obtain from MPD's maximum seq value old_mpd_url = mpd_url - mpd_url, stream_number, is_live = mpd_feed(format_id) or (mpd_url, stream_number, False) - if old_mpd_url == mpd_url and not refresh_sequence: - return True, last_seq + last_error = ctx.pop('last_error', None) + expire_fast = last_error and isinstance(last_error, compat_HTTPError) and last_error.code == 403 + mpd_url, stream_number, is_live = (mpd_feed(format_id, 5 if expire_fast else 18000) + or (mpd_url, stream_number, False)) + if not refresh_sequence: + if expire_fast and not is_live: + return False, last_seq + elif old_mpd_url == mpd_url: + return True, last_seq try: fmts, _ = self._extract_mpd_formats_and_subtitles( mpd_url, None, note=False, errnote=False, fatal=False) @@ -1848,8 +1883,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): last_segment_url = None continue else: - should_retry, last_seq = _extract_sequence_from_mpd(True) - if not should_retry: + should_continue, last_seq = _extract_sequence_from_mpd(True) + if not should_continue: continue if known_idx > last_seq: @@ -1866,9 +1901,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): try: for idx in range(known_idx, last_seq): # do not update sequence here or you'll get skipped some part of it - should_retry, _ = _extract_sequence_from_mpd(False) - if not should_retry: - # retry when it gets weird state + should_continue, _ = _extract_sequence_from_mpd(False) + if not should_continue: known_idx = idx - 1 raise ExtractorError('breaking out of outer loop') last_segment_url = urljoin(fragment_base_url, 'sq/%d' % idx) @@ -2903,25 +2937,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if f.get('vcodec') != 'none': f['stretched_ratio'] = ratio break - - thumbnails = [] - thumbnail_dicts = traverse_obj( - (video_details, microformats), (..., ..., 'thumbnail', 'thumbnails', ...), - expected_type=dict, default=[]) - for thumbnail in thumbnail_dicts: - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - # Sometimes youtube gives a wrong thumbnail URL. See: - # https://github.com/yt-dlp/yt-dlp/issues/233 - # https://github.com/ytdl-org/youtube-dl/issues/28023 - if 'maxresdefault' in thumbnail_url: - thumbnail_url = thumbnail_url.split('?')[0] - thumbnails.append({ - 'url': thumbnail_url, - 'height': int_or_none(thumbnail.get('height')), - 'width': int_or_none(thumbnail.get('width')), - }) + thumbnails = self._extract_thumbnails((video_details, microformats), (..., ..., 'thumbnail')) thumbnail_url = search_meta(['og:image', 'twitter:image']) if thumbnail_url: thumbnails.append({ @@ -3584,7 +3600,6 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _extract_from_tabs(self, item_id, ytcfg, data, tabs): playlist_id = title = description = channel_url = channel_name = channel_id = None - thumbnails_list = [] tags = [] selected_tab = self._extract_selected_tab(tabs) @@ -3603,26 +3618,13 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): description = renderer.get('description', '') playlist_id = channel_id tags = renderer.get('keywords', '').split() - thumbnails_list = ( - try_get(renderer, lambda x: x['avatar']['thumbnails'], list) - or try_get( - self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'), - lambda x: x['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'], - list) - or []) - thumbnails = [] - for t in thumbnails_list: - if not isinstance(t, dict): - continue - thumbnail_url = url_or_none(t.get('url')) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(t.get('width')), - 'height': int_or_none(t.get('height')), - }) + thumbnails = ( + self._extract_thumbnails(renderer, 'avatar') + or self._extract_thumbnails( + self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'), + ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail'))) + if playlist_id is None: playlist_id = item_id if title is None: diff --git a/yt_dlp/extractor/zee5.py b/yt_dlp/extractor/zee5.py index 5a5eebd30..a3a705bdd 100644 --- a/yt_dlp/extractor/zee5.py +++ b/yt_dlp/extractor/zee5.py @@ -23,7 +23,7 @@ class Zee5IE(InfoExtractor): zee5:| https?://(?:www\.)?zee5\.com/(?:[^#?]+/)? (?: - (?:tvshows|kids|zee5originals)(?:/[^#/?]+){3} + (?:tv-shows|kids|zee5originals)(?:/[^#/?]+){3} |movies/[^#/?]+ )/(?P<display_id>[^#/?]+)/ ) @@ -37,48 +37,50 @@ class Zee5IE(InfoExtractor): 'display_id': 'krishna-the-birth', 'title': 'Krishna - The Birth', 'duration': 4368, - 'average_rating': 4, 'description': compat_str, 'alt_title': 'Krishna - The Birth', 'uploader': 'Zee Entertainment Enterprises Ltd', 'release_date': '20060101', 'upload_date': '20060101', 'timestamp': 1136073600, - 'thumbnail': 'https://akamaividz.zee5.com/resources/0-0-63098/list/270x152/0063098_list_80888170.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 0, + 'episode': 'Episode 0', 'tags': list }, 'params': { 'format': 'bv', }, }, { - 'url': 'https://zee5.com/tvshows/details/krishna-balram/0-6-1871/episode-1-the-test-of-bramha/0-1-233402', + 'url': 'https://www.zee5.com/kids/kids-shows/bandbudh-aur-budbak/0-6-1899/yoga-se-hoga-bandbudh-aur-budbak/0-1-239839', 'info_dict': { - 'id': '0-1-233402', + 'id': '0-1-239839', 'ext': 'mp4', - 'display_id': 'episode-1-the-test-of-bramha', - 'title': 'Episode 1 - The Test Of Bramha', - 'duration': 1336, - 'average_rating': 4, + 'display_id': 'yoga-se-hoga-bandbudh-aur-budbak', + 'title': 'Yoga Se Hoga-Bandbudh aur Budbak', + 'duration': 659, 'description': compat_str, - 'alt_title': 'Episode 1 - The Test Of Bramha', + 'alt_title': 'Yoga Se Hoga-Bandbudh aur Budbak', 'uploader': 'Zee Entertainment Enterprises Ltd', - 'release_date': '20090101', - 'upload_date': '20090101', - 'timestamp': 1230768000, - 'thumbnail': 'https://akamaividz.zee5.com/resources/0-1-233402/list/270x152/01233402_list.jpg', - 'series': 'Krishna Balram', + 'release_date': '20150101', + 'upload_date': '20150101', + 'timestamp': 1420070400, + 'thumbnail': r're:^https?://.*\.jpg$', + 'series': 'Bandbudh Aur Budbak', 'season_number': 1, 'episode_number': 1, + 'episode': 'Episode 1', + 'season': 'Season 1', 'tags': list, }, 'params': { 'format': 'bv', }, }, { - 'url': 'https://www.zee5.com/hi/tvshows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730?country=IN', + 'url': 'https://www.zee5.com/hi/tv-shows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730?country=IN', 'only_matching': True }, { - 'url': 'https://www.zee5.com/global/hi/tvshows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730', + 'url': 'https://www.zee5.com/global/hi/tv-shows/details/kundali-bhagya/0-6-366/kundali-bhagya-march-08-2021/0-1-manual_7g9jv1os7730', 'only_matching': True }] _DETAIL_API_URL = 'https://spapi.zee5.com/singlePlayback/getDetails?content_id={}&device_id={}&platform_name=desktop_web&country=IN&check_parental_control=false' @@ -153,7 +155,6 @@ class Zee5IE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'duration': int_or_none(asset_data.get('duration')), - 'average_rating': int_or_none(asset_data.get('rating')), 'description': str_or_none(asset_data.get('description')), 'alt_title': str_or_none(asset_data.get('original_title')), 'uploader': str_or_none(asset_data.get('content_owner')), @@ -175,42 +176,42 @@ class Zee5SeriesIE(InfoExtractor): (?: zee5:series:| https?://(?:www\.)?zee5\.com/(?:[^#?]+/)? - (?:tvshows|kids|zee5originals)(?:/[^#/?]+){2}/ + (?:tv-shows|kids|zee5originals)(?:/[^#/?]+){2}/ ) (?P<id>[^#/?]+)(?:/episodes)?/?(?:$|[?#]) ''' _TESTS = [{ - 'url': 'https://www.zee5.com/kids/kids-shows/krishna-balram/0-6-1871', - 'playlist_mincount': 43, + 'url': 'https://www.zee5.com/kids/kids-shows/bandbudh-aur-budbak/0-6-1899', + 'playlist_mincount': 156, 'info_dict': { - 'id': '0-6-1871', + 'id': '0-6-1899', }, }, { - 'url': 'https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199', + 'url': 'https://www.zee5.com/tv-shows/details/bhabi-ji-ghar-par-hai/0-6-199', 'playlist_mincount': 1500, 'info_dict': { 'id': '0-6-199', }, }, { - 'url': 'https://www.zee5.com/tvshows/details/agent-raghav-crime-branch/0-6-965', + 'url': 'https://www.zee5.com/tv-shows/details/agent-raghav-crime-branch/0-6-965', 'playlist_mincount': 24, 'info_dict': { 'id': '0-6-965', }, }, { - 'url': 'https://www.zee5.com/ta/tvshows/details/nagabhairavi/0-6-3201', + 'url': 'https://www.zee5.com/ta/tv-shows/details/nagabhairavi/0-6-3201', 'playlist_mincount': 3, 'info_dict': { 'id': '0-6-3201', }, }, { - 'url': 'https://www.zee5.com/global/hi/tvshows/details/khwaabon-ki-zamin-par/0-6-270', + 'url': 'https://www.zee5.com/global/hi/tv-shows/details/khwaabon-ki-zamin-par/0-6-270', 'playlist_mincount': 150, 'info_dict': { 'id': '0-6-270', }, }, { - 'url': 'https://www.zee5.com/tvshows/details/chala-hawa-yeu-dya-ladies-zindabaad/0-6-2943/episodes', + 'url': 'https://www.zee5.com/tv-shows/details/chala-hawa-yeu-dya-ladies-zindabaad/0-6-2943/episodes', 'only_matching': True, }] diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 168821a68..971c51515 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -20,7 +20,7 @@ from .utils import ( remove_end, write_string, ) -from .cookies import SUPPORTED_BROWSERS +from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS from .version import __version__ from .downloader.external import list_external_downloaders @@ -664,7 +664,7 @@ def parseOpts(overrideArguments=None): downloader.add_option( '-N', '--concurrent-fragments', dest='concurrent_fragment_downloads', metavar='N', default=1, type=int, - help='Number of fragments of a dash/hlsnative video that should be download concurrently (default is %default)') + help='Number of fragments of a dash/hlsnative video that should be downloaded concurrently (default is %default)') downloader.add_option( '-r', '--limit-rate', '--rate-limit', dest='ratelimit', metavar='RATE', @@ -678,6 +678,10 @@ def parseOpts(overrideArguments=None): dest='retries', metavar='RETRIES', default=10, help='Number of retries (default is %default), or "infinite"') downloader.add_option( + '--file-access-retries', + dest='file_access_retries', metavar='RETRIES', default=10, + help='Number of times to retry on file access error (default is %default), or "infinite"') + downloader.add_option( '--fragment-retries', dest='fragment_retries', metavar='RETRIES', default=10, help='Number of retries for a fragment (default is %default), or "infinite" (DASH, hlsnative and ISM)') @@ -1015,7 +1019,7 @@ def parseOpts(overrideArguments=None): }, help=( 'The paths where the files should be downloaded. ' 'Specify the type of file and the path separated by a colon ":". ' - 'All the same types as --output are supported. ' + 'All the same TYPES as --output are supported. ' 'Additionally, you can also provide "home" (default) and "temp" paths. ' 'All intermediary files are first downloaded to the temp path and ' 'then the final files are moved over to the home path after download is finished. ' @@ -1166,14 +1170,15 @@ def parseOpts(overrideArguments=None): help='Do not read/dump cookies from/to file (default)') filesystem.add_option( '--cookies-from-browser', - dest='cookiesfrombrowser', metavar='BROWSER[:PROFILE]', + dest='cookiesfrombrowser', metavar='BROWSER[+KEYRING][:PROFILE]', help=( - 'Load cookies from a user profile of the given web browser. ' - 'Currently supported browsers are: {}. ' - 'You can specify the user profile name or directory using ' - '"BROWSER:PROFILE_NAME" or "BROWSER:PROFILE_PATH". ' - 'If no profile is given, the most recently accessed one is used'.format( - ', '.join(sorted(SUPPORTED_BROWSERS))))) + 'The name of the browser and (optionally) the name/path of ' + 'the profile to load cookies from, separated by a ":". ' + f'Currently supported browsers are: {", ".join(sorted(SUPPORTED_BROWSERS))}. ' + 'By default, the most recently accessed profile is used. ' + 'The keyring used for decrypting Chromium cookies on Linux can be ' + '(optionally) specified after the browser name separated by a "+". ' + f'Currently supported keyrings are: {", ".join(map(str.lower, sorted(SUPPORTED_KEYRINGS)))}')) filesystem.add_option( '--no-cookies-from-browser', action='store_const', const=None, dest='cookiesfrombrowser', diff --git a/yt_dlp/postprocessor/embedthumbnail.py b/yt_dlp/postprocessor/embedthumbnail.py index 918d3e788..e199a1cdd 100644 --- a/yt_dlp/postprocessor/embedthumbnail.py +++ b/yt_dlp/postprocessor/embedthumbnail.py @@ -145,8 +145,43 @@ class EmbedThumbnailPP(FFmpegPostProcessor): self.report_warning('unable to embed using mutagen; %s' % error_to_compat_str(err)) success = False - # Method 2: Use ffmpeg+ffprobe - if not success and not prefer_atomicparsley: + # Method 2: Use AtomicParsley + if not success: + success = True + atomicparsley = next(( + x for x in ['AtomicParsley', 'atomicparsley'] + if check_executable(x, ['-v'])), None) + if atomicparsley is None: + self.to_screen('Neither mutagen nor AtomicParsley was found. Falling back to ffmpeg') + success = False + else: + if not prefer_atomicparsley: + self.to_screen('mutagen was not found. Falling back to AtomicParsley') + cmd = [encodeFilename(atomicparsley, True), + encodeFilename(filename, True), + encodeArgument('--artwork'), + encodeFilename(thumbnail_filename, True), + encodeArgument('-o'), + encodeFilename(temp_filename, True)] + cmd += [encodeArgument(o) for o in self._configuration_args('AtomicParsley')] + + self._report_run('atomicparsley', filename) + self.write_debug('AtomicParsley command line: %s' % shell_quote(cmd)) + p = Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate_or_kill() + if p.returncode != 0: + msg = stderr.decode('utf-8', 'replace').strip() + self.report_warning(f'Unable to embed thumbnails using AtomicParsley; {msg}') + # for formats that don't support thumbnails (like 3gp) AtomicParsley + # won't create to the temporary file + if b'No changes' in stdout: + self.report_warning('The file format doesn\'t support embedding a thumbnail') + success = False + + # Method 3: Use ffmpeg+ffprobe + # Thumbnails attached using this method doesn't show up as cover in some cases + # See https://github.com/yt-dlp/yt-dlp/issues/2125, https://github.com/yt-dlp/yt-dlp/issues/411 + if not success: success = True try: options = ['-c', 'copy', '-map', '0', '-dn', '-map', '1'] @@ -161,38 +196,8 @@ class EmbedThumbnailPP(FFmpegPostProcessor): self._report_run('ffmpeg', filename) self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options) except PostProcessingError as err: - self.report_warning('unable to embed using ffprobe & ffmpeg; %s' % error_to_compat_str(err)) - success = False - - # Method 3: Use AtomicParsley - if not success: - success = True - atomicparsley = next(( - x for x in ['AtomicParsley', 'atomicparsley'] - if check_executable(x, ['-v'])), None) - if atomicparsley is None: - raise EmbedThumbnailPPError('AtomicParsley was not found. Please install') - - cmd = [encodeFilename(atomicparsley, True), - encodeFilename(filename, True), - encodeArgument('--artwork'), - encodeFilename(thumbnail_filename, True), - encodeArgument('-o'), - encodeFilename(temp_filename, True)] - cmd += [encodeArgument(o) for o in self._configuration_args('AtomicParsley')] - - self._report_run('atomicparsley', filename) - self.write_debug('AtomicParsley command line: %s' % shell_quote(cmd)) - p = Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate_or_kill() - if p.returncode != 0: - msg = stderr.decode('utf-8', 'replace').strip() - raise EmbedThumbnailPPError(msg) - # for formats that don't support thumbnails (like 3gp) AtomicParsley - # won't create to the temporary file - if b'No changes' in stdout: - self.report_warning('The file format doesn\'t support embedding a thumbnail') success = False + raise EmbedThumbnailPPError(f'Unable to embed using ffprobe & ffmpeg; {err}') elif info['ext'] in ['ogg', 'opus', 'flac']: if not has_mutagen: diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 594762974..96b48ded5 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -296,8 +296,8 @@ class FFmpegPostProcessor(PostProcessor): keys = ['_%s%d' % (name, number), '_%s' % name] if name == 'o': args += ['-movflags', '+faststart'] - elif number == 1: - keys.append('') + if number == 1: + keys.append('') args += self._configuration_args(self.basename, keys) if name == 'i': args.append('-i') @@ -1092,12 +1092,12 @@ class FFmpegThumbnailsConvertorPP(FFmpegPostProcessor): files_to_delete = [] has_thumbnail = False - for idx, thumbnail_dict in enumerate(info['thumbnails']): - if 'filepath' not in thumbnail_dict: + for idx, thumbnail_dict in enumerate(info.get('thumbnails') or []): + original_thumbnail = thumbnail_dict.get('filepath') + if not original_thumbnail: continue has_thumbnail = True self.fixup_webp(info, idx) - original_thumbnail = thumbnail_dict['filepath'] _, thumbnail_ext = os.path.splitext(original_thumbnail) if thumbnail_ext: thumbnail_ext = thumbnail_ext[1:].lower() diff --git a/yt_dlp/postprocessor/metadataparser.py b/yt_dlp/postprocessor/metadataparser.py index 807cd305d..646659e75 100644 --- a/yt_dlp/postprocessor/metadataparser.py +++ b/yt_dlp/postprocessor/metadataparser.py @@ -99,7 +99,7 @@ class MetadataParserPP(PostProcessor): class MetadataFromFieldPP(MetadataParserPP): @classmethod def to_action(cls, f): - match = re.match(r'(?P<in>.*?)(?<!\\):(?P<out>.+)$', f) + match = re.match(r'(?s)(?P<in>.*?)(?<!\\):(?P<out>.+)$', f) if match is None: raise ValueError(f'it should be FROM:TO, not {f!r}') return ( diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index d34e5b545..0c3c6c401 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -210,6 +210,7 @@ DATE_FORMATS = ( '%Y/%m/%d %H:%M:%S', '%Y%m%d%H%M', '%Y%m%d%H%M%S', + '%Y%m%d', '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', @@ -304,7 +305,7 @@ def write_json_file(obj, fn): try: with tf: - json.dump(obj, tf) + json.dump(obj, tf, ensure_ascii=False) if sys.platform == 'win32': # Need to remove existing file on Windows, else os.rename raises # WindowsError or FileExistsError. @@ -1862,7 +1863,6 @@ def _windows_write_string(s, out): False if it has yet to be written out.""" # Adapted from http://stackoverflow.com/a/3259271/35070 - import ctypes import ctypes.wintypes WIN_OUTPUT_IDS = { @@ -2110,18 +2110,19 @@ def unsmuggle_url(smug_url, default=None): return url, data +def format_decimal_suffix(num, fmt='%d%s', *, factor=1000): + """ Formats numbers with decimal sufixes like K, M, etc """ + num, factor = float_or_none(num), float(factor) + if num is None: + return None + exponent = 0 if num == 0 else int(math.log(num, factor)) + suffix = ['', *'KMGTPEZY'][exponent] + converted = num / (factor ** exponent) + return fmt % (converted, f'{suffix}i' if suffix and factor == 1024 else suffix) + + def format_bytes(bytes): - if bytes is None: - return 'N/A' - if type(bytes) is str: - bytes = float(bytes) - if bytes == 0.0: - exponent = 0 - else: - exponent = int(math.log(bytes, 1024.0)) - suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent] - converted = float(bytes) / float(1024 ** exponent) - return '%.2f%s' % (converted, suffix) + return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A' def lookup_unit_table(unit_table, s): @@ -2210,7 +2211,7 @@ def parse_count(s): if s is None: return None - s = s.strip() + s = re.sub(r'^[^\d]+\s', '', s).strip() if re.match(r'^[\d,.]+$', s): return str_to_int(s) @@ -2222,9 +2223,17 @@ def parse_count(s): 'M': 1000 ** 2, 'kk': 1000 ** 2, 'KK': 1000 ** 2, + 'b': 1000 ** 3, + 'B': 1000 ** 3, } - return lookup_unit_table(_UNIT_TABLE, s) + ret = lookup_unit_table(_UNIT_TABLE, s) + if ret is not None: + return ret + + mobj = re.match(r'([\d,.]+)(?:$|\s)', s) + if mobj: + return str_to_int(mobj.group(1)) def parse_resolution(s): @@ -3192,30 +3201,29 @@ def parse_codecs(codecs_str): if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'): if not vcodec: - vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1') else full_codec + vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec if codec in ('dvh1', 'dvhe'): hdr = 'DV' elif codec == 'av1' and len(parts) > 3 and parts[3] == '10': hdr = 'HDR10' elif full_codec.replace('0', '').startswith('vp9.2'): hdr = 'HDR10' - elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): + elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): if not acodec: acodec = full_codec else: write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr) - if not vcodec and not acodec: - if len(split_codecs) == 2: - return { - 'vcodec': split_codecs[0], - 'acodec': split_codecs[1], - } - else: + if vcodec or acodec: return { 'vcodec': vcodec or 'none', 'acodec': acodec or 'none', 'dynamic_range': hdr, } + elif len(split_codecs) == 2: + return { + 'vcodec': split_codecs[0], + 'acodec': split_codecs[1], + } return {} @@ -5024,7 +5032,7 @@ def traverse_dict(dictn, keys, casesense=True): return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True) -def variadic(x, allowed_types=(str, bytes)): +def variadic(x, allowed_types=(str, bytes, dict)): return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,) diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 8c07d099e..7b5732595 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,5 +1,5 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2021.12.01' +__version__ = '2021.12.27' -RELEASE_GIT_HEAD = '91f071af6' +RELEASE_GIT_HEAD = '6223f67a8' |