aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJesús <heckyel@hyperbola.info>2022-02-05 10:48:13 -0500
committerJesús <heckyel@hyperbola.info>2022-02-05 10:48:13 -0500
commitc4b763b19f54ed5dfc2fd408adb9ed74126f6740 (patch)
tree1bbf4450644370608f97bf6d4d7db818c5039f55
parent5aac4e0267e32d98eb68692afedafda3b41ea629 (diff)
parenta3125791c7a5cdf2c8c025b99788bf686edd1a8a (diff)
downloadhypervideo-pre-c4b763b19f54ed5dfc2fd408adb9ed74126f6740.tar.lz
hypervideo-pre-c4b763b19f54ed5dfc2fd408adb9ed74126f6740.tar.xz
hypervideo-pre-c4b763b19f54ed5dfc2fd408adb9ed74126f6740.zip
updated from upstream | 05/02/2022 at 10:48
-rw-r--r--CONTRIBUTORS18
-rw-r--r--Changelog.md187
-rw-r--r--supportedsites.md79
-rw-r--r--test/helper.py10
-rw-r--r--test/test_InfoExtractor.py85
-rw-r--r--test/test_YoutubeDL.py10
-rwxr-xr-xtest/test_download.py2
-rw-r--r--test/test_options.py26
-rw-r--r--test/test_subtitles.py4
-rw-r--r--test/test_utils.py121
-rw-r--r--test/test_verbose_output.py16
-rw-r--r--test/test_youtube_lists.py30
-rw-r--r--test/test_youtube_signature.py13
-rw-r--r--yt_dlp/YoutubeDL.py540
-rw-r--r--yt_dlp/__init__.py45
-rw-r--r--yt_dlp/aes.py18
-rw-r--r--yt_dlp/compat.py7
-rw-r--r--yt_dlp/cookies.py16
-rw-r--r--yt_dlp/downloader/external.py13
-rw-r--r--yt_dlp/downloader/fragment.py5
-rw-r--r--yt_dlp/downloader/websocket.py7
-rw-r--r--yt_dlp/extractor/abc.py7
-rw-r--r--yt_dlp/extractor/adn.py16
-rw-r--r--yt_dlp/extractor/afreecatv.py97
-rw-r--r--yt_dlp/extractor/aparat.py15
-rw-r--r--yt_dlp/extractor/archiveorg.py74
-rw-r--r--yt_dlp/extractor/ard.py15
-rw-r--r--yt_dlp/extractor/arnes.py3
-rw-r--r--yt_dlp/extractor/awaan.py3
-rw-r--r--yt_dlp/extractor/bilibili.py253
-rw-r--r--yt_dlp/extractor/callin.py114
-rw-r--r--yt_dlp/extractor/cam4.py3
-rw-r--r--yt_dlp/extractor/canalalpha.py4
-rw-r--r--yt_dlp/extractor/canvas.py5
-rw-r--r--yt_dlp/extractor/carambatv.py3
-rw-r--r--yt_dlp/extractor/cctv.py3
-rw-r--r--yt_dlp/extractor/ceskatelevize.py1
-rw-r--r--yt_dlp/extractor/common.py127
-rw-r--r--yt_dlp/extractor/crowdbunker.py113
-rw-r--r--yt_dlp/extractor/crunchyroll.py195
-rw-r--r--yt_dlp/extractor/ctvnews.py5
-rw-r--r--yt_dlp/extractor/daftsex.py79
-rw-r--r--yt_dlp/extractor/dailymotion.py4
-rw-r--r--yt_dlp/extractor/daum.py5
-rw-r--r--yt_dlp/extractor/digitalconcerthall.py143
-rw-r--r--yt_dlp/extractor/dispeak.py3
-rw-r--r--yt_dlp/extractor/doodstream.py37
-rw-r--r--yt_dlp/extractor/dplay.py421
-rw-r--r--yt_dlp/extractor/dropbox.py44
-rw-r--r--yt_dlp/extractor/drtv.py16
-rw-r--r--yt_dlp/extractor/ertgr.py316
-rw-r--r--yt_dlp/extractor/europeantour.py37
-rw-r--r--yt_dlp/extractor/extractors.py112
-rw-r--r--yt_dlp/extractor/facebook.py105
-rw-r--r--yt_dlp/extractor/fc2.py53
-rw-r--r--yt_dlp/extractor/flickr.py3
-rw-r--r--yt_dlp/extractor/fox.py39
-rw-r--r--yt_dlp/extractor/fujitv.py62
-rw-r--r--yt_dlp/extractor/funk.py2
-rw-r--r--yt_dlp/extractor/gamejolt.py3
-rw-r--r--yt_dlp/extractor/generic.py197
-rw-r--r--yt_dlp/extractor/gfycat.py15
-rw-r--r--yt_dlp/extractor/globo.py26
-rw-r--r--yt_dlp/extractor/glomex.py220
-rw-r--r--yt_dlp/extractor/googlesearch.py21
-rw-r--r--yt_dlp/extractor/hotstar.py3
-rw-r--r--yt_dlp/extractor/hrfensehen.py10
-rw-r--r--yt_dlp/extractor/imggaming.py5
-rw-r--r--yt_dlp/extractor/instagram.py154
-rw-r--r--yt_dlp/extractor/itv.py4
-rw-r--r--yt_dlp/extractor/joj.py3
-rw-r--r--yt_dlp/extractor/kakao.py46
-rw-r--r--yt_dlp/extractor/kaltura.py3
-rw-r--r--yt_dlp/extractor/keezmovies.py3
-rw-r--r--yt_dlp/extractor/kelbyone.py84
-rw-r--r--yt_dlp/extractor/line.py110
-rw-r--r--yt_dlp/extractor/litv.py23
-rw-r--r--yt_dlp/extractor/lnkgo.py88
-rw-r--r--yt_dlp/extractor/mainstreaming.py219
-rw-r--r--yt_dlp/extractor/medaltv.py3
-rw-r--r--yt_dlp/extractor/mediaset.py121
-rw-r--r--yt_dlp/extractor/megatvcom.py173
-rw-r--r--yt_dlp/extractor/mildom.py71
-rw-r--r--yt_dlp/extractor/minds.py3
-rw-r--r--yt_dlp/extractor/mixch.py32
-rw-r--r--yt_dlp/extractor/mixcloud.py16
-rw-r--r--yt_dlp/extractor/musicdex.py175
-rw-r--r--yt_dlp/extractor/myspass.py63
-rw-r--r--yt_dlp/extractor/nba.py12
-rw-r--r--yt_dlp/extractor/nbc.py23
-rw-r--r--yt_dlp/extractor/neteasemusic.py13
-rw-r--r--yt_dlp/extractor/newstube.py10
-rw-r--r--yt_dlp/extractor/newsy.py51
-rw-r--r--yt_dlp/extractor/nexx.py143
-rw-r--r--yt_dlp/extractor/nfb.py62
-rw-r--r--yt_dlp/extractor/noodlemagazine.py67
-rw-r--r--yt_dlp/extractor/novaplay.py4
-rw-r--r--yt_dlp/extractor/odnoklassniki.py44
-rw-r--r--yt_dlp/extractor/onet.py7
-rw-r--r--yt_dlp/extractor/openrec.py104
-rw-r--r--yt_dlp/extractor/orf.py225
-rw-r--r--yt_dlp/extractor/pbs.py5
-rw-r--r--yt_dlp/extractor/peertube.py4
-rw-r--r--yt_dlp/extractor/pladform.py26
-rw-r--r--yt_dlp/extractor/pokergo.py111
-rw-r--r--yt_dlp/extractor/pornez.py43
-rw-r--r--yt_dlp/extractor/pornhub.py13
-rw-r--r--yt_dlp/extractor/prx.py431
-rw-r--r--yt_dlp/extractor/radlive.py10
-rw-r--r--yt_dlp/extractor/rai.py244
-rw-r--r--yt_dlp/extractor/redbulltv.py3
-rw-r--r--yt_dlp/extractor/rtl2.py16
-rw-r--r--yt_dlp/extractor/rtnews.py199
-rw-r--r--yt_dlp/extractor/rule34video.py65
-rw-r--r--yt_dlp/extractor/shemaroome.py6
-rw-r--r--yt_dlp/extractor/skyit.py5
-rw-r--r--yt_dlp/extractor/sportdeutschland.py8
-rw-r--r--yt_dlp/extractor/storyfire.py17
-rw-r--r--yt_dlp/extractor/streamcz.py157
-rw-r--r--yt_dlp/extractor/stv.py5
-rw-r--r--yt_dlp/extractor/ted.py477
-rw-r--r--yt_dlp/extractor/telemundo.py3
-rw-r--r--yt_dlp/extractor/theta.py10
-rw-r--r--yt_dlp/extractor/thisoldhouse.py17
-rw-r--r--yt_dlp/extractor/tiktok.py182
-rw-r--r--yt_dlp/extractor/trovo.py3
-rw-r--r--yt_dlp/extractor/tumblr.py80
-rw-r--r--yt_dlp/extractor/tver.py22
-rw-r--r--yt_dlp/extractor/tvopengr.py143
-rw-r--r--yt_dlp/extractor/twitcasting.py143
-rw-r--r--yt_dlp/extractor/twitter.py8
-rw-r--r--yt_dlp/extractor/veoh.py62
-rw-r--r--yt_dlp/extractor/vidio.py5
-rw-r--r--yt_dlp/extractor/vidlii.py3
-rw-r--r--yt_dlp/extractor/viki.py9
-rw-r--r--yt_dlp/extractor/vimm.py69
-rw-r--r--yt_dlp/extractor/vine.py3
-rw-r--r--yt_dlp/extractor/viu.py6
-rw-r--r--yt_dlp/extractor/vk.py95
-rw-r--r--yt_dlp/extractor/vlive.py32
-rw-r--r--yt_dlp/extractor/xvideos.py23
-rw-r--r--yt_dlp/extractor/yahoo.py42
-rw-r--r--yt_dlp/extractor/yandexvideo.py40
-rw-r--r--yt_dlp/extractor/younow.py3
-rw-r--r--yt_dlp/extractor/youtube.py996
-rw-r--r--yt_dlp/extractor/zdf.py37
-rw-r--r--yt_dlp/extractor/zee5.py3
-rw-r--r--yt_dlp/extractor/zhihu.py4
-rw-r--r--yt_dlp/options.py304
-rw-r--r--yt_dlp/postprocessor/__init__.py2
-rw-r--r--yt_dlp/postprocessor/embedthumbnail.py4
-rw-r--r--yt_dlp/postprocessor/exec.py12
-rw-r--r--yt_dlp/postprocessor/ffmpeg.py138
-rw-r--r--yt_dlp/postprocessor/metadataparser.py4
-rw-r--r--yt_dlp/postprocessor/modify_chapters.py3
-rw-r--r--yt_dlp/version.py4
156 files changed, 8344 insertions, 2463 deletions
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 35a0764a2..fd93e7df3 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -2,6 +2,7 @@ pukkandan (owner)
shirt-dev (collaborator)
coletdjnz/colethedj (collaborator)
Ashish0804 (collaborator)
+nao20010128nao/Lesmiscore (collaborator)
h-h-h-h
pauldubois98
nixxo
@@ -19,7 +20,6 @@ samiksome
alxnull
FelixFrog
Zocker1999NET
-nao20010128nao
kurumigi
bbepis
animelover1984/horahoradev
@@ -178,3 +178,19 @@ jaller94
r5d
julien-hadleyjack
git-anony-mouse
+mdawar
+trassshhub
+foghawk
+k3ns1n
+teridon
+mozlima
+timendum
+ischmidt20
+CreaValix
+sian1468
+arkamar
+hyano
+KiberInfinity
+tejing1
+Bricio
+lazypete365
diff --git a/Changelog.md b/Changelog.md
index f46c22a32..0a76f65be 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -11,6 +11,193 @@
-->
+### 2022.02.04
+
+* [youtube:search] Fix extractor by [coletdjnz](https://github.com/coletdjnz)
+* [youtube:search] Add tests
+* [twitcasting] Enforce UTF-8 for POST payload by [Lesmiscore](https://github.com/Lesmiscore)
+* [mediaset] Fix extractor by [nixxo](https://github.com/nixxo)
+* [websocket] Make syntax error in `websockets` module non-fatal
+
+### 2022.02.03
+
+* Merge youtube-dl: Upto [commit/78ce962](https://github.com/ytdl-org/youtube-dl/commit/78ce962f4fe020994c216dd2671546fbe58a5c67)
+* Add option `--print-to-file`
+* Make nested --config-locations relative to parent file
+* Ensure `_type` is present in `info.json`
+* Fix `--compat-options list-formats`
+* Fix/improve `InAdvancePagedList`
+* [downloader/ffmpeg] Handle unknown formats better
+* [outtmpl] Handle `-o ""` better
+* [outtmpl] Handle hard-coded file extension better
+* [extractor] Add convinience function `_yes_playlist`
+* [extractor] Allow non-fatal `title` extraction
+* [extractor] Extract video inside `Article` json_ld
+* [generic] Allow further processing of json_ld URL
+* [cookies] Fix keyring selection for unsupported desktops
+* [utils] Strip double spaces in `clean_html` by [dirkf](https://github.com/dirkf)
+* [aes] Add `unpad_pkcs7`
+* [test] Fix `test_youtube_playlist_noplaylist`
+* [docs,cleanup] Misc cleanup
+* [dplay] Add extractors for site changes by [Sipherdrakon](https://github.com/Sipherdrakon)
+* [ertgr] Add extractors by [zmousm](https://github.com/zmousm), [dirkf](https://github.com/dirkf)
+* [Musicdex] Add extractors by [Ashish0804](https://github.com/Ashish0804)
+* [YandexVideoPreview] Add extractor by [KiberInfinity](https://github.com/KiberInfinity)
+* [youtube] Add extractor `YoutubeMusicSearchURLIE`
+* [archive.org] Ignore unnecessary files
+* [Bilibili] Add 8k support by [u-spec-png](https://github.com/u-spec-png)
+* [bilibili] Fix extractor, make anthology title non-fatal
+* [CAM4] Add thumbnail extraction by [alerikaisattera](https://github.com/alerikaisattera)
+* [cctv] De-prioritize sample format
+* [crunchyroll:beta] Add cookies support by [tejing1](https://github.com/tejing1)
+* [crunchyroll] Fix login by [tejing1](https://github.com/tejing1)
+* [doodstream] Fix extractor
+* [fc2] Fix extraction by [Lesmiscore](https://github.com/Lesmiscore)
+* [FFmpegConcat] Abort on --skip-download and download errors
+* [Fujitv] Extract metadata and support premium by [YuenSzeHong](https://github.com/YuenSzeHong)
+* [globo] Fix extractor by [Bricio](https://github.com/Bricio)
+* [glomex] Simplify embed detection
+* [GoogleSearch] Fix extractor
+* [Instagram] Fix extraction when logged in by [MinePlayersPE](https://github.com/MinePlayersPE)
+* [iq.com] Add VIP support by [MinePlayersPE](https://github.com/MinePlayersPE)
+* [mildom] Fix extractor by [lazypete365](https://github.com/lazypete365)
+* [MySpass] Fix video url processing by [trassshhub](https://github.com/trassshhub)
+* [Odnoklassniki] Improve embedded players extraction by [KiberInfinity](https://github.com/KiberInfinity)
+* [orf:tvthek] Lazy playlist extraction and obey --no-playlist
+* [Pladform] Fix redirection to external player by [KiberInfinity](https://github.com/KiberInfinity)
+* [ThisOldHouse] Improve Premium URL check by [Ashish0804](https://github.com/Ashish0804)
+* [TikTok] Iterate through app versions by [MinePlayersPE](https://github.com/MinePlayersPE)
+* [tumblr] Fix 403 errors and handle vimeo embeds by [foghawk](https://github.com/foghawk)
+* [viki] Fix "Bad request" for manifest by [nyuszika7h](https://github.com/nyuszika7h)
+* [Vimm] add recording extractor by [alerikaisattera](https://github.com/alerikaisattera)
+* [web.archive:youtube] Add `ytarchive:` prefix and misc cleanup
+* [youtube:api] Do not use seek when reading HTTPError response by [coletdjnz](https://github.com/coletdjnz)
+* [youtube] Fix n-sig for player e06dea74
+* [youtube, cleanup] Misc fixes and cleanup
+
+
+### 2022.01.21
+
+* Add option `--concat-playlist` to **concat videos in a playlist**
+* Allow **multiple and nested configuration files**
+* Add more post-processing stages (`after_video`, `playlist`)
+* Allow `--exec` to be run at any post-processing stage (Deprecates `--exec-before-download`)
+* Allow `--print` to be run at any post-processing stage
+* Allow listing formats, thumbnails, subtitles using `--print` by [pukkandan](https://github.com/pukkandan), [Zirro](https://github.com/Zirro)
+* Add fields `video_autonumber`, `modified_date`, `modified_timestamp`, `playlist_count`, `channel_follower_count`
+* Add key `requested_downloads` in the root `info_dict`
+* Write `download_archive` only after all formats are downloaded
+* [FfmpegMetadata] Allow setting metadata of individual streams using `meta<n>_` prefix
+* Add option `--legacy-server-connect` by [xtkoba](https://github.com/xtkoba)
+* Allow escaped `,` in `--extractor-args`
+* Allow unicode characters in `info.json`
+* Check for existing thumbnail/subtitle in final directory
+* Don't treat empty containers as `None` in `sanitize_info`
+* Fix `-s --ignore-no-formats --force-write-archive`
+* Fix live title for multiple formats
+* List playlist thumbnails in `--list-thumbnails`
+* Raise error if subtitle download fails
+* [cookies] Fix bug when keyring is unspecified
+* [ffmpeg] Ignore unknown streams, standardize use of `-map 0`
+* [outtmpl] Alternate form for `D` and fix suffix's case
+* [utils] Add `Sec-Fetch-Mode` to `std_headers`
+* [utils] Fix `format_bytes` output for Bytes by [pukkandan](https://github.com/pukkandan), [mdawar](https://github.com/mdawar)
+* [utils] Handle `ss:xxx` in `parse_duration`
+* [utils] Improve parsing for nested HTML elements by [zmousm](https://github.com/zmousm), [pukkandan](https://github.com/pukkandan)
+* [utils] Use key `None` in `traverse_obj` to return as-is
+* [extractor] Detect more subtitle codecs in MPD manifests by [fstirlitz](https://github.com/fstirlitz)
+* [extractor] Extract chapters from JSON-LD by [iw0nderhow](https://github.com/iw0nderhow), [pukkandan](https://github.com/pukkandan)
+* [extractor] Extract thumbnails from JSON-LD by [nixxo](https://github.com/nixxo)
+* [extractor] Improve `url_result` and related
+* [generic] Improve KVS player extraction by [trassshhub](https://github.com/trassshhub)
+* [build] Reduce dependency on third party workflows
+* [extractor,cleanup] Use `_search_nextjs_data`, `format_field`
+* [cleanup] Minor fixes and cleanup
+* [docs] Improvements
+* [test] Fix TestVerboseOutput
+* [afreecatv] Add livestreams extractor by [wlritchi](https://github.com/wlritchi)
+* [callin] Add extractor by [foghawk](https://github.com/foghawk)
+* [CrowdBunker] Add extractors by [Ashish0804](https://github.com/Ashish0804)
+* [daftsex] Add extractors by [k3ns1n](https://github.com/k3ns1n)
+* [digitalconcerthall] Add extractor by [teridon](https://github.com/teridon)
+* [Drooble] Add extractor by [u-spec-png](https://github.com/u-spec-png)
+* [EuropeanTour] Add extractor by [Ashish0804](https://github.com/Ashish0804)
+* [iq.com] Add extractors by [MinePlayersPE](https://github.com/MinePlayersPE)
+* [KelbyOne] Add extractor by [Ashish0804](https://github.com/Ashish0804)
+* [LnkIE] Add extractor by [Ashish0804](https://github.com/Ashish0804)
+* [MainStreaming] Add extractor by [coletdjnz](https://github.com/coletdjnz)
+* [megatvcom] Add extractors by [zmousm](https://github.com/zmousm)
+* [Newsy] Add extractor by [Ashish0804](https://github.com/Ashish0804)
+* [noodlemagazine] Add extractor by [trassshhub](https://github.com/trassshhub)
+* [PokerGo] Add extractors by [Ashish0804](https://github.com/Ashish0804)
+* [Pornez] Add extractor by [mozlima](https://github.com/mozlima)
+* [PRX] Add Extractors by [coletdjnz](https://github.com/coletdjnz)
+* [RTNews] Add extractor by [Ashish0804](https://github.com/Ashish0804)
+* [Rule34video] Add extractor by [trassshhub](https://github.com/trassshhub)
+* [tvopengr] Add extractors by [zmousm](https://github.com/zmousm)
+* [Vimm] Add extractor by [alerikaisattera](https://github.com/alerikaisattera)
+* [glomex] Add extractors by [zmousm](https://github.com/zmousm)
+* [instagram] Add story/highlight extractor by [u-spec-png](https://github.com/u-spec-png)
+* [openrec] Add movie extractor by [Lesmiscore](https://github.com/Lesmiscore)
+* [rai] Add Raiplaysound extractors by [nixxo](https://github.com/nixxo), [pukkandan](https://github.com/pukkandan)
+* [aparat] Fix extractor
+* [ard] Extract subtitles by [fstirlitz](https://github.com/fstirlitz)
+* [BiliIntl] Add login by [MinePlayersPE](https://github.com/MinePlayersPE)
+* [CeskaTelevize] Use `http` for manifests
+* [CTVNewsIE] Add fallback for video search by [Ashish0804](https://github.com/Ashish0804)
+* [dplay] Migrate DiscoveryPlusItaly to DiscoveryPlus by [timendum](https://github.com/timendum)
+* [dplay] Re-structure DiscoveryPlus extractors
+* [Dropbox] Support password protected files and more formats by [zenerdi0de](https://github.com/zenerdi0de)
+* [facebook] Fix extraction from groups
+* [facebook] Improve title and uploader extraction
+* [facebook] Parse dash manifests
+* [fox] Extract m3u8 from preview by [ischmidt20](https://github.com/ischmidt20)
+* [funk] Support origin URLs
+* [gfycat] Fix `uploader`
+* [gfycat] Support embeds by [coletdjnz](https://github.com/coletdjnz)
+* [hotstar] Add extractor args to ignore tags by [Ashish0804](https://github.com/Ashish0804)
+* [hrfernsehen] Fix ardloader extraction by [CreaValix](https://github.com/CreaValix)
+* [instagram] Fix username extraction for stories and highlights by [nyuszika7h](https://github.com/nyuszika7h)
+* [kakao] Detect geo-restriction
+* [line] Remove `tv.line.me` by [sian1468](https://github.com/sian1468)
+* [mixch] Add `MixchArchiveIE` by [Lesmiscore](https://github.com/Lesmiscore)
+* [mixcloud] Detect restrictions by [llacb47](https://github.com/llacb47)
+* [NBCSports] Fix extraction of platform URLs by [ischmidt20](https://github.com/ischmidt20)
+* [Nexx] Extract more metadata by [MinePlayersPE](https://github.com/MinePlayersPE)
+* [Nexx] Support 3q CDN by [MinePlayersPE](https://github.com/MinePlayersPE)
+* [pbs] de-prioritize AD formats
+* [PornHub,YouTube] Refresh onion addresses by [unit193](https://github.com/unit193)
+* [RedBullTV] Parse subtitles from manifest by [Ashish0804](https://github.com/Ashish0804)
+* [streamcz] Fix extractor by [arkamar](https://github.com/arkamar), [pukkandan](https://github.com/pukkandan)
+* [Ted] Rewrite extractor by [pukkandan](https://github.com/pukkandan), [trassshhub](https://github.com/trassshhub)
+* [Theta] Fix valid URL by [alerikaisattera](https://github.com/alerikaisattera)
+* [ThisOldHouseIE] Add support for premium videos by [Ashish0804](https://github.com/Ashish0804)
+* [TikTok] Fix extraction for sigi-based webpages, add API fallback by [MinePlayersPE](https://github.com/MinePlayersPE)
+* [TikTok] Pass cookies to formats, and misc fixes by [MinePlayersPE](https://github.com/MinePlayersPE)
+* [TikTok] Extract captions, user thumbnail by [MinePlayersPE](https://github.com/MinePlayersPE)
+* [TikTok] Change app version by [MinePlayersPE](https://github.com/MinePlayersPE), [llacb47](https://github.com/llacb47)
+* [TVer] Extract message for unaired live by [Lesmiscore](https://github.com/Lesmiscore)
+* [twitcasting] Refactor extractor by [Lesmiscore](https://github.com/Lesmiscore)
+* [twitter] Fix video in quoted tweets
+* [veoh] Improve extractor by [foghawk](https://github.com/foghawk)
+* [vk] Capture `clip` URLs
+* [vk] Fix VKUserVideosIE by [Ashish0804](https://github.com/Ashish0804)
+* [vk] Improve `_VALID_URL` by [k3ns1n](https://github.com/k3ns1n)
+* [VrtNU] Handle empty title by [pgaig](https://github.com/pgaig)
+* [XVideos] Check HLS formats by [MinePlayersPE](https://github.com/MinePlayersPE)
+* [yahoo:gyao] Improved playlist handling by [hyano](https://github.com/hyano)
+* [youtube:tab] Extract more playlist metadata by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan)
+* [youtube:tab] Raise error on tab redirect by [krichbanana](https://github.com/krichbanana), [coletdjnz](https://github.com/coletdjnz)
+* [youtube] Update Innertube clients by [coletdjnz](https://github.com/coletdjnz)
+* [youtube] Detect live-stream embeds
+* [youtube] Do not return `upload_date` for playlists
+* [youtube] Extract channel subscriber count by [coletdjnz](https://github.com/coletdjnz)
+* [youtube] Make invalid storyboard URL non-fatal
+* [youtube] Enforce UTC, update innertube clients and tests by [coletdjnz](https://github.com/coletdjnz)
+* [zdf] Add chapter extraction by [iw0nderhow](https://github.com/iw0nderhow)
+* [zee5] Add geo-bypass
+
+
### 2021.12.27
* Avoid recursion error when re-extracting info
diff --git a/supportedsites.md b/supportedsites.md
index 9dc94f27d..7c4b9bee9 100644
--- a/supportedsites.md
+++ b/supportedsites.md
@@ -41,6 +41,7 @@
- **aenetworks:collection**
- **aenetworks:show**
- **afreecatv**: afreecatv.com
+ - **afreecatv:live**: afreecatv.com
- **AirMozilla**
- **AliExpressLive**
- **AlJazeera**
@@ -53,6 +54,7 @@
- **AMCNetworks**
- **AmericasTestKitchen**
- **AmericasTestKitchenSeason**
+ - **AmHistoryChannel**
- **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
- **AnimalPlanet**
- **AnimeLab**
@@ -162,6 +164,7 @@
- **BuzzFeed**
- **BYUtv**
- **CableAV**
+ - **Callin**
- **CAM4**
- **Camdemy**
- **CamdemyFolder**
@@ -225,6 +228,7 @@
- **ComedyCentralTV**
- **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED
- **CONtv**
+ - **CookingChannel**
- **Corus**
- **Coub**
- **CozyTV**
@@ -232,6 +236,8 @@
- **Cracked**
- **Crackle**
- **CrooksAndLiars**
+ - **CrowdBunker**
+ - **CrowdBunkerChannel**
- **crunchyroll**
- **crunchyroll:beta**
- **crunchyroll:playlist**
@@ -246,6 +252,7 @@
- **curiositystream:collections**
- **curiositystream:series**
- **CWTV**
+ - **Daftsex**
- **DagelijkseKost**: dagelijksekost.een.be
- **DailyMail**
- **dailymotion**
@@ -263,20 +270,20 @@
- **DeezerPlaylist**
- **defense.gouv.fr**
- **democracynow**
+ - **DestinationAmerica**
- **DHM**: Filmarchiv - Deutsches Historisches Museum
- **Digg**
+ - **DigitalConcertHall**: DigitalConcertHall extractor
- **DigitallySpeaking**
- **Digiteka**
- **Discovery**
- - **DiscoveryGo**
- - **DiscoveryGoPlaylist**
+ - **DiscoveryLife**
- **DiscoveryNetworksDe**
- **DiscoveryPlus**
- **DiscoveryPlusIndia**
- **DiscoveryPlusIndiaShow**
- **DiscoveryPlusItaly**
- **DiscoveryPlusItalyShow**
- - **DiscoveryVR**
- **Disney**
- **DIYNetwork**
- **dlive:stream**
@@ -288,6 +295,7 @@
- **DouyuTV**: 斗鱼
- **DPlay**
- **DRBonanza**
+ - **Drooble**
- **Dropbox**
- **Dropout**
- **DropoutSeason**
@@ -324,12 +332,16 @@
- **Eporner**
- **EroProfile**
- **EroProfile:album**
+ - **ertflix**: ERTFLIX videos
+ - **ertflix:codename**: ERTFLIX videos by codename
+ - **ertwebtv:embed**: ert.gr webtv embedded videos
- **Escapist**
- **ESPN**
- **ESPNArticle**
- **ESPNCricInfo**
- **EsriVideo**
- **Europa**
+ - **EuropeanTour**
- **EUScreen**
- **EWETV**
- **ExpoTV**
@@ -352,6 +364,7 @@
- **FiveTV**
- **Flickr**
- **Folketinget**: Folketinget (ft.dk; Danish parliament)
+ - **FoodNetwork**
- **FootyRoom**
- **Formula1**
- **FOX**
@@ -407,7 +420,10 @@
- **Glide**: Glide mobile video messages (glide.me)
- **Globo**
- **GloboArticle**
+ - **glomex**: Glomex videos
+ - **glomex:embed**: Glomex embedded videos
- **Go**
+ - **GoDiscovery**
- **GodTube**
- **Gofile**
- **Golem**
@@ -429,6 +445,7 @@
- **hetklokhuis**
- **hgtv.com:show**
- **HGTVDe**
+ - **HGTVUsa**
- **HiDive**
- **HistoricFilms**
- **history:player**
@@ -470,13 +487,17 @@
- **IndavideoEmbed**
- **InfoQ**
- **Instagram**
+ - **instagram:story**
- **instagram:tag**: Instagram hashtag search URLs
- **instagram:user**: Instagram user profile
- **InstagramIOS**: IOS instagram:// URL
- **Internazionale**
- **InternetVideoArchive**
+ - **InvestigationDiscovery**
- **IPrima**
- **IPrimaCNN**
+ - **iq.com**: International version of iQiyi
+ - **iq.com:album**
- **iqiyi**: 爱奇艺
- **Ir90Tv**
- **ITTF**
@@ -500,6 +521,7 @@
- **KarriereVideos**
- **Katsomo**
- **KeezMovies**
+ - **KelbyOne**
- **Ketnet**
- **khanacademy**
- **khanacademy:unit**
@@ -545,7 +567,6 @@
- **limelight:channel_list**
- **LineLive**
- **LineLiveChannel**
- - **LineTV**
- **LinkedIn**
- **linkedin:learning**
- **linkedin:learning:course**
@@ -554,6 +575,7 @@
- **LiveJournal**
- **livestream**
- **livestream:original**
+ - **Lnk**
- **LnkGo**
- **loc**: Library of Congress
- **LocalNews8**
@@ -566,6 +588,7 @@
- **mailru**: Видео@Mail.Ru
- **mailru:music**: Музыка@Mail.Ru
- **mailru:music:search**: Музыка@Mail.Ru
+ - **MainStreaming**: MainStreaming Player
- **MallTV**
- **mangomolo:live**
- **mangomolo:video**
@@ -592,6 +615,8 @@
- **MediasiteNamedCatalog**
- **Medici**
- **megaphone.fm**: megaphone.fm embedded players
+ - **megatvcom**: megatv.com videos
+ - **megatvcom:embed**: megatv.com embedded videos
- **Meipai**: 美拍
- **MelonVOD**
- **META**
@@ -615,6 +640,7 @@
- **mirrativ:user**
- **MiTele**: mitele.es
- **mixch**
+ - **mixch:archive**
- **mixcloud**
- **mixcloud:playlist**
- **mixcloud:user**
@@ -647,6 +673,10 @@
- **MTVUutisetArticle**
- **MuenchenTV**: münchen.tv
- **MuseScore**
+ - **MusicdexAlbum**
+ - **MusicdexArtist**
+ - **MusicdexPlaylist**
+ - **MusicdexSong**
- **mva**: Microsoft Virtual Academy videos
- **mva:course**: Microsoft Virtual Academy courses
- **Mwave**
@@ -704,6 +734,7 @@
- **Newgrounds:playlist**
- **Newgrounds:user**
- **Newstube**
+ - **Newsy**
- **NextMedia**: 蘋果日報
- **NextMediaActionNews**: 蘋果日報 - 動新聞
- **NextTV**: 壹電視
@@ -733,6 +764,7 @@
- **NJPWWorld**: 新日本プロレスワールド
- **NobelPrize**
- **NonkTube**
+ - **NoodleMagazine**
- **Noovo**
- **Normalboots**
- **NosVideo**
@@ -785,6 +817,7 @@
- **OpencastPlaylist**
- **openrec**
- **openrec:capture**
+ - **openrec:movie**
- **OraTV**
- **orf:burgenland**: Radio Burgenland
- **orf:fm4**: radio FM4
@@ -856,6 +889,8 @@
- **podomatic**
- **Pokemon**
- **PokemonWatch**
+ - **PokerGo**
+ - **PokerGoCollection**
- **PolsatGo**
- **PolskieRadio**
- **polskieradio:kierowcow**
@@ -867,6 +902,7 @@
- **PopcornTV**
- **PornCom**
- **PornerBros**
+ - **Pornez**
- **PornFlip**
- **PornHd**
- **PornHub**: PornHub and Thumbzilla
@@ -881,6 +917,11 @@
- **PressTV**
- **ProjectVeritas**
- **prosiebensat1**: ProSiebenSat.1 Digital
+ - **PRXAccount**
+ - **PRXSeries**
+ - **prxseries:search**: PRX Series Search; "prxseries:" prefix
+ - **prxstories:search**: PRX Stories Search; "prxstories:" prefix
+ - **PRXStory**
- **puhutv**
- **puhutv:serie**
- **Puls4**
@@ -914,8 +955,9 @@
- **RaiPlay**
- **RaiPlayLive**
- **RaiPlayPlaylist**
- - **RaiPlayRadio**
- - **RaiPlayRadioPlaylist**
+ - **RaiPlaySound**
+ - **RaiPlaySoundLive**
+ - **RaiPlaySoundPlaylist**
- **RayWenderlich**
- **RayWenderlichCourse**
- **RBMARadio**
@@ -950,12 +992,15 @@
- **Roxwel**
- **Rozhlas**
- **RTBF**
+ - **RTDocumentry**
+ - **RTDocumentryPlaylist**
- **rte**: Raidió Teilifís Éireann TV
- **rte:radio**: Raidió Teilifís Éireann radio
- **rtl.nl**: rtl.nl and rtlxl.nl
- **rtl2**
- **rtl2:you**
- **rtl2:you:series**
+ - **RTNews**
- **RTP**
- **RTRFM**
- **RTS**: RTS.ch
@@ -967,8 +1012,10 @@
- **RTVNH**
- **RTVS**
- **RUHD**
+ - **Rule34Video**
- **RumbleChannel**
- **RumbleEmbed**
+ - **Ruptly**
- **rutube**: Rutube videos
- **rutube:channel**: Rutube channel
- **rutube:embed**: Rutube embedded videos
@@ -1109,7 +1156,10 @@
- **TeamTreeHouse**
- **TechTalks**
- **techtv.mit.edu**
- - **ted**
+ - **TedEmbed**
+ - **TedPlaylist**
+ - **TedSeries**
+ - **TedTalk**
- **Tele13**
- **Tele5**
- **TeleBruxelles**
@@ -1148,6 +1198,7 @@
- **tiktok:tag**
- **tiktok:user**
- **tinypic**: tinypic.com videos
+ - **TLC**
- **TMZ**
- **TNAFlix**
- **TNAFlixNetworkEmbed**
@@ -1160,6 +1211,7 @@
- **Toypics**: Toypics video
- **ToypicsUser**: Toypics user profile
- **TrailerAddict** (Currently broken)
+ - **TravelChannel**
- **Trilulilu**
- **Trovo**
- **TrovoChannelClip**: All Clips of a trovo.live channel; "trovoclip:" prefix
@@ -1207,6 +1259,8 @@
- **TVNowNew**
- **TVNowSeason**
- **TVNowShow**
+ - **tvopengr:embed**: tvopen.gr embedded videos
+ - **tvopengr:watch**: tvopen.gr (and ethnos.gr) videos
- **tvp**: Telewizja Polska
- **tvp:embed**: Telewizja Polska
- **tvp:series**
@@ -1270,7 +1324,7 @@
- **Viddler**
- **Videa**
- **video.arnes.si**: Arnes Video
- - **video.google:search**: Google Video search; "gvsearch:" prefix (Currently broken)
+ - **video.google:search**: Google Video search; "gvsearch:" prefix
- **video.sky.it**
- **video.sky.it:live**
- **VideoDetective**
@@ -1299,6 +1353,8 @@
- **vimeo:review**: Review pages on vimeo
- **vimeo:user**
- **vimeo:watchlater**: Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)
+ - **Vimm:recording**
+ - **Vimm:stream**
- **Vimple**: Vimple - one-click video hosting
- **Vine**
- **vine:user**
@@ -1351,7 +1407,7 @@
- **wdr:mobile**
- **WDRElefant**
- **WDRPage**
- - **web.archive:youtube**: web.archive.org saved youtube videos
+ - **web.archive:youtube**: web.archive.org saved youtube videos, "ytarchive:" prefix
- **Webcaster**
- **WebcasterFeed**
- **WebOfStories**
@@ -1402,6 +1458,7 @@
- **yandexmusic:playlist**: Яндекс.Музыка - Плейлист
- **yandexmusic:track**: Яндекс.Музыка - Трек
- **YandexVideo**
+ - **YandexVideoPreview**
- **YapFiles**
- **YesJapan**
- **yinyuetai:video**: 音悦Tai
@@ -1418,6 +1475,7 @@
- **youtube**: YouTube
- **youtube:favorites**: YouTube liked videos; ":ytfav" keyword (requires cookies)
- **youtube:history**: Youtube watch history; ":ythis" keyword (requires cookies)
+ - **youtube:music:search_url**: YouTube music search URLs with selectable sections (Eg: #songs)
- **youtube:playlist**: YouTube playlists
- **youtube:recommended**: YouTube recommended videos; ":ytrec" keyword
- **youtube:search**: YouTube search; "ytsearch:" prefix
@@ -1425,9 +1483,10 @@
- **youtube:search_url**: YouTube search URLs with sorting and filter support
- **youtube:subscriptions**: YouTube subscriptions feed; ":ytsubs" keyword (requires cookies)
- **youtube:tab**: YouTube Tabs
+ - **youtube:user**: YouTube user videos; "ytuser:" prefix
- **youtube:watchlater**: Youtube watch later list; ":ytwatchlater" keyword (requires cookies)
+ - **YoutubeLivestreamEmbed**: YouTube livestream embeds
- **YoutubeYtBe**: youtu.be
- - **YoutubeYtUser**: YouTube user videos; "ytuser:" prefix
- **Zapiks**
- **Zattoo**
- **ZattooLive**
diff --git a/test/helper.py b/test/helper.py
index b63a5c897..1070e0668 100644
--- a/test/helper.py
+++ b/test/helper.py
@@ -211,7 +211,7 @@ def sanitize_got_info_dict(got_dict):
# Auto-generated
'autonumber', 'playlist', 'format_index', 'video_ext', 'audio_ext', 'duration_string', 'epoch',
- 'fulltitle', 'extractor', 'extractor_key', 'filepath', 'infojson_filename', 'original_url',
+ 'fulltitle', 'extractor', 'extractor_key', 'filepath', 'infojson_filename', 'original_url', 'n_entries',
# Only live_status needs to be checked
'is_live', 'was_live',
@@ -220,10 +220,12 @@ def sanitize_got_info_dict(got_dict):
IGNORED_PREFIXES = ('', 'playlist', 'requested', 'webpage')
def sanitize(key, value):
- if isinstance(value, str) and len(value) > 100:
+ if isinstance(value, str) and len(value) > 100 and key != 'thumbnail':
return f'md5:{md5(value)}'
elif isinstance(value, list) and len(value) > 10:
return f'count:{len(value)}'
+ elif key.endswith('_count') and isinstance(value, int):
+ return int
return value
test_info_dict = {
@@ -233,7 +235,7 @@ def sanitize_got_info_dict(got_dict):
}
# display_id may be generated from id
- if test_info_dict.get('display_id') == test_info_dict['id']:
+ if test_info_dict.get('display_id') == test_info_dict.get('id'):
test_info_dict.pop('display_id')
return test_info_dict
@@ -259,6 +261,8 @@ def expect_info_dict(self, got_dict, expected_dict):
def _repr(v):
if isinstance(v, compat_str):
return "'%s'" % v.replace('\\', '\\\\').replace("'", "\\'").replace('\n', '\\n')
+ elif isinstance(v, type):
+ return v.__name__
else:
return repr(v)
info_dict_str = ''
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
index cf06dbde4..866ded243 100644
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -208,6 +208,91 @@ class TestInfoExtractor(unittest.TestCase):
},
{'expected_type': 'NewsArticle'},
),
+ (
+ r'''<script type="application/ld+json">
+ {"url":"/vrtnu/a-z/het-journaal/2021/het-journaal-het-journaal-19u-20211231/",
+ "name":"Het journaal 19u",
+ "description":"Het journaal 19u van vrijdag 31 december 2021.",
+ "potentialAction":{"url":"https://vrtnu.page.link/pfVy6ihgCAJKgHqe8","@type":"ShareAction"},
+ "mainEntityOfPage":{"@id":"1640092242445","@type":"WebPage"},
+ "publication":[{
+ "startDate":"2021-12-31T19:00:00.000+01:00",
+ "endDate":"2022-01-30T23:55:00.000+01:00",
+ "publishedBy":{"name":"een","@type":"Organization"},
+ "publishedOn":{"url":"https://www.vrt.be/vrtnu/","name":"VRT NU","@type":"BroadcastService"},
+ "@id":"pbs-pub-3a7ec233-da95-4c1e-9b2b-cf5fdfebcbe8",
+ "@type":"BroadcastEvent"
+ }],
+ "video":{
+ "name":"Het journaal - Aflevering 365 (Seizoen 2021)",
+ "description":"Het journaal 19u van vrijdag 31 december 2021. Bekijk aflevering 365 van seizoen 2021 met VRT NU via de site of app.",
+ "thumbnailUrl":"//images.vrt.be/width1280/2021/12/31/80d5ed00-6a64-11ec-b07d-02b7b76bf47f.jpg",
+ "expires":"2022-01-30T23:55:00.000+01:00",
+ "hasPart":[
+ {"name":"Explosie Turnhout","startOffset":70,"@type":"Clip"},
+ {"name":"Jaarwisseling","startOffset":440,"@type":"Clip"},
+ {"name":"Natuurbranden Colorado","startOffset":1179,"@type":"Clip"},
+ {"name":"Klimaatverandering","startOffset":1263,"@type":"Clip"},
+ {"name":"Zacht weer","startOffset":1367,"@type":"Clip"},
+ {"name":"Financiële balans","startOffset":1383,"@type":"Clip"},
+ {"name":"Club Brugge","startOffset":1484,"@type":"Clip"},
+ {"name":"Mentale gezondheid bij topsporters","startOffset":1575,"@type":"Clip"},
+ {"name":"Olympische Winterspelen","startOffset":1728,"@type":"Clip"},
+ {"name":"Sober oudjaar in Nederland","startOffset":1873,"@type":"Clip"}
+ ],
+ "duration":"PT34M39.23S",
+ "uploadDate":"2021-12-31T19:00:00.000+01:00",
+ "@id":"vid-9457d0c6-b8ac-4aba-b5e1-15aa3a3295b5",
+ "@type":"VideoObject"
+ },
+ "genre":["Nieuws en actua"],
+ "episodeNumber":365,
+ "partOfSeries":{"name":"Het journaal","@id":"222831405527","@type":"TVSeries"},
+ "partOfSeason":{"name":"Seizoen 2021","@id":"961809365527","@type":"TVSeason"},
+ "@context":"https://schema.org","@id":"961685295527","@type":"TVEpisode"}</script>
+ ''',
+ {
+ 'chapters': [
+ {"title": "Explosie Turnhout", "start_time": 70, "end_time": 440},
+ {"title": "Jaarwisseling", "start_time": 440, "end_time": 1179},
+ {"title": "Natuurbranden Colorado", "start_time": 1179, "end_time": 1263},
+ {"title": "Klimaatverandering", "start_time": 1263, "end_time": 1367},
+ {"title": "Zacht weer", "start_time": 1367, "end_time": 1383},
+ {"title": "Financiële balans", "start_time": 1383, "end_time": 1484},
+ {"title": "Club Brugge", "start_time": 1484, "end_time": 1575},
+ {"title": "Mentale gezondheid bij topsporters", "start_time": 1575, "end_time": 1728},
+ {"title": "Olympische Winterspelen", "start_time": 1728, "end_time": 1873},
+ {"title": "Sober oudjaar in Nederland", "start_time": 1873, "end_time": 2079.23}
+ ],
+ 'title': 'Het journaal - Aflevering 365 (Seizoen 2021)'
+ }, {}
+ ),
+ (
+ # test multiple thumbnails in a list
+ r'''
+<script type="application/ld+json">
+{"@context":"https://schema.org",
+"@type":"VideoObject",
+"thumbnailUrl":["https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg"]}
+</script>''',
+ {
+ 'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}],
+ },
+ {},
+ ),
+ (
+ # test single thumbnail
+ r'''
+<script type="application/ld+json">
+{"@context":"https://schema.org",
+"@type":"VideoObject",
+"thumbnailUrl":"https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg"}
+</script>''',
+ {
+ 'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}],
+ },
+ {},
+ )
]
for html, expected_dict, search_json_ld_kwargs in _TESTS:
expect_dict(
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py
index 61923513e..34ed814b4 100644
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -30,6 +30,7 @@ class YDL(FakeYDL):
self.msgs = []
def process_info(self, info_dict):
+ info_dict = info_dict.copy()
info_dict.pop('__original_infodict', None)
self.downloaded_info_dicts.append(info_dict)
@@ -645,6 +646,7 @@ class TestYoutubeDL(unittest.TestCase):
'ext': 'mp4',
'width': None,
'height': 1080,
+ 'filesize': 1024,
'title1': '$PATH',
'title2': '%PATH%',
'title3': 'foo/bar\\test',
@@ -778,8 +780,9 @@ class TestYoutubeDL(unittest.TestCase):
test('%(title5)#U', 'a\u0301e\u0301i\u0301 𝐀')
test('%(title5)+U', 'áéí A')
test('%(title5)+#U', 'a\u0301e\u0301i\u0301 A')
- test('%(height)D', '1K')
- test('%(height)5.2D', ' 1.08K')
+ test('%(height)D', '1k')
+ test('%(filesize)#D', '1Ki')
+ test('%(height)5.2D', ' 1.08k')
test('%(title4)#S', 'foo_bar_test')
test('%(title4).10S', ('foo \'bar\' ', 'foo \'bar\'' + ('#' if compat_os_name == 'nt' else ' ')))
if compat_os_name == 'nt':
@@ -906,7 +909,7 @@ class TestYoutubeDL(unittest.TestCase):
def _match_entry(self, info_dict, incomplete=False):
res = super(FilterYDL, self)._match_entry(info_dict, incomplete)
if res is None:
- self.downloaded_info_dicts.append(info_dict)
+ self.downloaded_info_dicts.append(info_dict.copy())
return res
first = {
@@ -1151,6 +1154,7 @@ class TestYoutubeDL(unittest.TestCase):
self.assertTrue(entries[1] is None)
self.assertEqual(len(ydl.downloaded_info_dicts), 1)
downloaded = ydl.downloaded_info_dicts[0]
+ entries[2].pop('requested_downloads', None)
self.assertEqual(entries[2], downloaded)
self.assertEqual(downloaded['url'], TEST_URL)
self.assertEqual(downloaded['title'], 'Video Transparent 2')
diff --git a/test/test_download.py b/test/test_download.py
index d7c469f3d..818a670fb 100755
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -53,7 +53,7 @@ class YoutubeDL(yt_dlp.YoutubeDL):
raise ExtractorError(message)
def process_info(self, info_dict):
- self.processed_info_dicts.append(info_dict)
+ self.processed_info_dicts.append(info_dict.copy())
return super(YoutubeDL, self).process_info(info_dict)
diff --git a/test/test_options.py b/test/test_options.py
deleted file mode 100644
index 42d9183a9..000000000
--- a/test/test_options.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# coding: utf-8
-
-from __future__ import unicode_literals
-
-# Allow direct execution
-import os
-import sys
-import unittest
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from yt_dlp.options import _hide_login_info
-
-
-class TestOptions(unittest.TestCase):
- def test_hide_login_info(self):
- self.assertEqual(_hide_login_info(['-u', 'foo', '-p', 'bar']),
- ['-u', 'PRIVATE', '-p', 'PRIVATE'])
- self.assertEqual(_hide_login_info(['-u']), ['-u'])
- self.assertEqual(_hide_login_info(['-u', 'foo', '-u', 'bar']),
- ['-u', 'PRIVATE', '-u', 'PRIVATE'])
- self.assertEqual(_hide_login_info(['--username=foo']),
- ['--username=PRIVATE'])
-
-
-if __name__ == '__main__':
- unittest.main()
diff --git a/test/test_subtitles.py b/test/test_subtitles.py
index 9b39dbd39..95e33e54a 100644
--- a/test/test_subtitles.py
+++ b/test/test_subtitles.py
@@ -13,7 +13,7 @@ from test.helper import FakeYDL, md5, is_download_test
from yt_dlp.extractor import (
YoutubeIE,
DailymotionIE,
- TEDIE,
+ TedTalkIE,
VimeoIE,
WallaIE,
CeskaTelevizeIE,
@@ -141,7 +141,7 @@ class TestDailymotionSubtitles(BaseTestSubtitles):
@is_download_test
class TestTedSubtitles(BaseTestSubtitles):
url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html'
- IE = TEDIE
+ IE = TedTalkIE
def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True
diff --git a/test/test_utils.py b/test/test_utils.py
index 2e33308c7..6be5bb642 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -23,6 +23,7 @@ from yt_dlp.utils import (
caesar,
clean_html,
clean_podcast_url,
+ Config,
date_from_str,
datetime_from_str,
DateRange,
@@ -43,6 +44,12 @@ from yt_dlp.utils import (
get_element_by_attribute,
get_elements_by_class,
get_elements_by_attribute,
+ get_element_html_by_class,
+ get_element_html_by_attribute,
+ get_elements_html_by_class,
+ get_elements_html_by_attribute,
+ get_elements_text_and_html_by_attribute,
+ get_element_text_and_html_by_tag,
InAdvancePagedList,
int_or_none,
intlist_to_bytes,
@@ -117,6 +124,7 @@ from yt_dlp.compat import (
compat_chr,
compat_etree_fromstring,
compat_getenv,
+ compat_HTMLParseError,
compat_os_name,
compat_setenv,
)
@@ -635,6 +643,8 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_duration('PT1H0.040S'), 3600.04)
self.assertEqual(parse_duration('PT00H03M30SZ'), 210)
self.assertEqual(parse_duration('P0Y0M0DT0H4M20.880S'), 260.88)
+ self.assertEqual(parse_duration('01:02:03:050'), 3723.05)
+ self.assertEqual(parse_duration('103:050'), 103.05)
def test_fix_xml_ampersands(self):
self.assertEqual(
@@ -1123,7 +1133,7 @@ class TestUtil(unittest.TestCase):
def test_clean_html(self):
self.assertEqual(clean_html('a:\nb'), 'a: b')
- self.assertEqual(clean_html('a:\n "b"'), 'a: "b"')
+ self.assertEqual(clean_html('a:\n "b"'), 'a: "b"')
self.assertEqual(clean_html('a<br>\xa0b'), 'a\nb')
def test_intlist_to_bytes(self):
@@ -1574,46 +1584,116 @@ Line 1
self.assertEqual(urshift(3, 1), 1)
self.assertEqual(urshift(-3, 1), 2147483646)
+ GET_ELEMENT_BY_CLASS_TEST_STRING = '''
+ <span class="foo bar">nice</span>
+ '''
+
def test_get_element_by_class(self):
- html = '''
- <span class="foo bar">nice</span>
- '''
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_by_class('foo', html), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None)
+ def test_get_element_html_by_class(self):
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_element_html_by_class('foo', html), html.strip())
+ self.assertEqual(get_element_by_class('no-such-class', html), None)
+
+ GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
+ <div itemprop="author" itemscope>foo</div>
+ '''
+
def test_get_element_by_attribute(self):
- html = '''
- <span class="foo bar">nice</span>
- '''
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
- html = '''
- <div itemprop="author" itemscope>foo</div>
- '''
+ html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
+ def test_get_element_html_by_attribute(self):
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip())
+ self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
+ self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)
+
+ html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
+
+ self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip())
+
+ GET_ELEMENTS_BY_CLASS_TEST_STRING = '''
+ <span class="foo bar">nice</span><span class="foo bar">also nice</span>
+ '''
+ GET_ELEMENTS_BY_CLASS_RES = ['<span class="foo bar">nice</span>', '<span class="foo bar">also nice</span>']
+
def test_get_elements_by_class(self):
- html = '''
- <span class="foo bar">nice</span><span class="foo bar">also nice</span>
- '''
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_class('no-such-class', html), [])
+ def test_get_elements_html_by_class(self):
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES)
+ self.assertEqual(get_elements_html_by_class('no-such-class', html), [])
+
def test_get_elements_by_attribute(self):
- html = '''
- <span class="foo bar">nice</span><span class="foo bar">also nice</span>
- '''
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
+ def test_get_elements_html_by_attribute(self):
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html), self.GET_ELEMENTS_BY_CLASS_RES)
+ self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), [])
+ self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), [])
+
+ def test_get_elements_text_and_html_by_attribute(self):
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+ self.assertEqual(
+ list(get_elements_text_and_html_by_attribute('class', 'foo bar', html)),
+ list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
+ self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'foo', html)), [])
+ self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html)), [])
+
+ GET_ELEMENT_BY_TAG_TEST_STRING = '''
+ random text lorem ipsum</p>
+ <div>
+ this should be returned
+ <span>this should also be returned</span>
+ <div>
+ this should also be returned
+ </div>
+ closing tag above should not trick, so this should also be returned
+ </div>
+ but this text should not be returned
+ '''
+ GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[32:276]
+ GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT = GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML[5:-6]
+ GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119]
+ GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7]
+
+ def test_get_element_text_and_html_by_tag(self):
+ html = self.GET_ELEMENT_BY_TAG_TEST_STRING
+
+ self.assertEqual(
+ get_element_text_and_html_by_tag('div', html),
+ (self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT, self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML))
+ self.assertEqual(
+ get_element_text_and_html_by_tag('span', html),
+ (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML))
+ self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
+
def test_iri_to_uri(self):
self.assertEqual(
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
@@ -1701,6 +1781,15 @@ Line 1
self.assertEqual(format_bytes(1024**7), '1.00ZiB')
self.assertEqual(format_bytes(1024**8), '1.00YiB')
+ def test_hide_login_info(self):
+ self.assertEqual(Config.hide_login_info(['-u', 'foo', '-p', 'bar']),
+ ['-u', 'PRIVATE', '-p', 'PRIVATE'])
+ self.assertEqual(Config.hide_login_info(['-u']), ['-u'])
+ self.assertEqual(Config.hide_login_info(['-u', 'foo', '-u', 'bar']),
+ ['-u', 'PRIVATE', '-u', 'PRIVATE'])
+ self.assertEqual(Config.hide_login_info(['--username=foo']),
+ ['--username=PRIVATE'])
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_verbose_output.py b/test/test_verbose_output.py
index 86b039a4a..cc606115f 100644
--- a/test/test_verbose_output.py
+++ b/test/test_verbose_output.py
@@ -19,52 +19,52 @@ class TestVerboseOutput(unittest.TestCase):
[
sys.executable, 'yt_dlp/__main__.py', '-v',
'--username', 'johnsmith@gmail.com',
- '--password', 'secret',
+ '--password', 'my_secret_password',
], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
sout, serr = outp.communicate()
self.assertTrue(b'--username' in serr)
self.assertTrue(b'johnsmith' not in serr)
self.assertTrue(b'--password' in serr)
- self.assertTrue(b'secret' not in serr)
+ self.assertTrue(b'my_secret_password' not in serr)
def test_private_info_shortarg(self):
outp = subprocess.Popen(
[
sys.executable, 'yt_dlp/__main__.py', '-v',
'-u', 'johnsmith@gmail.com',
- '-p', 'secret',
+ '-p', 'my_secret_password',
], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
sout, serr = outp.communicate()
self.assertTrue(b'-u' in serr)
self.assertTrue(b'johnsmith' not in serr)
self.assertTrue(b'-p' in serr)
- self.assertTrue(b'secret' not in serr)
+ self.assertTrue(b'my_secret_password' not in serr)
def test_private_info_eq(self):
outp = subprocess.Popen(
[
sys.executable, 'yt_dlp/__main__.py', '-v',
'--username=johnsmith@gmail.com',
- '--password=secret',
+ '--password=my_secret_password',
], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
sout, serr = outp.communicate()
self.assertTrue(b'--username' in serr)
self.assertTrue(b'johnsmith' not in serr)
self.assertTrue(b'--password' in serr)
- self.assertTrue(b'secret' not in serr)
+ self.assertTrue(b'my_secret_password' not in serr)
def test_private_info_shortarg_eq(self):
outp = subprocess.Popen(
[
sys.executable, 'yt_dlp/__main__.py', '-v',
'-u=johnsmith@gmail.com',
- '-p=secret',
+ '-p=my_secret_password',
], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
sout, serr = outp.communicate()
self.assertTrue(b'-u' in serr)
self.assertTrue(b'johnsmith' not in serr)
self.assertTrue(b'-p' in serr)
- self.assertTrue(b'secret' not in serr)
+ self.assertTrue(b'my_secret_password' not in serr)
if __name__ == '__main__':
diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py
index d9638658d..455192b1f 100644
--- a/test/test_youtube_lists.py
+++ b/test/test_youtube_lists.py
@@ -9,11 +9,9 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL, is_download_test
-
from yt_dlp.extractor import (
- YoutubePlaylistIE,
- YoutubeTabIE,
YoutubeIE,
+ YoutubeTabIE,
)
@@ -27,21 +25,10 @@ class TestYoutubeLists(unittest.TestCase):
dl = FakeYDL()
dl.params['noplaylist'] = True
ie = YoutubeTabIE(dl)
- result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')
+ result = ie.extract('https://www.youtube.com/watch?v=OmJ-4B-mS-Y&list=PLydZ2Hrp_gPRJViZjLFKaBMgCQOYEEkyp&index=2')
self.assertEqual(result['_type'], 'url')
- self.assertEqual(YoutubeIE.extract_id(result['url']), 'FXxLjLQi3Fg')
-
- def test_youtube_course(self):
- print('Skipping: Course URLs no longer exists')
- return
- dl = FakeYDL()
- ie = YoutubePlaylistIE(dl)
- # TODO find a > 100 (paginating?) videos course
- result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
- entries = list(result['entries'])
- self.assertEqual(YoutubeIE.extract_id(entries[0]['url']), 'j9WZyLZCBzs')
- self.assertEqual(len(entries), 25)
- self.assertEqual(YoutubeIE.extract_id(entries[-1]['url']), 'rYefUsYuEp0')
+ self.assertEqual(result['ie_key'], YoutubeIE.ie_key())
+ self.assertEqual(YoutubeIE.extract_id(result['url']), 'OmJ-4B-mS-Y')
def test_youtube_mix(self):
dl = FakeYDL()
@@ -52,15 +39,6 @@ class TestYoutubeLists(unittest.TestCase):
original_video = entries[0]
self.assertEqual(original_video['id'], 'tyITL_exICo')
- def test_youtube_toptracks(self):
- print('Skipping: The playlist page gives error 500')
- return
- dl = FakeYDL()
- ie = YoutubePlaylistIE(dl)
- result = ie.extract('https://www.youtube.com/playlist?list=MCUS')
- entries = result['entries']
- self.assertEqual(len(entries), 100)
-
def test_youtube_flat_playlist_extraction(self):
dl = FakeYDL()
dl.params['extract_flat'] = True
diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py
index 5f8114a1c..cb07d3e23 100644
--- a/test/test_youtube_signature.py
+++ b/test/test_youtube_signature.py
@@ -86,6 +86,10 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/8040e515/player_ias.vflset/en_US/base.js',
'wvOFaY-yjgDuIEg5', 'HkfBFDHmgw4rsw',
),
+ (
+ 'https://www.youtube.com/s/player/e06dea74/player_ias.vflset/en_US/base.js',
+ 'AiuodmaDDYw8d3y4bf', 'ankd8eza2T6Qmw',
+ ),
]
@@ -116,10 +120,17 @@ class TestPlayerInfo(unittest.TestCase):
class TestSignature(unittest.TestCase):
def setUp(self):
TEST_DIR = os.path.dirname(os.path.abspath(__file__))
- self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata')
+ self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata/sigs')
if not os.path.exists(self.TESTDATA_DIR):
os.mkdir(self.TESTDATA_DIR)
+ def tearDown(self):
+ try:
+ for f in os.listdir(self.TESTDATA_DIR):
+ os.remove(f)
+ except OSError:
+ pass
+
def t_factory(name, sig_func, url_pattern):
def make_tfunc(url, sig_input, expected_sig):
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py
index ed1881da5..ac45a5160 100644
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -72,6 +72,7 @@ from .utils import (
GeoRestrictedError,
get_domain,
HEADRequest,
+ InAdvancePagedList,
int_or_none,
iri_to_uri,
ISO3166Utils,
@@ -91,6 +92,7 @@ from .utils import (
PerRequestProxyHandler,
platform_name,
Popen,
+ POSTPROCESS_WHEN,
PostProcessingError,
preferredencoding,
prepend_extension,
@@ -198,7 +200,12 @@ class YoutubeDL(object):
verbose: Print additional info to stdout.
quiet: Do not print messages to stdout.
no_warnings: Do not print out anything for warnings.
- forceprint: A list of templates to force print
+ forceprint: A dict with keys WHEN mapped to a list of templates to
+ print to stdout. The allowed keys are video or any of the
+ items in utils.POSTPROCESS_WHEN.
+ For compatibility, a single list is also accepted
+ print_to_file: A dict with keys WHEN (same as forceprint) mapped to
+ a list of tuples with (template, filename)
forceurl: Force printing final URL. (Deprecated)
forcetitle: Force printing title. (Deprecated)
forceid: Force printing ID. (Deprecated)
@@ -319,6 +326,8 @@ class YoutubeDL(object):
cookiesfrombrowser: A tuple containing the name of the browser, the profile
name/pathfrom where cookies are loaded, and the name of the
keyring. Eg: ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT')
+ legacyserverconnect: Explicitly allow HTTPS connection to servers that do not
+ support RFC 5746 secure renegotiation
nocheckcertificate: Do not verify SSL certificates
prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
At the moment, this is only supported by YouTube.
@@ -342,8 +351,8 @@ class YoutubeDL(object):
postprocessors: A list of dictionaries, each with an entry
* key: The name of the postprocessor. See
yt_dlp/postprocessor/__init__.py for a list.
- * when: When to run the postprocessor. Can be one of
- pre_process|before_dl|post_process|after_move.
+ * when: When to run the postprocessor. Allowed values are
+ the entries of utils.POSTPROCESS_WHEN
Assumed to be 'post_process' if not given
post_hooks: Deprecated - Register a custom postprocessor instead
A list of functions that get called as the final step
@@ -474,6 +483,7 @@ class YoutubeDL(object):
extractor_args: A dictionary of arguments to be passed to the extractors.
See "EXTRACTOR ARGUMENTS" for details.
Eg: {'youtube': {'skip': ['dash', 'hls']}}
+ mark_watched: Mark videos watched (even with --simulate). Only for YouTube
youtube_include_dash_manifest: Deprecated - Use extractor_args instead.
If True (default), DASH manifests and related
data will be downloaded and processed by extractor.
@@ -504,7 +514,7 @@ class YoutubeDL(object):
params = None
_ies = {}
- _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
+ _pps = {k: [] for k in POSTPROCESS_WHEN}
_printed_messages = set()
_first_webpage_request = True
_download_retcode = None
@@ -522,7 +532,7 @@ class YoutubeDL(object):
params = {}
self._ies = {}
self._ies_instances = {}
- self._pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
+ self._pps = {k: [] for k in POSTPROCESS_WHEN}
self._printed_messages = set()
self._first_webpage_request = True
self._post_hooks = []
@@ -530,6 +540,7 @@ class YoutubeDL(object):
self._postprocessor_hooks = []
self._download_retcode = 0
self._num_downloads = 0
+ self._num_videos = 0
self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
self._err_file = sys.stderr
self.params = params
@@ -584,7 +595,14 @@ class YoutubeDL(object):
else:
self.params['nooverwrites'] = not self.params['overwrites']
- if params.get('bidi_workaround', False):
+ self.params.setdefault('forceprint', {})
+ self.params.setdefault('print_to_file', {})
+
+ # Compatibility with older syntax
+ if not isinstance(params['forceprint'], dict):
+ self.params['forceprint'] = {'video': params['forceprint']}
+
+ if self.params.get('bidi_workaround', False):
try:
import pty
master, slave = pty.openpty()
@@ -612,7 +630,7 @@ class YoutubeDL(object):
if (sys.platform != 'win32'
and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
- and not params.get('restrictfilenames', False)):
+ and not self.params.get('restrictfilenames', False)):
# Unicode filesystem API will throw errors (#1474, #13027)
self.report_warning(
'Assuming --restrict-filenames since file system encoding '
@@ -1035,6 +1053,7 @@ class YoutubeDL(object):
if info_dict.get('duration', None) is not None
else None)
info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
+ info_dict['video_autonumber'] = self._num_videos
if info_dict.get('resolution') is None:
info_dict['resolution'] = self.format_resolution(info_dict, default=None)
@@ -1150,7 +1169,7 @@ class YoutubeDL(object):
str_fmt = f'{fmt[:-1]}s'
if fmt[-1] == 'l': # list
delim = '\n' if '#' in flags else ', '
- value, fmt = delim.join(variadic(value, allowed_types=(str, bytes))), str_fmt
+ value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt
elif fmt[-1] == 'j': # json
value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt
elif fmt[-1] == 'q': # quoted
@@ -1165,7 +1184,9 @@ class YoutubeDL(object):
'NF%s%s' % ('K' if '+' in flags else '', 'D' if '#' in flags else 'C'),
value), str_fmt
elif fmt[-1] == 'D': # decimal suffix
- value, fmt = format_decimal_suffix(value, f'%{fmt[:-1]}f%s' if fmt[:-1] else '%d%s'), 's'
+ num_fmt, fmt = fmt[:-1].replace('#', ''), 's'
+ value = format_decimal_suffix(value, f'%{num_fmt}f%s' if num_fmt else '%d%s',
+ factor=1024 if '#' in flags else 1000)
elif fmt[-1] == 'S': # filename sanitization
value, fmt = filename_sanitizer(initial_field, value, restricted='#' in flags), str_fmt
elif fmt[-1] == 'c':
@@ -1200,10 +1221,17 @@ class YoutubeDL(object):
try:
outtmpl = self._outtmpl_expandpath(self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default']))
filename = self.evaluate_outtmpl(outtmpl, info_dict, True)
+ if not filename:
+ return None
- force_ext = OUTTMPL_TYPES.get(tmpl_type)
- if filename and force_ext is not None:
- filename = replace_extension(filename, force_ext, info_dict.get('ext'))
+ if tmpl_type in ('default', 'temp'):
+ final_ext, ext = self.params.get('final_ext'), info_dict.get('ext')
+ if final_ext and ext and final_ext != ext and filename.endswith(f'.{final_ext}'):
+ filename = replace_extension(filename, ext, final_ext)
+ else:
+ force_ext = OUTTMPL_TYPES[tmpl_type]
+ if force_ext:
+ filename = replace_extension(filename, force_ext, info_dict.get('ext'))
# https://github.com/blackjack4494/youtube-dlc/issues/85
trim_file_name = self.params.get('trim_file_name', False)
@@ -1583,6 +1611,19 @@ class YoutubeDL(object):
def _ensure_dir_exists(self, path):
return make_dir(path, self.report_error)
+ @staticmethod
+ def _playlist_infodict(ie_result, **kwargs):
+ return {
+ **ie_result,
+ 'playlist': ie_result.get('title') or ie_result.get('id'),
+ 'playlist_id': ie_result.get('id'),
+ 'playlist_title': ie_result.get('title'),
+ 'playlist_uploader': ie_result.get('uploader'),
+ 'playlist_uploader_id': ie_result.get('uploader_id'),
+ 'playlist_index': 0,
+ **kwargs,
+ }
+
def __process_playlist(self, ie_result, download):
# We process each entry in the playlist
playlist = ie_result.get('title') or ie_result.get('id')
@@ -1623,23 +1664,27 @@ class YoutubeDL(object):
playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
ie_entries = ie_result['entries']
- msg = (
- 'Downloading %d videos' if not isinstance(ie_entries, list)
- else 'Collected %d videos; downloading %%d of them' % len(ie_entries))
-
if isinstance(ie_entries, list):
+ playlist_count = len(ie_entries)
+ msg = f'Collected {playlist_count} videos; downloading %d of them'
+ ie_result['playlist_count'] = ie_result.get('playlist_count') or playlist_count
+
def get_entry(i):
return ie_entries[i - 1]
else:
+ msg = 'Downloading %d videos'
if not isinstance(ie_entries, (PagedList, LazyList)):
ie_entries = LazyList(ie_entries)
+ elif isinstance(ie_entries, InAdvancePagedList):
+ if ie_entries._pagesize == 1:
+ playlist_count = ie_entries._pagecount
def get_entry(i):
return YoutubeDL.__handle_extraction_exceptions(
lambda self, i: ie_entries[i - 1]
)(self, i)
- entries = []
+ entries, broken = [], False
items = playlistitems if playlistitems is not None else itertools.count(playliststart)
for i in items:
if i == 0:
@@ -1661,6 +1706,7 @@ class YoutubeDL(object):
if entry is not None:
self._match_entry(entry, incomplete=True, silent=True)
except (ExistingVideoReached, RejectedVideoReached):
+ broken = True
break
ie_result['entries'] = entries
@@ -1671,23 +1717,19 @@ class YoutubeDL(object):
if entry is not None]
n_entries = len(entries)
+ if not (ie_result.get('playlist_count') or broken or playlistitems or playlistend):
+ ie_result['playlist_count'] = n_entries
+
if not playlistitems and (playliststart != 1 or playlistend):
playlistitems = list(range(playliststart, playliststart + n_entries))
ie_result['requested_entries'] = playlistitems
_infojson_written = False
- if not self.params.get('simulate') and self.params.get('allow_playlist_files', True):
- ie_copy = {
- 'playlist': playlist,
- 'playlist_id': ie_result.get('id'),
- 'playlist_title': ie_result.get('title'),
- 'playlist_uploader': ie_result.get('uploader'),
- 'playlist_uploader_id': ie_result.get('uploader_id'),
- 'playlist_index': 0,
- 'n_entries': n_entries,
- }
- ie_copy.update(dict(ie_result))
-
+ write_playlist_files = self.params.get('allow_playlist_files', True)
+ if write_playlist_files and self.params.get('list_thumbnails'):
+ self.list_thumbnails(ie_result)
+ if write_playlist_files and not self.params.get('simulate'):
+ ie_copy = self._playlist_infodict(ie_result, n_entries=n_entries)
_infojson_written = self._write_info_json(
'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson'))
if _infojson_written is None:
@@ -1720,6 +1762,7 @@ class YoutubeDL(object):
extra = {
'n_entries': n_entries,
'_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
+ 'playlist_count': ie_result.get('playlist_count'),
'playlist_index': playlist_index,
'playlist_autonumber': i,
'playlist': playlist,
@@ -1752,7 +1795,9 @@ class YoutubeDL(object):
'updated playlist', ie_result,
self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None:
return
- self.to_screen('[download] Finished downloading playlist: %s' % playlist)
+
+ ie_result = self.run_all_pps('playlist', ie_result)
+ self.to_screen(f'[download] Finished downloading playlist: {playlist}')
return ie_result
@__handle_extraction_exceptions
@@ -2194,10 +2239,7 @@ class YoutubeDL(object):
def _calc_headers(self, info_dict):
res = std_headers.copy()
-
- add_headers = info_dict.get('http_headers')
- if add_headers:
- res.update(add_headers)
+ res.update(info_dict.get('http_headers') or {})
cookies = self._calc_cookies(info_dict)
if cookies:
@@ -2257,12 +2299,20 @@ class YoutubeDL(object):
def process_video_result(self, info_dict, download=True):
assert info_dict.get('_type', 'video') == 'video'
+ self._num_videos += 1
if 'id' not in info_dict:
- raise ExtractorError('Missing "id" field in extractor result')
+ raise ExtractorError('Missing "id" field in extractor result', ie=info_dict['extractor'])
+ elif not info_dict.get('id'):
+ raise ExtractorError('Extractor failed to obtain "id"', ie=info_dict['extractor'])
+
+ info_dict['fulltitle'] = info_dict.get('title')
if 'title' not in info_dict:
raise ExtractorError('Missing "title" field in extractor result',
video_id=info_dict['id'], ie=info_dict['extractor'])
+ elif not info_dict.get('title'):
+ self.report_warning('Extractor failed to obtain "title". Creating a generic title instead')
+ info_dict['title'] = f'{info_dict["extractor"]} video #{info_dict["id"]}'
def report_force_conversion(field, field_not, conversion):
self.report_warning(
@@ -2310,6 +2360,7 @@ class YoutubeDL(object):
for ts_key, date_key in (
('timestamp', 'upload_date'),
('release_timestamp', 'release_date'),
+ ('modified_timestamp', 'modified_date'),
):
if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
# Working around out-of-range timestamp values (e.g. negative ones on Windows,
@@ -2372,6 +2423,8 @@ class YoutubeDL(object):
if info_dict.get('is_live'):
get_from_start = bool(self.params.get('live_from_start'))
formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start]
+ if not get_from_start:
+ info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
if not formats:
self.raise_no_formats(info_dict)
@@ -2533,24 +2586,46 @@ class YoutubeDL(object):
if not self.params.get('ignore_no_formats_error'):
raise ExtractorError('Requested format is not available', expected=True,
video_id=info_dict['id'], ie=info_dict['extractor'])
- else:
- self.report_warning('Requested format is not available')
- # Process what we can, even without any available formats.
- self.process_info(dict(info_dict))
- elif download:
- self.to_screen(
- '[info] %s: Downloading %d format(s): %s' % (
- info_dict['id'], len(formats_to_download),
- ", ".join([f['format_id'] for f in formats_to_download])))
- for fmt in formats_to_download:
- new_info = dict(info_dict)
+ self.report_warning('Requested format is not available')
+ # Process what we can, even without any available formats.
+ formats_to_download = [{}]
+
+ best_format = formats_to_download[-1]
+ if download:
+ if best_format:
+ self.to_screen(
+ f'[info] {info_dict["id"]}: Downloading {len(formats_to_download)} format(s): '
+ + ', '.join([f['format_id'] for f in formats_to_download]))
+ max_downloads_reached = False
+ for i, fmt in enumerate(formats_to_download):
+ formats_to_download[i] = new_info = dict(info_dict)
# Save a reference to the original info_dict so that it can be modified in process_info if needed
- new_info['__original_infodict'] = info_dict
new_info.update(fmt)
- self.process_info(new_info)
+ new_info['__original_infodict'] = info_dict
+ try:
+ self.process_info(new_info)
+ except MaxDownloadsReached:
+ max_downloads_reached = True
+ new_info.pop('__original_infodict')
+ # Remove copied info
+ for key, val in tuple(new_info.items()):
+ if info_dict.get(key) == val:
+ new_info.pop(key)
+ if max_downloads_reached:
+ break
+
+ write_archive = set(f.get('__write_download_archive', False) for f in formats_to_download)
+ assert write_archive.issubset({True, False, 'ignore'})
+ if True in write_archive and False not in write_archive:
+ self.record_download_archive(info_dict)
+
+ info_dict['requested_downloads'] = formats_to_download
+ info_dict = self.run_all_pps('after_video', info_dict)
+ if max_downloads_reached:
+ raise MaxDownloadsReached()
+
# We update the info dict with the selected best quality format (backwards compatibility)
- if formats_to_download:
- info_dict.update(formats_to_download[-1])
+ info_dict.update(best_format)
return info_dict
def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
@@ -2621,6 +2696,33 @@ class YoutubeDL(object):
subs[lang] = f
return subs
+ def _forceprint(self, key, info_dict):
+ if info_dict is None:
+ return
+ info_copy = info_dict.copy()
+ info_copy['formats_table'] = self.render_formats_table(info_dict)
+ info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict)
+ info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles'))
+ info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions'))
+
+ def format_tmpl(tmpl):
+ mobj = re.match(r'\w+(=?)$', tmpl)
+ if mobj and mobj.group(1):
+ return f'{tmpl[:-1]} = %({tmpl[:-1]})r'
+ elif mobj:
+ return f'%({tmpl})s'
+ return tmpl
+
+ for tmpl in self.params['forceprint'].get(key, []):
+ self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy))
+
+ for tmpl, file_tmpl in self.params['print_to_file'].get(key, []):
+ filename = self.evaluate_outtmpl(file_tmpl, info_dict)
+ tmpl = format_tmpl(tmpl)
+ self.to_screen(f'[info] Writing {tmpl!r} to: {filename}')
+ with io.open(filename, 'a', encoding='utf-8') as f:
+ f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n')
+
def __forced_printings(self, info_dict, filename, incomplete):
def print_mandatory(field, actual_field=None):
if actual_field is None:
@@ -2643,15 +2745,11 @@ class YoutubeDL(object):
elif 'url' in info_dict:
info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
- if self.params.get('forceprint') or self.params.get('forcejson'):
+ if (self.params.get('forcejson')
+ or self.params['forceprint'].get('video')
+ or self.params['print_to_file'].get('video')):
self.post_extract(info_dict)
- for tmpl in self.params.get('forceprint', []):
- mobj = re.match(r'\w+(=?)$', tmpl)
- if mobj and mobj.group(1):
- tmpl = f'{tmpl[:-1]} = %({tmpl[:-1]})s'
- elif mobj:
- tmpl = '%({})s'.format(tmpl)
- self.to_stdout(self.evaluate_outtmpl(tmpl, info_dict))
+ self._forceprint('video', info_dict)
print_mandatory('title')
print_mandatory('id')
@@ -2689,7 +2787,9 @@ class YoutubeDL(object):
if not test:
for ph in self._progress_hooks:
fd.add_progress_hook(ph)
- urls = '", "'.join([f['url'] for f in info.get('requested_formats', [])] or [info['url']])
+ urls = '", "'.join(
+ (f['url'].split(',')[0] + ',<data>' if f['url'].startswith('data:') else f['url'])
+ for f in info.get('requested_formats', []) or [info])
self.write_debug('Invoking downloader on "%s"' % urls)
# Note: Ideally info should be a deep-copied so that hooks cannot modify it.
@@ -2699,26 +2799,27 @@ class YoutubeDL(object):
new_info['http_headers'] = self._calc_headers(new_info)
return fd.download(name, new_info, subtitle)
- def process_info(self, info_dict):
- """Process a single resolved IE result."""
-
- assert info_dict.get('_type', 'video') == 'video'
+ def existing_file(self, filepaths, *, default_overwrite=True):
+ existing_files = list(filter(os.path.exists, orderedSet(filepaths)))
+ if existing_files and not self.params.get('overwrites', default_overwrite):
+ return existing_files[0]
- max_downloads = self.params.get('max_downloads')
- if max_downloads is not None:
- if self._num_downloads >= int(max_downloads):
- raise MaxDownloadsReached()
+ for file in existing_files:
+ self.report_file_delete(file)
+ os.remove(file)
+ return None
- if info_dict.get('is_live') and not self.params.get('live_from_start'):
- info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
+ def process_info(self, info_dict):
+ """Process a single resolved IE result. (Modified it in-place)"""
- # TODO: backward compatibility, to be removed
- info_dict['fulltitle'] = info_dict['title']
+ assert info_dict.get('_type', 'video') == 'video'
+ original_infodict = info_dict
if 'format' not in info_dict and 'ext' in info_dict:
info_dict['format'] = info_dict['ext']
if self._match_entry(info_dict) is not None:
+ info_dict['__write_download_archive'] = 'ignore'
return
self.post_extract(info_dict)
@@ -2733,9 +2834,7 @@ class YoutubeDL(object):
self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
if self.params.get('simulate'):
- if self.params.get('force_write_download_archive', False):
- self.record_download_archive(info_dict)
- # Do nothing else if in simulate mode
+ info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
return
if full_filename is None:
@@ -2830,43 +2929,39 @@ class YoutubeDL(object):
for link_type, should_write in write_links.items()):
return
+ def replace_info_dict(new_info):
+ nonlocal info_dict
+ if new_info == info_dict:
+ return
+ info_dict.clear()
+ info_dict.update(new_info)
+
try:
- info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
+ new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
+ replace_info_dict(new_info)
except PostProcessingError as err:
self.report_error('Preprocessing: %s' % str(err))
return
- must_record_download_archive = False
- if self.params.get('skip_download', False):
+ if self.params.get('skip_download'):
info_dict['filepath'] = temp_filename
info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
info_dict['__files_to_move'] = files_to_move
- info_dict = self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)
+ replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict))
+ info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
else:
# Download
info_dict.setdefault('__postprocessors', [])
try:
- def existing_file(*filepaths):
+ def existing_video_file(*filepaths):
ext = info_dict.get('ext')
- final_ext = self.params.get('final_ext', ext)
- existing_files = []
- for file in orderedSet(filepaths):
- if final_ext != ext:
- converted = replace_extension(file, final_ext, ext)
- if os.path.exists(encodeFilename(converted)):
- existing_files.append(converted)
- if os.path.exists(encodeFilename(file)):
- existing_files.append(file)
-
- if not existing_files or self.params.get('overwrites', False):
- for file in orderedSet(existing_files):
- self.report_file_delete(file)
- os.remove(encodeFilename(file))
- return None
-
- info_dict['ext'] = os.path.splitext(existing_files[0])[1][1:]
- return existing_files[0]
+ converted = lambda file: replace_extension(file, self.params.get('final_ext') or ext, ext)
+ file = self.existing_file(itertools.chain(*zip(map(converted, filepaths), filepaths)),
+ default_overwrite=False)
+ if file:
+ info_dict['ext'] = os.path.splitext(file)[1][1:]
+ return file
success = True
if info_dict.get('requested_formats') is not None:
@@ -2920,7 +3015,7 @@ class YoutubeDL(object):
# Ensure filename always has a correct extension for successful merge
full_filename = correct_ext(full_filename)
temp_filename = correct_ext(temp_filename)
- dl_filename = existing_file(full_filename, temp_filename)
+ dl_filename = existing_video_file(full_filename, temp_filename)
info_dict['__real_download'] = False
downloaded = []
@@ -2983,7 +3078,7 @@ class YoutubeDL(object):
files_to_move[file] = None
else:
# Just a single file
- dl_filename = existing_file(full_filename, temp_filename)
+ dl_filename = existing_video_file(full_filename, temp_filename)
if dl_filename is None or dl_filename == temp_filename:
# dl_filename == temp_filename could mean that the file was partially downloaded with --no-part.
# So we should try to resume the download
@@ -3060,7 +3155,7 @@ class YoutubeDL(object):
fixup()
try:
- info_dict = self.post_process(dl_filename, info_dict, files_to_move)
+ replace_info_dict(self.post_process(dl_filename, info_dict, files_to_move))
except PostProcessingError as err:
self.report_error('Postprocessing: %s' % str(err))
return
@@ -3070,10 +3165,14 @@ class YoutubeDL(object):
except Exception as err:
self.report_error('post hooks: %s' % str(err))
return
- must_record_download_archive = True
+ info_dict['__write_download_archive'] = True
+
+ if self.params.get('force_write_download_archive'):
+ info_dict['__write_download_archive'] = True
+
+ # Make sure the info_dict was modified in-place
+ assert info_dict is original_infodict
- if must_record_download_archive or self.params.get('force_write_download_archive', False):
- self.record_download_archive(info_dict)
max_downloads = self.params.get('max_downloads')
if max_downloads is not None and self._num_downloads >= int(max_downloads):
raise MaxDownloadsReached()
@@ -3139,12 +3238,13 @@ class YoutubeDL(object):
if info_dict is None:
return info_dict
info_dict.setdefault('epoch', int(time.time()))
+ info_dict.setdefault('_type', 'video')
remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict
keep_keys = ['_type'] # Always keep this to facilitate load-info-json
if remove_private_keys:
remove_keys |= {
- 'requested_formats', 'requested_subtitles', 'requested_entries', 'entries',
- 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber',
+ 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries',
+ 'entries', 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber',
}
reject = lambda k, v: k not in keep_keys and (
k.startswith('_') or k in remove_keys or v is None)
@@ -3168,6 +3268,25 @@ class YoutubeDL(object):
''' Alias of sanitize_info for backward compatibility '''
return YoutubeDL.sanitize_info(info_dict, actually_filter)
+ @staticmethod
+ def post_extract(info_dict):
+ def actual_post_extract(info_dict):
+ if info_dict.get('_type') in ('playlist', 'multi_video'):
+ for video_dict in info_dict.get('entries', {}):
+ actual_post_extract(video_dict or {})
+ return
+
+ post_extractor = info_dict.get('__post_extractor') or (lambda: {})
+ extra = post_extractor().items()
+ info_dict.update(extra)
+ info_dict.pop('__post_extractor', None)
+
+ original_infodict = info_dict.get('__original_infodict') or {}
+ original_infodict.update(extra)
+ original_infodict.pop('__post_extractor', None)
+
+ actual_post_extract(info_dict or {})
+
def run_pp(self, pp, infodict):
files_to_delete = []
if '__files_to_move' not in infodict:
@@ -3197,45 +3316,26 @@ class YoutubeDL(object):
del infodict['__files_to_move'][old_filename]
return infodict
- @staticmethod
- def post_extract(info_dict):
- def actual_post_extract(info_dict):
- if info_dict.get('_type') in ('playlist', 'multi_video'):
- for video_dict in info_dict.get('entries', {}):
- actual_post_extract(video_dict or {})
- return
-
- post_extractor = info_dict.get('__post_extractor') or (lambda: {})
- extra = post_extractor().items()
- info_dict.update(extra)
- info_dict.pop('__post_extractor', None)
-
- original_infodict = info_dict.get('__original_infodict') or {}
- original_infodict.update(extra)
- original_infodict.pop('__post_extractor', None)
-
- actual_post_extract(info_dict or {})
+ def run_all_pps(self, key, info, *, additional_pps=None):
+ self._forceprint(key, info)
+ for pp in (additional_pps or []) + self._pps[key]:
+ info = self.run_pp(pp, info)
+ return info
def pre_process(self, ie_info, key='pre_process', files_to_move=None):
info = dict(ie_info)
info['__files_to_move'] = files_to_move or {}
- for pp in self._pps[key]:
- info = self.run_pp(pp, info)
+ info = self.run_all_pps(key, info)
return info, info.pop('__files_to_move', None)
- def post_process(self, filename, ie_info, files_to_move=None):
+ def post_process(self, filename, info, files_to_move=None):
"""Run all the postprocessors on the given file."""
- info = dict(ie_info)
info['filepath'] = filename
info['__files_to_move'] = files_to_move or {}
-
- for pp in ie_info.get('__postprocessors', []) + self._pps['post_process']:
- info = self.run_pp(pp, info)
+ info = self.run_all_pps('post_process', info, additional_pps=info.get('__postprocessors'))
info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
del info['__files_to_move']
- for pp in self._pps['after_move']:
- info = self.run_pp(pp, info)
- return info
+ return self.run_all_pps('after_move', info)
def _make_archive_id(self, info_dict):
video_id = info_dict.get('id')
@@ -3274,6 +3374,7 @@ class YoutubeDL(object):
return
vid_id = self._make_archive_id(info_dict)
assert vid_id
+ self.write_debug(f'Adding to archive: {vid_id}')
with locked_file(fn, 'a', encoding='utf-8') as archive_file:
archive_file.write(vid_id + '\n')
self.archive.add(vid_id)
@@ -3292,6 +3393,11 @@ class YoutubeDL(object):
return '%dx?' % format['width']
return default
+ def _list_format_headers(self, *headers):
+ if self.params.get('listformats_table', True) is not False:
+ return [self._format_screen(header, self.Styles.HEADERS) for header in headers]
+ return headers
+
def _format_note(self, fdict):
res = ''
if fdict.get('ext') in ['f4f', 'f4m']:
@@ -3352,102 +3458,97 @@ class YoutubeDL(object):
res += '~' + format_bytes(fdict['filesize_approx'])
return res
- def _list_format_headers(self, *headers):
- if self.params.get('listformats_table', True) is not False:
- return [self._format_screen(header, self.Styles.HEADERS) for header in headers]
- return headers
-
- def list_formats(self, info_dict):
+ def render_formats_table(self, info_dict):
if not info_dict.get('formats') and not info_dict.get('url'):
- self.to_screen('%s has no formats' % info_dict['id'])
- return
- self.to_screen('[info] Available formats for %s:' % info_dict['id'])
+ return None
formats = info_dict.get('formats', [info_dict])
- new_format = self.params.get('listformats_table', True) is not False
- if new_format:
- delim = self._format_screen('\u2502', self.Styles.DELIM, '|', test_encoding=True)
- table = [
- [
- self._format_screen(format_field(f, 'format_id'), self.Styles.ID),
- format_field(f, 'ext'),
- format_field(f, func=self.format_resolution, ignore=('audio only', 'images')),
- format_field(f, 'fps', '\t%d'),
- format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''),
- delim,
- format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes),
- format_field(f, 'tbr', '\t%dk'),
- shorten_protocol_name(f.get('protocol', '')),
- delim,
- format_field(f, 'vcodec', default='unknown').replace(
- 'none',
- 'images' if f.get('acodec') == 'none'
- else self._format_screen('audio only', self.Styles.SUPPRESS)),
- format_field(f, 'vbr', '\t%dk'),
- format_field(f, 'acodec', default='unknown').replace(
- 'none',
- '' if f.get('vcodec') == 'none'
- else self._format_screen('video only', self.Styles.SUPPRESS)),
- format_field(f, 'abr', '\t%dk'),
- format_field(f, 'asr', '\t%dHz'),
- join_nonempty(
- self._format_screen('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None,
- format_field(f, 'language', '[%s]'),
- join_nonempty(
- format_field(f, 'format_note'),
- format_field(f, 'container', ignore=(None, f.get('ext'))),
- delim=', '),
- delim=' '),
- ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
- header_line = self._list_format_headers(
- 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO',
- delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO')
- else:
+ if not self.params.get('listformats_table', True) is not False:
table = [
[
format_field(f, 'format_id'),
format_field(f, 'ext'),
self.format_resolution(f),
- self._format_note(f)]
- for f in formats
- if f.get('preference') is None or f['preference'] >= -1000]
- header_line = ['format code', 'extension', 'resolution', 'note']
-
- self.to_stdout(render_table(
- header_line, table,
- extra_gap=(0 if new_format else 1),
- hide_empty=new_format,
- delim=new_format and self._format_screen('\u2500', self.Styles.DELIM, '-', test_encoding=True)))
-
- def list_thumbnails(self, info_dict):
- thumbnails = list(info_dict.get('thumbnails'))
+ self._format_note(f)
+ ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
+ return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1)
+
+ delim = self._format_screen('\u2502', self.Styles.DELIM, '|', test_encoding=True)
+ table = [
+ [
+ self._format_screen(format_field(f, 'format_id'), self.Styles.ID),
+ format_field(f, 'ext'),
+ format_field(f, func=self.format_resolution, ignore=('audio only', 'images')),
+ format_field(f, 'fps', '\t%d'),
+ format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''),
+ delim,
+ format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes),
+ format_field(f, 'tbr', '\t%dk'),
+ shorten_protocol_name(f.get('protocol', '')),
+ delim,
+ format_field(f, 'vcodec', default='unknown').replace(
+ 'none', 'images' if f.get('acodec') == 'none'
+ else self._format_screen('audio only', self.Styles.SUPPRESS)),
+ format_field(f, 'vbr', '\t%dk'),
+ format_field(f, 'acodec', default='unknown').replace(
+ 'none', '' if f.get('vcodec') == 'none'
+ else self._format_screen('video only', self.Styles.SUPPRESS)),
+ format_field(f, 'abr', '\t%dk'),
+ format_field(f, 'asr', '\t%dHz'),
+ join_nonempty(
+ self._format_screen('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None,
+ format_field(f, 'language', '[%s]'),
+ join_nonempty(format_field(f, 'format_note'),
+ format_field(f, 'container', ignore=(None, f.get('ext'))),
+ delim=', '),
+ delim=' '),
+ ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
+ header_line = self._list_format_headers(
+ 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO',
+ delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO')
+
+ return render_table(
+ header_line, table, hide_empty=True,
+ delim=self._format_screen('\u2500', self.Styles.DELIM, '-', test_encoding=True))
+
+ def render_thumbnails_table(self, info_dict):
+ thumbnails = list(info_dict.get('thumbnails') or [])
if not thumbnails:
- self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
- return
-
- self.to_screen(
- '[info] Thumbnails for %s:' % info_dict['id'])
- self.to_stdout(render_table(
+ return None
+ return render_table(
self._list_format_headers('ID', 'Width', 'Height', 'URL'),
- [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
-
- def list_subtitles(self, video_id, subtitles, name='subtitles'):
- if not subtitles:
- self.to_screen('%s has no %s' % (video_id, name))
- return
- self.to_screen(
- 'Available %s for %s:' % (name, video_id))
+ [[t.get('id'), t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])
+ def render_subtitles_table(self, video_id, subtitles):
def _row(lang, formats):
exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)))
if len(set(names)) == 1:
names = [] if names[0] == 'unknown' else names[:1]
return [lang, ', '.join(names), ', '.join(exts)]
- self.to_stdout(render_table(
+ if not subtitles:
+ return None
+ return render_table(
self._list_format_headers('Language', 'Name', 'Formats'),
[_row(lang, formats) for lang, formats in subtitles.items()],
- hide_empty=True))
+ hide_empty=True)
+
+ def __list_table(self, video_id, name, func, *args):
+ table = func(*args)
+ if not table:
+ self.to_screen(f'{video_id} has no {name}')
+ return
+ self.to_screen(f'[info] Available {name} for {video_id}:')
+ self.to_stdout(table)
+
+ def list_formats(self, info_dict):
+ self.__list_table(info_dict['id'], 'formats', self.render_formats_table, info_dict)
+
+ def list_thumbnails(self, info_dict):
+ self.__list_table(info_dict['id'], 'thumbnails', self.render_thumbnails_table, info_dict)
+
+ def list_subtitles(self, video_id, subtitles, name='subtitles'):
+ self.__list_table(video_id, name, self.render_subtitles_table, video_id, subtitles)
def urlopen(self, req):
""" Start an HTTP download """
@@ -3696,10 +3797,11 @@ class YoutubeDL(object):
sub_format = sub_info['ext']
sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
- if not self.params.get('overwrites', True) and os.path.exists(sub_filename):
+ existing_sub = self.existing_file((sub_filename_final, sub_filename))
+ if existing_sub:
self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
- sub_info['filepath'] = sub_filename
- ret.append((sub_filename, sub_filename_final))
+ sub_info['filepath'] = existing_sub
+ ret.append((existing_sub, sub_filename_final))
continue
self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
@@ -3722,9 +3824,10 @@ class YoutubeDL(object):
self.dl(sub_filename, sub_copy, subtitle=True)
sub_info['filepath'] = sub_filename
ret.append((sub_filename, sub_filename_final))
- except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
+ except (DownloadError, ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
+ if self.params.get('ignoreerrors') is not True: # False or 'only_download'
+ raise DownloadError(f'Unable to download video subtitles for {sub_lang!r}: {err}', err)
self.report_warning(f'Unable to download video subtitles for {sub_lang!r}: {err}')
- continue
return ret
def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None):
@@ -3747,11 +3850,12 @@ class YoutubeDL(object):
thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
- if not self.params.get('overwrites', True) and os.path.exists(thumb_filename):
- ret.append((thumb_filename, thumb_filename_final))
- t['filepath'] = thumb_filename
+ existing_thumb = self.existing_file((thumb_filename_final, thumb_filename))
+ if existing_thumb:
self.to_screen('[info] %s is already present' % (
thumb_display_id if multiple else f'{label} thumbnail').capitalize())
+ t['filepath'] = existing_thumb
+ ret.append((existing_thumb, thumb_filename_final))
else:
self.to_screen(f'[info] Downloading {thumb_display_id} ...')
try:
diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py
index 4fa2e2d8c..7469b0f61 100644
--- a/yt_dlp/__init__.py
+++ b/yt_dlp/__init__.py
@@ -140,6 +140,8 @@ def _real_main(argv=None):
'"-f best" selects the best pre-merged format which is often not the best option',
'To let yt-dlp download and merge the best available formats, simply do not pass any format selection',
'If you know what you are doing and want only the best pre-merged format, use "-f b" instead to suppress this warning')))
+ if opts.exec_cmd.get('before_dl') and opts.exec_before_dl_cmd:
+ parser.error('using "--exec-before-download" conflicts with "--exec before_dl:"')
if opts.usenetrc and (opts.username is not None or opts.password is not None):
parser.error('using .netrc conflicts with giving username/password')
if opts.password is not None and opts.username is None:
@@ -330,6 +332,9 @@ def _real_main(argv=None):
if _video_multistreams_set is False and _audio_multistreams_set is False:
_unused_compat_opt('multistreams')
outtmpl_default = opts.outtmpl.get('default')
+ if outtmpl_default == '':
+ outtmpl_default, opts.skip_download = None, True
+ del opts.outtmpl['default']
if opts.useid:
if outtmpl_default is None:
outtmpl_default = opts.outtmpl['default'] = '%(id)s.%(ext)s'
@@ -348,9 +353,13 @@ def _real_main(argv=None):
for k, tmpl in opts.outtmpl.items():
validate_outtmpl(tmpl, f'{k} output template')
- opts.forceprint = opts.forceprint or []
- for tmpl in opts.forceprint or []:
- validate_outtmpl(tmpl, 'print template')
+ for type_, tmpl_list in opts.forceprint.items():
+ for tmpl in tmpl_list:
+ validate_outtmpl(tmpl, f'{type_} print template')
+ for type_, tmpl_list in opts.print_to_file.items():
+ for tmpl, file in tmpl_list:
+ validate_outtmpl(tmpl, f'{type_} print-to-file template')
+ validate_outtmpl(file, f'{type_} print-to-file filename')
validate_outtmpl(opts.sponsorblock_chapter_title, 'SponsorBlock chapter title')
for k, tmpl in opts.progress_template.items():
k = f'{k[:-6]} console title' if '-title' in k else f'{k} progress'
@@ -392,7 +401,10 @@ def _real_main(argv=None):
opts.parse_metadata.append('title:%s' % opts.metafromtitle)
opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, opts.parse_metadata)))
- any_getting = opts.forceprint or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
+ any_getting = (any(opts.forceprint.values()) or opts.dumpjson or opts.dump_single_json
+ or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail
+ or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration)
+
any_printing = opts.print_json
download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive
@@ -483,13 +495,6 @@ def _real_main(argv=None):
# Run this before the actual video download
'when': 'before_dl'
})
- # Must be after all other before_dl
- if opts.exec_before_dl_cmd:
- postprocessors.append({
- 'key': 'Exec',
- 'exec_cmd': opts.exec_before_dl_cmd,
- 'when': 'before_dl'
- })
if opts.extractaudio:
postprocessors.append({
'key': 'FFmpegExtractAudio',
@@ -590,13 +595,21 @@ def _real_main(argv=None):
# XAttrMetadataPP should be run after post-processors that may change file contents
if opts.xattrs:
postprocessors.append({'key': 'XAttrMetadata'})
- # Exec must be the last PP
- if opts.exec_cmd:
+ if opts.concat_playlist != 'never':
+ postprocessors.append({
+ 'key': 'FFmpegConcat',
+ 'only_multi_video': opts.concat_playlist != 'always',
+ 'when': 'playlist',
+ })
+ # Exec must be the last PP of each category
+ if opts.exec_before_dl_cmd:
+ opts.exec_cmd.setdefault('before_dl', opts.exec_before_dl_cmd)
+ for when, exec_cmd in opts.exec_cmd.items():
postprocessors.append({
'key': 'Exec',
- 'exec_cmd': opts.exec_cmd,
+ 'exec_cmd': exec_cmd,
# Run this only after the files have been moved to their final locations
- 'when': 'after_move'
+ 'when': when,
})
def report_args_compat(arg, name):
@@ -654,6 +667,7 @@ def _real_main(argv=None):
'forcefilename': opts.getfilename,
'forceformat': opts.getformat,
'forceprint': opts.forceprint,
+ 'print_to_file': opts.print_to_file,
'forcejson': opts.dumpjson or opts.print_json,
'dump_single_json': opts.dump_single_json,
'force_write_download_archive': opts.force_write_download_archive,
@@ -747,6 +761,7 @@ def _real_main(argv=None):
'skip_playlist_after_errors': opts.skip_playlist_after_errors,
'cookiefile': opts.cookiefile,
'cookiesfrombrowser': opts.cookiesfrombrowser,
+ 'legacyserverconnect': opts.legacy_server_connect,
'nocheckcertificate': opts.no_check_certificate,
'prefer_insecure': opts.prefer_insecure,
'proxy': opts.proxy,
diff --git a/yt_dlp/aes.py b/yt_dlp/aes.py
index 8503e3dfd..b37f0dd39 100644
--- a/yt_dlp/aes.py
+++ b/yt_dlp/aes.py
@@ -2,8 +2,15 @@ from __future__ import unicode_literals
from math import ceil
-from .compat import compat_b64decode, compat_pycrypto_AES
-from .utils import bytes_to_intlist, intlist_to_bytes
+from .compat import (
+ compat_b64decode,
+ compat_ord,
+ compat_pycrypto_AES,
+)
+from .utils import (
+ bytes_to_intlist,
+ intlist_to_bytes,
+)
if compat_pycrypto_AES:
@@ -25,6 +32,10 @@ else:
return intlist_to_bytes(aes_gcm_decrypt_and_verify(*map(bytes_to_intlist, (data, key, tag, nonce))))
+def unpad_pkcs7(data):
+ return data[:-compat_ord(data[-1])]
+
+
BLOCK_SIZE_BYTES = 16
@@ -506,5 +517,6 @@ __all__ = [
'aes_encrypt',
'aes_gcm_decrypt_and_verify',
'aes_gcm_decrypt_and_verify_bytes',
- 'key_expansion'
+ 'key_expansion',
+ 'unpad_pkcs7',
]
diff --git a/yt_dlp/compat.py b/yt_dlp/compat.py
index 79c8e3494..b97d4512e 100644
--- a/yt_dlp/compat.py
+++ b/yt_dlp/compat.py
@@ -2,6 +2,7 @@
import asyncio
import base64
+import collections
import ctypes
import getpass
import html
@@ -180,14 +181,17 @@ def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.pytho
compat_basestring = str
compat_chr = chr
+compat_filter = filter
compat_input = input
compat_integer_types = (int, )
compat_kwargs = lambda kwargs: kwargs
+compat_map = map
compat_numeric_types = (int, float, complex)
compat_str = str
compat_xpath = lambda xpath: xpath
compat_zip = zip
+compat_collections_abc = collections.abc
compat_HTMLParser = html.parser.HTMLParser
compat_HTTPError = urllib.error.HTTPError
compat_Struct = struct.Struct
@@ -245,6 +249,7 @@ __all__ = [
'compat_b64decode',
'compat_basestring',
'compat_chr',
+ 'compat_collections_abc',
'compat_cookiejar',
'compat_cookiejar_Cookie',
'compat_cookies',
@@ -254,6 +259,7 @@ __all__ = [
'compat_etree_fromstring',
'compat_etree_register_namespace',
'compat_expanduser',
+ 'compat_filter',
'compat_get_terminal_size',
'compat_getenv',
'compat_getpass',
@@ -265,6 +271,7 @@ __all__ = [
'compat_integer_types',
'compat_itertools_count',
'compat_kwargs',
+ 'compat_map',
'compat_numeric_types',
'compat_ord',
'compat_os_name',
diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py
index 74e133bc9..fc033a8ae 100644
--- a/yt_dlp/cookies.py
+++ b/yt_dlp/cookies.py
@@ -11,7 +11,11 @@ from datetime import datetime, timedelta, timezone
from enum import Enum, auto
from hashlib import pbkdf2_hmac
-from .aes import aes_cbc_decrypt_bytes, aes_gcm_decrypt_and_verify_bytes
+from .aes import (
+ aes_cbc_decrypt_bytes,
+ aes_gcm_decrypt_and_verify_bytes,
+ unpad_pkcs7,
+)
from .compat import (
compat_b64decode,
compat_cookiejar_Cookie,
@@ -669,8 +673,7 @@ def _get_linux_desktop_environment(env):
return _LinuxDesktopEnvironment.GNOME
elif 'KDE_FULL_SESSION' in env:
return _LinuxDesktopEnvironment.KDE
- else:
- return _LinuxDesktopEnvironment.OTHER
+ return _LinuxDesktopEnvironment.OTHER
def _choose_linux_keyring(logger):
@@ -790,7 +793,7 @@ def _get_linux_keyring_password(browser_keyring_name, keyring, logger):
# Chromium supports a flag: --password-store=<basic|gnome|kwallet> so the automatic detection
# will not be sufficient in all cases.
- keyring = _LinuxKeyring[keyring] or _choose_linux_keyring(logger)
+ keyring = _LinuxKeyring[keyring] if keyring else _choose_linux_keyring(logger)
logger.debug(f'Chosen keyring: {keyring.name}')
if keyring == _LinuxKeyring.KWALLET:
@@ -847,10 +850,9 @@ def pbkdf2_sha1(password, salt, iterations, key_length):
def _decrypt_aes_cbc(ciphertext, key, logger, initialization_vector=b' ' * 16):
- plaintext = aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)
- padding_length = plaintext[-1]
+ plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector))
try:
- return plaintext[:-padding_length].decode('utf-8')
+ return plaintext.decode('utf-8')
except UnicodeDecodeError:
logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True)
return None
diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py
index 17be3c46f..f4fdcf120 100644
--- a/yt_dlp/downloader/external.py
+++ b/yt_dlp/downloader/external.py
@@ -17,11 +17,13 @@ from ..utils import (
cli_valueless_option,
cli_bool_option,
_configuration_args,
+ determine_ext,
encodeFilename,
encodeArgument,
handle_youtubedl_headers,
check_executable,
Popen,
+ remove_end,
)
@@ -304,7 +306,7 @@ class HttpieFD(ExternalFD):
@classmethod
def available(cls, path=None):
- return ExternalFD.available(cls, path or 'http')
+ return super().available(path or 'http')
def _make_cmd(self, tmpfilename, info_dict):
cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']]
@@ -463,6 +465,15 @@ class FFmpegFD(ExternalFD):
args += ['-f', 'flv']
elif ext == 'mp4' and tmpfilename == '-':
args += ['-f', 'mpegts']
+ elif ext == 'unknown_video':
+ ext = determine_ext(remove_end(tmpfilename, '.part'))
+ if ext == 'unknown_video':
+ self.report_warning(
+ 'The video format is unknown and cannot be downloaded by ffmpeg. '
+ 'Explicitly set the extension in the filename to attempt download in that format')
+ else:
+ self.report_warning(f'The video format is unknown. Trying to download as {ext} according to the filename')
+ args += ['-f', EXT_TO_OUT_FORMATS.get(ext, ext)]
else:
args += ['-f', EXT_TO_OUT_FORMATS.get(ext, ext)]
diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py
index d4f112b0f..19c0990d3 100644
--- a/yt_dlp/downloader/fragment.py
+++ b/yt_dlp/downloader/fragment.py
@@ -14,7 +14,7 @@ except ImportError:
from .common import FileDownloader
from .http import HttpFD
-from ..aes import aes_cbc_decrypt_bytes
+from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
from ..compat import (
compat_os_name,
compat_urllib_error,
@@ -366,8 +366,7 @@ class FragmentFD(FileDownloader):
# not what it decrypts to.
if self.params.get('test', False):
return frag_content
- decrypted_data = aes_cbc_decrypt_bytes(frag_content, decrypt_info['KEY'], iv)
- return decrypted_data[:-decrypted_data[-1]]
+ return unpad_pkcs7(aes_cbc_decrypt_bytes(frag_content, decrypt_info['KEY'], iv))
return decrypt_fragment
diff --git a/yt_dlp/downloader/websocket.py b/yt_dlp/downloader/websocket.py
index 088222046..daac34884 100644
--- a/yt_dlp/downloader/websocket.py
+++ b/yt_dlp/downloader/websocket.py
@@ -5,9 +5,12 @@ import threading
try:
import websockets
- has_websockets = True
-except ImportError:
+except (ImportError, SyntaxError):
+ # websockets 3.10 on python 3.6 causes SyntaxError
+ # See https://github.com/yt-dlp/yt-dlp/issues/2633
has_websockets = False
+else:
+ has_websockets = True
from .common import FileDownloader
from .external import FFmpegFD
diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py
index 354453a27..9d6f5a435 100644
--- a/yt_dlp/extractor/abc.py
+++ b/yt_dlp/extractor/abc.py
@@ -300,11 +300,10 @@ class ABCIViewShowSeriesIE(InfoExtractor):
unescapeHTML(webpage_data).encode('utf-8').decode('unicode_escape'), show_id)
video_data = video_data['route']['pageData']['_embedded']
- if self.get_param('noplaylist') and 'highlightVideo' in video_data:
- self.to_screen('Downloading just the highlight video because of --no-playlist')
- return self.url_result(video_data['highlightVideo']['shareUrl'], ie=ABCIViewIE.ie_key())
+ highlight = try_get(video_data, lambda x: x['highlightVideo']['shareUrl'])
+ if not self._yes_playlist(show_id, bool(highlight), video_label='highlight video'):
+ return self.url_result(highlight, ie=ABCIViewIE.ie_key())
- self.to_screen(f'Downloading playlist {show_id} - add --no-playlist to just download the highlight video')
series = video_data['selectedSeries']
return {
'_type': 'playlist',
diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py
index 5a1283baa..0863e0d85 100644
--- a/yt_dlp/extractor/adn.py
+++ b/yt_dlp/extractor/adn.py
@@ -8,11 +8,10 @@ import os
import random
from .common import InfoExtractor
-from ..aes import aes_cbc_decrypt
+from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
from ..compat import (
compat_HTTPError,
compat_b64decode,
- compat_ord,
)
from ..utils import (
ass_subtitles_timecode,
@@ -84,14 +83,11 @@ class ADNIE(InfoExtractor):
return None
# http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
- dec_subtitles = intlist_to_bytes(aes_cbc_decrypt(
- bytes_to_intlist(compat_b64decode(enc_subtitles[24:])),
- bytes_to_intlist(binascii.unhexlify(self._K + 'ab9f52f5baae7c72')),
- bytes_to_intlist(compat_b64decode(enc_subtitles[:24]))
- ))
- subtitles_json = self._parse_json(
- dec_subtitles[:-compat_ord(dec_subtitles[-1])].decode(),
- None, fatal=False)
+ dec_subtitles = unpad_pkcs7(aes_cbc_decrypt_bytes(
+ compat_b64decode(enc_subtitles[24:]),
+ binascii.unhexlify(self._K + 'ab9f52f5baae7c72'),
+ compat_b64decode(enc_subtitles[:24])))
+ subtitles_json = self._parse_json(dec_subtitles.decode(), None, fatal=False)
if not subtitles_json:
return None
diff --git a/yt_dlp/extractor/afreecatv.py b/yt_dlp/extractor/afreecatv.py
index 063872b4f..80853487e 100644
--- a/yt_dlp/extractor/afreecatv.py
+++ b/yt_dlp/extractor/afreecatv.py
@@ -10,7 +10,11 @@ from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
+ qualities,
+ traverse_obj,
unified_strdate,
+ unified_timestamp,
+ update_url_query,
url_or_none,
urlencode_postdata,
xpath_text,
@@ -380,3 +384,96 @@ class AfreecaTVIE(InfoExtractor):
})
return info
+
+
+class AfreecaTVLiveIE(AfreecaTVIE):
+
+ IE_NAME = 'afreecatv:live'
+ _VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P<id>[^/]+)(?:/(?P<bno>\d+))?'
+ _TESTS = [{
+ 'url': 'https://play.afreecatv.com/pyh3646/237852185',
+ 'info_dict': {
+ 'id': '237852185',
+ 'ext': 'mp4',
+ 'title': '【 우루과이 오늘은 무슨일이? 】',
+ 'uploader': '박진우[JINU]',
+ 'uploader_id': 'pyh3646',
+ 'timestamp': 1640661495,
+ 'is_live': True,
+ },
+ 'skip': 'Livestream has ended',
+ }, {
+ 'url': 'http://play.afreeca.com/pyh3646/237852185',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://play.afreeca.com/pyh3646',
+ 'only_matching': True,
+ }]
+
+ _LIVE_API_URL = 'https://live.afreecatv.com/afreeca/player_live_api.php'
+
+ _QUALITIES = ('sd', 'hd', 'hd2k', 'original')
+
+ def _real_extract(self, url):
+ broadcaster_id, broadcast_no = self._match_valid_url(url).group('id', 'bno')
+
+ info = self._download_json(self._LIVE_API_URL, broadcaster_id, fatal=False,
+ data=urlencode_postdata({'bid': broadcaster_id})) or {}
+ channel_info = info.get('CHANNEL') or {}
+ broadcaster_id = channel_info.get('BJID') or broadcaster_id
+ broadcast_no = channel_info.get('BNO') or broadcast_no
+ if not broadcast_no:
+ raise ExtractorError(f'Unable to extract broadcast number ({broadcaster_id} may not be live)', expected=True)
+
+ formats = []
+ quality_key = qualities(self._QUALITIES)
+ for quality_str in self._QUALITIES:
+ aid_response = self._download_json(
+ self._LIVE_API_URL, broadcast_no, fatal=False,
+ data=urlencode_postdata({
+ 'bno': broadcast_no,
+ 'stream_type': 'common',
+ 'type': 'aid',
+ 'quality': quality_str,
+ }),
+ note=f'Downloading access token for {quality_str} stream',
+ errnote=f'Unable to download access token for {quality_str} stream')
+ aid = traverse_obj(aid_response, ('CHANNEL', 'AID'))
+ if not aid:
+ continue
+
+ stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com'
+ stream_info = self._download_json(
+ f'{stream_base_url}/broad_stream_assign.html', broadcast_no, fatal=False,
+ query={
+ 'return_type': channel_info.get('CDN', 'gcp_cdn'),
+ 'broad_key': f'{broadcast_no}-common-{quality_str}-hls',
+ },
+ note=f'Downloading metadata for {quality_str} stream',
+ errnote=f'Unable to download metadata for {quality_str} stream') or {}
+
+ if stream_info.get('view_url'):
+ formats.append({
+ 'format_id': quality_str,
+ 'url': update_url_query(stream_info['view_url'], {'aid': aid}),
+ 'ext': 'mp4',
+ 'protocol': 'm3u8',
+ 'quality': quality_key(quality_str),
+ })
+
+ self._sort_formats(formats)
+
+ station_info = self._download_json(
+ 'https://st.afreecatv.com/api/get_station_status.php', broadcast_no,
+ query={'szBjId': broadcaster_id}, fatal=False,
+ note='Downloading channel metadata', errnote='Unable to download channel metadata') or {}
+
+ return {
+ 'id': broadcast_no,
+ 'title': channel_info.get('TITLE') or station_info.get('station_title'),
+ 'uploader': channel_info.get('BJNICK') or station_info.get('station_name'),
+ 'uploader_id': broadcaster_id,
+ 'timestamp': unified_timestamp(station_info.get('broad_start')),
+ 'formats': formats,
+ 'is_live': True,
+ }
diff --git a/yt_dlp/extractor/aparat.py b/yt_dlp/extractor/aparat.py
index da06a3cac..1057233cf 100644
--- a/yt_dlp/extractor/aparat.py
+++ b/yt_dlp/extractor/aparat.py
@@ -33,19 +33,22 @@ class AparatIE(InfoExtractor):
'only_matching': True,
}]
+ def _parse_options(self, webpage, video_id, fatal=True):
+ return self._parse_json(self._search_regex(
+ r'options\s*=\s*({.+?})\s*;', webpage, 'options', default='{}'), video_id)
+
def _real_extract(self, url):
video_id = self._match_id(url)
- # Provides more metadata
+ # If available, provides more metadata
webpage = self._download_webpage(url, video_id, fatal=False)
+ options = self._parse_options(webpage, video_id, fatal=False)
- if not webpage:
+ if not options:
webpage = self._download_webpage(
'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id,
- video_id)
-
- options = self._parse_json(self._search_regex(
- r'options\s*=\s*({.+?})\s*;', webpage, 'options'), video_id)
+ video_id, 'Downloading embed webpage')
+ options = self._parse_options(webpage, video_id)
formats = []
for sources in (options.get('multiSRC') or []):
diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py
index 467fe4875..2a25c0713 100644
--- a/yt_dlp/extractor/archiveorg.py
+++ b/yt_dlp/extractor/archiveorg.py
@@ -19,6 +19,7 @@ from ..utils import (
get_element_by_id,
HEADRequest,
int_or_none,
+ join_nonempty,
KNOWN_EXTENSIONS,
merge_dicts,
mimetype2ext,
@@ -64,7 +65,7 @@ class ArchiveOrgIE(InfoExtractor):
'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c',
'uploader': 'yorkmba99@hotmail.com',
'timestamp': 1387699629,
- 'upload_date': "20131222",
+ 'upload_date': '20131222',
},
}, {
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
@@ -150,8 +151,7 @@ class ArchiveOrgIE(InfoExtractor):
# Archive.org metadata API doesn't clearly demarcate playlist entries
# or subtitle tracks, so we get them from the embeddable player.
- embed_page = self._download_webpage(
- 'https://archive.org/embed/' + identifier, identifier)
+ embed_page = self._download_webpage(f'https://archive.org/embed/{identifier}', identifier)
playlist = self._playlist_data(embed_page)
entries = {}
@@ -166,17 +166,17 @@ class ArchiveOrgIE(InfoExtractor):
'thumbnails': [],
'artist': p.get('artist'),
'track': p.get('title'),
- 'subtitles': {}}
+ 'subtitles': {},
+ }
for track in p.get('tracks', []):
if track['kind'] != 'subtitles':
continue
-
entries[p['orig']][track['label']] = {
- 'url': 'https://archive.org/' + track['file'].lstrip('/')}
+ 'url': 'https://archive.org/' + track['file'].lstrip('/')
+ }
- metadata = self._download_json(
- 'http://archive.org/metadata/' + identifier, identifier)
+ metadata = self._download_json('http://archive.org/metadata/' + identifier, identifier)
m = metadata['metadata']
identifier = m['identifier']
@@ -189,7 +189,7 @@ class ArchiveOrgIE(InfoExtractor):
'license': m.get('licenseurl'),
'release_date': unified_strdate(m.get('date')),
'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
- 'webpage_url': 'https://archive.org/details/' + identifier,
+ 'webpage_url': f'https://archive.org/details/{identifier}',
'location': m.get('venue'),
'release_year': int_or_none(m.get('year'))}
@@ -207,7 +207,7 @@ class ArchiveOrgIE(InfoExtractor):
'discnumber': int_or_none(f.get('disc')),
'release_year': int_or_none(f.get('year'))})
entry = entries[f['name']]
- elif f.get('original') in entries:
+ elif traverse_obj(f, 'original', expected_type=str) in entries:
entry = entries[f['original']]
else:
continue
@@ -230,13 +230,12 @@ class ArchiveOrgIE(InfoExtractor):
'filesize': int_or_none(f.get('size')),
'protocol': 'https'})
- # Sort available formats by filesize
for entry in entries.values():
- entry['formats'] = list(sorted(entry['formats'], key=lambda x: x.get('filesize', -1)))
+ self._sort_formats(entry['formats'])
if len(entries) == 1:
# If there's only one item, use it as the main info dict
- only_video = entries[list(entries.keys())[0]]
+ only_video = next(iter(entries.values()))
if entry_id:
info = merge_dicts(only_video, info)
else:
@@ -261,19 +260,19 @@ class ArchiveOrgIE(InfoExtractor):
class YoutubeWebArchiveIE(InfoExtractor):
IE_NAME = 'web.archive:youtube'
- IE_DESC = 'web.archive.org saved youtube videos'
- _VALID_URL = r"""(?x)^
- (?:https?://)?web\.archive\.org/
- (?:web/)?
- (?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
-
- (?:https?(?::|%3[Aa])//)?
- (?:
- (?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
- |(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
- )
- (?P<id>[0-9A-Za-z_-]{11})(?:%26|\#|&|$)
- """
+ IE_DESC = 'web.archive.org saved youtube videos, "ytarchive:" prefix'
+ _VALID_URL = r'''(?x)(?:(?P<prefix>ytarchive:)|
+ (?:https?://)?web\.archive\.org/
+ (?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
+ (?:https?(?::|%3[Aa])//)?(?:
+ (?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
+ |(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
+ )
+ )(?P<id>[0-9A-Za-z_-]{11})
+ (?(prefix)
+ (?::(?P<date2>[0-9]{14}))?$|
+ (?:%26|[#&]|$)
+ )'''
_TESTS = [
{
@@ -438,7 +437,13 @@ class YoutubeWebArchiveIE(InfoExtractor):
}, {
'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg',
'only_matching': True
- }
+ }, {
+ 'url': 'ytarchive:BaW_jenozKc:20050214000000',
+ 'only_matching': True
+ }, {
+ 'url': 'ytarchive:BaW_jenozKc',
+ 'only_matching': True
+ },
]
_YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
_YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE
@@ -484,7 +489,6 @@ class YoutubeWebArchiveIE(InfoExtractor):
page_title, 'title', default='')
def _extract_metadata(self, video_id, webpage):
-
search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None))
player_response = self._extract_yt_initial_variable(
webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {}
@@ -596,7 +600,7 @@ class YoutubeWebArchiveIE(InfoExtractor):
# Prefer the new polymer UI captures as we support extracting more metadata from them
# WBM captures seem to all switch to this layout ~July 2020
- modern_captures = list(filter(lambda x: x >= 20200701000000, all_captures))
+ modern_captures = [x for x in all_captures if x >= 20200701000000]
if modern_captures:
capture_dates.append(modern_captures[0])
capture_dates.append(url_date)
@@ -608,11 +612,11 @@ class YoutubeWebArchiveIE(InfoExtractor):
# Fallbacks if any of the above fail
capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE])
- return orderedSet(capture_dates)
+ return orderedSet(filter(None, capture_dates))
def _real_extract(self, url):
-
- url_date, video_id = self._match_valid_url(url).groups()
+ video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2')
+ url_date = url_date or url_date_2
urlh = None
try:
@@ -629,11 +633,9 @@ class YoutubeWebArchiveIE(InfoExtractor):
raise
capture_dates = self._get_capture_dates(video_id, int_or_none(url_date))
- self.write_debug('Captures to try: ' + ', '.join(str(i) for i in capture_dates if i is not None))
+ self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', '))
info = {'id': video_id}
for capture in capture_dates:
- if not capture:
- continue
webpage = self._download_webpage(
(self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id),
video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)',
@@ -648,7 +650,7 @@ class YoutubeWebArchiveIE(InfoExtractor):
info['thumbnails'] = self._extract_thumbnails(video_id)
if urlh:
- url = compat_urllib_parse_unquote(urlh.url)
+ url = compat_urllib_parse_unquote(urlh.geturl())
video_file_url_qs = parse_qs(url)
# Attempt to recover any ext & format info from playback url & response headers
format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))}
diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py
index 1aff0361c..4ad5d6ddd 100644
--- a/yt_dlp/extractor/ard.py
+++ b/yt_dlp/extractor/ard.py
@@ -376,9 +376,24 @@ class ARDIE(InfoExtractor):
formats.append(f)
self._sort_formats(formats)
+ _SUB_FORMATS = (
+ ('./dataTimedText', 'ttml'),
+ ('./dataTimedTextNoOffset', 'ttml'),
+ ('./dataTimedTextVtt', 'vtt'),
+ )
+
+ subtitles = {}
+ for subsel, subext in _SUB_FORMATS:
+ for node in video_node.findall(subsel):
+ subtitles.setdefault('de', []).append({
+ 'url': node.attrib['url'],
+ 'ext': subext,
+ })
+
return {
'id': xpath_text(video_node, './videoId', default=display_id),
'formats': formats,
+ 'subtitles': subtitles,
'display_id': display_id,
'title': video_node.find('./title').text,
'duration': parse_duration(video_node.find('./duration').text),
diff --git a/yt_dlp/extractor/arnes.py b/yt_dlp/extractor/arnes.py
index c0032fcab..050c252e3 100644
--- a/yt_dlp/extractor/arnes.py
+++ b/yt_dlp/extractor/arnes.py
@@ -7,6 +7,7 @@ from ..compat import (
compat_urllib_parse_urlparse,
)
from ..utils import (
+ format_field,
float_or_none,
int_or_none,
parse_iso8601,
@@ -92,7 +93,7 @@ class ArnesIE(InfoExtractor):
'timestamp': parse_iso8601(video.get('creationTime')),
'channel': channel.get('name'),
'channel_id': channel_id,
- 'channel_url': self._BASE_URL + '/?channel=' + channel_id if channel_id else None,
+ 'channel_url': format_field(channel_id, template=f'{self._BASE_URL}/?channel=%s'),
'duration': float_or_none(video.get('duration'), 1000),
'view_count': int_or_none(video.get('views')),
'tags': video.get('hashtags'),
diff --git a/yt_dlp/extractor/awaan.py b/yt_dlp/extractor/awaan.py
index b5d1b57af..f5e559c9f 100644
--- a/yt_dlp/extractor/awaan.py
+++ b/yt_dlp/extractor/awaan.py
@@ -9,6 +9,7 @@ from ..compat import (
compat_str,
)
from ..utils import (
+ format_field,
int_or_none,
parse_iso8601,
smuggle_url,
@@ -43,7 +44,7 @@ class AWAANBaseIE(InfoExtractor):
'id': video_id,
'title': title,
'description': video_data.get('description_en') or video_data.get('description_ar'),
- 'thumbnail': 'http://admin.mangomolo.com/analytics/%s' % img if img else None,
+ 'thumbnail': format_field(img, template='http://admin.mangomolo.com/analytics/%s'),
'duration': int_or_none(video_data.get('duration')),
'timestamp': parse_iso8601(video_data.get('create_time'), ' '),
'is_live': is_live,
diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py
index 2cb01ff83..a775aa97f 100644
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@@ -1,5 +1,6 @@
# coding: utf-8
+import base64
import hashlib
import itertools
import functools
@@ -16,9 +17,9 @@ from ..utils import (
ExtractorError,
int_or_none,
float_or_none,
+ mimetype2ext,
parse_iso8601,
traverse_obj,
- try_get,
parse_count,
smuggle_url,
srt_subtitles_timecode,
@@ -51,16 +52,14 @@ class BiliBiliIE(InfoExtractor):
'url': 'http://www.bilibili.com/video/av1074402/',
'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
'info_dict': {
- 'id': '1074402',
- 'ext': 'flv',
+ 'id': '1074402_part1',
+ 'ext': 'mp4',
'title': '【金坷垃】金泡沫',
+ 'uploader_id': '156160',
+ 'uploader': '菊子桑',
+ 'upload_date': '20140420',
'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
- 'duration': 308.067,
'timestamp': 1398012678,
- 'upload_date': '20140420',
- 'thumbnail': r're:^https?://.+\.jpg',
- 'uploader': '菊子桑',
- 'uploader_id': '156160',
},
}, {
# Tested in BiliBiliBangumiIE
@@ -74,49 +73,27 @@ class BiliBiliIE(InfoExtractor):
'url': 'http://bangumi.bilibili.com/anime/5802/play#100643',
'md5': '3f721ad1e75030cc06faf73587cfec57',
'info_dict': {
- 'id': '100643',
+ 'id': '100643_part1',
'ext': 'mp4',
'title': 'CHAOS;CHILD',
'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...',
},
'skip': 'Geo-restricted to China',
}, {
- # Title with double quotes
'url': 'http://www.bilibili.com/video/av8903802/',
'info_dict': {
- 'id': '8903802',
+ 'id': '8903802_part1',
+ 'ext': 'mp4',
'title': '阿滴英文|英文歌分享#6 "Closer',
+ 'upload_date': '20170301',
'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文',
+ 'timestamp': 1488382634,
+ 'uploader_id': '65880958',
+ 'uploader': '阿滴英文',
+ },
+ 'params': {
+ 'skip_download': True,
},
- 'playlist': [{
- 'info_dict': {
- 'id': '8903802_part1',
- 'ext': 'flv',
- 'title': '阿滴英文|英文歌分享#6 "Closer',
- 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
- 'uploader': '阿滴英文',
- 'uploader_id': '65880958',
- 'timestamp': 1488382634,
- 'upload_date': '20170301',
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- 'info_dict': {
- 'id': '8903802_part2',
- 'ext': 'flv',
- 'title': '阿滴英文|英文歌分享#6 "Closer',
- 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
- 'uploader': '阿滴英文',
- 'uploader_id': '65880958',
- 'timestamp': 1488382634,
- 'upload_date': '20170301',
- },
- 'params': {
- 'skip_download': True,
- },
- }]
}, {
# new BV video id format
'url': 'https://www.bilibili.com/video/BV1JE411F741',
@@ -151,6 +128,7 @@ class BiliBiliIE(InfoExtractor):
av_id, bv_id = self._get_video_id_set(video_id, mobj.group('id_bv') is not None)
video_id = av_id
+ info = {}
anime_id = mobj.group('anime_id')
page_id = mobj.group('page')
webpage = self._download_webpage(url, video_id)
@@ -202,35 +180,48 @@ class BiliBiliIE(InfoExtractor):
}
headers.update(self.geo_verification_headers())
+ video_info = self._parse_json(
+ self._search_regex(r'window.__playinfo__\s*=\s*({.+?})</script>', webpage, 'video info', default=None) or '{}',
+ video_id, fatal=False)
+ video_info = video_info.get('data') or {}
+
+ durl = traverse_obj(video_info, ('dash', 'video'))
+ audios = traverse_obj(video_info, ('dash', 'audio')) or []
entries = []
RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4')
for num, rendition in enumerate(RENDITIONS, start=1):
payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition)
sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest()
-
- video_info = self._download_json(
- 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign),
- video_id, note='Downloading video info page',
- headers=headers, fatal=num == len(RENDITIONS))
-
if not video_info:
- continue
+ video_info = self._download_json(
+ 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign),
+ video_id, note='Downloading video info page',
+ headers=headers, fatal=num == len(RENDITIONS))
+ if not video_info:
+ continue
- if 'durl' not in video_info:
+ if not durl and 'durl' not in video_info:
if num < len(RENDITIONS):
continue
self._report_error(video_info)
- for idx, durl in enumerate(video_info['durl']):
- formats = [{
- 'url': durl['url'],
- 'filesize': int_or_none(durl['size']),
- }]
- for backup_url in durl.get('backup_url', []):
+ formats = []
+ for idx, durl in enumerate(durl or video_info['durl']):
+ formats.append({
+ 'url': durl.get('baseUrl') or durl.get('base_url') or durl.get('url'),
+ 'ext': mimetype2ext(durl.get('mimeType') or durl.get('mime_type')),
+ 'fps': int_or_none(durl.get('frameRate') or durl.get('frame_rate')),
+ 'width': int_or_none(durl.get('width')),
+ 'height': int_or_none(durl.get('height')),
+ 'vcodec': durl.get('codecs'),
+ 'acodec': 'none' if audios else None,
+ 'tbr': float_or_none(durl.get('bandwidth'), scale=1000),
+ 'filesize': int_or_none(durl.get('size')),
+ })
+ for backup_url in traverse_obj(durl, 'backup_url', expected_type=list) or []:
formats.append({
'url': backup_url,
- # backup URLs have lower priorities
'quality': -2 if 'hd.mp4' in backup_url else -3,
})
@@ -238,30 +229,47 @@ class BiliBiliIE(InfoExtractor):
a_format.setdefault('http_headers', {}).update({
'Referer': url,
})
-
- self._sort_formats(formats)
-
- entries.append({
- 'id': '%s_part%s' % (video_id, idx),
- 'duration': float_or_none(durl.get('length'), 1000),
- 'formats': formats,
+ for audio in audios:
+ formats.append({
+ 'url': audio.get('baseUrl') or audio.get('base_url') or audio.get('url'),
+ 'ext': mimetype2ext(audio.get('mimeType') or audio.get('mime_type')),
+ 'fps': int_or_none(audio.get('frameRate') or audio.get('frame_rate')),
+ 'width': int_or_none(audio.get('width')),
+ 'height': int_or_none(audio.get('height')),
+ 'acodec': audio.get('codecs'),
+ 'vcodec': 'none',
+ 'tbr': float_or_none(audio.get('bandwidth'), scale=1000),
+ 'filesize': int_or_none(audio.get('size'))
})
+ for backup_url in traverse_obj(audio, 'backup_url', expected_type=list) or []:
+ formats.append({
+ 'url': backup_url,
+ # backup URLs have lower priorities
+ 'quality': -3,
+ })
+
+ info.update({
+ 'id': video_id,
+ 'duration': float_or_none(durl.get('length'), 1000),
+ 'formats': formats,
+ })
break
- title = self._html_search_regex(
- (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
- r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
- group='title')
+ self._sort_formats(formats)
+
+ title = self._html_search_regex((
+ r'<h1[^>]+title=(["\'])(?P<content>[^"\']+)',
+ r'(?s)<h1[^>]*>(?P<content>.+?)</h1>',
+ self._meta_regex('title')
+ ), webpage, 'title', group='content', fatal=False)
# Get part title for anthologies
if page_id is not None:
- # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video
- part_title = try_get(
- self._download_json(
- f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp',
- video_id, note='Extracting videos in anthology'),
- lambda x: x['data'][int(page_id) - 1]['part'])
- title = part_title or title
+ # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video.
+ part_info = traverse_obj(self._download_json(
+ f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp',
+ video_id, note='Extracting videos in anthology'), 'data', expected_type=list)
+ title = title if len(part_info) == 1 else traverse_obj(part_info, (int(page_id) - 1, 'part')) or title
description = self._html_search_meta('description', webpage)
timestamp = unified_timestamp(self._html_search_regex(
@@ -271,15 +279,15 @@ class BiliBiliIE(InfoExtractor):
thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)
# TODO 'view_count' requires deobfuscating Javascript
- info = {
- 'id': str(video_id) if page_id is None else '%s_part%s' % (video_id, page_id),
+ info.update({
+ 'id': f'{video_id}_part{page_id or 1}',
'cid': cid,
'title': title,
'description': description,
'timestamp': timestamp,
'thumbnail': thumbnail,
'duration': float_or_none(video_info.get('timelength'), scale=1000),
- }
+ })
uploader_mobj = re.search(
r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>\s*(?P<name>[^<]+?)\s*<',
@@ -300,7 +308,7 @@ class BiliBiliIE(InfoExtractor):
video_id, fatal=False, note='Downloading tags'), ('data', ..., 'tag_name')),
}
- entries[0]['subtitles'] = {
+ info['subtitles'] = {
'danmaku': [{
'ext': 'xml',
'url': f'https://comment.bilibili.com/{cid}.xml',
@@ -335,12 +343,10 @@ class BiliBiliIE(InfoExtractor):
entry['id'] = '%s_part%d' % (video_id, (idx + 1))
return {
- '_type': 'multi_video',
'id': str(video_id),
'bv_id': bv_id,
'title': title,
'description': description,
- 'entries': entries,
**info, **top_level_info
}
@@ -481,9 +487,9 @@ class BilibiliChannelIE(InfoExtractor):
data = self._download_json(
self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data']
- max_count = max_count or try_get(data, lambda x: x['page']['count'])
+ max_count = max_count or traverse_obj(data, ('page', 'count'))
- entries = try_get(data, lambda x: x['list']['vlist'])
+ entries = traverse_obj(data, ('list', 'vlist'))
if not entries:
return
for entry in entries:
@@ -521,7 +527,7 @@ class BilibiliCategoryIE(InfoExtractor):
api_url, query, query={'Search_key': query, 'pn': page_num},
note='Extracting results from page %s of %s' % (page_num, num_pages))
- video_list = try_get(parsed_json, lambda x: x['data']['archives'], list)
+ video_list = traverse_obj(parsed_json, ('data', 'archives'), expected_type=list)
if not video_list:
raise ExtractorError('Failed to retrieve video list for page %d' % page_num)
@@ -551,7 +557,7 @@ class BilibiliCategoryIE(InfoExtractor):
api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value
page_json = self._download_json(api_url, query, query={'Search_key': query, 'pn': '1'})
- page_data = try_get(page_json, lambda x: x['data']['page'], dict)
+ page_data = traverse_obj(page_json, ('data', 'page'), expected_type=dict)
count, size = int_or_none(page_data.get('count')), int_or_none(page_data.get('size'))
if count is None or not size:
raise ExtractorError('Failed to calculate either page count or size')
@@ -724,14 +730,30 @@ class BiliBiliPlayerIE(InfoExtractor):
class BiliIntlBaseIE(InfoExtractor):
_API_URL = 'https://api.bilibili.tv/intl/gateway'
+ _NETRC_MACHINE = 'biliintl'
def _call_api(self, endpoint, *args, **kwargs):
- return self._download_json(self._API_URL + endpoint, *args, **kwargs)['data']
+ json = self._download_json(self._API_URL + endpoint, *args, **kwargs)
+ if json.get('code'):
+ if json['code'] in (10004004, 10004005, 10023006):
+ self.raise_login_required()
+ elif json['code'] == 10004001:
+ self.raise_geo_restricted()
+ else:
+ if json.get('message') and str(json['code']) != json['message']:
+ errmsg = f'{kwargs.get("errnote", "Unable to download JSON metadata")}: {self.IE_NAME} said: {json["message"]}'
+ else:
+ errmsg = kwargs.get('errnote', 'Unable to download JSON metadata')
+ if kwargs.get('fatal'):
+ raise ExtractorError(errmsg)
+ else:
+ self.report_warning(errmsg)
+ return json.get('data')
def json2srt(self, json):
data = '\n\n'.join(
f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}'
- for i, line in enumerate(json['body']))
+ for i, line in enumerate(json['body']) if line.get('content'))
return data
def _get_subtitles(self, ep_id):
@@ -755,16 +777,6 @@ class BiliIntlBaseIE(InfoExtractor):
def _get_formats(self, ep_id):
video_json = self._call_api(f'/web/playurl?ep_id={ep_id}&platform=web', ep_id,
note='Downloading video formats', errnote='Unable to download video formats')
- if video_json.get('code'):
- if video_json['code'] in (10004004, 10004005, 10023006):
- self.raise_login_required(method='cookies')
- elif video_json['code'] == 10004001:
- self.raise_geo_restricted()
- elif video_json.get('message') and str(video_json['code']) != video_json['message']:
- raise ExtractorError(
- f'Unable to download video formats: {self.IE_NAME} said: {video_json["message"]}', expected=True)
- else:
- raise ExtractorError('Unable to download video formats')
video_json = video_json['playurl']
formats = []
for vid in video_json.get('video') or []:
@@ -810,10 +822,49 @@ class BiliIntlBaseIE(InfoExtractor):
'extractor_key': BiliIntlIE.ie_key(),
}
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+
+ try:
+ from Cryptodome.PublicKey import RSA
+ from Cryptodome.Cipher import PKCS1_v1_5
+ except ImportError:
+ try:
+ from Crypto.PublicKey import RSA
+ from Crypto.Cipher import PKCS1_v1_5
+ except ImportError:
+ raise ExtractorError('pycryptodomex not found. Please install', expected=True)
+
+ key_data = self._download_json(
+ 'https://passport.bilibili.tv/x/intl/passport-login/web/key?lang=en-US', None,
+ note='Downloading login key', errnote='Unable to download login key')['data']
+
+ public_key = RSA.importKey(key_data['key'])
+ password_hash = PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8'))
+ login_post = self._download_json(
+ 'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({
+ 'username': username,
+ 'password': base64.b64encode(password_hash).decode('ascii'),
+ 'keep_me': 'true',
+ 's_locale': 'en_US',
+ 'isTrusted': 'true'
+ }), note='Logging in', errnote='Unable to log in')
+ if login_post.get('code'):
+ if login_post.get('message'):
+ raise ExtractorError(f'Unable to log in: {self.IE_NAME} said: {login_post["message"]}', expected=True)
+ else:
+ raise ExtractorError('Unable to log in')
+
+ def _real_initialize(self):
+ self._login()
+
class BiliIntlIE(BiliIntlBaseIE):
_VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)'
_TESTS = [{
+ # Bstation page
'url': 'https://www.bilibili.tv/en/play/34613/341736',
'info_dict': {
'id': '341736',
@@ -823,6 +874,7 @@ class BiliIntlIE(BiliIntlBaseIE):
'episode_number': 2,
}
}, {
+ # Non-Bstation page
'url': 'https://www.bilibili.tv/en/play/1033760/11005006',
'info_dict': {
'id': '11005006',
@@ -832,6 +884,17 @@ class BiliIntlIE(BiliIntlBaseIE):
'episode_number': 3,
}
}, {
+ # Subtitle with empty content
+ 'url': 'https://www.bilibili.tv/en/play/1005144/10131790',
+ 'info_dict': {
+ 'id': '10131790',
+ 'ext': 'mp4',
+ 'title': 'E140 - Two Heartbeats: Kabuto\'s Trap',
+ 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
+ 'episode_number': 140,
+ },
+ 'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.'
+ }, {
'url': 'https://www.biliintl.com/en/play/34613/341736',
'only_matching': True,
}]
diff --git a/yt_dlp/extractor/callin.py b/yt_dlp/extractor/callin.py
new file mode 100644
index 000000000..acf327ace
--- /dev/null
+++ b/yt_dlp/extractor/callin.py
@@ -0,0 +1,114 @@
+# coding: utf-8
+from .common import InfoExtractor
+from ..utils import (
+ traverse_obj,
+ float_or_none,
+ int_or_none
+)
+
+
+class CallinIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?callin\.com/(episode)/(?P<id>[-a-zA-Z]+)'
+ _TESTS = [{
+ 'url': 'https://www.callin.com/episode/the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
+ 'info_dict': {
+ 'id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd',
+ 'title': 'The Title IX Regime and the Long March Through and Beyond the Institutions',
+ 'ext': 'ts',
+ 'display_id': 'the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc',
+ 'thumbnail': 're:https://.+\\.png',
+ 'description': 'First episode',
+ 'uploader': 'Wesley Yang',
+ 'timestamp': 1639404128.65,
+ 'upload_date': '20211213',
+ 'uploader_id': 'wesyang',
+ 'uploader_url': 'http://wesleyyang.substack.com',
+ 'channel': 'Conversations in Year Zero',
+ 'channel_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553',
+ 'channel_url': 'https://callin.com/show/conversations-in-year-zero-oJNllRFSfx',
+ 'duration': 9951.936,
+ 'view_count': int,
+ 'categories': ['News & Politics', 'History', 'Technology'],
+ 'cast': ['Wesley Yang', 'KC Johnson', 'Gabi Abramovich'],
+ 'series': 'Conversations in Year Zero',
+ 'series_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553',
+ 'episode': 'The Title IX Regime and the Long March Through and Beyond the Institutions',
+ 'episode_number': 1,
+ 'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd'
+ }
+ }]
+
+ def try_get_user_name(self, d):
+ names = [d.get(n) for n in ('first', 'last')]
+ if None in names:
+ return next((n for n in names if n), default=None)
+ return ' '.join(names)
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ next_data = self._search_nextjs_data(webpage, display_id)
+ episode = next_data['props']['pageProps']['episode']
+
+ id = episode['id']
+ title = (episode.get('title')
+ or self._og_search_title(webpage, fatal=False)
+ or self._html_search_regex('<title>(.*?)</title>', webpage, 'title'))
+ url = episode['m3u8']
+ formats = self._extract_m3u8_formats(url, display_id, ext='ts')
+ self._sort_formats(formats)
+
+ show = traverse_obj(episode, ('show', 'title'))
+ show_id = traverse_obj(episode, ('show', 'id'))
+
+ show_json = None
+ app_slug = (self._html_search_regex(
+ '<script\\s+src=["\']/_next/static/([-_a-zA-Z0-9]+)/_',
+ webpage, 'app slug', fatal=False) or next_data.get('buildId'))
+ show_slug = traverse_obj(episode, ('show', 'linkObj', 'resourceUrl'))
+ if app_slug and show_slug and '/' in show_slug:
+ show_slug = show_slug.rsplit('/', 1)[1]
+ show_json_url = f'https://www.callin.com/_next/data/{app_slug}/show/{show_slug}.json'
+ show_json = self._download_json(show_json_url, display_id, fatal=False)
+
+ host = (traverse_obj(show_json, ('pageProps', 'show', 'hosts', 0))
+ or traverse_obj(episode, ('speakers', 0)))
+
+ host_nick = traverse_obj(host, ('linkObj', 'resourceUrl'))
+ host_nick = host_nick.rsplit('/', 1)[1] if (host_nick and '/' in host_nick) else None
+
+ cast = list(filter(None, [
+ self.try_get_user_name(u) for u in
+ traverse_obj(episode, (('speakers', 'callerTags'), ...)) or []
+ ]))
+
+ episode_list = traverse_obj(show_json, ('pageProps', 'show', 'episodes')) or []
+ episode_number = next(
+ (len(episode_list) - i for (i, e) in enumerate(episode_list) if e.get('id') == id),
+ None)
+
+ return {
+ 'id': id,
+ 'display_id': display_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': traverse_obj(episode, ('show', 'photo')),
+ 'description': episode.get('description'),
+ 'uploader': self.try_get_user_name(host) if host else None,
+ 'timestamp': episode.get('publishedAt'),
+ 'uploader_id': host_nick,
+ 'uploader_url': traverse_obj(show_json, ('pageProps', 'show', 'url')),
+ 'channel': show,
+ 'channel_id': show_id,
+ 'channel_url': traverse_obj(episode, ('show', 'linkObj', 'resourceUrl')),
+ 'duration': float_or_none(episode.get('runtime')),
+ 'view_count': int_or_none(episode.get('plays')),
+ 'categories': traverse_obj(episode, ('show', 'categorizations', ..., 'name')),
+ 'cast': cast if cast else None,
+ 'series': show,
+ 'series_id': show_id,
+ 'episode': title,
+ 'episode_number': episode_number,
+ 'episode_id': id
+ }
diff --git a/yt_dlp/extractor/cam4.py b/yt_dlp/extractor/cam4.py
index f47de9176..2a3931fd0 100644
--- a/yt_dlp/extractor/cam4.py
+++ b/yt_dlp/extractor/cam4.py
@@ -13,6 +13,8 @@ class CAM4IE(InfoExtractor):
'ext': 'mp4',
'title': 're:^foxynesss [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'age_limit': 18,
+ 'live_status': 'is_live',
+ 'thumbnail': 'https://snapshots.xcdnpro.com/thumbnails/foxynesss',
}
}
@@ -29,4 +31,5 @@ class CAM4IE(InfoExtractor):
'is_live': True,
'age_limit': 18,
'formats': formats,
+ 'thumbnail': f'https://snapshots.xcdnpro.com/thumbnails/{channel_id}',
}
diff --git a/yt_dlp/extractor/canalalpha.py b/yt_dlp/extractor/canalalpha.py
index 51d30a321..0365cb2f6 100644
--- a/yt_dlp/extractor/canalalpha.py
+++ b/yt_dlp/extractor/canalalpha.py
@@ -78,11 +78,11 @@ class CanalAlphaIE(InfoExtractor):
'height': try_get(video, lambda x: x['res']['height'], expected_type=int),
} for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')]
if manifests.get('hls'):
- m3u8_frmts, m3u8_subs = self._parse_m3u8_formats_and_subtitles(manifests['hls'], id)
+ m3u8_frmts, m3u8_subs = self._parse_m3u8_formats_and_subtitles(manifests['hls'], video_id=id)
formats.extend(m3u8_frmts)
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
if manifests.get('dash'):
- dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash'], id)
+ dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash'])
formats.extend(dash_frmts)
subtitles = self._merge_subtitles(subtitles, dash_subs)
self._sort_formats(formats)
diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py
index e97c91929..82fded4e1 100644
--- a/yt_dlp/extractor/canvas.py
+++ b/yt_dlp/extractor/canvas.py
@@ -76,7 +76,7 @@ class CanvasIE(InfoExtractor):
'vrtPlayerToken': vrtPlayerToken,
'client': 'null',
}, expected_status=400)
- if not data.get('title'):
+ if 'title' not in data:
code = data.get('code')
if code == 'AUTHENTICATION_REQUIRED':
self.raise_login_required()
@@ -84,7 +84,8 @@ class CanvasIE(InfoExtractor):
self.raise_geo_restricted(countries=['BE'])
raise ExtractorError(data.get('message') or code, expected=True)
- title = data['title']
+ # Note: The title may be an empty string
+ title = data['title'] or f'{site_id} {video_id}'
description = data.get('description')
formats = []
diff --git a/yt_dlp/extractor/carambatv.py b/yt_dlp/extractor/carambatv.py
index b57b86af7..7e5cc90fb 100644
--- a/yt_dlp/extractor/carambatv.py
+++ b/yt_dlp/extractor/carambatv.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ format_field,
float_or_none,
int_or_none,
try_get,
@@ -43,7 +44,7 @@ class CarambaTVIE(InfoExtractor):
formats = [{
'url': base_url + f['fn'],
'height': int_or_none(f.get('height')),
- 'format_id': '%sp' % f['height'] if f.get('height') else None,
+ 'format_id': format_field(f, 'height', '%sp'),
} for f in video['qualities'] if f.get('fn')]
self._sort_formats(formats)
diff --git a/yt_dlp/extractor/cctv.py b/yt_dlp/extractor/cctv.py
index 9b8612138..0ed5f327b 100644
--- a/yt_dlp/extractor/cctv.py
+++ b/yt_dlp/extractor/cctv.py
@@ -162,7 +162,8 @@ class CCTVIE(InfoExtractor):
'url': video_url,
'format_id': 'http',
'quality': quality,
- 'source_preference': -10
+ # Sample clip
+ 'preference': -10
})
hls_url = try_get(data, lambda x: x['hls_url'], compat_str)
diff --git a/yt_dlp/extractor/ceskatelevize.py b/yt_dlp/extractor/ceskatelevize.py
index 6c90b247e..ddf66b207 100644
--- a/yt_dlp/extractor/ceskatelevize.py
+++ b/yt_dlp/extractor/ceskatelevize.py
@@ -177,6 +177,7 @@ class CeskaTelevizeIE(InfoExtractor):
is_live = item.get('type') == 'LIVE'
formats = []
for format_id, stream_url in item.get('streamUrls', {}).items():
+ stream_url = stream_url.replace('https://', 'http://')
if 'playerType=flash' in stream_url:
stream_formats = self._extract_m3u8_formats(
stream_url, playlist_id, 'mp4', 'm3u8_native',
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index 3260399cb..ac9e28560 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -45,6 +45,7 @@ from ..utils import (
determine_ext,
determine_protocol,
dict_get,
+ encode_data_uri,
error_to_compat_str,
extract_attributes,
ExtractorError,
@@ -243,11 +244,16 @@ class InfoExtractor(object):
uploader: Full name of the video uploader.
license: License name the video is licensed under.
creator: The creator of the video.
- release_timestamp: UNIX timestamp of the moment the video was released.
- release_date: The date (YYYYMMDD) when the video was released.
timestamp: UNIX timestamp of the moment the video was uploaded
upload_date: Video upload date (YYYYMMDD).
- If not explicitly set, calculated from timestamp.
+ If not explicitly set, calculated from timestamp
+ release_timestamp: UNIX timestamp of the moment the video was released.
+ If it is not clear whether to use timestamp or this, use the former
+ release_date: The date (YYYYMMDD) when the video was released.
+ If not explicitly set, calculated from release_timestamp
+ modified_timestamp: UNIX timestamp of the moment the video was last modified.
+ modified_date: The date (YYYYMMDD) when the video was last modified.
+ If not explicitly set, calculated from modified_timestamp
uploader_id: Nickname or id of the video uploader.
uploader_url: Full URL to a personal webpage of the video uploader.
channel: Full name of the channel the video is uploaded on.
@@ -255,6 +261,7 @@ class InfoExtractor(object):
fields. This depends on a particular extractor.
channel_id: Id of the channel.
channel_url: Full URL to a channel webpage.
+ channel_follower_count: Number of followers of the channel.
location: Physical location where the video was filmed.
subtitles: The available subtitles as a dictionary in the format
{tag: subformats}. "tag" is usually a language code, and
@@ -370,6 +377,7 @@ class InfoExtractor(object):
disc_number: Number of the disc or other physical medium the track belongs to,
as an integer.
release_year: Year (YYYY) when the album was released.
+ composer: Composer of the piece
Unless mentioned otherwise, the fields should be Unicode strings.
@@ -383,6 +391,11 @@ class InfoExtractor(object):
Additionally, playlists can have "id", "title", and any other relevent
attributes with the same semantics as videos (see above).
+ It can also have the following optional fields:
+
+ playlist_count: The total number of videos in a playlist. If not given,
+ YoutubeDL tries to calculate it from "entries"
+
_type "multi_video" indicates that there are multiple videos that
form a single show, for examples multiple acts of an opera or TV episode.
@@ -1108,39 +1121,39 @@ class InfoExtractor(object):
# Methods for following #608
@staticmethod
- def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
+ def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
"""Returns a URL that points to a page that should be processed"""
- # TODO: ie should be the class used for getting the info
- video_info = {'_type': 'url',
- 'url': url,
- 'ie_key': ie}
- video_info.update(kwargs)
+ if ie is not None:
+ kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
if video_id is not None:
- video_info['id'] = video_id
+ kwargs['id'] = video_id
if video_title is not None:
- video_info['title'] = video_title
- return video_info
+ kwargs['title'] = video_title
+ return {
+ **kwargs,
+ '_type': 'url_transparent' if url_transparent else 'url',
+ 'url': url,
+ }
- def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
- urls = orderedSet(
- self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
- for m in matches)
- return self.playlist_result(
- urls, playlist_id=playlist_id, playlist_title=playlist_title)
+ def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, **kwargs):
+ urls = (self.url_result(self._proto_relative_url(m), ie)
+ for m in orderedSet(map(getter, matches) if getter else matches))
+ return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
@staticmethod
- def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
+ def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
"""Returns a playlist"""
- video_info = {'_type': 'playlist',
- 'entries': entries}
- video_info.update(kwargs)
if playlist_id:
- video_info['id'] = playlist_id
+ kwargs['id'] = playlist_id
if playlist_title:
- video_info['title'] = playlist_title
+ kwargs['title'] = playlist_title
if playlist_description is not None:
- video_info['description'] = playlist_description
- return video_info
+ kwargs['description'] = playlist_description
+ return {
+ **kwargs,
+ '_type': 'multi_video' if multi_video else 'playlist',
+ 'entries': entries,
+ }
def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
@@ -1278,6 +1291,7 @@ class InfoExtractor(object):
return self._og_search_property('description', html, fatal=False, **kargs)
def _og_search_title(self, html, **kargs):
+ kargs.setdefault('fatal', False)
return self._og_search_property('title', html, **kargs)
def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
@@ -1429,6 +1443,23 @@ class InfoExtractor(object):
continue
info[count_key] = interaction_count
+ def extract_chapter_information(e):
+ chapters = [{
+ 'title': part.get('name'),
+ 'start_time': part.get('startOffset'),
+ 'end_time': part.get('endOffset'),
+ } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
+ for idx, (last_c, current_c, next_c) in enumerate(zip(
+ [{'end_time': 0}] + chapters, chapters, chapters[1:])):
+ current_c['end_time'] = current_c['end_time'] or next_c['start_time']
+ current_c['start_time'] = current_c['start_time'] or last_c['end_time']
+ if None in current_c.values():
+ self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
+ return
+ if chapters:
+ chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
+ info['chapters'] = chapters
+
def extract_video_object(e):
assert e['@type'] == 'VideoObject'
author = e.get('author')
@@ -1436,7 +1467,8 @@ class InfoExtractor(object):
'url': url_or_none(e.get('contentUrl')),
'title': unescapeHTML(e.get('name')),
'description': unescapeHTML(e.get('description')),
- 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
+ 'thumbnails': [{'url': url_or_none(url)}
+ for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
'duration': parse_duration(e.get('duration')),
'timestamp': unified_timestamp(e.get('uploadDate')),
# author can be an instance of 'Organization' or 'Person' types.
@@ -1451,6 +1483,7 @@ class InfoExtractor(object):
'view_count': int_or_none(e.get('interactionCount')),
})
extract_interaction_statistic(e)
+ extract_chapter_information(e)
def traverse_json_ld(json_ld, at_top_level=True):
for e in json_ld:
@@ -1496,6 +1529,8 @@ class InfoExtractor(object):
'title': unescapeHTML(e.get('headline')),
'description': unescapeHTML(e.get('articleBody') or e.get('description')),
})
+ if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
+ extract_video_object(e['video'][0])
elif item_type == 'VideoObject':
extract_video_object(e)
if expected_type is None:
@@ -1513,12 +1548,12 @@ class InfoExtractor(object):
return dict((k, v) for k, v in info.items() if v is not None)
- def _search_nextjs_data(self, webpage, video_id, **kw):
+ def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
return self._parse_json(
self._search_regex(
r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
- webpage, 'next.js data', **kw),
- video_id, **kw)
+ webpage, 'next.js data', fatal=fatal, **kw),
+ video_id, transform_source=transform_source, fatal=fatal)
def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
@@ -2076,7 +2111,7 @@ class InfoExtractor(object):
headers=headers, query=query, video_id=video_id)
def _parse_m3u8_formats_and_subtitles(
- self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
+ self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
preference=None, quality=None, m3u8_id=None, live=False, note=None,
errnote=None, fatal=True, data=None, headers={}, query={},
video_id=None):
@@ -2126,7 +2161,7 @@ class InfoExtractor(object):
formats = [{
'format_id': join_nonempty(m3u8_id, idx),
'format_index': idx,
- 'url': m3u8_url,
+ 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
'ext': ext,
'protocol': entry_protocol,
'preference': preference,
@@ -2712,11 +2747,15 @@ class InfoExtractor(object):
mime_type = representation_attrib['mimeType']
content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
- codecs = representation_attrib.get('codecs', '')
+ codecs = parse_codecs(representation_attrib.get('codecs', ''))
if content_type not in ('video', 'audio', 'text'):
if mime_type == 'image/jpeg':
content_type = mime_type
- elif codecs.split('.')[0] == 'stpp':
+ elif codecs['vcodec'] != 'none':
+ content_type = 'video'
+ elif codecs['acodec'] != 'none':
+ content_type = 'audio'
+ elif codecs.get('tcodec', 'none') != 'none':
content_type = 'text'
elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
content_type = 'text'
@@ -2762,8 +2801,8 @@ class InfoExtractor(object):
'format_note': 'DASH %s' % content_type,
'filesize': filesize,
'container': mimetype2ext(mime_type) + '_dash',
+ **codecs
}
- f.update(parse_codecs(codecs))
elif content_type == 'text':
f = {
'ext': mimetype2ext(mime_type),
@@ -3468,8 +3507,6 @@ class InfoExtractor(object):
def _int(self, v, name, fatal=False, **kwargs):
res = int_or_none(v, **kwargs)
- if 'get_attr' in kwargs:
- print(getattr(v, kwargs['get_attr']))
if res is None:
msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
if fatal:
@@ -3676,6 +3713,22 @@ class InfoExtractor(object):
return [] if default is NO_DEFAULT else default
return list(val) if casesense else [x.lower() for x in val]
+ def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
+ if not playlist_id or not video_id:
+ return not video_id
+
+ no_playlist = (smuggled_data or {}).get('force_noplaylist')
+ if no_playlist is not None:
+ return not no_playlist
+
+ video_id = '' if video_id is True else f' {video_id}'
+ playlist_id = '' if playlist_id is True else f' {playlist_id}'
+ if self.get_param('noplaylist'):
+ self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
+ return False
+ self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
+ return True
+
class SearchInfoExtractor(InfoExtractor):
"""
diff --git a/yt_dlp/extractor/crowdbunker.py b/yt_dlp/extractor/crowdbunker.py
new file mode 100644
index 000000000..72906afef
--- /dev/null
+++ b/yt_dlp/extractor/crowdbunker.py
@@ -0,0 +1,113 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ try_get,
+ unified_strdate,
+)
+
+
+class CrowdBunkerIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/v/(?P<id>[^/?#$&]+)'
+
+ _TESTS = [{
+ 'url': 'https://crowdbunker.com/v/0z4Kms8pi8I',
+ 'info_dict': {
+ 'id': '0z4Kms8pi8I',
+ 'ext': 'mp4',
+ 'title': '117) Pass vax et solutions',
+ 'description': 'md5:86bcb422c29475dbd2b5dcfa6ec3749c',
+ 'view_count': int,
+ 'duration': 5386,
+ 'uploader': 'Jérémie Mercier',
+ 'uploader_id': 'UCeN_qQV829NYf0pvPJhW5dQ',
+ 'like_count': int,
+ 'upload_date': '20211218',
+ 'thumbnail': 'https://scw.divulg.org/cb-medias4/images/0z4Kms8pi8I/maxres.jpg'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://api.divulg.org/post/{id}/details',
+ id, headers={'accept': 'application/json, text/plain, */*'})
+ video_json = data_json['video']
+ formats, subtitles = [], {}
+ for sub in video_json.get('captions') or []:
+ sub_url = try_get(sub, lambda x: x['file']['url'])
+ if not sub_url:
+ continue
+ subtitles.setdefault(sub.get('languageCode', 'fr'), []).append({
+ 'url': sub_url,
+ })
+
+ mpd_url = try_get(video_json, lambda x: x['dashManifest']['url'])
+ if mpd_url:
+ fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ m3u8_url = try_get(video_json, lambda x: x['hlsManifest']['url'])
+ if m3u8_url:
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(mpd_url, id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+
+ thumbnails = [{
+ 'url': image['url'],
+ 'height': int_or_none(image.get('height')),
+ 'width': int_or_none(image.get('width')),
+ } for image in video_json.get('thumbnails') or [] if image.get('url')]
+
+ self._sort_formats(formats)
+ return {
+ 'id': id,
+ 'title': video_json.get('title'),
+ 'description': video_json.get('description'),
+ 'view_count': video_json.get('viewCount'),
+ 'duration': video_json.get('duration'),
+ 'uploader': try_get(data_json, lambda x: x['channel']['name']),
+ 'uploader_id': try_get(data_json, lambda x: x['channel']['id']),
+ 'like_count': data_json.get('likesCount'),
+ 'upload_date': unified_strdate(video_json.get('publishedAt') or video_json.get('createdAt')),
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class CrowdBunkerChannelIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/@(?P<id>[^/?#$&]+)'
+
+ _TESTS = [{
+ 'url': 'https://crowdbunker.com/@Milan_UHRIN',
+ 'playlist_mincount': 14,
+ 'info_dict': {
+ 'id': 'Milan_UHRIN',
+ },
+ }]
+
+ def _entries(self, id):
+ last = None
+
+ for page in itertools.count():
+ channel_json = self._download_json(
+ f'https://api.divulg.org/organization/{id}/posts', id, headers={'accept': 'application/json, text/plain, */*'},
+ query={'after': last} if last else {}, note=f'Downloading Page {page}')
+ for item in channel_json.get('items') or []:
+ v_id = item.get('uid')
+ if not v_id:
+ continue
+ yield self.url_result(
+ 'https://crowdbunker.com/v/%s' % v_id, ie=CrowdBunkerIE.ie_key(), video_id=v_id)
+ last = channel_json.get('last')
+ if not last:
+ break
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ return self.playlist_result(self._entries(id), playlist_id=id)
diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py
index cd35728e5..ffe291098 100644
--- a/yt_dlp/extractor/crunchyroll.py
+++ b/yt_dlp/extractor/crunchyroll.py
@@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
+import base64
import re
import json
import zlib
@@ -23,15 +24,17 @@ from ..utils import (
bytes_to_intlist,
extract_attributes,
float_or_none,
+ format_field,
intlist_to_bytes,
int_or_none,
+ join_nonempty,
lowercase_escape,
merge_dicts,
qualities,
remove_end,
sanitized_Request,
+ traverse_obj,
try_get,
- urlencode_postdata,
xpath_text,
)
from ..aes import (
@@ -40,8 +43,8 @@ from ..aes import (
class CrunchyrollBaseIE(InfoExtractor):
- _LOGIN_URL = 'https://www.crunchyroll.com/login'
- _LOGIN_FORM = 'login_form'
+ _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login'
+ _API_BASE = 'https://api.crunchyroll.com'
_NETRC_MACHINE = 'crunchyroll'
def _call_rpc_api(self, method, video_id, note=None, data=None):
@@ -58,50 +61,33 @@ class CrunchyrollBaseIE(InfoExtractor):
username, password = self._get_login_info()
if username is None:
return
-
- login_page = self._download_webpage(
- self._LOGIN_URL, None, 'Downloading login page')
-
- def is_logged(webpage):
- return 'href="/logout"' in webpage
-
- # Already logged in
- if is_logged(login_page):
+ if self._get_cookies(self._LOGIN_URL).get('etp_rt'):
return
- login_form_str = self._search_regex(
- r'(?P<form><form[^>]+?id=(["\'])%s\2[^>]*>)' % self._LOGIN_FORM,
- login_page, 'login form', group='form')
-
- post_url = extract_attributes(login_form_str).get('action')
- if not post_url:
- post_url = self._LOGIN_URL
- elif not post_url.startswith('http'):
- post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
-
- login_form = self._form_hidden_inputs(self._LOGIN_FORM, login_page)
-
- login_form.update({
- 'login_form[name]': username,
- 'login_form[password]': password,
- })
-
- response = self._download_webpage(
- post_url, None, 'Logging in', 'Wrong login info',
- data=urlencode_postdata(login_form),
- headers={'Content-Type': 'application/x-www-form-urlencoded'})
-
- # Successful login
- if is_logged(response):
- return
-
- error = self._html_search_regex(
- '(?s)<ul[^>]+class=["\']messages["\'][^>]*>(.+?)</ul>',
- response, 'error message', default=None)
- if error:
- raise ExtractorError('Unable to login: %s' % error, expected=True)
-
- raise ExtractorError('Unable to log in')
+ upsell_response = self._download_json(
+ f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id',
+ query={
+ 'sess_id': 1,
+ 'device_id': 'whatvalueshouldbeforweb',
+ 'device_type': 'com.crunchyroll.static',
+ 'access_token': 'giKq5eY27ny3cqz',
+ 'referer': self._LOGIN_URL
+ })
+ if upsell_response['code'] != 'ok':
+ raise ExtractorError('Could not get session id')
+ session_id = upsell_response['data']['session_id']
+
+ login_response = self._download_json(
+ f'{self._API_BASE}/login.1.json', None, 'Logging in',
+ data=compat_urllib_parse_urlencode({
+ 'account': username,
+ 'password': password,
+ 'session_id': session_id
+ }).encode('ascii'))
+ if login_response['code'] != 'ok':
+ raise ExtractorError('Login failed. Bad username or password?', expected=True)
+ if not self._get_cookies(self._LOGIN_URL).get('etp_rt'):
+ raise ExtractorError('Login succeeded but did not set etp_rt cookie')
def _real_initialize(self):
self._login()
@@ -733,13 +719,118 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE):
def _real_extract(self, url):
lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id')
webpage = self._download_webpage(url, display_id)
- episode_data = self._parse_json(
- self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'episode data'),
- display_id)['content']['byId'][internal_id]
- video_id = episode_data['external_id'].split('.')[1]
- series_id = episode_data['episode_metadata']['series_slug_title']
- return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}',
- CrunchyrollIE.ie_key(), video_id)
+ initial_state = self._parse_json(
+ self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'),
+ display_id)
+ episode_data = initial_state['content']['byId'][internal_id]
+ if not self._get_cookies(url).get('etp_rt'):
+ video_id = episode_data['external_id'].split('.')[1]
+ series_id = episode_data['episode_metadata']['series_slug_title']
+ return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}',
+ CrunchyrollIE.ie_key(), video_id)
+
+ app_config = self._parse_json(
+ self._search_regex(r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'),
+ display_id)
+ client_id = app_config['cxApiParams']['accountAuthClientId']
+ api_domain = app_config['cxApiParams']['apiDomain']
+ basic_token = str(base64.b64encode(('%s:' % client_id).encode('ascii')), 'ascii')
+ auth_response = self._download_json(
+ f'{api_domain}/auth/v1/token', display_id,
+ note='Authenticating with cookie',
+ headers={
+ 'Authorization': 'Basic ' + basic_token
+ }, data='grant_type=etp_rt_cookie'.encode('ascii'))
+ policy_response = self._download_json(
+ f'{api_domain}/index/v2', display_id,
+ note='Retrieving signed policy',
+ headers={
+ 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']
+ })
+ bucket = policy_response['cms']['bucket']
+ params = {
+ 'Policy': policy_response['cms']['policy'],
+ 'Signature': policy_response['cms']['signature'],
+ 'Key-Pair-Id': policy_response['cms']['key_pair_id']
+ }
+ locale = traverse_obj(initial_state, ('localization', 'locale'))
+ if locale:
+ params['locale'] = locale
+ episode_response = self._download_json(
+ f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id,
+ note='Retrieving episode metadata',
+ query=params)
+ if episode_response.get('is_premium_only') and not episode_response.get('playback'):
+ raise ExtractorError('This video is for premium members only.', expected=True)
+ stream_response = self._download_json(
+ episode_response['playback'], display_id,
+ note='Retrieving stream info')
+
+ thumbnails = []
+ for thumbnails_data in traverse_obj(episode_response, ('images', 'thumbnail')):
+ for thumbnail_data in thumbnails_data:
+ thumbnails.append({
+ 'url': thumbnail_data.get('source'),
+ 'width': thumbnail_data.get('width'),
+ 'height': thumbnail_data.get('height'),
+ })
+ subtitles = {}
+ for lang, subtitle_data in stream_response.get('subtitles').items():
+ subtitles[lang] = [{
+ 'url': subtitle_data.get('url'),
+ 'ext': subtitle_data.get('format')
+ }]
+
+ requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])]
+ hardsub_preference = qualities(requested_hardsubs[::-1])
+ requested_formats = self._configuration_arg('format') or ['adaptive_hls']
+
+ formats = []
+ for stream_type, streams in stream_response.get('streams', {}).items():
+ if stream_type not in requested_formats:
+ continue
+ for stream in streams.values():
+ hardsub_lang = stream.get('hardsub_locale') or ''
+ if hardsub_lang.lower() not in requested_hardsubs:
+ continue
+ format_id = join_nonempty(
+ stream_type,
+ format_field(stream, 'hardsub_locale', 'hardsub-%s'))
+ if not stream.get('url'):
+ continue
+ if stream_type.split('_')[-1] == 'hls':
+ adaptive_formats = self._extract_m3u8_formats(
+ stream['url'], display_id, 'mp4', m3u8_id=format_id,
+ note='Downloading %s information' % format_id,
+ fatal=False)
+ elif stream_type.split('_')[-1] == 'dash':
+ adaptive_formats = self._extract_mpd_formats(
+ stream['url'], display_id, mpd_id=format_id,
+ note='Downloading %s information' % format_id,
+ fatal=False)
+ for f in adaptive_formats:
+ if f.get('acodec') != 'none':
+ f['language'] = stream_response.get('audio_locale')
+ f['quality'] = hardsub_preference(hardsub_lang.lower())
+ formats.extend(adaptive_formats)
+ self._sort_formats(formats)
+
+ return {
+ 'id': internal_id,
+ 'title': '%s Episode %s – %s' % (episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')),
+ 'description': episode_response.get('description').replace(r'\r\n', '\n'),
+ 'duration': float_or_none(episode_response.get('duration_ms'), 1000),
+ 'thumbnails': thumbnails,
+ 'series': episode_response.get('series_title'),
+ 'series_id': episode_response.get('series_id'),
+ 'season': episode_response.get('season_title'),
+ 'season_id': episode_response.get('season_id'),
+ 'season_number': episode_response.get('season_number'),
+ 'episode': episode_response.get('title'),
+ 'episode_number': episode_response.get('sequence_number'),
+ 'subtitles': subtitles,
+ 'formats': formats
+ }
class CrunchyrollBetaShowIE(CrunchyrollBaseIE):
diff --git a/yt_dlp/extractor/ctvnews.py b/yt_dlp/extractor/ctvnews.py
index 03f8cefb7..952f4c747 100644
--- a/yt_dlp/extractor/ctvnews.py
+++ b/yt_dlp/extractor/ctvnews.py
@@ -65,4 +65,9 @@ class CTVNewsIE(InfoExtractor):
})
entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet(
re.findall(r'clip\.id\s*=\s*(\d+);', webpage))]
+ if not entries:
+ webpage = self._download_webpage(url, page_id)
+ if 'getAuthStates("' in webpage:
+ entries = [ninecninemedia_url_result(clip_id) for clip_id in
+ self._search_regex(r'getAuthStates\("([\d+,]+)"', webpage, 'clip ids').split(',')]
return self.playlist_result(entries, page_id)
diff --git a/yt_dlp/extractor/daftsex.py b/yt_dlp/extractor/daftsex.py
new file mode 100644
index 000000000..03672b35d
--- /dev/null
+++ b/yt_dlp/extractor/daftsex.py
@@ -0,0 +1,79 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_b64decode
+from ..utils import (
+ get_elements_by_class,
+ int_or_none,
+ js_to_json,
+ parse_count,
+ parse_duration,
+ try_get,
+)
+
+
+class DaftsexIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?daftsex\.com/watch/(?P<id>-?\d+_\d+)'
+ _TESTS = [{
+ 'url': 'https://daftsex.com/watch/-156601359_456242791',
+ 'info_dict': {
+ 'id': '-156601359_456242791',
+ 'ext': 'mp4',
+ 'title': 'Skye Blue - Dinner And A Show',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ title = get_elements_by_class('heading', webpage)[-1]
+ duration = parse_duration(self._search_regex(
+ r'Duration: ((?:[0-9]{2}:){0,2}[0-9]{2})',
+ webpage, 'duration', fatal=False))
+ views = parse_count(self._search_regex(
+ r'Views: ([0-9 ]+)',
+ webpage, 'views', fatal=False))
+
+ player_hash = self._search_regex(
+ r'DaxabPlayer\.Init\({[\s\S]*hash:\s*"([0-9a-zA-Z_\-]+)"[\s\S]*}',
+ webpage, 'player hash')
+ player_color = self._search_regex(
+ r'DaxabPlayer\.Init\({[\s\S]*color:\s*"([0-9a-z]+)"[\s\S]*}',
+ webpage, 'player color', fatal=False) or ''
+
+ embed_page = self._download_webpage(
+ 'https://daxab.com/player/%s?color=%s' % (player_hash, player_color),
+ video_id, headers={'Referer': url})
+ video_params = self._parse_json(
+ self._search_regex(
+ r'window\.globParams\s*=\s*({[\S\s]+})\s*;\s*<\/script>',
+ embed_page, 'video parameters'),
+ video_id, transform_source=js_to_json)
+
+ server_domain = 'https://%s' % compat_b64decode(video_params['server'][::-1]).decode('utf-8')
+ formats = []
+ for format_id, format_data in video_params['video']['cdn_files'].items():
+ ext, height = format_id.split('_')
+ extra_quality_data = format_data.split('.')[-1]
+ url = f'{server_domain}/videos/{video_id.replace("_", "/")}/{height}.mp4?extra={extra_quality_data}'
+ formats.append({
+ 'format_id': format_id,
+ 'url': url,
+ 'height': int_or_none(height),
+ 'ext': ext,
+ })
+ self._sort_formats(formats)
+
+ thumbnail = try_get(video_params,
+ lambda vi: 'https:' + compat_b64decode(vi['video']['thumb']).decode('utf-8'))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ 'view_count': views,
+ 'age_limit': 18,
+ }
diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py
index b4211e1e4..e71462061 100644
--- a/yt_dlp/extractor/dailymotion.py
+++ b/yt_dlp/extractor/dailymotion.py
@@ -207,12 +207,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
video_id, playlist_id = self._match_valid_url(url).groups()
if playlist_id:
- if not self.get_param('noplaylist'):
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id)
+ if self._yes_playlist(playlist_id, video_id):
return self.url_result(
'http://www.dailymotion.com/playlist/' + playlist_id,
'DailymotionPlaylist', playlist_id)
- self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
password = self.get_param('videopassword')
media = self._call_api(
diff --git a/yt_dlp/extractor/daum.py b/yt_dlp/extractor/daum.py
index 8aa2af9a8..4362e92cb 100644
--- a/yt_dlp/extractor/daum.py
+++ b/yt_dlp/extractor/daum.py
@@ -157,11 +157,8 @@ class DaumListIE(InfoExtractor):
query_dict = parse_qs(url)
if 'clipid' in query_dict:
clip_id = query_dict['clipid'][0]
- if self.get_param('noplaylist'):
- self.to_screen('Downloading just video %s because of --no-playlist' % clip_id)
+ if not self._yes_playlist(list_id, clip_id):
return self.url_result(DaumClipIE._URL_TEMPLATE % clip_id, 'DaumClip')
- else:
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % list_id)
class DaumPlaylistIE(DaumListIE):
diff --git a/yt_dlp/extractor/digitalconcerthall.py b/yt_dlp/extractor/digitalconcerthall.py
new file mode 100644
index 000000000..9b302a9a0
--- /dev/null
+++ b/yt_dlp/extractor/digitalconcerthall.py
@@ -0,0 +1,143 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+from ..utils import (
+ ExtractorError,
+ parse_resolution,
+ traverse_obj,
+ try_get,
+ urlencode_postdata,
+)
+
+
+class DigitalConcertHallIE(InfoExtractor):
+ IE_DESC = 'DigitalConcertHall extractor'
+ _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P<language>[a-z]+)/concert/(?P<id>[0-9]+)'
+ _OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token'
+ _ACCESS_TOKEN = None
+ _NETRC_MACHINE = 'digitalconcerthall'
+ _TESTS = [{
+ 'note': 'Playlist with only one video',
+ 'url': 'https://www.digitalconcerthall.com/en/concert/53201',
+ 'info_dict': {
+ 'id': '53201-1',
+ 'ext': 'mp4',
+ 'composer': 'Kurt Weill',
+ 'title': '[Magic Night]',
+ 'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$',
+ 'upload_date': '20210624',
+ 'timestamp': 1624548600,
+ 'duration': 2798,
+ 'album_artist': 'Members of the Berliner Philharmoniker / Simon Rössler',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'note': 'Concert with several works and an interview',
+ 'url': 'https://www.digitalconcerthall.com/en/concert/53785',
+ 'info_dict': {
+ 'id': '53785',
+ 'album_artist': 'Berliner Philharmoniker / Kirill Petrenko',
+ 'title': 'Kirill Petrenko conducts Mendelssohn and Shostakovich',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ 'playlist_count': 3,
+ }]
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if not username:
+ self.raise_login_required()
+ token_response = self._download_json(
+ self._OAUTH_URL,
+ None, 'Obtaining token', errnote='Unable to obtain token', data=urlencode_postdata({
+ 'affiliate': 'none',
+ 'grant_type': 'device',
+ 'device_vendor': 'unknown',
+ 'app_id': 'dch.webapp',
+ 'app_version': '1.0.0',
+ 'client_secret': '2ySLN+2Fwb',
+ }), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ })
+ self._ACCESS_TOKEN = token_response['access_token']
+ try:
+ self._download_json(
+ self._OAUTH_URL,
+ None, note='Logging in', errnote='Unable to login', data=urlencode_postdata({
+ 'grant_type': 'password',
+ 'username': username,
+ 'password': password,
+ }), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ 'Referer': 'https://www.digitalconcerthall.com',
+ 'Authorization': f'Bearer {self._ACCESS_TOKEN}'
+ })
+ except ExtractorError:
+ self.raise_login_required(msg='Login info incorrect')
+
+ def _real_initialize(self):
+ self._login()
+
+ def _entries(self, items, language, **kwargs):
+ for item in items:
+ video_id = item['id']
+ stream_info = self._download_json(
+ self._proto_relative_url(item['_links']['streams']['href']), video_id, headers={
+ 'Accept': 'application/json',
+ 'Authorization': f'Bearer {self._ACCESS_TOKEN}',
+ 'Accept-Language': language
+ })
+
+ m3u8_url = traverse_obj(
+ stream_info, ('channel', lambda x: x.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False)
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', fatal=False)
+ self._sort_formats(formats)
+
+ yield {
+ 'id': video_id,
+ 'title': item.get('title'),
+ 'composer': item.get('name_composer'),
+ 'url': m3u8_url,
+ 'formats': formats,
+ 'duration': item.get('duration_total'),
+ 'timestamp': traverse_obj(item, ('date', 'published')),
+ 'description': item.get('short_description') or stream_info.get('short_description'),
+ **kwargs,
+ 'chapters': [{
+ 'start_time': chapter.get('time'),
+ 'end_time': try_get(chapter, lambda x: x['time'] + x['duration']),
+ 'title': chapter.get('text'),
+ } for chapter in item['cuepoints']] if item.get('cuepoints') else None,
+ }
+
+ def _real_extract(self, url):
+ language, video_id = self._match_valid_url(url).group('language', 'id')
+ if not language:
+ language = 'en'
+
+ thumbnail_url = self._html_search_regex(
+ r'(https?://images\.digitalconcerthall\.com/cms/thumbnails/.*\.jpg)',
+ self._download_webpage(url, video_id), 'thumbnail')
+ thumbnails = [{
+ 'url': thumbnail_url,
+ **parse_resolution(thumbnail_url)
+ }]
+
+ vid_info = self._download_json(
+ f'https://api.digitalconcerthall.com/v2/concert/{video_id}', video_id, headers={
+ 'Accept': 'application/json',
+ 'Accept-Language': language
+ })
+ album_artist = ' / '.join(traverse_obj(vid_info, ('_links', 'artist', ..., 'name')) or '')
+
+ return {
+ '_type': 'playlist',
+ 'id': video_id,
+ 'title': vid_info.get('title'),
+ 'entries': self._entries(traverse_obj(vid_info, ('_embedded', ..., ...)), language,
+ thumbnails=thumbnails, album_artist=album_artist),
+ 'thumbnails': thumbnails,
+ 'album_artist': album_artist,
+ }
diff --git a/yt_dlp/extractor/dispeak.py b/yt_dlp/extractor/dispeak.py
index be7ad1202..3d651f3ab 100644
--- a/yt_dlp/extractor/dispeak.py
+++ b/yt_dlp/extractor/dispeak.py
@@ -74,13 +74,11 @@ class DigitallySpeakingIE(InfoExtractor):
tbr = int_or_none(bitrate)
vbr = int_or_none(self._search_regex(
r'-(\d+)\.mp4', video_path, 'vbr', default=None))
- abr = tbr - vbr if tbr and vbr else None
video_formats.append({
'format_id': bitrate,
'url': url,
'tbr': tbr,
'vbr': vbr,
- 'abr': abr,
})
return video_formats
@@ -121,6 +119,7 @@ class DigitallySpeakingIE(InfoExtractor):
video_formats = self._parse_mp4(metadata)
if video_formats is None:
video_formats = self._parse_flv(metadata)
+ self._sort_formats(video_formats)
return {
'id': video_id,
diff --git a/yt_dlp/extractor/doodstream.py b/yt_dlp/extractor/doodstream.py
index 2c9ea6898..f692127c2 100644
--- a/yt_dlp/extractor/doodstream.py
+++ b/yt_dlp/extractor/doodstream.py
@@ -21,6 +21,16 @@ class DoodStreamIE(InfoExtractor):
'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg',
}
}, {
+ 'url': 'http://dood.watch/d/5s1wmbdacezb',
+ 'md5': '4568b83b31e13242b3f1ff96c55f0595',
+ 'info_dict': {
+ 'id': '5s1wmbdacezb',
+ 'ext': 'mp4',
+ 'title': 'Kat Wonders - Monthly May 2020',
+ 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com',
+ 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg',
+ }
+ }, {
'url': 'https://dood.to/d/jzrxn12t2s7n',
'md5': '3207e199426eca7c2aa23c2872e6728a',
'info_dict': {
@@ -34,31 +44,26 @@ class DoodStreamIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ url = f'https://dood.to/e/{video_id}'
webpage = self._download_webpage(url, video_id)
- if '/d/' in url:
- url = "https://dood.to" + self._html_search_regex(
- r'<iframe src="(/e/[a-z0-9]+)"', webpage, 'embed')
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- title = self._html_search_meta(['og:title', 'twitter:title'],
- webpage, default=None)
- thumb = self._html_search_meta(['og:image', 'twitter:image'],
- webpage, default=None)
+ title = self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None)
+ thumb = self._html_search_meta(['og:image', 'twitter:image'], webpage, default=None)
token = self._html_search_regex(r'[?&]token=([a-z0-9]+)[&\']', webpage, 'token')
description = self._html_search_meta(
- ['og:description', 'description', 'twitter:description'],
- webpage, default=None)
- auth_url = 'https://dood.to' + self._html_search_regex(
- r'(/pass_md5.*?)\'', webpage, 'pass_md5')
+ ['og:description', 'description', 'twitter:description'], webpage, default=None)
+
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/66.0',
'referer': url
}
- webpage = self._download_webpage(auth_url, video_id, headers=headers)
- final_url = webpage + ''.join([random.choice(string.ascii_letters + string.digits) for _ in range(10)]) + "?token=" + token + "&expiry=" + str(int(time.time() * 1000))
+ pass_md5 = self._html_search_regex(r'(/pass_md5.*?)\'', webpage, 'pass_md5')
+ final_url = ''.join((
+ self._download_webpage(f'https://dood.to{pass_md5}', video_id, headers=headers),
+ *(random.choice(string.ascii_letters + string.digits) for _ in range(10)),
+ f'?token={token}&expiry={int(time.time() * 1000)}',
+ ))
return {
'id': video_id,
diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py
index e1f5e9dc8..a25f27c3a 100644
--- a/yt_dlp/extractor/dplay.py
+++ b/yt_dlp/extractor/dplay.py
@@ -347,33 +347,7 @@ class HGTVDeIE(DPlayBaseIE):
url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de')
-class DiscoveryPlusIE(DPlayBaseIE):
- _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?!it/)(?:\w{2}/)?video' + DPlayBaseIE._PATH_REGEX
- _TESTS = [{
- 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family',
- 'info_dict': {
- 'id': '1140794',
- 'display_id': 'property-brothers-forever-home/food-and-family',
- 'ext': 'mp4',
- 'title': 'Food and Family',
- 'description': 'The brothers help a Richmond family expand their single-level home.',
- 'duration': 2583.113,
- 'timestamp': 1609304400,
- 'upload_date': '20201230',
- 'creator': 'HGTV',
- 'series': 'Property Brothers: Forever Home',
- 'season_number': 1,
- 'episode_number': 1,
- },
- 'skip': 'Available for Premium users',
- }, {
- 'url': 'https://discoveryplus.com/ca/video/bering-sea-gold-discovery-ca/goldslingers',
- 'only_matching': True,
- }]
-
- _PRODUCT = 'dplus_us'
- _API_URL = 'us1-prod-direct.discoveryplus.com'
-
+class DiscoveryPlusBaseIE(DPlayBaseIE):
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
headers['x-disco-client'] = f'WEB:UNKNOWN:{self._PRODUCT}:25.2.6'
@@ -392,12 +366,226 @@ class DiscoveryPlusIE(DPlayBaseIE):
}).encode('utf-8'))['data']['attributes']['streaming']
def _real_extract(self, url):
- display_id = self._match_id(url)
- return self._get_disco_api_info(
- url, display_id, self._API_URL, 'go', 'us')
+ return self._get_disco_api_info(url, self._match_id(url), **self._DISCO_API_PARAMS)
+
+
+class GoDiscoveryIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:go\.)?discovery\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://go.discovery.com/video/dirty-jobs-discovery-atve-us/rodbuster-galvanizer',
+ 'info_dict': {
+ 'id': '4164906',
+ 'display_id': 'dirty-jobs-discovery-atve-us/rodbuster-galvanizer',
+ 'ext': 'mp4',
+ 'title': 'Rodbuster / Galvanizer',
+ 'description': 'Mike installs rebar with a team of rodbusters, then he galvanizes steel.',
+ 'season_number': 9,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://discovery.com/video/dirty-jobs-discovery-atve-us/rodbuster-galvanizer',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'dsc'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.go.discovery.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class TravelChannelIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:watch\.)?travelchannel\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://watch.travelchannel.com/video/ghost-adventures-travel-channel/ghost-train-of-ely',
+ 'info_dict': {
+ 'id': '2220256',
+ 'display_id': 'ghost-adventures-travel-channel/ghost-train-of-ely',
+ 'ext': 'mp4',
+ 'title': 'Ghost Train of Ely',
+ 'description': 'The crew investigates the dark history of the Nevada Northern Railway.',
+ 'season_number': 24,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://watch.travelchannel.com/video/ghost-adventures-travel-channel/ghost-train-of-ely',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'trav'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.watch.travelchannel.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class CookingChannelIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:watch\.)?cookingchanneltv\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://watch.cookingchanneltv.com/video/carnival-eats-cooking-channel/the-postman-always-brings-rice-2348634',
+ 'info_dict': {
+ 'id': '2348634',
+ 'display_id': 'carnival-eats-cooking-channel/the-postman-always-brings-rice-2348634',
+ 'ext': 'mp4',
+ 'title': 'The Postman Always Brings Rice',
+ 'description': 'Noah visits the Maui Fair and the Aurora Winter Festival in Vancouver.',
+ 'season_number': 9,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://watch.cookingchanneltv.com/video/carnival-eats-cooking-channel/the-postman-always-brings-rice-2348634',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'cook'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.watch.cookingchanneltv.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class HGTVUsaIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:watch\.)?hgtv\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://watch.hgtv.com/video/home-inspector-joe-hgtv-atve-us/this-mold-house',
+ 'info_dict': {
+ 'id': '4289736',
+ 'display_id': 'home-inspector-joe-hgtv-atve-us/this-mold-house',
+ 'ext': 'mp4',
+ 'title': 'This Mold House',
+ 'description': 'Joe and Noel help take a familys dream home from hazardous to fabulous.',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://watch.hgtv.com/video/home-inspector-joe-hgtv-atve-us/this-mold-house',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'hgtv'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.watch.hgtv.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class FoodNetworkIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:watch\.)?foodnetwork\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://watch.foodnetwork.com/video/kids-baking-championship-food-network/float-like-a-butterfly',
+ 'info_dict': {
+ 'id': '4116449',
+ 'display_id': 'kids-baking-championship-food-network/float-like-a-butterfly',
+ 'ext': 'mp4',
+ 'title': 'Float Like a Butterfly',
+ 'description': 'The 12 kid bakers create colorful carved butterfly cakes.',
+ 'season_number': 10,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://watch.foodnetwork.com/video/kids-baking-championship-food-network/float-like-a-butterfly',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'food'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.watch.foodnetwork.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class DestinationAmericaIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?destinationamerica\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.destinationamerica.com/video/alaska-monsters-destination-america-atve-us/central-alaskas-bigfoot',
+ 'info_dict': {
+ 'id': '4210904',
+ 'display_id': 'alaska-monsters-destination-america-atve-us/central-alaskas-bigfoot',
+ 'ext': 'mp4',
+ 'title': 'Central Alaskas Bigfoot',
+ 'description': 'A team heads to central Alaska to investigate an aggressive Bigfoot.',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://www.destinationamerica.com/video/alaska-monsters-destination-america-atve-us/central-alaskas-bigfoot',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'dam'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.destinationamerica.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class InvestigationDiscoveryIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?investigationdiscovery\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.investigationdiscovery.com/video/unmasked-investigation-discovery/the-killer-clown',
+ 'info_dict': {
+ 'id': '2139409',
+ 'display_id': 'unmasked-investigation-discovery/the-killer-clown',
+ 'ext': 'mp4',
+ 'title': 'The Killer Clown',
+ 'description': 'A wealthy Florida woman is fatally shot in the face by a clown at her door.',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://www.investigationdiscovery.com/video/unmasked-investigation-discovery/the-killer-clown',
+ 'only_matching': True,
+ }]
+ _PRODUCT = 'ids'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.investigationdiscovery.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
-class ScienceChannelIE(DiscoveryPlusIE):
+
+class AmHistoryChannelIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?ahctv\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.ahctv.com/video/modern-sniper-ahc/army',
+ 'info_dict': {
+ 'id': '2309730',
+ 'display_id': 'modern-sniper-ahc/army',
+ 'ext': 'mp4',
+ 'title': 'Army',
+ 'description': 'Snipers today face challenges their predecessors couldve only dreamed of.',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://www.ahctv.com/video/modern-sniper-ahc/army',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'ahc'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.ahctv.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class ScienceChannelIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:www\.)?sciencechannel\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.sciencechannel.com/video/strangest-things-science-atve-us/nazi-mystery-machine',
@@ -411,13 +599,20 @@ class ScienceChannelIE(DiscoveryPlusIE):
'episode_number': 1,
},
'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://www.sciencechannel.com/video/strangest-things-science-atve-us/nazi-mystery-machine',
+ 'only_matching': True,
}]
_PRODUCT = 'sci'
- _API_URL = 'us1-prod-direct.sciencechannel.com'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.sciencechannel.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
-class DIYNetworkIE(DiscoveryPlusIE):
+class DIYNetworkIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:watch\.)?diynetwork\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://watch.diynetwork.com/video/pool-kings-diy-network/bringing-beach-life-to-texas',
@@ -431,13 +626,47 @@ class DIYNetworkIE(DiscoveryPlusIE):
'episode_number': 2,
},
'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://watch.diynetwork.com/video/pool-kings-diy-network/bringing-beach-life-to-texas',
+ 'only_matching': True,
}]
_PRODUCT = 'diy'
- _API_URL = 'us1-prod-direct.watch.diynetwork.com'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.watch.diynetwork.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class DiscoveryLifeIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?discoverylife\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.discoverylife.com/video/surviving-death-discovery-life-atve-us/bodily-trauma',
+ 'info_dict': {
+ 'id': '2218238',
+ 'display_id': 'surviving-death-discovery-life-atve-us/bodily-trauma',
+ 'ext': 'mp4',
+ 'title': 'Bodily Trauma',
+ 'description': 'Meet three people who tested the limits of the human body.',
+ 'season_number': 1,
+ 'episode_number': 2,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://www.discoverylife.com/video/surviving-death-discovery-life-atve-us/bodily-trauma',
+ 'only_matching': True,
+ }]
+ _PRODUCT = 'dlf'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.discoverylife.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
-class AnimalPlanetIE(DiscoveryPlusIE):
+
+class AnimalPlanetIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:www\.)?animalplanet\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.animalplanet.com/video/north-woods-law-animal-planet/squirrel-showdown',
@@ -451,13 +680,79 @@ class AnimalPlanetIE(DiscoveryPlusIE):
'episode_number': 11,
},
'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://www.animalplanet.com/video/north-woods-law-animal-planet/squirrel-showdown',
+ 'only_matching': True,
}]
_PRODUCT = 'apl'
- _API_URL = 'us1-prod-direct.animalplanet.com'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.animalplanet.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
-class DiscoveryPlusIndiaIE(DPlayBaseIE):
+class TLCIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:go\.)?tlc\.com/video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://go.tlc.com/video/my-600-lb-life-tlc/melissas-story-part-1',
+ 'info_dict': {
+ 'id': '2206540',
+ 'display_id': 'my-600-lb-life-tlc/melissas-story-part-1',
+ 'ext': 'mp4',
+ 'title': 'Melissas Story (Part 1)',
+ 'description': 'At 650 lbs, Melissa is ready to begin her seven-year weight loss journey.',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://go.tlc.com/video/my-600-lb-life-tlc/melissas-story-part-1',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'tlc'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.tlc.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class DiscoveryPlusIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?!it/)(?:\w{2}/)?video' + DPlayBaseIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family',
+ 'info_dict': {
+ 'id': '1140794',
+ 'display_id': 'property-brothers-forever-home/food-and-family',
+ 'ext': 'mp4',
+ 'title': 'Food and Family',
+ 'description': 'The brothers help a Richmond family expand their single-level home.',
+ 'duration': 2583.113,
+ 'timestamp': 1609304400,
+ 'upload_date': '20201230',
+ 'creator': 'HGTV',
+ 'series': 'Property Brothers: Forever Home',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://discoveryplus.com/ca/video/bering-sea-gold-discovery-ca/goldslingers',
+ 'only_matching': True,
+ }]
+
+ _PRODUCT = 'dplus_us'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'us1-prod-direct.discoveryplus.com',
+ 'realm': 'go',
+ 'country': 'us',
+ }
+
+
+class DiscoveryPlusIndiaIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/videos?' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.discoveryplus.in/videos/how-do-they-do-it/fugu-and-more?seasonId=8&type=EPISODE',
@@ -467,41 +762,38 @@ class DiscoveryPlusIndiaIE(DPlayBaseIE):
'display_id': 'how-do-they-do-it/fugu-and-more',
'title': 'Fugu and More',
'description': 'The Japanese catch, prepare and eat the deadliest fish on the planet.',
- 'duration': 1319,
+ 'duration': 1319.32,
'timestamp': 1582309800,
'upload_date': '20200221',
'series': 'How Do They Do It?',
'season_number': 8,
'episode_number': 2,
'creator': 'Discovery Channel',
+ 'thumbnail': r're:https://.+\.jpeg',
+ 'episode': 'Episode 2',
+ 'season': 'Season 8',
+ 'tags': [],
},
'params': {
'skip_download': True,
}
}]
+ _PRODUCT = 'dplus-india'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'ap2-prod-direct.discoveryplus.in',
+ 'realm': 'dplusindia',
+ 'country': 'in',
+ 'domain': 'https://www.discoveryplus.in/',
+ }
+
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
headers.update({
'x-disco-params': 'realm=%s' % realm,
- 'x-disco-client': 'WEB:UNKNOWN:dplus-india:17.0.0',
+ 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:17.0.0',
'Authorization': self._get_auth(disco_base, display_id, realm),
})
- def _download_video_playback_info(self, disco_base, video_id, headers):
- return self._download_json(
- disco_base + 'playback/v3/videoPlaybackInfo',
- video_id, headers=headers, data=json.dumps({
- 'deviceInfo': {
- 'adBlocker': False,
- },
- 'videoId': video_id,
- }).encode('utf-8'))['data']['attributes']['streaming']
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- return self._get_disco_api_info(
- url, display_id, 'ap2-prod-direct.discoveryplus.in', 'dplusindia', 'in', 'https://www.discoveryplus.in/')
-
class DiscoveryNetworksDeIE(DPlayBaseIE):
_VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)'
@@ -515,6 +807,16 @@ class DiscoveryNetworksDeIE(DPlayBaseIE):
'description': 'md5:61033c12b73286e409d99a41742ef608',
'timestamp': 1554069600,
'upload_date': '20190331',
+ 'creator': 'TLC',
+ 'season': 'Season 1',
+ 'series': 'Breaking Amish',
+ 'episode_number': 1,
+ 'tags': ['new york', 'großstadt', 'amische', 'landleben', 'modern', 'infos', 'tradition', 'herausforderung'],
+ 'display_id': 'breaking-amish/die-welt-da-drauen',
+ 'episode': 'Episode 1',
+ 'duration': 2625.024,
+ 'season_number': 1,
+ 'thumbnail': r're:https://.+\.jpg',
},
'params': {
'skip_download': True,
@@ -575,16 +877,19 @@ class DiscoveryPlusShowBaseIE(DPlayBaseIE):
return self.playlist_result(self._entries(show_name), playlist_id=show_name)
-class DiscoveryPlusItalyIE(InfoExtractor):
+class DiscoveryPlusItalyIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/it/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.discoveryplus.com/it/video/i-signori-della-neve/stagione-2-episodio-1-i-preparativi',
'only_matching': True,
}]
- def _real_extract(self, url):
- video_id = self._match_id(url)
- return self.url_result(f'https://discoveryplus.it/video/{video_id}', DPlayIE.ie_key(), video_id)
+ _PRODUCT = 'dplus_us'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'eu1-prod-direct.discoveryplus.com',
+ 'realm': 'dplay',
+ 'country': 'it',
+ }
class DiscoveryPlusItalyShowIE(DiscoveryPlusShowBaseIE):
diff --git a/yt_dlp/extractor/dropbox.py b/yt_dlp/extractor/dropbox.py
index 6a7d050aa..3ae3a8d3d 100644
--- a/yt_dlp/extractor/dropbox.py
+++ b/yt_dlp/extractor/dropbox.py
@@ -6,7 +6,12 @@ import re
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
-from ..utils import url_basename
+from ..utils import (
+ ExtractorError,
+ traverse_obj,
+ try_get,
+ url_basename,
+)
class DropboxIE(InfoExtractor):
@@ -28,13 +33,44 @@ class DropboxIE(InfoExtractor):
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
fn = compat_urllib_parse_unquote(url_basename(url))
title = os.path.splitext(fn)[0]
- video_url = re.sub(r'[?&]dl=0', '', url)
- video_url += ('?' if '?' not in video_url else '&') + 'dl=1'
+
+ password = self.get_param('videopassword')
+ if (self._og_search_title(webpage) == 'Dropbox - Password Required'
+ or 'Enter the password for this link' in webpage):
+
+ if password:
+ content_id = self._search_regex(r'content_id=(.*?)["\']', webpage, 'content_id')
+ payload = f'is_xhr=true&t={self._get_cookies("https://www.dropbox.com").get("t").value}&content_id={content_id}&password={password}&url={url}'
+ response = self._download_json(
+ 'https://www.dropbox.com/sm/auth', video_id, 'POSTing video password', data=payload.encode('UTF-8'),
+ headers={'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'})
+
+ if response.get('status') != 'authed':
+ raise ExtractorError('Authentication failed!', expected=True)
+ webpage = self._download_webpage(url, video_id)
+ elif self._get_cookies('https://dropbox.com').get('sm_auth'):
+ webpage = self._download_webpage(url, video_id)
+ else:
+ raise ExtractorError('Password protected video, use --video-password <password>', expected=True)
+
+ json_string = self._html_search_regex(r'InitReact\.mountComponent.+ "props":(.+), "elem_id"', webpage, 'Info JSON')
+ info_json = self._parse_json(json_string, video_id)
+ transcode_url = traverse_obj(info_json, ((None, 'preview'), 'file', 'preview', 'content', 'transcode_url'), get_all=False)
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id)
+
+ # downloads enabled we can get the original file
+ if 'anonymous' in (try_get(info_json, lambda x: x['sharePermission']['canDownloadRoles']) or []):
+ video_url = re.sub(r'[?&]dl=0', '', url)
+ video_url += ('?' if '?' not in video_url else '&') + 'dl=1'
+ formats.append({'url': video_url, 'format_id': 'original', 'format_note': 'Original', 'quality': 1})
+ self._sort_formats(formats)
return {
'id': video_id,
'title': title,
- 'url': video_url,
+ 'formats': formats,
+ 'subtitles': subtitles
}
diff --git a/yt_dlp/extractor/drtv.py b/yt_dlp/extractor/drtv.py
index 70134204c..37e4d5b26 100644
--- a/yt_dlp/extractor/drtv.py
+++ b/yt_dlp/extractor/drtv.py
@@ -7,13 +7,11 @@ import re
from .common import InfoExtractor
-from ..aes import aes_cbc_decrypt
+from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
from ..compat import compat_urllib_parse_unquote
from ..utils import (
- bytes_to_intlist,
ExtractorError,
int_or_none,
- intlist_to_bytes,
float_or_none,
mimetype2ext,
str_or_none,
@@ -191,13 +189,11 @@ class DRTVIE(InfoExtractor):
def decrypt_uri(e):
n = int(e[2:10], 16)
a = e[10 + n:]
- data = bytes_to_intlist(hex_to_bytes(e[10:10 + n]))
- key = bytes_to_intlist(hashlib.sha256(
- ('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest())
- iv = bytes_to_intlist(hex_to_bytes(a))
- decrypted = aes_cbc_decrypt(data, key, iv)
- return intlist_to_bytes(
- decrypted[:-decrypted[-1]]).decode('utf-8').split('?')[0]
+ data = hex_to_bytes(e[10:10 + n])
+ key = hashlib.sha256(('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest()
+ iv = hex_to_bytes(a)
+ decrypted = unpad_pkcs7(aes_cbc_decrypt_bytes(data, key, iv))
+ return decrypted.decode('utf-8').split('?')[0]
for asset in assets:
kind = asset.get('Kind')
diff --git a/yt_dlp/extractor/ertgr.py b/yt_dlp/extractor/ertgr.py
new file mode 100644
index 000000000..19ce23f01
--- /dev/null
+++ b/yt_dlp/extractor/ertgr.py
@@ -0,0 +1,316 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ clean_html,
+ determine_ext,
+ ExtractorError,
+ dict_get,
+ int_or_none,
+ merge_dicts,
+ parse_qs,
+ parse_age_limit,
+ parse_iso8601,
+ str_or_none,
+ try_get,
+ unescapeHTML,
+ url_or_none,
+ variadic,
+)
+
+
+class ERTFlixBaseIE(InfoExtractor):
+ def _call_api(
+ self, video_id, method='Player/AcquireContent', api_version=1,
+ param_headers=None, data=None, headers=None, **params):
+ platform_codename = {'platformCodename': 'www'}
+ headers_as_param = {'X-Api-Date-Format': 'iso', 'X-Api-Camel-Case': False}
+ headers_as_param.update(param_headers or {})
+ headers = headers or {}
+ if data:
+ headers['Content-Type'] = headers_as_param['Content-Type'] = 'application/json;charset=utf-8'
+ data = json.dumps(merge_dicts(platform_codename, data)).encode('utf-8')
+ query = merge_dicts(
+ {} if data else platform_codename,
+ {'$headers': json.dumps(headers_as_param)},
+ params)
+ response = self._download_json(
+ 'https://api.app.ertflix.gr/v%s/%s' % (str(api_version), method),
+ video_id, fatal=False, query=query, data=data, headers=headers)
+ if try_get(response, lambda x: x['Result']['Success']) is True:
+ return response
+
+ def _call_api_get_tiles(self, video_id, *tile_ids):
+ requested_tile_ids = [video_id] + list(tile_ids)
+ requested_tiles = [{'Id': tile_id} for tile_id in requested_tile_ids]
+ tiles_response = self._call_api(
+ video_id, method='Tile/GetTiles', api_version=2,
+ data={'RequestedTiles': requested_tiles})
+ tiles = try_get(tiles_response, lambda x: x['Tiles'], list) or []
+ if tile_ids:
+ if sorted([tile['Id'] for tile in tiles]) != sorted(requested_tile_ids):
+ raise ExtractorError('Requested tiles not found', video_id=video_id)
+ return tiles
+ try:
+ return next(tile for tile in tiles if tile['Id'] == video_id)
+ except StopIteration:
+ raise ExtractorError('No matching tile found', video_id=video_id)
+
+
+class ERTFlixCodenameIE(ERTFlixBaseIE):
+ IE_NAME = 'ertflix:codename'
+ IE_DESC = 'ERTFLIX videos by codename'
+ _VALID_URL = r'ertflix:(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'ertflix:monogramma-praxitelis-tzanoylinos',
+ 'md5': '5b9c2cd171f09126167e4082fc1dd0ef',
+ 'info_dict': {
+ 'id': 'monogramma-praxitelis-tzanoylinos',
+ 'ext': 'mp4',
+ 'title': 'md5:ef0b439902963d56c43ac83c3f41dd0e',
+ },
+ },
+ ]
+
+ def _extract_formats_and_subs(self, video_id, allow_none=True):
+ media_info = self._call_api(video_id, codename=video_id)
+ formats, subs = [], {}
+ for media_file in try_get(media_info, lambda x: x['MediaFiles'], list) or []:
+ for media in try_get(media_file, lambda x: x['Formats'], list) or []:
+ fmt_url = url_or_none(try_get(media, lambda x: x['Url']))
+ if not fmt_url:
+ continue
+ ext = determine_ext(fmt_url)
+ if ext == 'm3u8':
+ formats_, subs_ = self._extract_m3u8_formats_and_subtitles(
+ fmt_url, video_id, m3u8_id='hls', ext='mp4', fatal=False)
+ elif ext == 'mpd':
+ formats_, subs_ = self._extract_mpd_formats_and_subtitles(
+ fmt_url, video_id, mpd_id='dash', fatal=False)
+ else:
+ formats.append({
+ 'url': fmt_url,
+ 'format_id': str_or_none(media.get('Id')),
+ })
+ continue
+ formats.extend(formats_)
+ self._merge_subtitles(subs_, target=subs)
+
+ if formats or not allow_none:
+ self._sort_formats(formats)
+ return formats, subs
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ formats, subs = self._extract_formats_and_subs(video_id)
+
+ if formats:
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subs,
+ 'title': self._generic_title(url),
+ }
+
+
+class ERTFlixIE(ERTFlixBaseIE):
+ IE_NAME = 'ertflix'
+ IE_DESC = 'ERTFLIX videos'
+ _VALID_URL = r'https?://www\.ertflix\.gr/(?:series|vod)/(?P<id>[a-z]{3}\.\d+)'
+ _TESTS = [{
+ 'url': 'https://www.ertflix.gr/vod/vod.173258-aoratoi-ergates',
+ 'md5': '6479d5e60fd7e520b07ba5411dcdd6e7',
+ 'info_dict': {
+ 'id': 'aoratoi-ergates',
+ 'ext': 'mp4',
+ 'title': 'md5:c1433d598fbba0211b0069021517f8b4',
+ 'description': 'md5:01a64d113c31957eb7eb07719ab18ff4',
+ 'thumbnail': r're:https?://.+\.jpg',
+ 'episode_id': 'vod.173258',
+ 'timestamp': 1639648800,
+ 'upload_date': '20211216',
+ 'duration': 3166,
+ 'age_limit': 8,
+ },
+ }, {
+ 'url': 'https://www.ertflix.gr/series/ser.3448-monogramma',
+ 'info_dict': {
+ 'id': 'ser.3448',
+ 'age_limit': 8,
+ 'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.',
+ 'title': 'Μονόγραμμα',
+ },
+ 'playlist_mincount': 64,
+ }, {
+ 'url': 'https://www.ertflix.gr/series/ser.3448-monogramma?season=1',
+ 'info_dict': {
+ 'id': 'ser.3448',
+ 'age_limit': 8,
+ 'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.',
+ 'title': 'Μονόγραμμα',
+ },
+ 'playlist_count': 22,
+ }, {
+ 'url': 'https://www.ertflix.gr/series/ser.3448-monogramma?season=1&season=2021%20-%202022',
+ 'info_dict': {
+ 'id': 'ser.3448',
+ 'age_limit': 8,
+ 'description': 'Η εκπομπή σαράντα ετών που σημάδεψε τον πολιτισμό μας.',
+ 'title': 'Μονόγραμμα',
+ },
+ 'playlist_mincount': 36,
+ }, {
+ 'url': 'https://www.ertflix.gr/series/ser.164991-to-diktuo-1?season=1-9',
+ 'info_dict': {
+ 'id': 'ser.164991',
+ 'age_limit': 8,
+ 'description': 'Η πρώτη ελληνική εκπομπή με θεματολογία αποκλειστικά γύρω από το ίντερνετ.',
+ 'title': 'Το δίκτυο',
+ },
+ 'playlist_mincount': 9,
+ }]
+
+ def _extract_episode(self, episode):
+ codename = try_get(episode, lambda x: x['Codename'], compat_str)
+ title = episode.get('Title')
+ description = clean_html(dict_get(episode, ('ShortDescription', 'TinyDescription', )))
+ if not codename or not title or not episode.get('HasPlayableStream', True):
+ return
+ thumbnail = next((
+ url_or_none(thumb.get('Url'))
+ for thumb in variadic(dict_get(episode, ('Images', 'Image')) or {})
+ if thumb.get('IsMain')),
+ None)
+ return {
+ '_type': 'url_transparent',
+ 'thumbnail': thumbnail,
+ 'id': codename,
+ 'episode_id': episode.get('Id'),
+ 'title': title,
+ 'alt_title': episode.get('Subtitle'),
+ 'description': description,
+ 'timestamp': parse_iso8601(episode.get('PublishDate')),
+ 'duration': episode.get('DurationSeconds'),
+ 'age_limit': self._parse_age_rating(episode),
+ 'url': 'ertflix:%s' % (codename, ),
+ }
+
+ @staticmethod
+ def _parse_age_rating(info_dict):
+ return parse_age_limit(
+ info_dict.get('AgeRating')
+ or (info_dict.get('IsAdultContent') and 18)
+ or (info_dict.get('IsKidsContent') and 0))
+
+ def _extract_series(self, video_id, season_titles=None, season_numbers=None):
+ media_info = self._call_api(video_id, method='Tile/GetSeriesDetails', id=video_id)
+
+ series = try_get(media_info, lambda x: x['Series'], dict) or {}
+ series_info = {
+ 'age_limit': self._parse_age_rating(series),
+ 'title': series.get('Title'),
+ 'description': dict_get(series, ('ShortDescription', 'TinyDescription', )),
+ }
+ if season_numbers:
+ season_titles = season_titles or []
+ for season in try_get(series, lambda x: x['Seasons'], list) or []:
+ if season.get('SeasonNumber') in season_numbers and season.get('Title'):
+ season_titles.append(season['Title'])
+
+ def gen_episode(m_info, season_titles):
+ for episode_group in try_get(m_info, lambda x: x['EpisodeGroups'], list) or []:
+ if season_titles and episode_group.get('Title') not in season_titles:
+ continue
+ episodes = try_get(episode_group, lambda x: x['Episodes'], list)
+ if not episodes:
+ continue
+ season_info = {
+ 'season': episode_group.get('Title'),
+ 'season_number': int_or_none(episode_group.get('SeasonNumber')),
+ }
+ try:
+ episodes = [(int(ep['EpisodeNumber']), ep) for ep in episodes]
+ episodes.sort()
+ except (KeyError, ValueError):
+ episodes = enumerate(episodes, 1)
+ for n, episode in episodes:
+ info = self._extract_episode(episode)
+ if info is None:
+ continue
+ info['episode_number'] = n
+ info.update(season_info)
+ yield info
+
+ return self.playlist_result(
+ gen_episode(media_info, season_titles), playlist_id=video_id, **series_info)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ if video_id.startswith('ser.'):
+ param_season = parse_qs(url).get('season', [None])
+ param_season = [
+ (have_number, int_or_none(v) if have_number else str_or_none(v))
+ for have_number, v in
+ [(int_or_none(ps) is not None, ps) for ps in param_season]
+ if v is not None
+ ]
+ season_kwargs = {
+ k: [v for is_num, v in param_season if is_num is c] or None
+ for k, c in
+ [('season_titles', False), ('season_numbers', True)]
+ }
+ return self._extract_series(video_id, **season_kwargs)
+
+ return self._extract_episode(self._call_api_get_tiles(video_id))
+
+
+class ERTWebtvEmbedIE(InfoExtractor):
+ IE_NAME = 'ertwebtv:embed'
+ IE_DESC = 'ert.gr webtv embedded videos'
+ _BASE_PLAYER_URL_RE = re.escape('//www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php')
+ _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?f=(?P<id>[^#&]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php?f=trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4&bgimg=/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg',
+ 'md5': 'f9e9900c25c26f4ecfbddbb4b6305854',
+ 'info_dict': {
+ 'id': 'trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4',
+ 'title': 'md5:914f06a73cd8b62fbcd6fb90c636e497',
+ 'ext': 'mp4',
+ 'thumbnail': 'https://program.ert.gr/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg'
+ },
+ }]
+
+ @classmethod
+ def _extract_urls(cls, webpage):
+ EMBED_URL_RE = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
+ EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{EMBED_URL_RE})(?P=_q1)'
+
+ for mobj in re.finditer(EMBED_RE, webpage):
+ url = unescapeHTML(mobj.group('url'))
+ if not cls.suitable(url):
+ continue
+ yield url
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ formats, subs = self._extract_m3u8_formats_and_subtitles(
+ f'https://mediastream.ert.gr/vodedge/_definst_/mp4:dvrorigin/{video_id}/playlist.m3u8',
+ video_id, 'mp4')
+ self._sort_formats(formats)
+ thumbnail_id = parse_qs(url).get('bgimg', [None])[0]
+ if thumbnail_id and not thumbnail_id.startswith('http'):
+ thumbnail_id = f'https://program.ert.gr{thumbnail_id}'
+ return {
+ 'id': video_id,
+ 'title': f'VOD - {video_id}',
+ 'thumbnail': thumbnail_id,
+ 'formats': formats,
+ 'subtitles': subs,
+ }
diff --git a/yt_dlp/extractor/europeantour.py b/yt_dlp/extractor/europeantour.py
new file mode 100644
index 000000000..e28f067be
--- /dev/null
+++ b/yt_dlp/extractor/europeantour.py
@@ -0,0 +1,37 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class EuropeanTourIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?europeantour\.com/dpworld-tour/news/video/(?P<id>[^/&?#$]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.europeantour.com/dpworld-tour/news/video/the-best-shots-of-the-2021-seasons/',
+ 'info_dict': {
+ 'id': '6287788195001',
+ 'ext': 'mp4',
+ 'title': 'The best shots of the 2021 seasons',
+ 'duration': 2416.512,
+ 'timestamp': 1640010141,
+ 'uploader_id': '5136026580001',
+ 'tags': ['prod-imported'],
+ 'thumbnail': 'md5:fdac52bc826548860edf8145ee74e71a',
+ 'upload_date': '20211220'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ vid, aid = re.search(r'(?s)brightcove-player\s?video-id="([^"]+)".*"ACCOUNT_ID":"([^"]+)"', webpage).groups()
+ if not aid:
+ aid = '5136026580001'
+ return self.url_result(
+ self.BRIGHTCOVE_URL_TEMPLATE % (aid, vid), 'BrightcoveNew')
diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py
index 1b32efc47..bd514f958 100644
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@@ -37,7 +37,10 @@ from .aenetworks import (
HistoryPlayerIE,
BiographyIE,
)
-from .afreecatv import AfreecaTVIE
+from .afreecatv import (
+ AfreecaTVIE,
+ AfreecaTVLiveIE,
+)
from .airmozilla import AirMozillaIE
from .aljazeera import AlJazeeraIE
from .alphaporno import AlphaPornoIE
@@ -190,6 +193,7 @@ from .buzzfeed import BuzzFeedIE
from .byutv import BYUtvIE
from .c56 import C56IE
from .cableav import CableAVIE
+from .callin import CallinIE
from .cam4 import CAM4IE
from .camdemy import (
CamdemyIE,
@@ -300,6 +304,10 @@ from .cozytv import CozyTVIE
from .cracked import CrackedIE
from .crackle import CrackleIE
from .crooksandliars import CrooksAndLiarsIE
+from .crowdbunker import (
+ CrowdBunkerIE,
+ CrowdBunkerChannelIE,
+)
from .crunchyroll import (
CrunchyrollIE,
CrunchyrollShowPlaylistIE,
@@ -317,6 +325,7 @@ from .curiositystream import (
CuriosityStreamSeriesIE,
)
from .cwtv import CWTVIE
+from .daftsex import DaftsexIE
from .dailymail import DailyMailIE
from .dailymotion import (
DailymotionIE,
@@ -352,9 +361,19 @@ from .dplay import (
DPlayIE,
DiscoveryPlusIE,
HGTVDeIE,
+ GoDiscoveryIE,
+ TravelChannelIE,
+ CookingChannelIE,
+ HGTVUsaIE,
+ FoodNetworkIE,
+ InvestigationDiscoveryIE,
+ DestinationAmericaIE,
+ AmHistoryChannelIE,
ScienceChannelIE,
DIYNetworkIE,
+ DiscoveryLifeIE,
AnimalPlanetIE,
+ TLCIE,
DiscoveryPlusIndiaIE,
DiscoveryNetworksDeIE,
DiscoveryPlusItalyIE,
@@ -376,12 +395,8 @@ from .duboku import (
)
from .dumpert import DumpertIE
from .defense import DefenseGouvFrIE
+from .digitalconcerthall import DigitalConcertHallIE
from .discovery import DiscoveryIE
-from .discoverygo import (
- DiscoveryGoIE,
- DiscoveryGoPlaylistIE,
-)
-from .discoveryvr import DiscoveryVRIE
from .disney import DisneyIE
from .dispeak import DigitallySpeakingIE
from .doodstream import DoodStreamIE
@@ -423,6 +438,11 @@ from .eroprofile import (
EroProfileIE,
EroProfileAlbumIE,
)
+from .ertgr import (
+ ERTFlixCodenameIE,
+ ERTFlixIE,
+ ERTWebtvEmbedIE,
+)
from .escapist import EscapistIE
from .espn import (
ESPNIE,
@@ -432,6 +452,7 @@ from .espn import (
)
from .esri import EsriVideoIE
from .europa import EuropaIE
+from .europeantour import EuropeanTourIE
from .euscreen import EUScreenIE
from .expotv import ExpoTVIE
from .expressen import ExpressenIE
@@ -629,7 +650,11 @@ from .iprima import (
IPrimaIE,
IPrimaCNNIE
)
-from .iqiyi import IqiyiIE
+from .iqiyi import (
+ IqiyiIE,
+ IqIE,
+ IqAlbumIE
+)
from .ir90tv import Ir90TvIE
from .itv import (
ITVIE,
@@ -656,6 +681,7 @@ from .kankan import KankanIE
from .karaoketv import KaraoketvIE
from .karrierevideos import KarriereVideosIE
from .keezmovies import KeezMoviesIE
+from .kelbyone import KelbyOneIE
from .ketnet import KetnetIE
from .khanacademy import (
KhanAcademyIE,
@@ -723,7 +749,6 @@ from .limelight import (
LimelightChannelListIE,
)
from .line import (
- LineTVIE,
LineLiveIE,
LineLiveChannelIE,
)
@@ -740,7 +765,10 @@ from .livestream import (
LivestreamOriginalIE,
LivestreamShortenerIE,
)
-from .lnkgo import LnkGoIE
+from .lnkgo import (
+ LnkGoIE,
+ LnkIE,
+)
from .localnews8 import LocalNews8IE
from .lovehomeporn import LoveHomePornIE
from .lrt import LRTIE
@@ -755,6 +783,7 @@ from .mailru import (
MailRuMusicIE,
MailRuMusicSearchIE,
)
+from .mainstreaming import MainStreamingIE
from .malltv import MallTVIE
from .mangomolo import (
MangomoloVideoIE,
@@ -820,7 +849,10 @@ from .mirrativ import (
)
from .mit import TechTVMITIE, OCWMITIE
from .mitele import MiTeleIE
-from .mixch import MixchIE
+from .mixch import (
+ MixchIE,
+ MixchArchiveIE,
+)
from .mixcloud import (
MixcloudIE,
MixcloudUserIE,
@@ -859,6 +891,12 @@ from .mtv import (
)
from .muenchentv import MuenchenTVIE
from .musescore import MuseScoreIE
+from .musicdex import (
+ MusicdexSongIE,
+ MusicdexAlbumIE,
+ MusicdexArtistIE,
+ MusicdexPlaylistIE,
+)
from .mwave import MwaveIE, MwaveMeetGreetIE
from .mxplayer import (
MxplayerIE,
@@ -935,6 +973,7 @@ from .newgrounds import (
NewgroundsUserIE,
)
from .newstube import NewstubeIE
+from .newsy import NewsyIE
from .nextmedia import (
NextMediaIE,
NextMediaActionNewsIE,
@@ -945,6 +984,7 @@ from .nexx import (
NexxIE,
NexxEmbedIE,
)
+from .nfb import NFBIE
from .nfhsnetwork import NFHSNetworkIE
from .nfl import (
NFLIE,
@@ -982,6 +1022,7 @@ from .nitter import NitterIE
from .njpwworld import NJPWWorldIE
from .nobelprize import NobelPrizeIE
from .nonktube import NonkTubeIE
+from .noodlemagazine import NoodleMagazineIE
from .noovo import NoovoIE
from .normalboots import NormalbootsIE
from .nosvideo import NosVideoIE
@@ -1057,6 +1098,7 @@ from .opencast import (
from .openrec import (
OpenRecIE,
OpenRecCaptureIE,
+ OpenRecMovieIE,
)
from .ora import OraTVIE
from .orf import (
@@ -1154,6 +1196,10 @@ from .pokemon import (
PokemonIE,
PokemonWatchIE,
)
+from .pokergo import (
+ PokerGoIE,
+ PokerGoCollectionIE,
+)
from .polsatgo import PolsatGoIE
from .polskieradio import (
PolskieRadioIE,
@@ -1179,6 +1225,7 @@ from .pornhub import (
from .pornotube import PornotubeIE
from .pornovoisines import PornoVoisinesIE
from .pornoxo import PornoXOIE
+from .pornez import PornezIE
from .puhutv import (
PuhuTVIE,
PuhuTVSerieIE,
@@ -1186,6 +1233,13 @@ from .puhutv import (
from .presstv import PressTVIE
from .projectveritas import ProjectVeritasIE
from .prosiebensat1 import ProSiebenSat1IE
+from .prx import (
+ PRXStoryIE,
+ PRXSeriesIE,
+ PRXAccountIE,
+ PRXStoriesSearchIE,
+ PRXSeriesSearchIE
+)
from .puls4 import Puls4IE
from .pyvideo import PyvideoIE
from .qqmusic import (
@@ -1222,9 +1276,10 @@ from .rai import (
RaiPlayIE,
RaiPlayLiveIE,
RaiPlayPlaylistIE,
+ RaiPlaySoundIE,
+ RaiPlaySoundLiveIE,
+ RaiPlaySoundPlaylistIE,
RaiIE,
- RaiPlayRadioIE,
- RaiPlayRadioPlaylistIE,
)
from .raywenderlich import (
RayWenderlichIE,
@@ -1279,6 +1334,12 @@ from .rtl2 import (
RTL2YouIE,
RTL2YouSeriesIE,
)
+from .rtnews import (
+ RTNewsIE,
+ RTDocumentryIE,
+ RTDocumentryPlaylistIE,
+ RuptlyIE,
+)
from .rtp import RTPIE
from .rtrfm import RTRFMIE
from .rts import RTSIE
@@ -1292,6 +1353,7 @@ from .rtve import (
from .rtvnh import RTVNHIE
from .rtvs import RTVSIE
from .ruhd import RUHDIE
+from .rule34video import Rule34VideoIE
from .rumble import (
RumbleEmbedIE,
RumbleChannelIE,
@@ -1305,6 +1367,14 @@ from .rutube import (
RutubePlaylistIE,
RutubeTagsIE,
)
+from .glomex import (
+ GlomexIE,
+ GlomexEmbedIE,
+)
+from .megatvcom import (
+ MegaTVComIE,
+ MegaTVComEmbedIE,
+)
from .rutv import RUTVIE
from .ruutu import RuutuIE
from .ruv import RuvIE
@@ -1493,7 +1563,12 @@ from .teachingchannel import TeachingChannelIE
from .teamcoco import TeamcocoIE
from .teamtreehouse import TeamTreeHouseIE
from .techtalks import TechTalksIE
-from .ted import TEDIE
+from .ted import (
+ TedEmbedIE,
+ TedPlaylistIE,
+ TedSeriesIE,
+ TedTalkIE,
+)
from .tele5 import Tele5IE
from .tele13 import Tele13IE
from .telebruxelles import TeleBruxellesIE
@@ -1639,6 +1714,10 @@ from .tvnow import (
TVNowAnnualIE,
TVNowShowIE,
)
+from .tvopengr import (
+ TVOpenGrWatchIE,
+ TVOpenGrEmbedIE,
+)
from .tvp import (
TVPEmbedIE,
TVPIE,
@@ -1767,6 +1846,10 @@ from .vimeo import (
VimeoWatchLaterIE,
VHXEmbedIE,
)
+from .vimm import (
+ VimmIE,
+ VimmRecordingIE,
+)
from .vimple import VimpleIE
from .vine import (
VineIE,
@@ -1913,6 +1996,7 @@ from .yandexmusic import (
)
from .yandexvideo import (
YandexVideoIE,
+ YandexVideoPreviewIE,
ZenYandexIE,
ZenYandexChannelIE,
)
@@ -1939,11 +2023,13 @@ from .youtube import (
YoutubeFavouritesIE,
YoutubeHistoryIE,
YoutubeTabIE,
+ YoutubeLivestreamEmbedIE,
YoutubePlaylistIE,
YoutubeRecommendedIE,
YoutubeSearchDateIE,
YoutubeSearchIE,
YoutubeSearchURLIE,
+ YoutubeMusicSearchURLIE,
YoutubeSubscriptionsIE,
YoutubeTruncatedIDIE,
YoutubeTruncatedURLIE,
diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py
index 6dbcd690d..d39dcc058 100644
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@@ -13,23 +13,25 @@ from ..compat import (
)
from ..utils import (
clean_html,
+ determine_ext,
error_to_compat_str,
ExtractorError,
float_or_none,
get_element_by_id,
int_or_none,
js_to_json,
- limit_length,
merge_dicts,
network_exceptions,
parse_count,
parse_qs,
qualities,
sanitized_Request,
+ traverse_obj,
try_get,
url_or_none,
urlencode_postdata,
urljoin,
+ variadic,
)
@@ -163,7 +165,7 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '1417995061575415',
'ext': 'mp4',
- 'title': 'Yaroslav Korpan - Довгоочікуване відео',
+ 'title': 'Ukrainian Scientists Worldwide | Довгоочікуване відео',
'description': 'Довгоочікуване відео',
'timestamp': 1486648771,
'upload_date': '20170209',
@@ -194,8 +196,8 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '202882990186699',
'ext': 'mp4',
- 'title': 'Elisabeth Ahtn - Hello? Yes your uber ride is here\n* Jukin...',
- 'description': 'Hello? Yes your uber ride is here\n* Jukin Media Verified *\nFind this video and others like it by visiting...',
+ 'title': 'birb (O v O") | Hello? Yes your uber ride is here',
+ 'description': 'Hello? Yes your uber ride is here * Jukin Media Verified * Find this video and others like it by visiting...',
'timestamp': 1486035513,
'upload_date': '20170202',
'uploader': 'Elisabeth Ahtn',
@@ -397,28 +399,31 @@ class FacebookIE(InfoExtractor):
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
def extract_metadata(webpage):
- video_title = self._html_search_regex(
- r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage,
- 'title', default=None)
- if not video_title:
- video_title = self._html_search_regex(
- r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
- webpage, 'alternative title', default=None)
- if not video_title:
- video_title = self._html_search_meta(
- ['og:title', 'twitter:title', 'description'],
- webpage, 'title', default=None)
- if video_title:
- video_title = limit_length(video_title, 80)
- else:
- video_title = 'Facebook video #%s' % video_id
- description = self._html_search_meta(
+ post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall(
+ r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)]
+ post = traverse_obj(post_data, (
+ ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
+ media = [m for m in traverse_obj(post, (..., 'attachments', ..., 'media'), expected_type=dict) or []
+ if str(m.get('id')) == video_id and m.get('__typename') == 'Video']
+ title = traverse_obj(media, (..., 'title', 'text'), get_all=False)
+ description = traverse_obj(media, (
+ ..., 'creation_story', 'comet_sections', 'message', 'story', 'message', 'text'), get_all=False)
+ uploader_data = (traverse_obj(media, (..., 'owner'), get_all=False)
+ or traverse_obj(post, (..., 'node', 'actors', ...), get_all=False) or {})
+
+ page_title = title or self._html_search_regex((
+ r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
+ r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(?P<content>.*?)</span>',
+ self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'<title>(?P<content>.+?)</title>'
+ ), webpage, 'title', default=None, group='content')
+ description = description or self._html_search_meta(
['description', 'og:description', 'twitter:description'],
webpage, 'description', default=None)
- uploader = clean_html(get_element_by_id(
- 'fbPhotoPageAuthorName', webpage)) or self._search_regex(
- r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',
- default=None) or self._og_search_title(webpage, fatal=False)
+ uploader = uploader_data.get('name') or (
+ clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
+ or self._search_regex(
+ (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False))
+
timestamp = int_or_none(self._search_regex(
r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
'timestamp', default=None))
@@ -433,17 +438,17 @@ class FacebookIE(InfoExtractor):
r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
default=None))
info_dict = {
- 'title': video_title,
'description': description,
'uploader': uploader,
+ 'uploader_id': uploader_data.get('id'),
'timestamp': timestamp,
'thumbnail': thumbnail,
'view_count': view_count,
}
+
info_json_ld = self._search_json_ld(webpage, video_id, default={})
- if info_json_ld.get('title'):
- info_json_ld['title'] = limit_length(
- re.sub(r'\s*\|\s*Facebook$', '', info_json_ld['title']), 80)
+ info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '')
+ or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}')
return merge_dicts(info_json_ld, info_dict)
video_data = None
@@ -510,15 +515,19 @@ class FacebookIE(InfoExtractor):
def parse_graphql_video(video):
formats = []
q = qualities(['sd', 'hd'])
- for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]:
- playable_url = video.get('playable_url' + suffix)
+ for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
+ ('playable_url_dash', '')):
+ playable_url = video.get(key)
if not playable_url:
continue
- formats.append({
- 'format_id': format_id,
- 'quality': q(format_id),
- 'url': playable_url,
- })
+ if determine_ext(playable_url) == 'mpd':
+ formats.extend(self._extract_mpd_formats(playable_url, video_id))
+ else:
+ formats.append({
+ 'format_id': format_id,
+ 'quality': q(format_id),
+ 'url': playable_url,
+ })
extract_dash_manifest(video, formats)
process_formats(formats)
v_id = video.get('videoId') or video.get('id') or video_id
@@ -546,22 +555,15 @@ class FacebookIE(InfoExtractor):
if media.get('__typename') == 'Video':
return parse_graphql_video(media)
- nodes = data.get('nodes') or []
- node = data.get('node') or {}
- if not nodes and node:
- nodes.append(node)
- for node in nodes:
- story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {}
- attachments = try_get(story, [
- lambda x: x['attached_story']['attachments'],
- lambda x: x['attachments']
- ], list) or []
- for attachment in attachments:
- attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict)
- ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
- for n in ns:
- parse_attachment(n)
- parse_attachment(attachment)
+ nodes = variadic(traverse_obj(data, 'nodes', 'node') or [])
+ attachments = traverse_obj(nodes, (
+ ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments',
+ ..., ('styles', 'style_type_renderer'), 'attachment'), expected_type=dict) or []
+ for attachment in attachments:
+ ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
+ for n in ns:
+ parse_attachment(n)
+ parse_attachment(attachment)
edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or []
for edge in edges:
@@ -730,6 +732,7 @@ class FacebookPluginsVideoIE(InfoExtractor):
'info_dict': {
'id': '10154383743583686',
'ext': 'mp4',
+ # TODO: Fix title, uploader
'title': 'What to do during the haze?',
'uploader': 'Gov.sg',
'upload_date': '20160826',
diff --git a/yt_dlp/extractor/fc2.py b/yt_dlp/extractor/fc2.py
index 4d85e62fe..a407ba158 100644
--- a/yt_dlp/extractor/fc2.py
+++ b/yt_dlp/extractor/fc2.py
@@ -1,18 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
-import hashlib
-
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
- compat_urllib_request,
- compat_urlparse,
)
from ..utils import (
ExtractorError,
sanitized_Request,
+ traverse_obj,
urlencode_postdata,
+ urljoin,
)
@@ -82,41 +80,32 @@ class FC2IE(InfoExtractor):
self._downloader.cookiejar.clear_session_cookies() # must clear
self._login()
- title = 'FC2 video %s' % video_id
- thumbnail = None
+ title, thumbnail, description = None, None, None
if webpage is not None:
- title = self._og_search_title(webpage)
+ title = self._html_search_regex(
+ (r'<h2\s+class="videoCnt_title">([^<]+?)</h2>',
+ r'\s+href="[^"]+"\s*title="([^"]+?)"\s*rel="nofollow">\s*<img',
+ # there's two matches in the webpage
+ r'\s+href="[^"]+"\s*title="([^"]+?)"\s*rel="nofollow">\s*\1'),
+ webpage,
+ 'title', fatal=False)
thumbnail = self._og_search_thumbnail(webpage)
- refer = url.replace('/content/', '/a/content/') if '/a/content/' not in url else url
-
- mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest()
-
- info_url = (
- 'http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&'.
- format(video_id, mimi, compat_urllib_request.quote(refer, safe=b'').replace('.', '%2E')))
-
- info_webpage = self._download_webpage(
- info_url, video_id, note='Downloading info page')
- info = compat_urlparse.parse_qs(info_webpage)
+ description = self._og_search_description(webpage)
- if 'err_code' in info:
- # most of the time we can still download wideo even if err_code is 403 or 602
- self.report_warning(
- 'Error code was: %s... but still trying' % info['err_code'][0])
-
- if 'filepath' not in info:
- raise ExtractorError('Cannot download file. Are you logged in?')
-
- video_url = info['filepath'][0] + '?mid=' + info['mid'][0]
- title_info = info.get('title')
- if title_info:
- title = title_info[0]
+ vidplaylist = self._download_json(
+ 'https://video.fc2.com/api/v3/videoplaylist/%s?sh=1&fs=0' % video_id, video_id,
+ note='Downloading info page')
+ vid_url = traverse_obj(vidplaylist, ('playlist', 'nq'))
+ if not vid_url:
+ raise ExtractorError('Unable to extract video URL')
+ vid_url = urljoin('https://video.fc2.com/', vid_url)
return {
'id': video_id,
'title': title,
- 'url': video_url,
- 'ext': 'flv',
+ 'url': vid_url,
+ 'ext': 'mp4',
+ 'description': description,
'thumbnail': thumbnail,
}
diff --git a/yt_dlp/extractor/flickr.py b/yt_dlp/extractor/flickr.py
index 6c82fae3c..2ed6c2bdc 100644
--- a/yt_dlp/extractor/flickr.py
+++ b/yt_dlp/extractor/flickr.py
@@ -7,6 +7,7 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ format_field,
int_or_none,
qualities,
)
@@ -95,7 +96,7 @@ class FlickrIE(InfoExtractor):
owner = video_info.get('owner', {})
uploader_id = owner.get('nsid')
uploader_path = owner.get('path_alias') or uploader_id
- uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None
+ uploader_url = format_field(uploader_path, template='https://www.flickr.com/photos/%s/')
return {
'id': video_id,
diff --git a/yt_dlp/extractor/fox.py b/yt_dlp/extractor/fox.py
index 04f4bdba6..4c52b9ac6 100644
--- a/yt_dlp/extractor/fox.py
+++ b/yt_dlp/extractor/fox.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
import json
import uuid
-from .adobepass import AdobePassIE
+from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_str,
@@ -20,7 +20,7 @@ from ..utils import (
)
-class FOXIE(AdobePassIE):
+class FOXIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)'
_TESTS = [{
# clip
@@ -37,6 +37,7 @@ class FOXIE(AdobePassIE):
'creator': 'FOX',
'series': 'Gotham',
'age_limit': 14,
+ 'episode': 'Aftermath: Bruce Wayne Develops Into The Dark Knight'
},
'params': {
'skip_download': True,
@@ -46,14 +47,15 @@ class FOXIE(AdobePassIE):
'url': 'https://www.fox.com/watch/087036ca7f33c8eb79b08152b4dd75c1/',
'only_matching': True,
}, {
- # episode, geo-restricted, tv provided required
- 'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/',
+ # sports event, geo-restricted
+ 'url': 'https://www.fox.com/watch/b057484dade738d1f373b3e46216fa2c/',
'only_matching': True,
}]
_GEO_BYPASS = False
_HOME_PAGE_URL = 'https://www.fox.com/'
- _API_KEY = 'abdcbed02c124d393b39e818a4312055'
+ _API_KEY = '6E9S4bmcoNnZwVLOHywOv8PJEdu76cM9'
_access_token = None
+ _device_id = compat_str(uuid.uuid4())
def _call_api(self, path, video_id, data=None):
headers = {
@@ -63,7 +65,7 @@ class FOXIE(AdobePassIE):
headers['Authorization'] = 'Bearer ' + self._access_token
try:
return self._download_json(
- 'https://api2.fox.com/v2.0/' + path,
+ 'https://api3.fox.com/v2.0/' + path,
video_id, data=data, headers=headers)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
@@ -87,16 +89,37 @@ class FOXIE(AdobePassIE):
if not self._access_token:
self._access_token = self._call_api(
'login', None, json.dumps({
- 'deviceId': compat_str(uuid.uuid4()),
+ 'deviceId': self._device_id,
}).encode())['accessToken']
def _real_extract(self, url):
video_id = self._match_id(url)
- video = self._call_api('vodplayer/' + video_id, video_id)
+ self._access_token = self._call_api(
+ 'previewpassmvpd?device_id=%s&mvpd_id=TempPass_fbcfox_60min' % self._device_id,
+ video_id)['accessToken']
+
+ video = self._call_api('watch', video_id, data=json.dumps({
+ 'capabilities': ['drm/widevine', 'fsdk/yo'],
+ 'deviceWidth': 1280,
+ 'deviceHeight': 720,
+ 'maxRes': '720p',
+ 'os': 'macos',
+ 'osv': '',
+ 'provider': {
+ 'freewheel': {'did': self._device_id},
+ 'vdms': {'rays': ''},
+ 'dmp': {'kuid': '', 'seg': ''}
+ },
+ 'playlist': '',
+ 'privacy': {'us': '1---'},
+ 'siteSection': '',
+ 'streamType': 'vod',
+ 'streamId': video_id}).encode('utf-8'))
title = video['name']
release_url = video['url']
+
try:
m3u8_url = self._download_json(release_url, video_id)['playURL']
except ExtractorError as e:
diff --git a/yt_dlp/extractor/fujitv.py b/yt_dlp/extractor/fujitv.py
index 1cea62609..f3cb9a6f4 100644
--- a/yt_dlp/extractor/fujitv.py
+++ b/yt_dlp/extractor/fujitv.py
@@ -1,48 +1,52 @@
# coding: utf-8
from __future__ import unicode_literals
-
+from ..utils import HEADRequest
from .common import InfoExtractor
class FujiTVFODPlus7IE(InfoExtractor):
- _VALID_URL = r'https?://fod\.fujitv\.co\.jp/title/[0-9a-z]{4}/(?P<id>[0-9a-z]+)'
- _BASE_URL = 'http://i.fod.fujitv.co.jp/'
- _BITRATE_MAP = {
- 300: (320, 180),
- 800: (640, 360),
- 1200: (1280, 720),
- 2000: (1280, 720),
- 4000: (1920, 1080),
- }
+ _VALID_URL = r'https?://fod\.fujitv\.co\.jp/title/(?P<sid>[0-9a-z]{4})/(?P<id>[0-9a-z]+)'
+ _BASE_URL = 'https://i.fod.fujitv.co.jp/'
_TESTS = [{
- 'url': 'https://fod.fujitv.co.jp/title/5d40/5d40810075',
+ 'url': 'https://fod.fujitv.co.jp/title/5d40/5d40110076',
'info_dict': {
- 'id': '5d40810075',
- 'title': '5d40810075',
+ 'id': '5d40110076',
'ext': 'mp4',
- 'format_id': '4000',
- 'thumbnail': 'http://i.fod.fujitv.co.jp/pc/image/wbtn/wbtn_5d40810075.jpg'
+ 'title': '#1318 『まる子、まぼろしの洋館を見る』の巻',
+ 'series': 'ちびまる子ちゃん',
+ 'series_id': '5d40',
+ 'description': 'md5:b3f51dbfdda162ac4f789e0ff4d65750',
+ 'thumbnail': 'https://i.fod.fujitv.co.jp/img/program/5d40/episode/5d40110076_a.jpg',
},
- 'skip': 'Expires after a week'
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
- formats = self._extract_m3u8_formats(
- self._BASE_URL + 'abr/tv_android/%s.m3u8' % video_id, video_id, 'mp4')
- for f in formats:
- wh = self._BITRATE_MAP.get(f.get('tbr'))
- if wh:
- f.update({
- 'width': wh[0],
- 'height': wh[1],
- })
- self._sort_formats(formats)
+ series_id, video_id = self._match_valid_url(url).groups()
+ self._request_webpage(HEADRequest(url), video_id)
+ json_info = {}
+ token = self._get_cookies(url).get('CT')
+ if token:
+ json_info = self._download_json('https://fod-sp.fujitv.co.jp/apps/api/episode/detail/?ep_id=%s&is_premium=false' % video_id, video_id, headers={'x-authorization': f'Bearer {token.value}'}, fatal=False)
+ else:
+ self.report_warning(f'The token cookie is needed to extract video metadata. {self._LOGIN_HINTS["cookies"]}')
+ formats, subtitles = [], {}
+ src_json = self._download_json(f'{self._BASE_URL}abrjson_v2/tv_android/{video_id}', video_id)
+ for src in src_json['video_selector']:
+ if not src.get('url'):
+ continue
+ fmt, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, 'mp4')
+ formats.extend(fmt)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ self._sort_formats(formats, ['tbr'])
return {
'id': video_id,
- 'title': video_id,
+ 'title': json_info.get('ep_title'),
+ 'series': json_info.get('lu_title'),
+ 'series_id': series_id,
+ 'description': json_info.get('ep_description'),
'formats': formats,
- 'thumbnail': self._BASE_URL + 'pc/image/wbtn/wbtn_%s.jpg' % video_id,
+ 'subtitles': subtitles,
+ 'thumbnail': f'{self._BASE_URL}img/program/{series_id}/episode/{video_id}_a.jpg',
}
diff --git a/yt_dlp/extractor/funk.py b/yt_dlp/extractor/funk.py
index e5e32608f..2c5cfe864 100644
--- a/yt_dlp/extractor/funk.py
+++ b/yt_dlp/extractor/funk.py
@@ -11,7 +11,7 @@ from ..utils import (
class FunkIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.|origin\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.funk.net/channel/ba-793/die-lustigsten-instrumente-aus-dem-internet-teil-2-1155821',
'md5': '8dd9d9ab59b4aa4173b3197f2ea48e81',
diff --git a/yt_dlp/extractor/gamejolt.py b/yt_dlp/extractor/gamejolt.py
index 7f2f6f3e1..a13e528f5 100644
--- a/yt_dlp/extractor/gamejolt.py
+++ b/yt_dlp/extractor/gamejolt.py
@@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
from ..utils import (
determine_ext,
+ format_field,
int_or_none,
str_or_none,
traverse_obj,
@@ -86,7 +87,7 @@ class GameJoltBaseIE(InfoExtractor):
'display_id': post_data.get('slug'),
'uploader': user_data.get('display_name') or user_data.get('name'),
'uploader_id': user_data.get('username'),
- 'uploader_url': 'https://gamejolt.com' + user_data['url'] if user_data.get('url') else None,
+ 'uploader_url': format_field(user_data, 'url', 'https://gamejolt.com%s'),
'categories': [try_get(category, lambda x: '%s - %s' % (x['community']['name'], x['channel'].get('display_title') or x['channel']['title']))
for category in post_data.get('communities' or [])],
'tags': traverse_obj(
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py
index 5dafef283..2b59d076f 100644
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -28,6 +28,7 @@ from ..utils import (
mimetype2ext,
orderedSet,
parse_duration,
+ parse_resolution,
sanitized_Request,
smuggle_url,
unescapeHTML,
@@ -100,6 +101,8 @@ from .ustream import UstreamIE
from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
+from .glomex import GlomexEmbedIE
+from .megatvcom import MegaTVComEmbedIE
from .limelight import LimelightBaseIE
from .anvato import AnvatoIE
from .washingtonpost import WashingtonPostIE
@@ -112,6 +115,7 @@ from .channel9 import Channel9IE
from .vshare import VShareIE
from .mediasite import MediasiteIE
from .springboardplatform import SpringboardPlatformIE
+from .ted import TedEmbedIE
from .yapfiles import YapFilesIE
from .vice import ViceIE
from .xfileshare import XFileShareIE
@@ -135,8 +139,12 @@ from .arcpublishing import ArcPublishingIE
from .medialaan import MedialaanIE
from .simplecast import SimplecastIE
from .wimtv import WimTVIE
+from .tvopengr import TVOpenGrEmbedIE
+from .ertgr import ERTWebtvEmbedIE
from .tvp import TVPEmbedIE
from .blogger import BloggerIE
+from .mainstreaming import MainStreamingIE
+from .gfycat import GfycatIE
class GenericIE(InfoExtractor):
@@ -1870,6 +1878,62 @@ class GenericIE(InfoExtractor):
'add_ie': [RutubeIE.ie_key()],
},
{
+ # glomex:embed
+ 'url': 'https://www.skai.gr/news/world/iatrikos-syllogos-tourkias-to-turkovac-aplo-dialyma-erntogan-eiste-apateones-kai-pseytes',
+ 'info_dict': {
+ 'id': 'v-ch2nkhcirwc9-sf',
+ 'ext': 'mp4',
+ 'title': 'md5:786e1e24e06c55993cee965ef853a0c1',
+ 'description': 'md5:8b517a61d577efe7e36fde72fd535995',
+ 'timestamp': 1641885019,
+ 'upload_date': '20220111',
+ 'duration': 460000,
+ 'thumbnail': 'https://i3thumbs.glomex.com/dC1idjJwdndiMjRzeGwvMjAyMi8wMS8xMS8wNy8xMF8zNV82MWRkMmQ2YmU5ZTgyLmpwZw==/profile:player-960x540',
+ },
+ },
+ {
+ # megatvcom:embed
+ 'url': 'https://www.in.gr/2021/12/18/greece/apokalypsi-mega-poios-parelave-tin-ereyna-tsiodra-ek-merous-tis-kyvernisis-o-prothypourgos-telika-gnorize/',
+ 'info_dict': {
+ 'id': 'apokalypsi-mega-poios-parelave-tin-ereyna-tsiodra-ek-merous-tis-kyvernisis-o-prothypourgos-telika-gnorize',
+ 'title': 'md5:5e569cf996ec111057c2764ec272848f',
+ },
+ 'playlist': [{
+ 'md5': '1afa26064ff00ccb91617957dbc73dc1',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': '564916',
+ 'display_id': 'md5:6cdf22d3a2e7bacb274b7295089a1770',
+ 'title': 'md5:33b9dd39584685b62873043670eb52a6',
+ 'description': 'md5:c1db7310f390518ac36dd69d947ef1a1',
+ 'timestamp': 1639753145,
+ 'upload_date': '20211217',
+ 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/12/prezerakos-1024x597.jpg',
+ },
+ }, {
+ 'md5': '4a1c220695f1ef865a8b7966a53e2474',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': '564905',
+ 'display_id': 'md5:ead15695e485e649aed2b81ebd699b88',
+ 'title': 'md5:2b71fd54249a3ca34609fe39ae31c47b',
+ 'description': 'md5:c42e12f638d0a97d6de4508e2c4df982',
+ 'timestamp': 1639753047,
+ 'upload_date': '20211217',
+ 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/12/tsiodras-mitsotakis-1024x545.jpg',
+ },
+ }]
+ },
+ {
+ 'url': 'https://www.ertnews.gr/video/manolis-goyalles-o-anthropos-piso-apo-ti-diadiktyaki-vasilopita/',
+ 'info_dict': {
+ 'id': '2022/tv/news-themata-ianouarios/20220114-apotis6-gouales-pita.mp4',
+ 'ext': 'mp4',
+ 'title': 'md5:df64f5b61c06d0e9556c0cdd5cf14464',
+ 'thumbnail': 'https://www.ert.gr/themata/photos/2021/20220114-apotis6-gouales-pita.jpg',
+ },
+ },
+ {
# ThePlatform embedded with whitespaces in URLs
'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm',
'only_matching': True,
@@ -2175,6 +2239,22 @@ class GenericIE(InfoExtractor):
},
},
{
+ # tvopengr:embed
+ 'url': 'https://www.ethnos.gr/World/article/190604/hparosiaxekinoynoisynomiliessthgeneyhmethskiatoypolemoypanoapothnoykrania',
+ 'md5': 'eb0c3995d0a6f18f6538c8e057865d7d',
+ 'info_dict': {
+ 'id': '101119',
+ 'ext': 'mp4',
+ 'display_id': 'oikarpoitondiapragmateyseonhparosias',
+ 'title': 'md5:b979f4d640c568617d6547035528a149',
+ 'description': 'md5:e54fc1977c7159b01cc11cd7d9d85550',
+ 'timestamp': 1641772800,
+ 'upload_date': '20220110',
+ 'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/70bc39fa-895b-4918-a364-c39d2135fc6d.jpg',
+
+ }
+ },
+ {
# blogger embed
'url': 'https://blog.tomeuvizoso.net/2019/01/a-panfrost-milestone.html',
'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac',
@@ -2382,8 +2462,47 @@ class GenericIE(InfoExtractor):
'timestamp': 1636788683.0,
'upload_date': '20211113'
}
+ },
+ {
+ # MainStreaming player
+ 'url': 'https://www.lactv.it/2021/10/03/lac-news24-la-settimana-03-10-2021/',
+ 'info_dict': {
+ 'id': 'EUlZfGWkGpOd',
+ 'title': 'La Settimana ',
+ 'description': '03 Ottobre ore 02:00',
+ 'ext': 'mp4',
+ 'live_status': 'not_live',
+ 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+ 'duration': 1512
+ }
+ },
+ {
+ # Multiple gfycat iframe embeds
+ 'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=613422',
+ 'info_dict': {
+ 'title': '재이, 윤, 세은 황금 드레스를 입고 빛난다',
+ 'id': 'board'
+ },
+ 'playlist_count': 8,
+ },
+ {
+ # Multiple gfycat gifs (direct links)
+ 'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=612199',
+ 'info_dict': {
+ 'title': '옳게 된 크롭 니트 스테이씨 아이사',
+ 'id': 'board'
+ },
+ 'playlist_count': 6
+ },
+ {
+ # Multiple gfycat embeds, with uppercase "IFR" in urls
+ 'url': 'https://kkzz.kr/?vid=2295',
+ 'info_dict': {
+ 'title': '지방시 앰버서더 에스파 카리나 움짤',
+ 'id': '?vid=2295'
+ },
+ 'playlist_count': 9
}
- #
]
def report_following_redirect(self, new_url):
@@ -3083,10 +3202,9 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'Tvigle')
# Look for embedded TED player
- mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
- if mobj is not None:
- return self.url_result(mobj.group('url'), 'TED')
+ ted_urls = TedEmbedIE._extract_urls(webpage)
+ if ted_urls:
+ return self.playlist_from_matches(ted_urls, video_id, video_title, ie=TedEmbedIE.ie_key())
# Look for embedded Ustream videos
ustream_url = UstreamIE._extract_url(webpage)
@@ -3422,6 +3540,18 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
+ # Look for Glomex embeds
+ glomex_urls = list(GlomexEmbedIE._extract_urls(webpage, url))
+ if glomex_urls:
+ return self.playlist_from_matches(
+ glomex_urls, video_id, video_title, ie=GlomexEmbedIE.ie_key())
+
+ # Look for megatv.com embeds
+ megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage))
+ if megatvcom_urls:
+ return self.playlist_from_matches(
+ megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key())
+
# Look for WashingtonPost embeds
wapo_urls = WashingtonPostIE._extract_urls(webpage)
if wapo_urls:
@@ -3568,10 +3698,32 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key())
+ # Look for (tvopen|ethnos).gr embeds
+ tvopengr_urls = list(TVOpenGrEmbedIE._extract_urls(webpage))
+ if tvopengr_urls:
+ return self.playlist_from_matches(tvopengr_urls, video_id, video_title, ie=TVOpenGrEmbedIE.ie_key())
+
+ # Look for ert.gr webtv embeds
+ ertwebtv_urls = list(ERTWebtvEmbedIE._extract_urls(webpage))
+ if len(ertwebtv_urls) == 1:
+ return self.url_result(self._proto_relative_url(ertwebtv_urls[0]), video_title=video_title, url_transparent=True)
+ elif ertwebtv_urls:
+ return self.playlist_from_matches(ertwebtv_urls, video_id, video_title, ie=ERTWebtvEmbedIE.ie_key())
+
tvp_urls = TVPEmbedIE._extract_urls(webpage)
if tvp_urls:
return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key())
+ # Look for MainStreaming embeds
+ mainstreaming_urls = MainStreamingIE._extract_urls(webpage)
+ if mainstreaming_urls:
+ return self.playlist_from_matches(mainstreaming_urls, video_id, video_title, ie=MainStreamingIE.ie_key())
+
+ # Look for Gfycat Embeds
+ gfycat_urls = GfycatIE._extract_urls(webpage)
+ if gfycat_urls:
+ return self.playlist_from_matches(gfycat_urls, video_id, video_title, ie=GfycatIE.ie_key())
+
# Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries:
@@ -3663,12 +3815,16 @@ class GenericIE(InfoExtractor):
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(webpage, video_id, default={})
- if json_ld.get('url'):
+ if json_ld.get('url') not in (url, None):
self.report_detected('JSON LD')
- if determine_ext(json_ld.get('url')) == 'm3u8':
+ if determine_ext(json_ld['url']) == 'm3u8':
json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles(
json_ld['url'], video_id, 'mp4')
json_ld.pop('url')
+ self._sort_formats(json_ld['formats'])
+ else:
+ json_ld['_type'] = 'url_transparent'
+ json_ld['url'] = smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True})
return merge_dicts(json_ld, info_dict)
def check_video(vurl):
@@ -3723,20 +3879,21 @@ class GenericIE(InfoExtractor):
protocol, _, _ = url.partition('/')
thumbnail = protocol + thumbnail
+ url_keys = list(filter(re.compile(r'video_url|video_alt_url\d*').fullmatch, flashvars.keys()))
formats = []
- for key in ('video_url', 'video_alt_url', 'video_alt_url2'):
- if key in flashvars and '/get_file/' in flashvars[key]:
- next_format = {
- 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']),
- 'format_id': flashvars.get(key + '_text', key),
- 'ext': 'mp4',
- }
- height = re.search(r'%s_(\d+)p\.mp4(?:/[?].*)?$' % flashvars['video_id'], flashvars[key])
- if height:
- next_format['height'] = int(height.group(1))
- else:
- next_format['quality'] = 1
- formats.append(next_format)
+ for key in url_keys:
+ if '/get_file/' not in flashvars[key]:
+ continue
+ format_id = flashvars.get(f'{key}_text', key)
+ formats.append({
+ 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']),
+ 'format_id': format_id,
+ 'ext': 'mp4',
+ **(parse_resolution(format_id) or parse_resolution(flashvars[key]))
+ })
+ if not formats[-1].get('height'):
+ formats[-1]['quality'] = 1
+
self._sort_formats(formats)
return {
diff --git a/yt_dlp/extractor/gfycat.py b/yt_dlp/extractor/gfycat.py
index 56a6dc03d..2ad03e2b2 100644
--- a/yt_dlp/extractor/gfycat.py
+++ b/yt_dlp/extractor/gfycat.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
int_or_none,
@@ -11,7 +13,7 @@ from ..utils import (
class GfycatIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\.]+)'
+ _VALID_URL = r'(?i)https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\."\']+)'
_TESTS = [{
'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher',
'info_dict': {
@@ -78,8 +80,19 @@ class GfycatIE(InfoExtractor):
}, {
'url': 'https://giant.gfycat.com/acceptablehappygoluckyharborporpoise.mp4',
'only_matching': True
+ }, {
+ 'url': 'http://gfycat.com/IFR/JauntyTimelyAmazontreeboa',
+ 'only_matching': True
}]
+ @staticmethod
+ def _extract_urls(webpage):
+ return [
+ mobj.group('url')
+ for mobj in re.finditer(
+ r'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>%s)' % GfycatIE._VALID_URL,
+ webpage)]
+
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/yt_dlp/extractor/globo.py b/yt_dlp/extractor/globo.py
index a3f024157..8624a160a 100644
--- a/yt_dlp/extractor/globo.py
+++ b/yt_dlp/extractor/globo.py
@@ -12,6 +12,7 @@ from ..compat import (
compat_str,
)
from ..utils import (
+ HEADRequest,
ExtractorError,
float_or_none,
orderedSet,
@@ -67,11 +68,28 @@ class GloboIE(InfoExtractor):
}, {
'url': 'globo:3607726',
'only_matching': True,
+ }, {
+ 'url': 'https://globoplay.globo.com/v/10248083/',
+ 'info_dict': {
+ 'id': '10248083',
+ 'ext': 'mp4',
+ 'title': 'Melhores momentos: Equador 1 x 1 Brasil pelas Eliminatórias da Copa do Mundo 2022',
+ 'duration': 530.964,
+ 'uploader': 'SporTV',
+ 'uploader_id': '698',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
video_id = self._match_id(url)
+ self._request_webpage(
+ HEADRequest('https://globo-ab.globo.com/v2/selected-alternatives?experiments=player-isolated-experiment-02&skipImpressions=true'),
+ video_id, 'Getting cookies')
+
video = self._download_json(
'http://api.globovideos.com/videos/%s/playlist' % video_id,
video_id)['videos'][0]
@@ -82,7 +100,7 @@ class GloboIE(InfoExtractor):
formats = []
security = self._download_json(
- 'https://playback.video.globo.com/v1/video-session', video_id, 'Downloading security hash for %s' % video_id,
+ 'https://playback.video.globo.com/v2/video-session', video_id, 'Downloading security hash for %s' % video_id,
headers={'content-type': 'application/json'}, data=json.dumps({
"player_type": "desktop",
"video_id": video_id,
@@ -92,7 +110,9 @@ class GloboIE(InfoExtractor):
"tz": "-3.0:00"
}).encode())
- security_hash = security['source']['token']
+ self._request_webpage(HEADRequest(security['sources'][0]['url_template']), video_id, 'Getting locksession cookie')
+
+ security_hash = security['sources'][0]['token']
if not security_hash:
message = security.get('message')
if message:
@@ -115,7 +135,7 @@ class GloboIE(InfoExtractor):
md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode()
signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=')
signed_hash = hash_prefix + padded_sign_time + signed_md5
- source = security['source']['url_parts']
+ source = security['sources'][0]['url_parts']
resource_url = source['scheme'] + '://' + source['domain'] + source['path']
signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A')
diff --git a/yt_dlp/extractor/glomex.py b/yt_dlp/extractor/glomex.py
new file mode 100644
index 000000000..d9ef4338f
--- /dev/null
+++ b/yt_dlp/extractor/glomex.py
@@ -0,0 +1,220 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import urllib.parse
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ extract_attributes,
+ ExtractorError,
+ int_or_none,
+ parse_qs,
+ smuggle_url,
+ unescapeHTML,
+ unsmuggle_url,
+)
+
+
+class GlomexBaseIE(InfoExtractor):
+ _DEFAULT_ORIGIN_URL = 'https://player.glomex.com/'
+ _API_URL = 'https://integration-cloudfront-eu-west-1.mes.glomex.cloud/'
+
+ @staticmethod
+ def _smuggle_origin_url(url, origin_url):
+ if origin_url is None:
+ return url
+ return smuggle_url(url, {'origin': origin_url})
+
+ @classmethod
+ def _unsmuggle_origin_url(cls, url, fallback_origin_url=None):
+ defaults = {'origin': fallback_origin_url or cls._DEFAULT_ORIGIN_URL}
+ unsmuggled_url, data = unsmuggle_url(url, default=defaults)
+ return unsmuggled_url, data['origin']
+
+ def _get_videoid_type(self, video_id):
+ _VIDEOID_TYPES = {
+ 'v': 'video',
+ 'pl': 'playlist',
+ 'rl': 'related videos playlist',
+ 'cl': 'curated playlist',
+ }
+ prefix = video_id.split('-')[0]
+ return _VIDEOID_TYPES.get(prefix, 'unknown type')
+
+ def _download_api_data(self, video_id, integration, current_url=None):
+ query = {
+ 'integration_id': integration,
+ 'playlist_id': video_id,
+ 'current_url': current_url or self._DEFAULT_ORIGIN_URL,
+ }
+ video_id_type = self._get_videoid_type(video_id)
+ return self._download_json(
+ self._API_URL,
+ video_id, 'Downloading %s JSON' % video_id_type,
+ 'Unable to download %s JSON' % video_id_type,
+ query=query)
+
+ def _download_and_extract_api_data(self, video_id, integration, current_url):
+ api_data = self._download_api_data(video_id, integration, current_url)
+ videos = api_data['videos']
+ if not videos:
+ raise ExtractorError('no videos found for %s' % video_id)
+ videos = [self._extract_api_data(video, video_id) for video in videos]
+ return videos[0] if len(videos) == 1 else self.playlist_result(videos, video_id)
+
+ def _extract_api_data(self, video, video_id):
+ if video.get('error_code') == 'contentGeoblocked':
+ self.raise_geo_restricted(countries=video['geo_locations'])
+
+ formats, subs = [], {}
+ for format_id, format_url in video['source'].items():
+ ext = determine_ext(format_url)
+ if ext == 'm3u8':
+ formats_, subs_ = self._extract_m3u8_formats_and_subtitles(
+ format_url, video_id, 'mp4', m3u8_id=format_id,
+ fatal=False)
+ formats.extend(formats_)
+ self._merge_subtitles(subs_, target=subs)
+ else:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ })
+ if video.get('language'):
+ for fmt in formats:
+ fmt['language'] = video['language']
+ self._sort_formats(formats)
+
+ images = (video.get('images') or []) + [video.get('image') or {}]
+ thumbnails = [{
+ 'id': image.get('id'),
+ 'url': f'{image["url"]}/profile:player-960x540',
+ 'width': 960,
+ 'height': 540,
+ } for image in images if image.get('url')]
+ self._remove_duplicate_formats(thumbnails)
+
+ return {
+ 'id': video.get('clip_id') or video_id,
+ 'title': video.get('title'),
+ 'description': video.get('description'),
+ 'thumbnails': thumbnails,
+ 'duration': int_or_none(video.get('clip_duration')),
+ 'timestamp': video.get('created_at'),
+ 'formats': formats,
+ 'subtitles': subs,
+ }
+
+
+class GlomexIE(GlomexBaseIE):
+ IE_NAME = 'glomex'
+ IE_DESC = 'Glomex videos'
+ _VALID_URL = r'https?://video\.glomex\.com/[^/]+/(?P<id>v-[^-]+)'
+ _INTEGRATION_ID = '19syy24xjn1oqlpc'
+
+ _TESTS = [{
+ 'url': 'https://video.glomex.com/sport/v-cb24uwg77hgh-nach-2-0-sieg-guardiola-mit-mancity-vor-naechstem-titel',
+ 'md5': 'cec33a943c4240c9cb33abea8c26242e',
+ 'info_dict': {
+ 'id': 'v-cb24uwg77hgh',
+ 'ext': 'mp4',
+ 'title': 'md5:38a90cedcfadd72982c81acf13556e0c',
+ 'description': 'md5:1ea6b6caff1443fcbbba159e432eedb8',
+ 'duration': 29600,
+ 'timestamp': 1619895017,
+ 'upload_date': '20210501',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ return self.url_result(
+ GlomexEmbedIE.build_player_url(video_id, self._INTEGRATION_ID, url),
+ GlomexEmbedIE.ie_key(), video_id)
+
+
+class GlomexEmbedIE(GlomexBaseIE):
+ IE_NAME = 'glomex:embed'
+ IE_DESC = 'Glomex embedded videos'
+ _BASE_PLAYER_URL = '//player.glomex.com/integration/1/iframe-player.html'
+ _BASE_PLAYER_URL_RE = re.escape(_BASE_PLAYER_URL).replace('/1/', r'/[^/]/')
+ _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?playlistId=(?P<id>[^#&]+)'
+
+ _TESTS = [{
+ 'url': 'https://player.glomex.com/integration/1/iframe-player.html?integrationId=4059a013k56vb2yd&playlistId=v-cfa6lye0dkdd-sf',
+ 'md5': '68f259b98cc01918ac34180142fce287',
+ 'info_dict': {
+ 'id': 'v-cfa6lye0dkdd-sf',
+ 'ext': 'mp4',
+ 'timestamp': 1635337199,
+ 'duration': 133080,
+ 'upload_date': '20211027',
+ 'description': 'md5:e741185fc309310ff5d0c789b437be66',
+ 'title': 'md5:35647293513a6c92363817a0fb0a7961',
+ },
+ }, {
+ 'url': 'https://player.glomex.com/integration/1/iframe-player.html?origin=fullpage&integrationId=19syy24xjn1oqlpc&playlistId=rl-vcb49w1fb592p&playlistIndex=0',
+ 'info_dict': {
+ 'id': 'rl-vcb49w1fb592p',
+ },
+ 'playlist_count': 100,
+ }, {
+ 'url': 'https://player.glomex.com/integration/1/iframe-player.html?playlistId=cl-bgqaata6aw8x&integrationId=19syy24xjn1oqlpc',
+ 'info_dict': {
+ 'id': 'cl-bgqaata6aw8x',
+ },
+ 'playlist_mincount': 2,
+ }]
+
+ @classmethod
+ def build_player_url(cls, video_id, integration, origin_url=None):
+ query_string = urllib.parse.urlencode({
+ 'playlistId': video_id,
+ 'integrationId': integration,
+ })
+ return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url)
+
+ @classmethod
+ def _extract_urls(cls, webpage, origin_url):
+ # https://docs.glomex.com/publisher/video-player-integration/javascript-api/
+ quot_re = r'["\']'
+
+ regex = fr'''(?x)
+ <iframe[^>]+?src=(?P<q>{quot_re})(?P<url>
+ (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+
+ )(?P=q)'''
+ for mobj in re.finditer(regex, webpage):
+ url = unescapeHTML(mobj.group('url'))
+ if cls.suitable(url):
+ yield cls._smuggle_origin_url(url, origin_url)
+
+ regex = fr'''(?x)
+ <glomex-player [^>]+?>|
+ <div[^>]* data-glomex-player=(?P<q>{quot_re})true(?P=q)[^>]*>'''
+ for mobj in re.finditer(regex, webpage):
+ attrs = extract_attributes(mobj.group(0))
+ if attrs.get('data-integration-id') and attrs.get('data-playlist-id'):
+ yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], origin_url)
+
+ # naive parsing of inline scripts for hard-coded integration parameters
+ regex = fr'''(?x)
+ (?P<is_js>dataset\.)?%s\s*(?(is_js)=|:)\s*
+ (?P<q>{quot_re})(?P<id>(?:(?!(?P=q)).)+)(?P=q)\s'''
+ for mobj in re.finditer(r'(?x)<script[^<]*>.+?</script>', webpage):
+ script = mobj.group(0)
+ integration_id = re.search(regex % 'integrationId', script)
+ if not integration_id:
+ continue
+ playlist_id = re.search(regex % 'playlistId', script)
+ if playlist_id:
+ yield cls.build_player_url(playlist_id, integration_id, origin_url)
+
+ def _real_extract(self, url):
+ url, origin_url = self._unsmuggle_origin_url(url)
+ playlist_id = self._match_id(url)
+ integration = parse_qs(url).get('integrationId', [None])[0]
+ if not integration:
+ raise ExtractorError('No integrationId in URL', expected=True)
+ return self._download_and_extract_api_data(playlist_id, integration, origin_url)
diff --git a/yt_dlp/extractor/googlesearch.py b/yt_dlp/extractor/googlesearch.py
index f605c0c35..4b8b1bcbb 100644
--- a/yt_dlp/extractor/googlesearch.py
+++ b/yt_dlp/extractor/googlesearch.py
@@ -8,36 +8,33 @@ from .common import SearchInfoExtractor
class GoogleSearchIE(SearchInfoExtractor):
IE_DESC = 'Google Video search'
- _MAX_RESULTS = 1000
IE_NAME = 'video.google:search'
_SEARCH_KEY = 'gvsearch'
- _WORKING = False
- _TEST = {
+ _TESTS = [{
'url': 'gvsearch15:python language',
'info_dict': {
'id': 'python language',
'title': 'python language',
},
'playlist_count': 15,
- }
+ }]
+ _PAGE_SIZE = 100
def _search_results(self, query):
for pagenum in itertools.count():
webpage = self._download_webpage(
- 'http://www.google.com/search',
- 'gvsearch:' + query,
- note='Downloading result page %s' % (pagenum + 1),
+ 'http://www.google.com/search', f'gvsearch:{query}',
+ note=f'Downloading result page {pagenum + 1}',
query={
'tbm': 'vid',
'q': query,
- 'start': pagenum * 10,
+ 'start': pagenum * self._PAGE_SIZE,
+ 'num': self._PAGE_SIZE,
'hl': 'en',
})
- for hit_idx, mobj in enumerate(re.finditer(
- r'<h3 class="r"><a href="([^"]+)"', webpage)):
- if re.search(f'id="vidthumb{hit_idx + 1}"', webpage):
- yield self.url_result(mobj.group(1))
+ for url in re.findall(r'<div[^>]* class="dXiKIc"[^>]*><a href="([^"]+)"', webpage):
+ yield self.url_result(url)
if not re.search(r'id="pnnext"', webpage):
return
diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py
index de2b30cf7..a0ce1f10a 100644
--- a/yt_dlp/extractor/hotstar.py
+++ b/yt_dlp/extractor/hotstar.py
@@ -203,6 +203,9 @@ class HotStarIE(HotStarBaseIE):
format_url = re.sub(
r'(?<=//staragvod)(\d)', r'web\1', format_url)
tags = str_or_none(playback_set.get('tagsCombination')) or ''
+ ingored_res, ignored_vcodec, ignored_dr = self._configuration_arg('res'), self._configuration_arg('vcodec'), self._configuration_arg('dr')
+ if any(f'resolution:{ig_res}' in tags for ig_res in ingored_res) or any(f'video_codec:{ig_vc}' in tags for ig_vc in ignored_vcodec) or any(f'dynamic_range:{ig_dr}' in tags for ig_dr in ignored_dr):
+ continue
ext = determine_ext(format_url)
current_formats, current_subs = [], {}
try:
diff --git a/yt_dlp/extractor/hrfensehen.py b/yt_dlp/extractor/hrfensehen.py
index 2a994d471..e39ded254 100644
--- a/yt_dlp/extractor/hrfensehen.py
+++ b/yt_dlp/extractor/hrfensehen.py
@@ -26,13 +26,7 @@ class HRFernsehenIE(InfoExtractor):
}]},
'timestamp': 1598470200,
'upload_date': '20200826',
- 'thumbnails': [{
- 'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9.jpg',
- 'id': '0'
- }, {
- 'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg',
- 'id': '1'
- }],
+ 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg',
'title': 'hessenschau vom 26.08.2020'
}
}, {
@@ -81,7 +75,7 @@ class HRFernsehenIE(InfoExtractor):
description = self._html_search_meta(
['description'], webpage)
- loader_str = unescapeHTML(self._search_regex(r"data-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader"))
+ loader_str = unescapeHTML(self._search_regex(r"data-new-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader"))
loader_data = json.loads(loader_str)
info = {
diff --git a/yt_dlp/extractor/imggaming.py b/yt_dlp/extractor/imggaming.py
index 14d3fad55..bae74b290 100644
--- a/yt_dlp/extractor/imggaming.py
+++ b/yt_dlp/extractor/imggaming.py
@@ -64,10 +64,7 @@ class ImgGamingBaseIE(InfoExtractor):
domain, media_type, media_id, playlist_id = self._match_valid_url(url).groups()
if playlist_id:
- if self.get_param('noplaylist'):
- self.to_screen('Downloading just video %s because of --no-playlist' % media_id)
- else:
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id)
+ if self._yes_playlist(playlist_id, media_id):
media_type, media_id = 'playlist', playlist_id
if media_type == 'playlist':
diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py
index ab14e5b0a..a2cc9f748 100644
--- a/yt_dlp/extractor/instagram.py
+++ b/yt_dlp/extractor/instagram.py
@@ -12,11 +12,13 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ format_field,
float_or_none,
get_element_by_attribute,
int_or_none,
lowercase_escape,
std_headers,
+ str_or_none,
str_to_int,
traverse_obj,
url_or_none,
@@ -126,6 +128,74 @@ class InstagramBaseIE(InfoExtractor):
'like_count': self._get_count(node, 'likes', 'preview_like'),
}
+ def _extract_product_media(self, product_media):
+ media_id = product_media.get('code') or product_media.get('id')
+ vcodec = product_media.get('video_codec')
+ dash_manifest_raw = product_media.get('video_dash_manifest')
+ videos_list = product_media.get('video_versions')
+ if not (dash_manifest_raw or videos_list):
+ return {}
+
+ formats = [{
+ 'format_id': format.get('id'),
+ 'url': format.get('url'),
+ 'width': format.get('width'),
+ 'height': format.get('height'),
+ 'vcodec': vcodec,
+ } for format in videos_list or []]
+ if dash_manifest_raw:
+ formats.extend(self._parse_mpd_formats(self._parse_xml(dash_manifest_raw, media_id), mpd_id='dash'))
+ self._sort_formats(formats)
+
+ thumbnails = [{
+ 'url': thumbnail.get('url'),
+ 'width': thumbnail.get('width'),
+ 'height': thumbnail.get('height')
+ } for thumbnail in traverse_obj(product_media, ('image_versions2', 'candidates')) or []]
+ return {
+ 'id': media_id,
+ 'duration': float_or_none(product_media.get('video_duration')),
+ 'formats': formats,
+ 'thumbnails': thumbnails
+ }
+
+ def _extract_product(self, product_info):
+ if isinstance(product_info, list):
+ product_info = product_info[0]
+
+ user_info = product_info.get('user') or {}
+ info_dict = {
+ 'id': product_info.get('code') or product_info.get('id'),
+ 'title': product_info.get('title') or f'Video by {user_info.get("username")}',
+ 'description': traverse_obj(product_info, ('caption', 'text'), expected_type=str_or_none),
+ 'timestamp': int_or_none(product_info.get('taken_at')),
+ 'channel': user_info.get('username'),
+ 'uploader': user_info.get('full_name'),
+ 'uploader_id': str_or_none(user_info.get('pk')),
+ 'view_count': int_or_none(product_info.get('view_count')),
+ 'like_count': int_or_none(product_info.get('like_count')),
+ 'comment_count': int_or_none(product_info.get('comment_count')),
+ 'http_headers': {
+ 'Referer': 'https://www.instagram.com/',
+ }
+ }
+ carousel_media = product_info.get('carousel_media')
+ if carousel_media:
+ return {
+ '_type': 'playlist',
+ **info_dict,
+ 'title': f'Post by {user_info.get("username")}',
+ 'entries': [{
+ **info_dict,
+ **self._extract_product_media(product_media),
+ } for product_media in carousel_media],
+ }
+
+ return {
+ **info_dict,
+ **self._extract_product_media(product_info)
+ }
+
class InstagramIOSIE(InfoExtractor):
IE_DESC = 'IOS instagram:// URL'
@@ -184,8 +254,9 @@ class InstagramIE(InstagramBaseIE):
'duration': 0,
'timestamp': 1371748545,
'upload_date': '20130620',
- 'uploader_id': 'naomipq',
+ 'uploader_id': '2815873',
'uploader': 'B E A U T Y F O R A S H E S',
+ 'channel': 'naomipq',
'like_count': int,
'comment_count': int,
'comments': list,
@@ -201,8 +272,9 @@ class InstagramIE(InstagramBaseIE):
'duration': 0,
'timestamp': 1453760977,
'upload_date': '20160125',
- 'uploader_id': 'britneyspears',
+ 'uploader_id': '12246775',
'uploader': 'Britney Spears',
+ 'channel': 'britneyspears',
'like_count': int,
'comment_count': int,
'comments': list,
@@ -248,8 +320,9 @@ class InstagramIE(InstagramBaseIE):
'duration': 53.83,
'timestamp': 1530032919,
'upload_date': '20180626',
- 'uploader_id': 'instagram',
+ 'uploader_id': '25025320',
'uploader': 'Instagram',
+ 'channel': 'instagram',
'like_count': int,
'comment_count': int,
'comments': list,
@@ -315,16 +388,19 @@ class InstagramIE(InstagramBaseIE):
if not media:
additional_data = self._parse_json(
self._search_regex(
- r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;',
+ r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\);',
webpage, 'additional data', default='{}'),
video_id, fatal=False)
+ product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict)
+ if product_item:
+ return self._extract_product(product_item)
media = traverse_obj(additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {}
if not media and 'www.instagram.com/accounts/login' in urlh.geturl():
self.raise_login_required('You need to log in to access this content')
- uploader_id = traverse_obj(media, ('owner', 'username')) or self._search_regex(
- r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'uploader id', fatal=False)
+ username = traverse_obj(media, ('owner', 'username')) or self._search_regex(
+ r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'username', fatal=False)
description = (
traverse_obj(media, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str)
@@ -341,7 +417,7 @@ class InstagramIE(InstagramBaseIE):
if nodes:
return self.playlist_result(
self._extract_nodes(nodes, True), video_id,
- 'Post by %s' % uploader_id if uploader_id else None, description)
+ format_field(username, template='Post by %s'), description)
video_url = self._og_search_video_url(webpage, secure=False)
@@ -377,12 +453,13 @@ class InstagramIE(InstagramBaseIE):
return {
'id': video_id,
'formats': formats,
- 'title': media.get('title') or 'Video by %s' % uploader_id,
+ 'title': media.get('title') or 'Video by %s' % username,
'description': description,
'duration': float_or_none(media.get('video_duration')),
'timestamp': traverse_obj(media, 'taken_at_timestamp', 'date', expected_type=int_or_none),
- 'uploader_id': uploader_id,
+ 'uploader_id': traverse_obj(media, ('owner', 'id')),
'uploader': traverse_obj(media, ('owner', 'full_name')),
+ 'channel': username,
'like_count': self._get_count(media, 'likes', 'preview_like') or str_to_int(self._search_regex(
r'data-log-event="likeCountClick"[^>]*>[^\d]*([\d,\.]+)', webpage, 'like count', fatal=False)),
'comment_count': self._get_count(media, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'),
@@ -577,42 +654,23 @@ class InstagramStoryIE(InstagramBaseIE):
'X-ASBD-ID': 198387,
'X-IG-WWW-Claim': 0,
})['reels']
- entites = []
-
- videos = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items'))
- for video_info in videos:
- formats = []
- if isinstance(video_info, list):
- video_info = video_info[0]
- vcodec = video_info.get('video_codec')
- dash_manifest_raw = video_info.get('video_dash_manifest')
- videos_list = video_info.get('video_versions')
- if not (dash_manifest_raw or videos_list):
- continue
- for format in videos_list:
- formats.append({
- 'url': format.get('url'),
- 'width': format.get('width'),
- 'height': format.get('height'),
- 'vcodec': vcodec,
- })
- if dash_manifest_raw:
- formats.extend(self._parse_mpd_formats(self._parse_xml(dash_manifest_raw, story_id), mpd_id='dash'))
- self._sort_formats(formats)
- thumbnails = [{
- 'url': thumbnail.get('url'),
- 'width': thumbnail.get('width'),
- 'height': thumbnail.get('height')
- } for thumbnail in traverse_obj(video_info, ('image_versions2', 'candidates')) or []]
- entites.append({
- 'id': video_info.get('id'),
- 'title': f'Story by {username}',
- 'timestamp': int_or_none(video_info.get('taken_at')),
- 'uploader': traverse_obj(videos, ('user', 'full_name')),
- 'duration': float_or_none(video_info.get('video_duration')),
- 'uploader_id': user_id,
- 'thumbnails': thumbnails,
- 'formats': formats,
- })
-
- return self.playlist_result(entites, playlist_id=story_id, playlist_title=highlight_title)
+
+ full_name = traverse_obj(videos, ('user', 'full_name'))
+
+ user_info = {}
+ if not (username and username != 'highlights' and full_name):
+ user_info = self._download_json(
+ f'https://i.instagram.com/api/v1/users/{user_id}/info/', story_id, headers={
+ 'User-Agent': 'Mozilla/5.0 (Linux; Android 11; SM-A505F Build/RP1A.200720.012; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/96.0.4664.45 Mobile Safari/537.36 Instagram 214.1.0.29.120 Android (30/11; 450dpi; 1080x2122; samsung; SM-A505F; a50; exynos9610; en_US; 333717274)',
+ }, note='Downloading user info')
+
+ username = traverse_obj(user_info, ('user', 'username')) or username
+ full_name = traverse_obj(user_info, ('user', 'full_name')) or full_name
+
+ highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items'))
+ return self.playlist_result([{
+ **self._extract_product(highlight),
+ 'title': f'Story by {username}',
+ 'uploader': full_name,
+ 'uploader_id': user_id,
+ } for highlight in highlights], playlist_id=story_id, playlist_title=highlight_title)
diff --git a/yt_dlp/extractor/itv.py b/yt_dlp/extractor/itv.py
index bdd6af688..f1591403f 100644
--- a/yt_dlp/extractor/itv.py
+++ b/yt_dlp/extractor/itv.py
@@ -243,8 +243,8 @@ class ITVBTCCIE(InfoExtractor):
webpage = self._download_webpage(url, playlist_id)
- json_map = try_get(self._parse_json(self._html_search_regex(
- '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)</script>', webpage, 'json_map'), playlist_id),
+ json_map = try_get(
+ self._search_nextjs_data(webpage, playlist_id),
lambda x: x['props']['pageProps']['article']['body']['content']) or []
entries = []
diff --git a/yt_dlp/extractor/joj.py b/yt_dlp/extractor/joj.py
index 637618183..7350f537c 100644
--- a/yt_dlp/extractor/joj.py
+++ b/yt_dlp/extractor/joj.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ format_field,
int_or_none,
js_to_json,
try_get,
@@ -72,7 +73,7 @@ class JojIE(InfoExtractor):
r'(\d+)[pP]\.', format_url, 'height', default=None)
formats.append({
'url': format_url,
- 'format_id': '%sp' % height if height else None,
+ 'format_id': format_field(height, template='%sp'),
'height': int(height),
})
if not formats:
diff --git a/yt_dlp/extractor/kakao.py b/yt_dlp/extractor/kakao.py
index 97c986d8c..483ab7128 100644
--- a/yt_dlp/extractor/kakao.py
+++ b/yt_dlp/extractor/kakao.py
@@ -3,10 +3,12 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import compat_HTTPError
from ..utils import (
+ ExtractorError,
int_or_none,
strip_or_none,
+ str_or_none,
traverse_obj,
unified_timestamp,
)
@@ -24,10 +26,17 @@ class KakaoIE(InfoExtractor):
'id': '301965083',
'ext': 'mp4',
'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』',
- 'uploader_id': 2671005,
+ 'description': '',
+ 'uploader_id': '2671005',
'uploader': '그랑그랑이',
'timestamp': 1488160199,
'upload_date': '20170227',
+ 'like_count': int,
+ 'thumbnail': r're:http://.+/thumb\.png',
+ 'tags': ['乃木坂'],
+ 'view_count': int,
+ 'duration': 1503,
+ 'comment_count': int,
}
}, {
'url': 'http://tv.kakao.com/channel/2653210/cliplink/300103180',
@@ -37,11 +46,21 @@ class KakaoIE(InfoExtractor):
'ext': 'mp4',
'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회',
'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)',
- 'uploader_id': 2653210,
+ 'uploader_id': '2653210',
'uploader': '쇼! 음악중심',
'timestamp': 1485684628,
'upload_date': '20170129',
+ 'like_count': int,
+ 'thumbnail': r're:http://.+/thumb\.png',
+ 'tags': 'count:28',
+ 'view_count': int,
+ 'duration': 184,
+ 'comment_count': int,
}
+ }, {
+ # geo restricted
+ 'url': 'https://tv.kakao.com/channel/3643855/cliplink/412069491',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -73,19 +92,24 @@ class KakaoIE(InfoExtractor):
title = clip.get('title') or clip_link.get('displayTitle')
formats = []
- for fmt in clip.get('videoOutputList', []):
+ for fmt in clip.get('videoOutputList') or []:
profile_name = fmt.get('profile')
if not profile_name or profile_name == 'AUDIO':
continue
query.update({
'profile': profile_name,
- 'fields': '-*,url',
+ 'fields': '-*,code,message,url',
})
+ try:
+ fmt_url_json = self._download_json(
+ cdn_api_base, video_id, query=query,
+ note='Downloading video URL for profile %s' % profile_name)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ resp = self._parse_json(e.cause.read().decode(), video_id)
+ if resp.get('code') == 'GeoBlocked':
+ self.raise_geo_restricted()
- fmt_url_json = self._download_json(
- cdn_api_base, video_id,
- 'Downloading video URL for profile %s' % profile_name,
- query=query, fatal=False)
fmt_url = traverse_obj(fmt_url_json, ('videoLocation', 'url'))
if not fmt_url:
continue
@@ -105,7 +129,7 @@ class KakaoIE(InfoExtractor):
for thumb in clip.get('clipChapterThumbnailList') or []:
thumbs.append({
'url': thumb.get('thumbnailUrl'),
- 'id': compat_str(thumb.get('timeInSec')),
+ 'id': str(thumb.get('timeInSec')),
'preference': -1 if thumb.get('isDefault') else 0
})
top_thumbnail = clip.get('thumbnailUrl')
@@ -120,7 +144,7 @@ class KakaoIE(InfoExtractor):
'title': title,
'description': strip_or_none(clip.get('description')),
'uploader': traverse_obj(clip_link, ('channel', 'name')),
- 'uploader_id': clip_link.get('channelId'),
+ 'uploader_id': str_or_none(clip_link.get('channelId')),
'thumbnails': thumbs,
'timestamp': unified_timestamp(clip_link.get('createTime')),
'duration': int_or_none(clip.get('duration')),
diff --git a/yt_dlp/extractor/kaltura.py b/yt_dlp/extractor/kaltura.py
index c8f60ef45..c58216458 100644
--- a/yt_dlp/extractor/kaltura.py
+++ b/yt_dlp/extractor/kaltura.py
@@ -12,6 +12,7 @@ from ..compat import (
from ..utils import (
clean_html,
ExtractorError,
+ format_field,
int_or_none,
unsmuggle_url,
smuggle_url,
@@ -372,6 +373,6 @@ class KalturaIE(InfoExtractor):
'thumbnail': info.get('thumbnailUrl'),
'duration': info.get('duration'),
'timestamp': info.get('createdAt'),
- 'uploader_id': info.get('userId') if info.get('userId') != 'None' else None,
+ 'uploader_id': format_field(info, 'userId', ignore=('None', None)),
'view_count': info.get('plays'),
}
diff --git a/yt_dlp/extractor/keezmovies.py b/yt_dlp/extractor/keezmovies.py
index 027f43cf0..06dbcbb40 100644
--- a/yt_dlp/extractor/keezmovies.py
+++ b/yt_dlp/extractor/keezmovies.py
@@ -8,6 +8,7 @@ from ..compat import compat_urllib_parse_unquote
from ..utils import (
determine_ext,
ExtractorError,
+ format_field,
int_or_none,
str_to_int,
strip_or_none,
@@ -69,7 +70,7 @@ class KeezMoviesIE(InfoExtractor):
video_url, title, 32).decode('utf-8')
formats.append({
'url': format_url,
- 'format_id': '%dp' % height if height else None,
+ 'format_id': format_field(height, template='%dp'),
'height': height,
'tbr': tbr,
})
diff --git a/yt_dlp/extractor/kelbyone.py b/yt_dlp/extractor/kelbyone.py
new file mode 100644
index 000000000..20c26cf48
--- /dev/null
+++ b/yt_dlp/extractor/kelbyone.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class KelbyOneIE(InfoExtractor):
+ _VALID_URL = r'https?://members\.kelbyone\.com/course/(?P<id>[^$&?#/]+)'
+
+ _TESTS = [{
+ 'url': 'https://members.kelbyone.com/course/glyn-dewis-mastering-selections/',
+ 'playlist_mincount': 1,
+ 'info_dict': {
+ 'id': 'glyn-dewis-mastering-selections',
+ 'title': 'Trailer - Mastering Selections in Photoshop',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'MkiOnLqK',
+ 'ext': 'mp4',
+ 'title': 'Trailer - Mastering Selections in Photoshop',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'thumbnail': 'https://content.jwplatform.com/v2/media/MkiOnLqK/poster.jpg?width=720',
+ 'timestamp': 1601568639,
+ 'duration': 90,
+ 'upload_date': '20201001',
+ },
+ }]
+ }]
+
+ def _entries(self, playlist):
+ for item in playlist:
+ video_id = item['mediaid']
+ thumbnails = [{
+ 'url': image.get('src'),
+ 'width': int_or_none(image.get('width')),
+ } for image in item.get('images') or []]
+ formats, subtitles = [], {}
+ for source in item.get('sources') or []:
+ if not source.get('file'):
+ continue
+ if source.get('type') == 'application/vnd.apple.mpegurl':
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(source['file'], video_id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subs, subtitles)
+ elif source.get('type') == 'audio/mp4':
+ formats.append({
+ 'format_id': source.get('label'),
+ 'url': source['file'],
+ 'vcodec': 'none',
+ })
+ else:
+ formats.append({
+ 'format_id': source.get('label'),
+ 'height': source.get('height'),
+ 'width': source.get('width'),
+ 'url': source['file'],
+ })
+ for track in item.get('tracks'):
+ if track.get('kind') == 'captions' and track.get('file'):
+ subtitles.setdefault('en', []).append({
+ 'url': track['file'],
+ })
+ self._sort_formats(formats)
+ yield {
+ 'id': video_id,
+ 'title': item['title'],
+ 'description': item.get('description'),
+ 'thumbnails': thumbnails,
+ 'thumbnail': item.get('image'),
+ 'timestamp': item.get('pubdate'),
+ 'duration': item.get('duration'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ def _real_extract(self, url):
+ item_id = self._match_id(url)
+ webpage = self._download_webpage(url, item_id)
+ playlist_url = self._html_search_regex(r'playlist"\:"(https.*content\.jwplatform\.com.*json)"', webpage, 'playlist url').replace('\\', '')
+ course_data = self._download_json(playlist_url, item_id)
+ return self.playlist_result(self._entries(course_data['playlist']), item_id,
+ course_data.get('title'), course_data.get('description'))
diff --git a/yt_dlp/extractor/line.py b/yt_dlp/extractor/line.py
index e1d5f21e1..987c43430 100644
--- a/yt_dlp/extractor/line.py
+++ b/yt_dlp/extractor/line.py
@@ -5,95 +5,12 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ format_field,
int_or_none,
- js_to_json,
str_or_none,
)
-class LineTVIE(InfoExtractor):
- _VALID_URL = r'https?://tv\.line\.me/v/(?P<id>\d+)_[^/]+-(?P<segment>ep\d+-\d+)'
-
- _TESTS = [{
- 'url': 'https://tv.line.me/v/793123_goodbye-mrblack-ep1-1/list/69246',
- 'info_dict': {
- 'id': '793123_ep1-1',
- 'ext': 'mp4',
- 'title': 'Goodbye Mr.Black | EP.1-1',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 998.509,
- 'view_count': int,
- },
- }, {
- 'url': 'https://tv.line.me/v/2587507_%E6%B4%BE%E9%81%A3%E5%A5%B3%E9%86%ABx-ep1-02/list/185245',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- series_id, segment = self._match_valid_url(url).groups()
- video_id = '%s_%s' % (series_id, segment)
-
- webpage = self._download_webpage(url, video_id)
-
- player_params = self._parse_json(self._search_regex(
- r'naver\.WebPlayer\(({[^}]+})\)', webpage, 'player parameters'),
- video_id, transform_source=js_to_json)
-
- video_info = self._download_json(
- 'https://global-nvapis.line.me/linetv/rmcnmv/vod_play_videoInfo.json',
- video_id, query={
- 'videoId': player_params['videoId'],
- 'key': player_params['key'],
- })
-
- stream = video_info['streams'][0]
- extra_query = '?__gda__=' + stream['key']['value']
- formats = self._extract_m3u8_formats(
- stream['source'] + extra_query, video_id, ext='mp4',
- entry_protocol='m3u8_native', m3u8_id='hls')
-
- for a_format in formats:
- a_format['url'] += extra_query
-
- duration = None
- for video in video_info.get('videos', {}).get('list', []):
- encoding_option = video.get('encodingOption', {})
- abr = video['bitrate']['audio']
- vbr = video['bitrate']['video']
- tbr = abr + vbr
- formats.append({
- 'url': video['source'],
- 'format_id': 'http-%d' % int(tbr),
- 'height': encoding_option.get('height'),
- 'width': encoding_option.get('width'),
- 'abr': abr,
- 'vbr': vbr,
- 'filesize': video.get('size'),
- })
- if video.get('duration') and duration is None:
- duration = video['duration']
-
- self._sort_formats(formats)
-
- if formats and not formats[0].get('width'):
- formats[0]['vcodec'] = 'none'
-
- title = self._og_search_title(webpage)
-
- # like_count requires an additional API request https://tv.line.me/api/likeit/getCount
-
- return {
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'extra_param_to_segment_url': extra_query[1:],
- 'duration': duration,
- 'thumbnails': [{'url': thumbnail['source']}
- for thumbnail in video_info.get('thumbnails', {}).get('list', [])],
- 'view_count': video_info.get('meta', {}).get('count'),
- }
-
-
class LineLiveBaseIE(InfoExtractor):
_API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/'
@@ -121,7 +38,7 @@ class LineLiveBaseIE(InfoExtractor):
'timestamp': int_or_none(item.get('createdAt')),
'channel': channel.get('name'),
'channel_id': channel_id,
- 'channel_url': 'https://live.line.me/channels/' + channel_id if channel_id else None,
+ 'channel_url': format_field(channel_id, template='https://live.line.me/channels/%s'),
'duration': int_or_none(item.get('archiveDuration')),
'view_count': int_or_none(item.get('viewerCount')),
'comment_count': int_or_none(item.get('chatCount')),
@@ -132,16 +49,19 @@ class LineLiveBaseIE(InfoExtractor):
class LineLiveIE(LineLiveBaseIE):
_VALID_URL = r'https?://live\.line\.me/channels/(?P<channel_id>\d+)/broadcast/(?P<id>\d+)'
_TESTS = [{
- 'url': 'https://live.line.me/channels/4867368/broadcast/16331360',
- 'md5': 'bc931f26bf1d4f971e3b0982b3fab4a3',
+ 'url': 'https://live.line.me/channels/5833718/broadcast/18373277',
+ 'md5': '2c15843b8cb3acd55009ddcb2db91f7c',
'info_dict': {
- 'id': '16331360',
- 'title': '振りコピ講座😙😙😙',
+ 'id': '18373277',
+ 'title': '2021/12/05 (15分犬)定例譲渡会🐶',
'ext': 'mp4',
- 'timestamp': 1617095132,
- 'upload_date': '20210330',
- 'channel': '白川ゆめか',
- 'channel_id': '4867368',
+ 'timestamp': 1638674925,
+ 'upload_date': '20211205',
+ 'thumbnail': 'md5:e1f5817e60f4a72b7e43377cf308d7ef',
+ 'channel_url': 'https://live.line.me/channels/5833718',
+ 'channel': 'Yahooニュース掲載🗞プロフ見てね🐕🐕',
+ 'channel_id': '5833718',
+ 'duration': 937,
'view_count': int,
'comment_count': int,
'is_live': False,
@@ -193,8 +113,8 @@ class LineLiveChannelIE(LineLiveBaseIE):
'url': 'https://live.line.me/channels/5893542',
'info_dict': {
'id': '5893542',
- 'title': 'いくらちゃん',
- 'description': 'md5:c3a4af801f43b2fac0b02294976580be',
+ 'title': 'いくらちゃんだよぉ🦒',
+ 'description': 'md5:4d418087973ad081ceb1b3481f0b1816',
},
'playlist_mincount': 29
}
diff --git a/yt_dlp/extractor/litv.py b/yt_dlp/extractor/litv.py
index 18d237ef9..16b475a44 100644
--- a/yt_dlp/extractor/litv.py
+++ b/yt_dlp/extractor/litv.py
@@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
+ traverse_obj,
smuggle_url,
unsmuggle_url,
)
@@ -55,9 +56,6 @@ class LiTVIE(InfoExtractor):
episode_title = program_info['title']
content_id = season_list['contentId']
- if prompt:
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id))
-
all_episodes = [
self.url_result(smuggle_url(
self._URL_TEMPLATE % (program_info['contentType'], episode['contentId']),
@@ -67,16 +65,10 @@ class LiTVIE(InfoExtractor):
return self.playlist_result(all_episodes, content_id, episode_title)
def _real_extract(self, url):
- url, data = unsmuggle_url(url, {})
+ url, smuggled_data = unsmuggle_url(url, {})
video_id = self._match_id(url)
- noplaylist = self.get_param('noplaylist')
- noplaylist_prompt = True
- if 'force_noplaylist' in data:
- noplaylist = data['force_noplaylist']
- noplaylist_prompt = False
-
webpage = self._download_webpage(url, video_id)
program_info = self._parse_json(self._search_regex(
@@ -84,14 +76,9 @@ class LiTVIE(InfoExtractor):
video_id)
season_list = list(program_info.get('seasonList', {}).values())
- if season_list:
- if not noplaylist:
- return self._extract_playlist(
- season_list[0], video_id, program_info,
- prompt=noplaylist_prompt)
-
- if noplaylist_prompt:
- self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+ playlist_id = traverse_obj(season_list, 0, 'contentId')
+ if self._yes_playlist(playlist_id, video_id, smuggled_data):
+ return self._extract_playlist(season_list[0], video_id, program_info)
# In browsers `getMainUrl` request is always issued. Usually this
# endpoint gives the same result as the data embedded in the webpage.
diff --git a/yt_dlp/extractor/lnkgo.py b/yt_dlp/extractor/lnkgo.py
index 14675968e..bd2dffac0 100644
--- a/yt_dlp/extractor/lnkgo.py
+++ b/yt_dlp/extractor/lnkgo.py
@@ -6,8 +6,10 @@ from .common import InfoExtractor
from ..utils import (
clean_html,
compat_str,
+ format_field,
int_or_none,
parse_iso8601,
+ unified_strdate,
)
@@ -71,17 +73,97 @@ class LnkGoIE(InfoExtractor):
video_id, 'mp4', 'm3u8_native')
self._sort_formats(formats)
- poster_image = video_info.get('posterImage')
-
return {
'id': video_id,
'display_id': display_id,
'title': title,
'formats': formats,
- 'thumbnail': 'https://lnk.lt/all-images/' + poster_image if poster_image else None,
+ 'thumbnail': format_field(video_info, 'posterImage', 'https://lnk.lt/all-images/%s'),
'duration': int_or_none(video_info.get('duration')),
'description': clean_html(video_info.get('htmlDescription')),
'age_limit': self._AGE_LIMITS.get(video_info.get('pgRating'), 0),
'timestamp': parse_iso8601(video_info.get('airDate')),
'view_count': int_or_none(video_info.get('viewsCount')),
}
+
+
+class LnkIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?lnk\.lt/[^/]+/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://lnk.lt/zinios/79791',
+ 'info_dict': {
+ 'id': '79791',
+ 'ext': 'mp4',
+ 'title': 'LNK.lt: Viešintų gyventojai sukilo prieš radijo bangų siųstuvą',
+ 'description': 'Svarbiausios naujienos trumpai, LNK žinios ir Info dienos pokalbiai.',
+ 'view_count': int,
+ 'duration': 233,
+ 'upload_date': '20191123',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'episode_number': 13431,
+ 'series': 'Naujausi žinių reportažai',
+ 'episode': 'Episode 13431'
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://lnk.lt/istorijos-trumpai/152546',
+ 'info_dict': {
+ 'id': '152546',
+ 'ext': 'mp4',
+ 'title': 'Radžio koncertas gaisre ',
+ 'description': 'md5:0666b5b85cb9fc7c1238dec96f71faba',
+ 'view_count': int,
+ 'duration': 54,
+ 'upload_date': '20220105',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'episode_number': 1036,
+ 'series': 'Istorijos trumpai',
+ 'episode': 'Episode 1036'
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://lnk.lt/gyvunu-pasaulis/151549',
+ 'info_dict': {
+ 'id': '151549',
+ 'ext': 'mp4',
+ 'title': 'Gyvūnų pasaulis',
+ 'description': '',
+ 'view_count': int,
+ 'duration': 1264,
+ 'upload_date': '20220108',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'episode_number': 16,
+ 'series': 'Gyvūnų pasaulis',
+ 'episode': 'Episode 16'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ video_json = self._download_json(f'https://lnk.lt/api/video/video-config/{id}', id)['videoInfo']
+ formats, subtitles = [], {}
+ if video_json.get('videoUrl'):
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(video_json['videoUrl'], id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ if video_json.get('videoFairplayUrl') and not video_json.get('drm'):
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(video_json['videoFairplayUrl'], id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+
+ self._sort_formats(formats)
+ return {
+ 'id': id,
+ 'title': video_json.get('title'),
+ 'description': video_json.get('description'),
+ 'view_count': video_json.get('viewsCount'),
+ 'duration': video_json.get('duration'),
+ 'upload_date': unified_strdate(video_json.get('airDate')),
+ 'thumbnail': format_field(video_json, 'posterImage', 'https://lnk.lt/all-images/%s'),
+ 'episode_number': int_or_none(video_json.get('episodeNumber')),
+ 'series': video_json.get('programTitle'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/yt_dlp/extractor/mainstreaming.py b/yt_dlp/extractor/mainstreaming.py
new file mode 100644
index 000000000..0f349a7a3
--- /dev/null
+++ b/yt_dlp/extractor/mainstreaming.py
@@ -0,0 +1,219 @@
+# coding: utf-8
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ parse_duration,
+ traverse_obj,
+ try_get,
+ urljoin
+)
+
+
+class MainStreamingIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P<id>\w+)'
+ IE_DESC = 'MainStreaming Player'
+
+ _TESTS = [
+ {
+ # Live stream offline, has alternative content id
+ 'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/53EN6GxbWaJC',
+ 'info_dict': {
+ 'id': '53EN6GxbWaJC',
+ 'title': 'Diretta homepage 2021-12-31 12:00',
+ 'description': '',
+ 'live_status': 'was_live',
+ 'ext': 'mp4',
+ 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+ },
+ 'expected_warnings': [
+ 'Ignoring alternative content ID: WDAF1KOWUpH3',
+ 'MainStreaming said: Live event is OFFLINE'
+ ],
+ 'skip': 'live stream offline'
+ }, {
+ # playlist
+ 'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/WDAF1KOWUpH3',
+ 'info_dict': {
+ 'id': 'WDAF1KOWUpH3',
+ 'title': 'Playlist homepage',
+ },
+ 'playlist_mincount': 2
+ }, {
+ # livestream
+ 'url': 'https://webtools-859c1818ed614cc5b0047439470927b0.msvdn.net/embed/tDoFkZD3T1Lw',
+ 'info_dict': {
+ 'id': 'tDoFkZD3T1Lw',
+ 'title': r're:Class CNBC Live \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'live_status': 'is_live',
+ 'ext': 'mp4',
+ 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+ },
+ 'skip': 'live stream'
+ }, {
+ 'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/EUlZfGWkGpOd?autoPlay=false',
+ 'info_dict': {
+ 'id': 'EUlZfGWkGpOd',
+ 'title': 'La Settimana ',
+ 'description': '03 Ottobre ore 02:00',
+ 'ext': 'mp4',
+ 'live_status': 'not_live',
+ 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+ 'duration': 1512
+ }
+ }, {
+ # video without webtools- prefix
+ 'url': 'https://f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/MfuWmzL2lGkA?autoplay=false&T=1635860445',
+ 'info_dict': {
+ 'id': 'MfuWmzL2lGkA',
+ 'title': 'TG Mattina',
+ 'description': '06 Ottobre ore 08:00',
+ 'ext': 'mp4',
+ 'live_status': 'not_live',
+ 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+ 'duration': 789.04
+ }
+ }, {
+ # always-on livestream with DVR
+ 'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/HVvPMzy',
+ 'info_dict': {
+ 'id': 'HVvPMzy',
+ 'title': r're:^Diretta LaC News24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'description': 'canale all news',
+ 'live_status': 'is_live',
+ 'ext': 'mp4',
+ 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # no host
+ 'url': 'https://webtools.msvdn.net/embed/MfuWmzL2lGkA',
+ 'only_matching': True
+ }, {
+ 'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/amp_embed/tDoFkZD3T1Lw',
+ 'only_matching': True
+ }, {
+ 'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/content/tDoFkZD3T1Lw#',
+ 'only_matching': True
+ }
+ ]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ mobj = re.findall(
+ r'<iframe[^>]+?src=["\']?(?P<url>%s)["\']?' % MainStreamingIE._VALID_URL, webpage)
+ if mobj:
+ return [group[0] for group in mobj]
+
+ def _playlist_entries(self, host, playlist_content):
+ for entry in playlist_content:
+ content_id = entry.get('contentID')
+ yield {
+ '_type': 'url',
+ 'ie_key': MainStreamingIE.ie_key(),
+ 'id': content_id,
+ 'duration': int_or_none(traverse_obj(entry, ('duration', 'totalSeconds'))),
+ 'title': entry.get('title'),
+ 'url': f'https://{host}/embed/{content_id}'
+ }
+
+ @staticmethod
+ def _get_webtools_host(host):
+ if not host.startswith('webtools'):
+ host = 'webtools' + ('-' if not host.startswith('.') else '') + host
+ return host
+
+ def _get_webtools_base_url(self, host):
+ return f'{self.http_scheme()}//{self._get_webtools_host(host)}'
+
+ def _call_api(self, host: str, path: str, item_id: str, query=None, note='Downloading API JSON', fatal=False):
+ # JSON API, does not appear to be documented
+ return self._call_webtools_api(host, '/api/v2/' + path, item_id, query, note, fatal)
+
+ def _call_webtools_api(self, host: str, path: str, item_id: str, query=None, note='Downloading webtools API JSON', fatal=False):
+ # webtools docs: https://webtools.msvdn.net/
+ return self._download_json(
+ urljoin(self._get_webtools_base_url(host), path), item_id, query=query, note=note, fatal=fatal)
+
+ def _real_extract(self, url):
+ host, video_id = self._match_valid_url(url).groups()
+ content_info = try_get(
+ self._call_api(
+ host, f'content/{video_id}', video_id, note='Downloading content info API JSON'), lambda x: x['playerContentInfo'])
+ # Fallback
+ if not content_info:
+ webpage = self._download_webpage(url, video_id)
+ player_config = self._parse_json(
+ self._search_regex(
+ r'config\s*=\s*({.+?})\s*;', webpage, 'mainstreaming player config',
+ default='{}', flags=re.DOTALL),
+ video_id, transform_source=js_to_json, fatal=False) or {}
+ content_info = player_config['contentInfo']
+
+ host = content_info.get('host') or host
+ video_id = content_info.get('contentID') or video_id
+ title = content_info.get('title')
+ description = traverse_obj(content_info, 'longDescription', 'shortDescription', expected_type=str)
+ live_status = 'not_live'
+ if content_info.get('drmEnabled'):
+ self.report_drm(video_id)
+
+ alternative_content_id = content_info.get('alternativeContentID')
+ if alternative_content_id:
+ self.report_warning(f'Ignoring alternative content ID: {alternative_content_id}')
+
+ content_type = int_or_none(content_info.get('contentType'))
+ format_base_url = None
+ formats = []
+ subtitles = {}
+ # Live content
+ if content_type == 20:
+ dvr_enabled = traverse_obj(content_info, ('playerSettings', 'dvrEnabled'), expected_type=bool)
+ format_base_url = f"https://{host}/live/{content_info['liveSourceID']}/{video_id}/%s{'?DVR' if dvr_enabled else ''}"
+ live_status = 'is_live'
+ heartbeat = self._call_api(host, f'heartbeat/{video_id}', video_id, note='Checking stream status') or {}
+ if heartbeat.get('heartBeatUp') is False:
+ self.raise_no_formats(f'MainStreaming said: {heartbeat.get("responseMessage")}', expected=True)
+ live_status = 'was_live'
+
+ # Playlist
+ elif content_type == 31:
+ return self.playlist_result(
+ self._playlist_entries(host, content_info.get('playlistContents')), video_id, title, description)
+ # Normal video content?
+ elif content_type == 10:
+ format_base_url = f'https://{host}/vod/{video_id}/%s'
+ # Progressive format
+ # Note: in https://webtools.msvdn.net/loader/playerV2.js there is mention of original.mp3 format,
+ # however it seems to be the same as original.mp4?
+ formats.append({'url': format_base_url % 'original.mp4', 'format_note': 'original', 'quality': 1})
+ else:
+ self.raise_no_formats(f'Unknown content type {content_type}')
+
+ if format_base_url:
+ m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
+ format_base_url % 'playlist.m3u8', video_id=video_id, fatal=False)
+ mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
+ format_base_url % 'manifest.mpd', video_id=video_id, fatal=False)
+
+ subtitles = self._merge_subtitles(m3u8_subs, mpd_subs)
+ formats.extend(m3u8_formats + mpd_formats)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ 'live_status': live_status,
+ 'duration': parse_duration(content_info.get('duration')),
+ 'tags': content_info.get('tags'),
+ 'subtitles': subtitles,
+ 'thumbnail': urljoin(self._get_webtools_base_url(host), f'image/{video_id}/poster')
+ }
diff --git a/yt_dlp/extractor/medaltv.py b/yt_dlp/extractor/medaltv.py
index 2ece5aac4..59cc30736 100644
--- a/yt_dlp/extractor/medaltv.py
+++ b/yt_dlp/extractor/medaltv.py
@@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
+ format_field,
float_or_none,
int_or_none,
str_or_none,
@@ -118,7 +119,7 @@ class MedalTVIE(InfoExtractor):
author = try_get(
hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {}
author_id = str_or_none(author.get('id'))
- author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None
+ author_url = format_field(author_id, template='https://medal.tv/users/%s')
return {
'id': video_id,
diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py
index 119b39997..d6b456c5d 100644
--- a/yt_dlp/extractor/mediaset.py
+++ b/yt_dlp/extractor/mediaset.py
@@ -7,6 +7,7 @@ import re
from .theplatform import ThePlatformBaseIE
from ..utils import (
ExtractorError,
+ GeoRestrictedError,
int_or_none,
OnDemandPagedList,
parse_qs,
@@ -37,7 +38,7 @@ class MediasetIE(ThePlatformBaseIE):
'id': 'F310575103000102',
'ext': 'mp4',
'title': 'Episodio 1',
- 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'description': 'md5:e8017b7d7194e9bfb75299c2b8d81e02',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2682.0,
'upload_date': '20210530',
@@ -45,6 +46,11 @@ class MediasetIE(ThePlatformBaseIE):
'timestamp': 1622413946,
'uploader': 'Canale 5',
'uploader_id': 'C5',
+ 'season': 'Season 1',
+ 'episode': 'Episode 1',
+ 'season_number': 1,
+ 'episode_number': 1,
+ 'chapters': [{'start_time': 0.0, 'end_time': 439.88}, {'start_time': 439.88, 'end_time': 1685.84}, {'start_time': 1685.84, 'end_time': 2682.0}],
},
}, {
'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501',
@@ -53,7 +59,7 @@ class MediasetIE(ThePlatformBaseIE):
'id': 'F309013801000501',
'ext': 'mp4',
'title': 'Puntata del 25 maggio',
- 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'description': 'md5:ee2e456e3eb1dba5e814596655bb5296',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 6565.008,
'upload_date': '20200903',
@@ -61,6 +67,11 @@ class MediasetIE(ThePlatformBaseIE):
'timestamp': 1599172492,
'uploader': 'Canale 5',
'uploader_id': 'C5',
+ 'season': 'Season 5',
+ 'episode': 'Episode 5',
+ 'season_number': 5,
+ 'episode_number': 5,
+ 'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}],
},
}, {
'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-69-pezzo-di-luna_F303843101017801',
@@ -69,7 +80,7 @@ class MediasetIE(ThePlatformBaseIE):
'id': 'F303843101017801',
'ext': 'mp4',
'title': 'Episodio 69 - Pezzo di luna',
- 'description': '',
+ 'description': 'md5:7c32c8ec4118b72588b9412f11353f73',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 263.008,
'upload_date': '20200902',
@@ -77,6 +88,11 @@ class MediasetIE(ThePlatformBaseIE):
'timestamp': 1599064700,
'uploader': 'Italia 1',
'uploader_id': 'I1',
+ 'season': 'Season 5',
+ 'episode': 'Episode 178',
+ 'season_number': 5,
+ 'episode_number': 178,
+ 'chapters': [{'start_time': 0.0, 'end_time': 261.88}, {'start_time': 261.88, 'end_time': 263.008}],
},
}, {
'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-51-tu-chi-sei_F303843107000601',
@@ -85,7 +101,7 @@ class MediasetIE(ThePlatformBaseIE):
'id': 'F303843107000601',
'ext': 'mp4',
'title': 'Episodio 51 - Tu chi sei?',
- 'description': '',
+ 'description': 'md5:42ef006e56824cc31787a547590923f4',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 367.021,
'upload_date': '20200902',
@@ -93,6 +109,28 @@ class MediasetIE(ThePlatformBaseIE):
'timestamp': 1599069817,
'uploader': 'Italia 1',
'uploader_id': 'I1',
+ 'season': 'Season 5',
+ 'episode': 'Episode 6',
+ 'season_number': 5,
+ 'episode_number': 6,
+ 'chapters': [{'start_time': 0.0, 'end_time': 358.68}, {'start_time': 358.68, 'end_time': 367.021}],
+ },
+ }, {
+ # movie
+ 'url': 'https://www.mediasetplay.mediaset.it/movie/selvaggi/selvaggi_F006474501000101',
+ 'md5': '720440187a2ae26af8148eb9e6b901ed',
+ 'info_dict': {
+ 'id': 'F006474501000101',
+ 'ext': 'mp4',
+ 'title': 'Selvaggi',
+ 'description': 'md5:cfdedbbfdd12d4d0e5dcf1fa1b75284f',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 5233.01,
+ 'upload_date': '20210729',
+ 'timestamp': 1627594716,
+ 'uploader': 'Cine34',
+ 'uploader_id': 'B6',
+ 'chapters': [{'start_time': 0.0, 'end_time': 1938.56}, {'start_time': 1938.56, 'end_time': 5233.01}],
},
}, {
# clip
@@ -160,6 +198,22 @@ class MediasetIE(ThePlatformBaseIE):
video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src'])
return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url)
+ def _check_drm_formats(self, tp_formats, video_id):
+ has_nondrm, drm_manifest = False, ''
+ for f in tp_formats:
+ if '_sampleaes/' in (f.get('manifest_url') or ''):
+ drm_manifest = drm_manifest or f['manifest_url']
+ f['has_drm'] = True
+ if not f.get('has_drm') and f.get('manifest_url'):
+ has_nondrm = True
+
+ nodrm_manifest = re.sub(r'_sampleaes/(\w+)_fp_', r'/\1_no_', drm_manifest)
+ if has_nondrm or nodrm_manifest == drm_manifest:
+ return
+
+ tp_formats.extend(self._extract_m3u8_formats(
+ nodrm_manifest, video_id, m3u8_id='hls', fatal=False) or [])
+
def _real_extract(self, url):
guid = self._match_id(url)
tp_path = 'PR1GhC/media/guid/2702976343/' + guid
@@ -167,10 +221,10 @@ class MediasetIE(ThePlatformBaseIE):
formats = []
subtitles = {}
- first_e = None
+ first_e = geo_e = None
asset_type = 'geoNo:HD,browser,geoIT|geoNo:HD,geoIT|geoNo:SD,browser,geoIT|geoNo:SD,geoIT|geoNo|HD|SD'
# TODO: fixup ISM+none manifest URLs
- for f in ('MPEG4', 'MPEG-DASH+none', 'M3U+none'):
+ for f in ('MPEG4', 'M3U'):
try:
tp_formats, tp_subtitles = self._extract_theplatform_smil(
update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), {
@@ -179,13 +233,19 @@ class MediasetIE(ThePlatformBaseIE):
'assetTypes': asset_type,
}), guid, 'Downloading %s SMIL data' % (f.split('+')[0]))
except ExtractorError as e:
+ if not geo_e and isinstance(e, GeoRestrictedError):
+ geo_e = e
if not first_e:
first_e = e
- break
+ continue
+ self._check_drm_formats(tp_formats, guid)
formats.extend(tp_formats)
subtitles = self._merge_subtitles(subtitles, tp_subtitles)
- if first_e and not formats:
- raise first_e
+
+ # check for errors and report them
+ if (first_e or geo_e) and not formats:
+ raise geo_e or first_e
+
self._sort_formats(formats)
feed_data = self._download_json(
@@ -201,15 +261,22 @@ class MediasetIE(ThePlatformBaseIE):
break
info.update({
- 'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')),
- 'season_number': int_or_none(feed_data.get('tvSeasonNumber')),
- 'series': feed_data.get('mediasetprogram$brandTitle'),
+ 'description': info.get('description') or feed_data.get('description') or feed_data.get('longDescription'),
'uploader': publish_info.get('description'),
'uploader_id': publish_info.get('channel'),
'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')),
'thumbnail': thumbnail,
})
+ if feed_data.get('programType') == 'episode':
+ info.update({
+ 'episode_number': int_or_none(
+ feed_data.get('tvSeasonEpisodeNumber')),
+ 'season_number': int_or_none(
+ feed_data.get('tvSeasonNumber')),
+ 'series': feed_data.get('mediasetprogram$brandTitle'),
+ })
+
info.update({
'id': guid,
'formats': formats,
@@ -224,37 +291,29 @@ class MediasetShowIE(MediasetIE):
https?://
(?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/
(?:
- (?:fiction|programmi-tv|serie-tv)/(?:.+?/)?
- (?:[a-z]+)_SE(?P<id>\d{12})
+ (?:fiction|programmi-tv|serie-tv|kids)/(?:.+?/)?
+ (?:[a-z-]+)_SE(?P<id>\d{12})
(?:,ST(?P<st>\d{12}))?
(?:,sb(?P<sb>\d{9}))?$
)
)
'''
_TESTS = [{
- # TV Show webpage (with a single playlist)
- 'url': 'https://www.mediasetplay.mediaset.it/serie-tv/fireforce/episodi_SE000000001556',
+ # TV Show webpage (general webpage)
+ 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061',
'info_dict': {
- 'id': '000000001556',
- 'title': 'Fire Force',
+ 'id': '000000000061',
+ 'title': 'Le Iene',
},
- 'playlist_count': 1,
+ 'playlist_mincount': 7,
}, {
- # TV Show webpage (with multiple playlists)
+ # TV Show webpage (specific season)
'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061,ST000000002763',
'info_dict': {
'id': '000000002763',
'title': 'Le Iene',
},
- 'playlist_count': 7,
- }, {
- # TV Show specific playlist (single page)
- 'url': 'https://www.mediasetplay.mediaset.it/serie-tv/fireforce/episodi_SE000000001556,ST000000002738,sb100013107',
- 'info_dict': {
- 'id': '100013107',
- 'title': 'Episodi',
- },
- 'playlist_count': 4,
+ 'playlist_mincount': 7,
}, {
# TV Show specific playlist (with multiple pages)
'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/iservizi_SE000000000061,ST000000002763,sb100013375',
@@ -262,7 +321,7 @@ class MediasetShowIE(MediasetIE):
'id': '100013375',
'title': 'I servizi',
},
- 'playlist_count': 53,
+ 'playlist_mincount': 50,
}]
_BY_SUBBRAND = 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2?byCustomValue={subBrandId}{%s}&sort=:publishInfo_lastPublished|desc,tvSeasonEpisodeNumber|desc&range=%d-%d'
@@ -281,7 +340,7 @@ class MediasetShowIE(MediasetIE):
def _real_extract(self, url):
playlist_id, st, sb = self._match_valid_url(url).group('id', 'st', 'sb')
if not sb:
- page = self._download_webpage(url, playlist_id)
+ page = self._download_webpage(url, st or playlist_id)
entries = [self.url_result(urljoin('https://www.mediasetplay.mediaset.it', url))
for url in re.findall(r'href="([^<>=]+SE\d{12},ST\d{12},sb\d{9})">[^<]+<', page)]
title = (self._html_search_regex(r'(?s)<h1[^>]*>(.+?)</h1>', page, 'title', default=None)
diff --git a/yt_dlp/extractor/megatvcom.py b/yt_dlp/extractor/megatvcom.py
new file mode 100644
index 000000000..0d6793acd
--- /dev/null
+++ b/yt_dlp/extractor/megatvcom.py
@@ -0,0 +1,173 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ determine_ext,
+ ExtractorError,
+ extract_attributes,
+ get_element_by_class,
+ get_element_html_by_id,
+ HEADRequest,
+ parse_qs,
+ unescapeHTML,
+ unified_timestamp,
+)
+
+
+class MegaTVComBaseIE(InfoExtractor):
+ _PLAYER_DIV_ID = 'player_div_id'
+
+ def _extract_player_attrs(self, webpage):
+ player_el = get_element_html_by_id(self._PLAYER_DIV_ID, webpage)
+ return {
+ re.sub(r'^data-(?:kwik_)?', '', k): v
+ for k, v in extract_attributes(player_el).items()
+ if k not in ('id',)
+ }
+
+
+class MegaTVComIE(MegaTVComBaseIE):
+ IE_NAME = 'megatvcom'
+ IE_DESC = 'megatv.com videos'
+ _VALID_URL = r'https?://(?:www\.)?megatv\.com/(?:\d{4}/\d{2}/\d{2}|[^/]+/(?P<id>\d+))/(?P<slug>[^/]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.megatv.com/2021/10/23/egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia/',
+ 'md5': '6546a1a37fff0dd51c9dce5f490b7d7d',
+ 'info_dict': {
+ 'id': '520979',
+ 'ext': 'mp4',
+ 'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
+ 'description': 'md5:0209fa8d318128569c0d256a5c404db1',
+ 'timestamp': 1634975747,
+ 'upload_date': '20211023',
+ 'display_id': 'egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia',
+ 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/10/ΠΕΙΡΑΙΑΣ-1024x450.jpg',
+ },
+ }, {
+ 'url': 'https://www.megatv.com/tvshows/527800/epeisodio-65-12/',
+ 'md5': 'cba2085d45c1abeb8e7e9b7e1d6c0072',
+ 'info_dict': {
+ 'id': '527800',
+ 'ext': 'mp4',
+ 'title': 'md5:fc322cb51f682eecfe2f54cd5ab3a157',
+ 'description': 'md5:b2b7ed3690a78f2a0156eb790fdc00df',
+ 'timestamp': 1636048859,
+ 'upload_date': '20211104',
+ 'display_id': 'epeisodio-65-12',
+ 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/11/16-1-1.jpg',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id, display_id = self._match_valid_url(url).group('id', 'slug')
+ _is_article = video_id is None
+ webpage = self._download_webpage(url, video_id or display_id)
+ if _is_article:
+ video_id = self._search_regex(
+ r'<article[^>]*\sid=["\']Article_(\d+)["\']', webpage, 'article id')
+ player_attrs = self._extract_player_attrs(webpage)
+ title = player_attrs.get('label') or self._og_search_title(webpage)
+ description = get_element_by_class(
+ 'article-wrapper' if _is_article else 'story_content',
+ webpage)
+ description = clean_html(re.sub(r'<script[^>]*>[^<]+</script>', '', description))
+ if not description:
+ description = self._og_search_description(webpage)
+ thumbnail = player_attrs.get('image') or self._og_search_thumbnail(webpage)
+ timestamp = unified_timestamp(self._html_search_meta(
+ 'article:published_time', webpage))
+ source = player_attrs.get('source')
+ if not source:
+ raise ExtractorError('No source found', video_id=video_id)
+ if determine_ext(source) == 'm3u8':
+ formats, subs = self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4')
+ else:
+ formats, subs = [{'url': source}], {}
+ if player_attrs.get('subs'):
+ self._merge_subtitles({'und': [{'url': player_attrs['subs']}]}, target=subs)
+ self._sort_formats(formats)
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ 'subtitles': subs,
+ }
+
+
+class MegaTVComEmbedIE(MegaTVComBaseIE):
+ IE_NAME = 'megatvcom:embed'
+ IE_DESC = 'megatv.com embedded videos'
+ _VALID_URL = r'(?:https?:)?//(?:www\.)?megatv\.com/embed/?\?p=(?P<id>\d+)'
+ _EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''')
+
+ _TESTS = [{
+ 'url': 'https://www.megatv.com/embed/?p=2020520979',
+ 'md5': '6546a1a37fff0dd51c9dce5f490b7d7d',
+ 'info_dict': {
+ 'id': '520979',
+ 'ext': 'mp4',
+ 'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
+ 'description': 'md5:0209fa8d318128569c0d256a5c404db1',
+ 'timestamp': 1634975747,
+ 'upload_date': '20211023',
+ 'display_id': 'egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia',
+ 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/10/ΠΕΙΡΑΙΑΣ-1024x450.jpg',
+ },
+ }, {
+ 'url': 'https://www.megatv.com/embed/?p=2020534081',
+ 'md5': '6ac8b3ce4dc6120c802f780a1e6b3812',
+ 'info_dict': {
+ 'id': '534081',
+ 'ext': 'mp4',
+ 'title': 'md5:062e9d5976ef854d8bdc1f5724d9b2d0',
+ 'description': 'md5:36dbe4c3762d2ede9513eea8d07f6d52',
+ 'timestamp': 1636376351,
+ 'upload_date': '20211108',
+ 'display_id': 'neo-rekor-stin-timi-tou-ilektrikou-reymatos-pano-apo-ta-200e-i-xondriki-timi-tou-ilektrikou',
+ 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/11/Capture-266.jpg',
+ },
+ }]
+
+ @classmethod
+ def _extract_urls(cls, webpage):
+ for mobj in cls._EMBED_RE.finditer(webpage):
+ yield unescapeHTML(mobj.group('url'))
+
+ def _match_canonical_url(self, webpage):
+ LINK_RE = r'''(?x)
+ <link(?:
+ rel=(?P<_q1>["'])(?P<canonical>canonical)(?P=_q1)|
+ href=(?P<_q2>["'])(?P<href>(?:(?!(?P=_q2)).)+)(?P=_q2)|
+ [^>]*?
+ )+>
+ '''
+ for mobj in re.finditer(LINK_RE, webpage):
+ canonical, href = mobj.group('canonical', 'href')
+ if canonical and href:
+ return unescapeHTML(href)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ player_attrs = self._extract_player_attrs(webpage)
+ canonical_url = player_attrs.get('share_url') or self._match_canonical_url(webpage)
+ if not canonical_url:
+ raise ExtractorError('canonical URL not found')
+ video_id = parse_qs(canonical_url)['p'][0]
+
+ # Defer to megatvcom as the metadata extracted from the embeddable page some
+ # times are slightly different, for the same video
+ canonical_url = self._request_webpage(
+ HEADRequest(canonical_url), video_id,
+ note='Resolve canonical URL',
+ errnote='Could not resolve canonical URL').geturl()
+ return self.url_result(canonical_url, MegaTVComIE.ie_key(), video_id)
diff --git a/yt_dlp/extractor/mildom.py b/yt_dlp/extractor/mildom.py
index c147cbbf6..ddeaa7021 100644
--- a/yt_dlp/extractor/mildom.py
+++ b/yt_dlp/extractor/mildom.py
@@ -12,6 +12,8 @@ from ..utils import (
update_url_query,
random_uuidv4,
try_get,
+ float_or_none,
+ dict_get
)
from ..compat import (
compat_str,
@@ -22,9 +24,18 @@ class MildomBaseIE(InfoExtractor):
_GUEST_ID = None
_DISPATCHER_CONFIG = None
- def _call_api(self, url, video_id, query={}, note='Downloading JSON metadata', init=False):
+ def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', init=False):
+ query = query or {}
+ if query:
+ query['__platform'] = 'web'
url = update_url_query(url, self._common_queries(query, init=init))
- return self._download_json(url, video_id, note=note)['body']
+ content = self._download_json(url, video_id, note=note)
+ if content['code'] == 0:
+ return content['body']
+ else:
+ self.raise_no_formats(
+ f'Video not found or premium content. {content["code"]} - {content["message"]}',
+ expected=True)
def _common_queries(self, query={}, init=False):
dc = self._fetch_dispatcher_config()
@@ -148,6 +159,7 @@ class MildomIE(MildomBaseIE):
'id': result_video_id,
'title': title,
'description': description,
+ 'timestamp': float_or_none(enterstudio.get('live_start_ms'), scale=1000),
'uploader': uploader,
'uploader_id': video_id,
'formats': formats,
@@ -158,7 +170,50 @@ class MildomIE(MildomBaseIE):
class MildomVodIE(MildomBaseIE):
IE_NAME = 'mildom:vod'
IE_DESC = 'Download a VOD in Mildom'
- _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+)'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+-?[0-9]*)'
+ _TESTS = [{
+ 'url': 'https://www.mildom.com/playback/10882672/10882672-1597662269',
+ 'info_dict': {
+ 'id': '10882672-1597662269',
+ 'ext': 'mp4',
+ 'title': '始めてのミルダム配信じゃぃ!',
+ 'thumbnail': r're:^https?://.*\.(png|jpg)$',
+ 'upload_date': '20200817',
+ 'duration': 4138.37,
+ 'description': 'ゲームをしたくて!',
+ 'timestamp': 1597662269.0,
+ 'uploader_id': '10882672',
+ 'uploader': 'kson組長(けいそん)',
+ },
+ }, {
+ 'url': 'https://www.mildom.com/playback/10882672/10882672-1597758589870-477',
+ 'info_dict': {
+ 'id': '10882672-1597758589870-477',
+ 'ext': 'mp4',
+ 'title': '【kson】感染メイズ!麻酔銃で無双する',
+ 'thumbnail': r're:^https?://.*\.(png|jpg)$',
+ 'timestamp': 1597759093.0,
+ 'uploader': 'kson組長(けいそん)',
+ 'duration': 4302.58,
+ 'uploader_id': '10882672',
+ 'description': 'このステージ絶対乗り越えたい',
+ 'upload_date': '20200818',
+ },
+ }, {
+ 'url': 'https://www.mildom.com/playback/10882672/10882672-buha9td2lrn97fk2jme0',
+ 'info_dict': {
+ 'id': '10882672-buha9td2lrn97fk2jme0',
+ 'ext': 'mp4',
+ 'title': '【kson組長】CART RACER!!!',
+ 'thumbnail': r're:^https?://.*\.(png|jpg)$',
+ 'uploader_id': '10882672',
+ 'uploader': 'kson組長(けいそん)',
+ 'upload_date': '20201104',
+ 'timestamp': 1604494797.0,
+ 'duration': 4657.25,
+ 'description': 'WTF',
+ },
+ }]
def _real_extract(self, url):
m = self._match_valid_url(url)
@@ -213,6 +268,9 @@ class MildomVodIE(MildomBaseIE):
'id': video_id,
'title': title,
'description': description,
+ 'timestamp': float_or_none(autoplay['publish_time'], scale=1000),
+ 'duration': float_or_none(autoplay['video_length'], scale=1000),
+ 'thumbnail': dict_get(autoplay, ('upload_pic', 'video_pic')),
'uploader': uploader,
'uploader_id': user_id,
'formats': formats,
@@ -230,6 +288,13 @@ class MildomUserVodIE(MildomBaseIE):
'title': 'Uploads from ねこばたけ',
},
'playlist_mincount': 351,
+ }, {
+ 'url': 'https://www.mildom.com/profile/10882672',
+ 'info_dict': {
+ 'id': '10882672',
+ 'title': 'Uploads from kson組長(けいそん)',
+ },
+ 'playlist_mincount': 191,
}]
def _entries(self, user_id):
diff --git a/yt_dlp/extractor/minds.py b/yt_dlp/extractor/minds.py
index 8e9f0f825..9da07207b 100644
--- a/yt_dlp/extractor/minds.py
+++ b/yt_dlp/extractor/minds.py
@@ -5,6 +5,7 @@ from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
clean_html,
+ format_field,
int_or_none,
str_or_none,
strip_or_none,
@@ -120,7 +121,7 @@ class MindsIE(MindsBaseIE):
'timestamp': int_or_none(entity.get('time_created')),
'uploader': strip_or_none(owner.get('name')),
'uploader_id': uploader_id,
- 'uploader_url': 'https://www.minds.com/' + uploader_id if uploader_id else None,
+ 'uploader_url': format_field(uploader_id, template='https://www.minds.com/%s'),
'view_count': int_or_none(entity.get('play:count')),
'like_count': int_or_none(entity.get('thumbs:up:count')),
'dislike_count': int_or_none(entity.get('thumbs:down:count')),
diff --git a/yt_dlp/extractor/mixch.py b/yt_dlp/extractor/mixch.py
index a99ddd172..31f450dfa 100644
--- a/yt_dlp/extractor/mixch.py
+++ b/yt_dlp/extractor/mixch.py
@@ -11,7 +11,7 @@ class MixchIE(InfoExtractor):
IE_NAME = 'mixch'
_VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P<id>\d+)'
- TESTS = [{
+ _TESTS = [{
'url': 'https://mixch.tv/u/16236849/live',
'skip': 'don\'t know if this live persists',
'info_dict': {
@@ -53,3 +53,33 @@ class MixchIE(InfoExtractor):
}],
'is_live': True,
}
+
+
+class MixchArchiveIE(InfoExtractor):
+ IE_NAME = 'mixch:archive'
+ _VALID_URL = r'https?://(?:www\.)?mixch\.tv/archive/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://mixch.tv/archive/421',
+ 'skip': 'paid video, no DRM. expires at Jan 23',
+ 'info_dict': {
+ 'id': '421',
+ 'title': '96NEKO SHOW TIME',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ html5_videos = self._parse_html5_media_entries(
+ url, webpage.replace('video-js', 'video'), video_id, 'hls')
+ if not html5_videos:
+ self.raise_login_required(method='cookies')
+ infodict = html5_videos[0]
+ infodict.update({
+ 'id': video_id,
+ 'title': self._html_search_regex(r'class="archive-title">(.+?)</', webpage, 'title')
+ })
+
+ return infodict
diff --git a/yt_dlp/extractor/mixcloud.py b/yt_dlp/extractor/mixcloud.py
index a0c043d4b..c2dd078ac 100644
--- a/yt_dlp/extractor/mixcloud.py
+++ b/yt_dlp/extractor/mixcloud.py
@@ -12,6 +12,7 @@ from ..compat import (
compat_zip
)
from ..utils import (
+ ExtractorError,
int_or_none,
parse_iso8601,
strip_or_none,
@@ -125,7 +126,20 @@ class MixcloudIE(MixcloudBaseIE):
tag {
name
}
- }''', track_id, username, slug)
+ }
+ restrictedReason
+ id''', track_id, username, slug)
+
+ if not cloudcast:
+ raise ExtractorError('Track not found', expected=True)
+
+ reason = cloudcast.get('restrictedReason')
+ if reason == 'tracklist':
+ raise ExtractorError('Track unavailable in your country due to licensing restrictions', expected=True)
+ elif reason == 'repeat_play':
+ raise ExtractorError('You have reached your play limit for this track', expected=True)
+ elif reason:
+ raise ExtractorError('Track is restricted', expected=True)
title = cloudcast['name']
diff --git a/yt_dlp/extractor/musicdex.py b/yt_dlp/extractor/musicdex.py
new file mode 100644
index 000000000..05f722091
--- /dev/null
+++ b/yt_dlp/extractor/musicdex.py
@@ -0,0 +1,175 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ date_from_str,
+ format_field,
+ try_get,
+ unified_strdate,
+)
+
+
+class MusicdexBaseIE(InfoExtractor):
+ def _return_info(self, track_json, album_json, id):
+ return {
+ 'id': str(id),
+ 'title': track_json.get('name'),
+ 'track': track_json.get('name'),
+ 'description': track_json.get('description'),
+ 'track_number': track_json.get('number'),
+ 'url': format_field(track_json, 'url', 'https://www.musicdex.org/%s'),
+ 'duration': track_json.get('duration'),
+ 'genre': [genre.get('name') for genre in track_json.get('genres') or []],
+ 'like_count': track_json.get('likes_count'),
+ 'view_count': track_json.get('plays'),
+ 'artist': [artist.get('name') for artist in track_json.get('artists') or []],
+ 'album_artist': [artist.get('name') for artist in album_json.get('artists') or []],
+ 'thumbnail': format_field(album_json, 'image', 'https://www.musicdex.org/%s'),
+ 'album': album_json.get('name'),
+ 'release_year': try_get(album_json, lambda x: date_from_str(unified_strdate(x['release_date'])).year),
+ 'extractor_key': MusicdexSongIE.ie_key(),
+ 'extractor': 'MusicdexSong',
+ }
+
+
+class MusicdexSongIE(MusicdexBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?musicdex\.org/track/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://www.musicdex.org/track/306/dual-existence',
+ 'info_dict': {
+ 'id': '306',
+ 'ext': 'mp3',
+ 'title': 'dual existence',
+ 'description': '#NIPPONSEI @ IRC.RIZON.NET',
+ 'track': 'dual existence',
+ 'track_number': 1,
+ 'duration': 266000,
+ 'genre': ['Anime'],
+ 'like_count': int,
+ 'view_count': int,
+ 'artist': ['fripSide'],
+ 'album_artist': ['fripSide'],
+ 'thumbnail': 'https://www.musicdex.org/storage/album/9iDIam1DHTVqUG4UclFIEq1WAFGXfPW4y0TtZa91.png',
+ 'album': 'To Aru Kagaku no Railgun T OP2 Single - dual existence',
+ 'release_year': 2020
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://www.musicdex.org/secure/tracks/{id}?defaultRelations=true', id)['track']
+ return self._return_info(data_json, data_json.get('album') or {}, id)
+
+
+class MusicdexAlbumIE(MusicdexBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?musicdex\.org/album/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://www.musicdex.org/album/56/tenmon-and-eiichiro-yanagi-minori/ef-a-tale-of-memories-original-soundtrack-2-fortissimo',
+ 'playlist_mincount': 28,
+ 'info_dict': {
+ 'id': '56',
+ 'genre': ['OST'],
+ 'view_count': int,
+ 'artist': ['TENMON & Eiichiro Yanagi / minori'],
+ 'title': 'ef - a tale of memories Original Soundtrack 2 ~fortissimo~',
+ 'release_year': 2008,
+ 'thumbnail': 'https://www.musicdex.org/storage/album/2rSHkyYBYfB7sbvElpEyTMcUn6toY7AohOgJuDlE.jpg',
+ },
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://www.musicdex.org/secure/albums/{id}?defaultRelations=true', id)['album']
+ entries = [self._return_info(track, data_json, track['id']) for track in data_json.get('tracks') or [] if track.get('id')]
+
+ return {
+ '_type': 'playlist',
+ 'id': id,
+ 'title': data_json.get('name'),
+ 'description': data_json.get('description'),
+ 'genre': [genre.get('name') for genre in data_json.get('genres') or []],
+ 'view_count': data_json.get('plays'),
+ 'artist': [artist.get('name') for artist in data_json.get('artists') or []],
+ 'thumbnail': format_field(data_json, 'image', 'https://www.musicdex.org/%s'),
+ 'release_year': try_get(data_json, lambda x: date_from_str(unified_strdate(x['release_date'])).year),
+ 'entries': entries,
+ }
+
+
+class MusicdexPageIE(MusicdexBaseIE):
+ def _entries(self, id):
+ next_page_url = self._API_URL % id
+ while next_page_url:
+ data_json = self._download_json(next_page_url, id)['pagination']
+ for data in data_json.get('data') or []:
+ yield data
+ next_page_url = data_json.get('next_page_url')
+
+
+class MusicdexArtistIE(MusicdexPageIE):
+ _VALID_URL = r'https?://(?:www\.)?musicdex\.org/artist/(?P<id>\d+)'
+ _API_URL = 'https://www.musicdex.org/secure/artists/%s/albums?page=1'
+
+ _TESTS = [{
+ 'url': 'https://www.musicdex.org/artist/11/fripside',
+ 'playlist_mincount': 28,
+ 'info_dict': {
+ 'id': '11',
+ 'view_count': int,
+ 'title': 'fripSide',
+ 'thumbnail': 'https://www.musicdex.org/storage/artist/ZmOz0lN2vsweegB660em3xWffCjLPmTQHqJls5Xx.jpg',
+ },
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://www.musicdex.org/secure/artists/{id}', id)['artist']
+ entries = []
+ for album in self._entries(id):
+ entries.extend(self._return_info(track, album, track['id']) for track in album.get('tracks') or [] if track.get('id'))
+
+ return {
+ '_type': 'playlist',
+ 'id': id,
+ 'title': data_json.get('name'),
+ 'view_count': data_json.get('plays'),
+ 'thumbnail': format_field(data_json, 'image_small', 'https://www.musicdex.org/%s'),
+ 'entries': entries,
+ }
+
+
+class MusicdexPlaylistIE(MusicdexPageIE):
+ _VALID_URL = r'https?://(?:www\.)?musicdex\.org/playlist/(?P<id>\d+)'
+ _API_URL = 'https://www.musicdex.org/secure/playlists/%s/tracks?perPage=10000&page=1'
+
+ _TESTS = [{
+ 'url': 'https://www.musicdex.org/playlist/9/test',
+ 'playlist_mincount': 73,
+ 'info_dict': {
+ 'id': '9',
+ 'view_count': int,
+ 'title': 'Test',
+ 'thumbnail': 'https://www.musicdex.org/storage/album/jXATI79f0IbQ2sgsKYOYRCW3zRwF3XsfHhzITCuJ.jpg',
+ 'description': 'Test 123 123 21312 32121321321321312',
+ },
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://www.musicdex.org/secure/playlists/{id}', id)['playlist']
+ entries = [self._return_info(track, track.get('album') or {}, track['id'])
+ for track in self._entries(id) or [] if track.get('id')]
+
+ return {
+ '_type': 'playlist',
+ 'id': id,
+ 'title': data_json.get('name'),
+ 'description': data_json.get('description'),
+ 'view_count': data_json.get('plays'),
+ 'thumbnail': format_field(data_json, 'image', 'https://www.musicdex.org/%s'),
+ 'entries': entries,
+ }
diff --git a/yt_dlp/extractor/myspass.py b/yt_dlp/extractor/myspass.py
index db7ebc94c..1775d5f0b 100644
--- a/yt_dlp/extractor/myspass.py
+++ b/yt_dlp/extractor/myspass.py
@@ -1,8 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
@@ -13,33 +11,74 @@ from ..utils import (
class MySpassIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?myspass\.de/([^/]+/)*(?P<id>\d+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?myspass\.de/(?:[^/]+/)*(?P<id>\d+)/?[^/]*$'
+ _TESTS = [{
'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/',
'md5': '0b49f4844a068f8b33f4b7c88405862b',
'info_dict': {
'id': '11741',
'ext': 'mp4',
- 'description': 'Wer kann in die Fußstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?',
+ 'description': 'md5:9f0db5044c8fe73f528a390498f7ce9b',
'title': '17.02.2013 - Die Highlights, Teil 2',
+ 'thumbnail': r're:.*\.jpg',
+ 'duration': 323.0,
+ 'episode': '17.02.2013 - Die Highlights, Teil 2',
+ 'season_id': '544',
+ 'episode_number': 1,
+ 'series': 'Absolute Mehrheit',
+ 'season_number': 2,
+ 'season': 'Season 2',
+ },
+ },
+ {
+ 'url': 'https://www.myspass.de/shows/tvshows/tv-total/Novak-Puffovic-bei-bester-Laune--/44996/',
+ 'md5': 'eb28b7c5e254192046e86ebaf7deac8f',
+ 'info_dict': {
+ 'id': '44996',
+ 'ext': 'mp4',
+ 'description': 'md5:74c7f886e00834417f1e427ab0da6121',
+ 'title': 'Novak Puffovic bei bester Laune',
+ 'thumbnail': r're:.*\.jpg',
+ 'episode_number': 8,
+ 'episode': 'Novak Puffovic bei bester Laune',
+ 'series': 'TV total',
+ 'season': 'Season 19',
+ 'season_id': '987',
+ 'duration': 2941.0,
+ 'season_number': 19,
+ },
+ },
+ {
+ 'url': 'https://www.myspass.de/channels/tv-total-raabigramm/17033/20831/',
+ 'md5': '7b293a6b9f3a7acdd29304c8d0dbb7cc',
+ 'info_dict': {
+ 'id': '20831',
+ 'ext': 'mp4',
+ 'description': 'Gefühle pur: Schaut euch die ungeschnittene Version von Stefans Liebesbeweis an die Moderationsgrazie von Welt, Verona Feldbusch, an.',
+ 'title': 'Raabigramm Verona Feldbusch',
+ 'thumbnail': r're:.*\.jpg',
+ 'episode_number': 6,
+ 'episode': 'Raabigramm Verona Feldbusch',
+ 'series': 'TV total',
+ 'season': 'Season 1',
+ 'season_id': '34',
+ 'duration': 105.0,
+ 'season_number': 1,
},
- }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- metadata = self._download_xml(
- 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=' + video_id,
- video_id)
+ metadata = self._download_xml('http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=' + video_id, video_id)
title = xpath_text(metadata, 'title', fatal=True)
video_url = xpath_text(metadata, 'url_flv', 'download url', True)
video_id_int = int(video_id)
- for group in re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url).groups():
+ for group in self._search_regex(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url, 'myspass', group=(1, 2, 3), default=[]):
group_int = int(group)
if group_int > video_id_int:
- video_url = video_url.replace(
- group, compat_str(group_int // video_id_int))
+ video_url = video_url.replace(group, compat_str(group_int // video_id_int))
return {
'id': video_id,
diff --git a/yt_dlp/extractor/nba.py b/yt_dlp/extractor/nba.py
index 7390ef8bc..359cc52b7 100644
--- a/yt_dlp/extractor/nba.py
+++ b/yt_dlp/extractor/nba.py
@@ -165,14 +165,10 @@ class NBAWatchIE(NBAWatchBaseIE):
def _real_extract(self, url):
display_id = self._match_id(url)
collection_id = parse_qs(url).get('collection', [None])[0]
- if collection_id:
- if self.get_param('noplaylist'):
- self.to_screen('Downloading just video %s because of --no-playlist' % display_id)
- else:
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % collection_id)
- return self.url_result(
- 'https://www.nba.com/watch/list/collection/' + collection_id,
- NBAWatchCollectionIE.ie_key(), collection_id)
+ if self._yes_playlist(collection_id, display_id):
+ return self.url_result(
+ 'https://www.nba.com/watch/list/collection/' + collection_id,
+ NBAWatchCollectionIE.ie_key(), collection_id)
return self._extract_video('seoName', display_id)
diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py
index cd573690b..109403440 100644
--- a/yt_dlp/extractor/nbc.py
+++ b/yt_dlp/extractor/nbc.py
@@ -197,9 +197,12 @@ class NBCSportsVPlayerIE(InfoExtractor):
'timestamp': 1426270238,
'upload_date': '20150313',
'uploader': 'NBCU-SPORTS',
+ 'duration': 72.818,
+ 'chapters': [],
+ 'thumbnail': r're:^https?://.*\.jpg$'
}
}, {
- 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z',
+ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/PEgOtlNcC_y2',
'only_matching': True,
}, {
'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true',
@@ -208,16 +211,15 @@ class NBCSportsVPlayerIE(InfoExtractor):
@staticmethod
def _extract_url(webpage):
- iframe_m = re.search(
- r'<(?:iframe[^>]+|div[^>]+data-(?:mpx-)?)src="(?P<url>%s[^"]+)"' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage)
- if iframe_m:
- return iframe_m.group('url')
+ video_urls = re.search(
+ r'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>%s[^\"]+)' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage)
+ if video_urls:
+ return video_urls.group('url')
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- theplatform_url = self._og_search_video_url(webpage).replace(
- 'vplayer.nbcsports.com', 'player.theplatform.com')
+ theplatform_url = self._html_search_regex(r'tp:releaseUrl="(.+?)"', webpage, 'url')
return self.url_result(theplatform_url, 'ThePlatform')
@@ -235,6 +237,9 @@ class NBCSportsIE(InfoExtractor):
'uploader': 'NBCU-SPORTS',
'upload_date': '20150330',
'timestamp': 1427726529,
+ 'chapters': [],
+ 'thumbnail': 'https://hdliveextra-a.akamaihd.net/HD/image_sports/NBCU_Sports_Group_-_nbcsports/253/303/izzodps.jpg',
+ 'duration': 528.395,
}
}, {
# data-mpx-src
@@ -403,9 +408,7 @@ class NBCNewsIE(ThePlatformIE):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- data = self._parse_json(self._search_regex(
- r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>',
- webpage, 'bootstrap json'), video_id)['props']['initialState']
+ data = self._search_nextjs_data(webpage, video_id)['props']['initialState']
video_data = try_get(data, lambda x: x['video']['current'], dict)
if not video_data:
video_data = data['article']['content'][0]['primaryMedia']['video']
diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py
index 7652371b3..57b4774b6 100644
--- a/yt_dlp/extractor/neteasemusic.py
+++ b/yt_dlp/extractor/neteasemusic.py
@@ -405,17 +405,12 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
name = info['name']
description = info['description']
- if not info['songs'] or self.get_param('noplaylist'):
- if info['songs']:
- self.to_screen(
- 'Downloading just the main audio %s because of --no-playlist'
- % info['mainSong']['id'])
-
+ if not self._yes_playlist(info['songs'] and program_id, info['mainSong']['id']):
formats = self.extract_formats(info['mainSong'])
self._sort_formats(formats)
return {
- 'id': program_id,
+ 'id': info['mainSong']['id'],
'title': name,
'description': description,
'creator': info['dj']['brand'],
@@ -425,10 +420,6 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
'formats': formats,
}
- self.to_screen(
- 'Downloading playlist %s - add --no-playlist to just download the main audio %s'
- % (program_id, info['mainSong']['id']))
-
song_ids = [info['mainSong']['id']]
song_ids.extend([song['id'] for song in info['songs']])
entries = [
diff --git a/yt_dlp/extractor/newstube.py b/yt_dlp/extractor/newstube.py
index dab4aec44..479141ae0 100644
--- a/yt_dlp/extractor/newstube.py
+++ b/yt_dlp/extractor/newstube.py
@@ -5,11 +5,9 @@ import base64
import hashlib
from .common import InfoExtractor
-from ..aes import aes_cbc_decrypt
+from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
from ..utils import (
- bytes_to_intlist,
int_or_none,
- intlist_to_bytes,
parse_codecs,
parse_duration,
)
@@ -47,10 +45,8 @@ class NewstubeIE(InfoExtractor):
}))
key = hashlib.pbkdf2_hmac(
'sha1', video_guid.replace('-', '').encode(), enc_data[:16], 1)[:16]
- dec_data = aes_cbc_decrypt(
- bytes_to_intlist(enc_data[32:]), bytes_to_intlist(key),
- bytes_to_intlist(enc_data[16:32]))
- sources = self._parse_json(intlist_to_bytes(dec_data[:-dec_data[-1]]), video_guid)
+ dec_data = unpad_pkcs7(aes_cbc_decrypt_bytes(enc_data[32:], key, enc_data[16:32]))
+ sources = self._parse_json(dec_data, video_guid)
formats = []
for source in sources:
diff --git a/yt_dlp/extractor/newsy.py b/yt_dlp/extractor/newsy.py
new file mode 100644
index 000000000..cf3164100
--- /dev/null
+++ b/yt_dlp/extractor/newsy.py
@@ -0,0 +1,51 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ js_to_json,
+ merge_dicts,
+)
+
+
+class NewsyIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?newsy\.com/stories/(?P<id>[^/?#$&]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.newsy.com/stories/nft-trend-leads-to-fraudulent-art-auctions/',
+ 'info_dict': {
+ 'id': '609d65125b086c24fb529312',
+ 'ext': 'mp4',
+ 'title': 'NFT Art Auctions Have A Piracy Problem',
+ 'description': 'md5:971e52ab8bc97e50305475cde8284c83',
+ 'display_id': 'nft-trend-leads-to-fraudulent-art-auctions',
+ 'timestamp': 1621339200,
+ 'duration': 339630,
+ 'thumbnail': 'https://cdn.newsy.com/images/videos/x/1620927824_xyrrP4.jpg',
+ 'upload_date': '20210518'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ data_json = self._parse_json(self._html_search_regex(
+ r'data-video-player\s?=\s?"({[^"]+})">', webpage, 'data'), display_id, js_to_json)
+ ld_json = self._search_json_ld(webpage, display_id, fatal=False)
+
+ formats, subtitles = [], {}
+ if data_json.get('stream'):
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(data_json['stream'], display_id)
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ self._sort_formats(formats)
+ return merge_dicts(ld_json, {
+ 'id': data_json['id'],
+ 'display_id': display_id,
+ 'title': data_json.get('headline'),
+ 'duration': data_json.get('duration'),
+ 'thumbnail': data_json.get('image'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
diff --git a/yt_dlp/extractor/nexx.py b/yt_dlp/extractor/nexx.py
index 8aceebd49..a521bb6e4 100644
--- a/yt_dlp/extractor/nexx.py
+++ b/yt_dlp/extractor/nexx.py
@@ -12,6 +12,8 @@ from ..utils import (
ExtractorError,
int_or_none,
parse_duration,
+ srt_subtitles_timecode,
+ traverse_obj,
try_get,
urlencode_postdata,
)
@@ -20,7 +22,7 @@ from ..utils import (
class NexxIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:
- https?://api\.nexx(?:\.cloud|cdn\.com)/v3/(?P<domain_id>\d+)/videos/byid/|
+ https?://api\.nexx(?:\.cloud|cdn\.com)/v3(?:\.\d)?/(?P<domain_id>\d+)/videos/byid/|
nexx:(?:(?P<domain_id_s>\d+):)?|
https?://arc\.nexx\.cloud/api/video/
)
@@ -42,35 +44,37 @@ class NexxIE(InfoExtractor):
'timestamp': 1384264416,
'upload_date': '20131112',
},
+ 'skip': 'Spiegel nexx CDNs are now disabled'
}, {
- # episode
- 'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858',
+ # episode with captions
+ 'url': 'https://api.nexx.cloud/v3.1/741/videos/byid/1701834',
'info_dict': {
- 'id': '247858',
+ 'id': '1701834',
'ext': 'mp4',
- 'title': 'Return of the Golden Child (OV)',
- 'description': 'md5:5d969537509a92b733de21bae249dc63',
- 'release_year': 2017,
+ 'title': 'Mein Leben mit \'nem TikTok E-Boy 😤',
+ 'alt_title': 'Mein Leben mit \'nem TikTok E-Boy 😤',
+ 'description': 'md5:f84f395a881fd143f952c892deab528d',
'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 1397,
- 'timestamp': 1495033267,
- 'upload_date': '20170517',
+ 'duration': 770,
+ 'timestamp': 1595600027,
+ 'upload_date': '20200724',
'episode_number': 2,
'season_number': 2,
+ 'episode': 'Episode 2',
+ 'season': 'Season 2',
},
'params': {
'skip_download': True,
},
- 'skip': 'HTTP Error 404: Not Found',
}, {
- # does not work via arc
'url': 'nexx:741:1269984',
- 'md5': 'c714b5b238b2958dc8d5642addba6886',
+ 'md5': 'd5f14e14b592501e51addd5abef95a7f',
'info_dict': {
'id': '1269984',
'ext': 'mp4',
- 'title': '1 TAG ohne KLO... wortwörtlich! 😑',
- 'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑',
+ 'title': '1 TAG ohne KLO... wortwörtlich! ?',
+ 'alt_title': '1 TAG ohne KLO... wortwörtlich! ?',
+ 'description': 'md5:2016393a31991a900946432ccdd09a6f',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 607,
'timestamp': 1518614955,
@@ -91,6 +95,7 @@ class NexxIE(InfoExtractor):
'timestamp': 1527874460,
'upload_date': '20180601',
},
+ 'skip': 'Spiegel nexx CDNs are now disabled'
}, {
'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907',
'only_matching': True,
@@ -138,6 +143,8 @@ class NexxIE(InfoExtractor):
return NexxIE._extract_urls(webpage)[0]
def _handle_error(self, response):
+ if traverse_obj(response, ('metadata', 'notice'), expected_type=str):
+ self.report_warning('%s said: %s' % (self.IE_NAME, response['metadata']['notice']))
status = int_or_none(try_get(
response, lambda x: x['metadata']['status']) or 200)
if 200 <= status < 300:
@@ -220,6 +227,65 @@ class NexxIE(InfoExtractor):
return formats
+ def _extract_3q_formats(self, video, video_id):
+ stream_data = video['streamdata']
+ cdn = stream_data['cdnType']
+ assert cdn == '3q'
+
+ q_acc, q_prefix, q_locator, q_hash = stream_data['qAccount'], stream_data['qPrefix'], stream_data['qLocator'], stream_data['qHash']
+ protection_key = traverse_obj(
+ video, ('protectiondata', 'key'), expected_type=str)
+
+ def get_cdn_shield_base(shield_type=''):
+ for secure in ('', 's'):
+ cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper()))
+ if cdn_shield:
+ return 'http%s://%s' % (secure, cdn_shield)
+ return f'http://sdn-global-{"prog" if shield_type.lower() == "prog" else "streaming"}-cache.3qsdn.com/' + (f's/{protection_key}/' if protection_key else '')
+
+ stream_base = get_cdn_shield_base()
+
+ formats = []
+ formats.extend(self._extract_m3u8_formats(
+ f'{stream_base}{q_acc}/files/{q_prefix}/{q_locator}/{q_acc}-{stream_data.get("qHEVCHash") or q_hash}.ism/manifest.m3u8',
+ video_id, 'mp4', m3u8_id=f'{cdn}-hls', fatal=False))
+ formats.extend(self._extract_mpd_formats(
+ f'{stream_base}{q_acc}/files/{q_prefix}/{q_locator}/{q_acc}-{q_hash}.ism/manifest.mpd',
+ video_id, mpd_id=f'{cdn}-dash', fatal=False))
+
+ progressive_base = get_cdn_shield_base('Prog')
+ q_references = stream_data.get('qReferences') or ''
+ fds = q_references.split(',')
+ for fd in fds:
+ ss = fd.split(':')
+ if len(ss) != 3:
+ continue
+ tbr = int_or_none(ss[1], scale=1000)
+ formats.append({
+ 'url': f'{progressive_base}{q_acc}/uploads/{q_acc}-{ss[2]}.webm',
+ 'format_id': f'{cdn}-{ss[0]}{"-%s" % tbr if tbr else ""}',
+ 'tbr': tbr,
+ })
+
+ azure_file_distribution = stream_data.get('azureFileDistribution') or ''
+ fds = azure_file_distribution.split(',')
+ for fd in fds:
+ ss = fd.split(':')
+ if len(ss) != 3:
+ continue
+ tbr = int_or_none(ss[0])
+ width, height = ss[1].split('x') if len(ss[1].split('x')) == 2 else (None, None)
+ f = {
+ 'url': f'{progressive_base}{q_acc}/files/{q_prefix}/{q_locator}/{ss[2]}.mp4',
+ 'format_id': f'{cdn}-http-{"-%s" % tbr if tbr else ""}',
+ 'tbr': tbr,
+ 'width': int_or_none(width),
+ 'height': int_or_none(height),
+ }
+ formats.append(f)
+
+ return formats
+
def _extract_azure_formats(self, video, video_id):
stream_data = video['streamdata']
cdn = stream_data['cdnType']
@@ -345,10 +411,11 @@ class NexxIE(InfoExtractor):
# md5( operation + domain_id + domain_secret )
# where domain_secret is a static value that will be given by nexx.tv
# as per [1]. Here is how this "secret" is generated (reversed
- # from _play.api.init function, search for clienttoken). So it's
- # actually not static and not that much of a secret.
+ # from _play._factory.data.getDomainData function, search for
+ # domaintoken or enableAPIAccess). So it's actually not static
+ # and not that much of a secret.
# 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf
- secret = result['device']['clienttoken'][int(device_id[0]):]
+ secret = result['device']['domaintoken'][int(device_id[0]):]
secret = secret[0:len(secret) - int(device_id[-1])]
op = 'byid'
@@ -360,15 +427,18 @@ class NexxIE(InfoExtractor):
result = self._call_api(
domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={
- 'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description',
+ 'additionalfields': 'language,channel,format,licenseby,slug,fileversion,episode,season',
'addInteractionOptions': '1',
'addStatusDetails': '1',
'addStreamDetails': '1',
- 'addCaptions': '1',
+ 'addFeatures': '1',
+ # Caption format selection doesn't seem to be enforced?
+ 'addCaptions': 'vtt',
'addScenes': '1',
+ 'addChapters': '1',
'addHotSpots': '1',
+ 'addConnectedMedia': 'persons',
'addBumpers': '1',
- 'captionFormat': 'data',
}, headers={
'X-Request-CID': cid,
'X-Request-Token': request_token,
@@ -384,27 +454,48 @@ class NexxIE(InfoExtractor):
formats = self._extract_azure_formats(video, video_id)
elif cdn == 'free':
formats = self._extract_free_formats(video, video_id)
+ elif cdn == '3q':
+ formats = self._extract_3q_formats(video, video_id)
else:
self.raise_no_formats(f'{cdn} formats are currently not supported', video_id)
self._sort_formats(formats)
+ subtitles = {}
+ for sub in video.get('captiondata') or []:
+ if sub.get('data'):
+ subtitles.setdefault(sub.get('language', 'en'), []).append({
+ 'ext': 'srt',
+ 'data': '\n\n'.join(
+ f'{i + 1}\n{srt_subtitles_timecode(line["fromms"] / 1000)} --> {srt_subtitles_timecode(line["toms"] / 1000)}\n{line["caption"]}'
+ for i, line in enumerate(sub['data'])),
+ 'name': sub.get('language_long') or sub.get('title')
+ })
+ elif sub.get('url'):
+ subtitles.setdefault(sub.get('language', 'en'), []).append({
+ 'url': sub['url'],
+ 'ext': sub.get('format'),
+ 'name': sub.get('language_long') or sub.get('title')
+ })
+
return {
'id': video_id,
'title': title,
'alt_title': general.get('subtitle'),
'description': general.get('description'),
'release_year': int_or_none(general.get('year')),
- 'creator': general.get('studio') or general.get('studio_adref'),
+ 'creator': general.get('studio') or general.get('studio_adref') or None,
'thumbnail': try_get(
video, lambda x: x['imagedata']['thumb'], compat_str),
'duration': parse_duration(general.get('runtime')),
'timestamp': int_or_none(general.get('uploaded')),
- 'episode_number': int_or_none(try_get(
- video, lambda x: x['episodedata']['episode'])),
- 'season_number': int_or_none(try_get(
- video, lambda x: x['episodedata']['season'])),
+ 'episode_number': traverse_obj(
+ video, (('episodedata', 'general'), 'episode'), expected_type=int, get_all=False),
+ 'season_number': traverse_obj(
+ video, (('episodedata', 'general'), 'season'), expected_type=int, get_all=False),
+ 'cast': traverse_obj(video, ('connectedmedia', ..., 'title'), expected_type=str),
'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/yt_dlp/extractor/nfb.py b/yt_dlp/extractor/nfb.py
new file mode 100644
index 000000000..a12e503de
--- /dev/null
+++ b/yt_dlp/extractor/nfb.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class NFBIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?nfb\.ca/film/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.nfb.ca/film/trafficopter/',
+ 'info_dict': {
+ 'id': 'trafficopter',
+ 'ext': 'mp4',
+ 'title': 'Trafficopter',
+ 'description': 'md5:060228455eb85cf88785c41656776bc0',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Barrie Howells',
+ 'release_year': 1972,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage('https://www.nfb.ca/film/%s/' % video_id, video_id)
+
+ iframe = self._html_search_regex(
+ r'<[^>]+\bid=["\']player-iframe["\'][^>]*src=["\']([^"\']+)',
+ webpage, 'iframe', default=None, fatal=True)
+ if iframe.startswith('/'):
+ iframe = f'https://www.nfb.ca{iframe}'
+
+ player = self._download_webpage(iframe, video_id)
+
+ source = self._html_search_regex(
+ r'source:\s*\'([^\']+)',
+ player, 'source', default=None, fatal=True)
+
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(source, video_id, ext='mp4')
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': self._html_search_regex(
+ r'<[^>]+\bid=["\']titleHeader["\'][^>]*>\s*<h1[^>]*>\s*([^<]+?)\s*</h1>',
+ webpage, 'title', default=None),
+ 'description': self._html_search_regex(
+ r'<[^>]+\bid=["\']tabSynopsis["\'][^>]*>\s*<p[^>]*>\s*([^<]+)',
+ webpage, 'description', default=None),
+ 'thumbnail': self._html_search_regex(
+ r'poster:\s*\'([^\']+)',
+ player, 'thumbnail', default=None),
+ 'uploader': self._html_search_regex(
+ r'<[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
+ webpage, 'uploader', default=None),
+ 'release_year': int_or_none(self._html_search_regex(
+ r'<[^>]+\bitemprop=["\']datePublished["\'][^>]*>([^<]+)',
+ webpage, 'release_year', default=None)),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/yt_dlp/extractor/noodlemagazine.py b/yt_dlp/extractor/noodlemagazine.py
new file mode 100644
index 000000000..2f170bbfe
--- /dev/null
+++ b/yt_dlp/extractor/noodlemagazine.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ parse_count,
+ unified_strdate
+)
+
+
+class NoodleMagazineIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www|adult\.)?noodlemagazine\.com/watch/(?P<id>[0-9-_]+)'
+ _TEST = {
+ 'url': 'https://adult.noodlemagazine.com/watch/-67421364_456239604',
+ 'md5': '9e02aa763612929d0b4b850591a9248b',
+ 'info_dict': {
+ 'id': '-67421364_456239604',
+ 'title': 'Aria alexander manojob',
+ 'thumbnail': r're:^https://.*\.jpg',
+ 'ext': 'mp4',
+ 'duration': 903,
+ 'view_count': int,
+ 'like_count': int,
+ 'description': 'Aria alexander manojob',
+ 'tags': ['aria', 'alexander', 'manojob'],
+ 'upload_date': '20190218',
+ 'age_limit': 18
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ title = self._og_search_title(webpage)
+ duration = parse_duration(self._html_search_meta('video:duration', webpage, 'duration', default=None))
+ description = self._og_search_property('description', webpage, default='').replace(' watch online hight quality video', '')
+ tags = self._html_search_meta('video:tag', webpage, default='').split(', ')
+ view_count = parse_count(self._html_search_meta('ya:ovs:views_total', webpage, default=None))
+ like_count = parse_count(self._html_search_meta('ya:ovs:likes', webpage, default=None))
+ upload_date = unified_strdate(self._html_search_meta('ya:ovs:upload_date', webpage, default=''))
+
+ key = self._html_search_regex(rf'/{video_id}\?(?:.*&)?m=([^&"\'\s,]+)', webpage, 'key')
+ playlist_info = self._download_json(f'https://adult.noodlemagazine.com/playlist/{video_id}?m={key}', video_id)
+ thumbnail = self._og_search_property('image', webpage, default=None) or playlist_info.get('image')
+
+ formats = [{
+ 'url': source.get('file'),
+ 'quality': source.get('label'),
+ 'ext': source.get('type'),
+ } for source in playlist_info.get('sources')]
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'description': description,
+ 'tags': tags,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'upload_date': upload_date,
+ 'age_limit': 18
+ }
diff --git a/yt_dlp/extractor/novaplay.py b/yt_dlp/extractor/novaplay.py
index 724986a06..bfb2c8751 100644
--- a/yt_dlp/extractor/novaplay.py
+++ b/yt_dlp/extractor/novaplay.py
@@ -41,9 +41,7 @@ class NovaPlayIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_props = self._parse_json(self._search_regex(
- r'<script\s?id=\"__NEXT_DATA__\"\s?type=\"application/json\">({.+})</script>',
- webpage, 'video_props'), video_id)['props']['pageProps']['video']
+ video_props = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video']
m3u8_url = self._download_json(
f'https://nbg-api.fite.tv/api/v2/videos/{video_id}/streams',
video_id, headers={'x-flipps-user-agent': 'Flipps/75/9.7'})[0]['url']
diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py
index 42f210a9b..293f1aa60 100644
--- a/yt_dlp/extractor/odnoklassniki.py
+++ b/yt_dlp/extractor/odnoklassniki.py
@@ -35,6 +35,38 @@ class OdnoklassnikiIE(InfoExtractor):
(?P<id>[\d-]+)
'''
_TESTS = [{
+ 'note': 'Coub embedded',
+ 'url': 'http://ok.ru/video/1484130554189',
+ 'info_dict': {
+ 'id': '1keok9',
+ 'ext': 'mp4',
+ 'timestamp': 1545580896,
+ 'view_count': int,
+ 'thumbnail': 'https://coub-anubis-a.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg',
+ 'title': 'Народная забава',
+ 'uploader': 'Nevata',
+ 'upload_date': '20181223',
+ 'age_limit': 0,
+ 'uploader_id': 'nevata.s',
+ 'like_count': int,
+ 'duration': 8.08,
+ 'repost_count': int,
+ },
+ }, {
+ 'note': 'vk.com embedded',
+ 'url': 'https://ok.ru/video/3568183087575',
+ 'info_dict': {
+ 'id': '-165101755_456243749',
+ 'ext': 'mp4',
+ 'uploader_id': '-165101755',
+ 'duration': 132,
+ 'timestamp': 1642869935,
+ 'upload_date': '20220122',
+ 'thumbnail': str,
+ 'title': str,
+ 'uploader': str,
+ },
+ }, {
# metadata in JSON
'url': 'http://ok.ru/video/20079905452',
'md5': '0b62089b479e06681abaaca9d204f152',
@@ -171,6 +203,10 @@ class OdnoklassnikiIE(InfoExtractor):
webpage, 'player', group='player')),
video_id)
+ # embedded external player
+ if player.get('isExternalPlayer') and player.get('url'):
+ return self.url_result(player['url'])
+
flashvars = player['flashvars']
metadata = flashvars.get('metadata')
@@ -226,6 +262,14 @@ class OdnoklassnikiIE(InfoExtractor):
'start_time': start_time,
}
+ # pladform
+ if provider == 'OPEN_GRAPH':
+ info.update({
+ '_type': 'url_transparent',
+ 'url': movie['contentId'],
+ })
+ return info
+
if provider == 'USER_YOUTUBE':
info.update({
'_type': 'url_transparent',
diff --git a/yt_dlp/extractor/onet.py b/yt_dlp/extractor/onet.py
index bf53ea0b0..95177a213 100644
--- a/yt_dlp/extractor/onet.py
+++ b/yt_dlp/extractor/onet.py
@@ -182,14 +182,9 @@ class OnetChannelIE(OnetBaseIE):
video_id = remove_start(current_clip_info['ckmId'], 'mvp:')
video_name = url_basename(current_clip_info['url'])
- if self.get_param('noplaylist'):
- self.to_screen(
- 'Downloading just video %s because of --no-playlist' % video_name)
+ if not self._yes_playlist(channel_id, video_name, playlist_label='channel'):
return self._extract_from_id(video_id, webpage)
- self.to_screen(
- 'Downloading channel %s - add --no-playlist to just download video %s' % (
- channel_id, video_name))
matches = re.findall(
r'<a[^>]+href=[\'"](%s[a-z]+/[0-9a-z-]+/[0-9a-z]+)' % self._URL_BASE_RE,
webpage)
diff --git a/yt_dlp/extractor/openrec.py b/yt_dlp/extractor/openrec.py
index d7073ab44..7cdc7d17c 100644
--- a/yt_dlp/extractor/openrec.py
+++ b/yt_dlp/extractor/openrec.py
@@ -4,51 +4,41 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
+ int_or_none,
traverse_obj,
- try_get,
- unified_strdate
+ unified_strdate,
+ unified_timestamp
)
from ..compat import compat_str
-class OpenRecIE(InfoExtractor):
- IE_NAME = 'openrec'
- _VALID_URL = r'https?://(?:www\.)?openrec\.tv/live/(?P<id>[^/]+)'
- _TESTS = [{
- 'url': 'https://www.openrec.tv/live/2p8v31qe4zy',
- 'only_matching': True,
- }, {
- 'url': 'https://www.openrec.tv/live/wez93eqvjzl',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage('https://www.openrec.tv/live/%s' % video_id, video_id)
-
- window_stores = self._parse_json(
+class OpenRecBaseIE(InfoExtractor):
+ def _extract_pagestore(self, webpage, video_id):
+ return self._parse_json(
self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id)
+
+ def _extract_movie(self, webpage, video_id, name, is_live):
+ window_stores = self._extract_pagestore(webpage, video_id)
movie_store = traverse_obj(
window_stores,
('v8', 'state', 'movie'),
('v8', 'movie'),
expected_type=dict)
if not movie_store:
- raise ExtractorError('Failed to extract live info')
+ raise ExtractorError(f'Failed to extract {name} info')
title = movie_store.get('title')
description = movie_store.get('introduction')
thumbnail = movie_store.get('thumbnailUrl')
- channel_user = movie_store.get('channel', {}).get('user')
- uploader = try_get(channel_user, lambda x: x['name'], compat_str)
- uploader_id = try_get(channel_user, lambda x: x['id'], compat_str)
+ uploader = traverse_obj(movie_store, ('channel', 'user', 'name'), expected_type=compat_str)
+ uploader_id = traverse_obj(movie_store, ('channel', 'user', 'id'), expected_type=compat_str)
- timestamp = traverse_obj(movie_store, ('startedAt', 'time'), expected_type=int)
+ timestamp = int_or_none(traverse_obj(movie_store, ('publishedAt', 'time')), scale=1000)
- m3u8_playlists = movie_store.get('media')
+ m3u8_playlists = movie_store.get('media') or {}
formats = []
- for (name, m3u8_url) in m3u8_playlists.items():
+ for name, m3u8_url in m3u8_playlists.items():
if not m3u8_url:
continue
formats.extend(self._extract_m3u8_formats(
@@ -66,11 +56,29 @@ class OpenRecIE(InfoExtractor):
'uploader': uploader,
'uploader_id': uploader_id,
'timestamp': timestamp,
- 'is_live': True,
+ 'is_live': is_live,
}
-class OpenRecCaptureIE(InfoExtractor):
+class OpenRecIE(OpenRecBaseIE):
+ IE_NAME = 'openrec'
+ _VALID_URL = r'https?://(?:www\.)?openrec\.tv/live/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'https://www.openrec.tv/live/2p8v31qe4zy',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.openrec.tv/live/wez93eqvjzl',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage('https://www.openrec.tv/live/%s' % video_id, video_id)
+
+ return self._extract_movie(webpage, video_id, 'live', True)
+
+
+class OpenRecCaptureIE(OpenRecBaseIE):
IE_NAME = 'openrec:capture'
_VALID_URL = r'https?://(?:www\.)?openrec\.tv/capture/(?P<id>[^/]+)'
_TESTS = [{
@@ -91,8 +99,7 @@ class OpenRecCaptureIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage('https://www.openrec.tv/capture/%s' % video_id, video_id)
- window_stores = self._parse_json(
- self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id)
+ window_stores = self._extract_pagestore(webpage, video_id)
movie_store = window_stores.get('movie')
capture_data = window_stores.get('capture')
@@ -102,17 +109,14 @@ class OpenRecCaptureIE(InfoExtractor):
thumbnail = capture_data.get('thumbnailUrl')
upload_date = unified_strdate(capture_data.get('createdAt'))
- channel_info = movie_store.get('channel') or {}
- uploader = channel_info.get('name')
- uploader_id = channel_info.get('id')
+ uploader = traverse_obj(movie_store, ('channel', 'name'), expected_type=compat_str)
+ uploader_id = traverse_obj(movie_store, ('channel', 'id'), expected_type=compat_str)
- m3u8_url = capture_data.get('source')
- if not m3u8_url:
- raise ExtractorError('Cannot extract m3u8 url')
- formats = self._extract_m3u8_formats(
- m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
- m3u8_id='hls')
+ timestamp = traverse_obj(movie_store, 'createdAt', expected_type=compat_str)
+ timestamp = unified_timestamp(timestamp)
+ formats = self._extract_m3u8_formats(
+ capture_data.get('source'), video_id, ext='mp4')
self._sort_formats(formats)
return {
@@ -120,7 +124,31 @@ class OpenRecCaptureIE(InfoExtractor):
'title': title,
'thumbnail': thumbnail,
'formats': formats,
+ 'timestamp': timestamp,
'uploader': uploader,
'uploader_id': uploader_id,
'upload_date': upload_date,
}
+
+
+class OpenRecMovieIE(OpenRecBaseIE):
+ IE_NAME = 'openrec:movie'
+ _VALID_URL = r'https?://(?:www\.)?openrec\.tv/movie/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'https://www.openrec.tv/movie/nqz5xl5km8v',
+ 'info_dict': {
+ 'id': 'nqz5xl5km8v',
+ 'title': '限定コミュニティ(Discord)参加方法ご説明動画',
+ 'description': 'md5:ebd563e5f5b060cda2f02bf26b14d87f',
+ 'thumbnail': r're:https://.+',
+ 'uploader': 'タイキとカズヒロ',
+ 'uploader_id': 'taiki_to_kazuhiro',
+ 'timestamp': 1638856800,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage('https://www.openrec.tv/movie/%s' % video_id, video_id)
+
+ return self._extract_movie(webpage, video_id, 'movie', False)
diff --git a/yt_dlp/extractor/orf.py b/yt_dlp/extractor/orf.py
index e2b703880..0628977a0 100644
--- a/yt_dlp/extractor/orf.py
+++ b/yt_dlp/extractor/orf.py
@@ -1,23 +1,26 @@
# coding: utf-8
from __future__ import unicode_literals
+import functools
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
clean_html,
determine_ext,
float_or_none,
HEADRequest,
+ InAdvancePagedList,
int_or_none,
join_nonempty,
orderedSet,
remove_end,
+ smuggle_url,
str_or_none,
strip_jsonp,
unescapeHTML,
unified_strdate,
+ unsmuggle_url,
url_or_none,
)
@@ -25,9 +28,40 @@ from ..utils import (
class ORFTVthekIE(InfoExtractor):
IE_NAME = 'orf:tvthek'
IE_DESC = 'ORF TVthek'
- _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)'
+ _VALID_URL = r'(?P<url>https?://tvthek\.orf\.at/(?:(?:[^/]+/){2}){1,2}(?P<id>\d+))(/[^/]+/(?P<vid>\d+))?(?:$|[?#])'
_TESTS = [{
+ 'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079',
+ 'info_dict': {
+ 'id': '14121079',
+ },
+ 'playlist_count': 11,
+ 'params': {'noplaylist': True}
+ }, {
+ 'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079/Umfrage-Welches-Tier-ist-Sebastian-Kurz/15083150',
+ 'info_dict': {
+ 'id': '14121079',
+ },
+ 'playlist_count': 1,
+ 'params': {'playlist_items': '5'}
+ }, {
+ 'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079/Umfrage-Welches-Tier-ist-Sebastian-Kurz/15083150',
+ 'info_dict': {
+ 'id': '14121079',
+ 'playlist_count': 1
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '15083150',
+ 'ext': 'mp4',
+ 'description': 'md5:7be1c485425f5f255a5e4e4815e77d04',
+ 'thumbnail': 'https://api-tvthek.orf.at/uploads/media/segments/0130/59/824271ea35cd8931a0fb08ab316a5b0a1562342c.jpeg',
+ 'title': 'Umfrage: Welches Tier ist Sebastian Kurz?',
+ }
+ }],
+ 'playlist_count': 1,
+ 'params': {'noplaylist': True, 'skip_download': 'm3u8'}
+ }, {
'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389',
'playlist': [{
'md5': '2942210346ed779588f428a92db88712',
@@ -62,8 +96,90 @@ class ORFTVthekIE(InfoExtractor):
'only_matching': True,
}]
+ def _pagefunc(self, url, data_jsb, n, *, image=None):
+ sd = data_jsb[n]
+ video_id, title = str(sd['id']), sd['title']
+ formats = []
+ for fd in sd['sources']:
+ src = url_or_none(fd.get('src'))
+ if not src:
+ continue
+ format_id = join_nonempty('delivery', 'quality', 'quality_string', from_dict=fd)
+ ext = determine_ext(src)
+ if ext == 'm3u8':
+ m3u8_formats = self._extract_m3u8_formats(
+ src, video_id, 'mp4', m3u8_id=format_id, fatal=False, note=f'Downloading {format_id} m3u8 manifest')
+ if any('/geoprotection' in f['url'] for f in m3u8_formats):
+ self.raise_geo_restricted()
+ formats.extend(m3u8_formats)
+ elif ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ src, video_id, f4m_id=format_id, fatal=False))
+ elif ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src, video_id, mpd_id=format_id, fatal=False, note=f'Downloading {format_id} mpd manifest'))
+ else:
+ formats.append({
+ 'format_id': format_id,
+ 'url': src,
+ 'protocol': fd.get('protocol'),
+ })
+
+ # Check for geoblocking.
+ # There is a property is_geoprotection, but that's always false
+ geo_str = sd.get('geoprotection_string')
+ http_url = next(
+ (f['url'] for f in formats if re.match(r'^https?://.*\.mp4$', f['url'])),
+ None) if geo_str else None
+ if http_url:
+ self._request_webpage(
+ HEADRequest(http_url), video_id, fatal=False, note='Testing for geoblocking',
+ errnote=f'This video seems to be blocked outside of {geo_str}. You may want to try the streaming-* formats')
+
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for sub in sd.get('subtitles', []):
+ sub_src = sub.get('src')
+ if not sub_src:
+ continue
+ subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({
+ 'url': sub_src,
+ })
+
+ upload_date = unified_strdate(sd.get('created_date'))
+
+ thumbnails = []
+ preview = sd.get('preview_image_url')
+ if preview:
+ thumbnails.append({
+ 'id': 'preview',
+ 'url': preview,
+ 'preference': 0,
+ })
+ image = sd.get('image_full_url') or image
+ if image:
+ thumbnails.append({
+ 'id': 'full',
+ 'url': image,
+ 'preference': 1,
+ })
+
+ yield {
+ 'id': video_id,
+ 'title': title,
+ 'webpage_url': smuggle_url(f'{url}/part/{video_id}', {'force_noplaylist': True}),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'description': sd.get('description'),
+ 'duration': int_or_none(sd.get('duration_in_seconds')),
+ 'upload_date': upload_date,
+ 'thumbnails': thumbnails,
+ }
+
def _real_extract(self, url):
- playlist_id = self._match_id(url)
+ url, smuggled_data = unsmuggle_url(url)
+ playlist_id, video_id, base_url = self._match_valid_url(url).group('id', 'vid', 'url')
webpage = self._download_webpage(url, playlist_id)
data_jsb = self._parse_json(
@@ -72,107 +188,16 @@ class ORFTVthekIE(InfoExtractor):
webpage, 'playlist', group='json'),
playlist_id, transform_source=unescapeHTML)['playlist']['videos']
- entries = []
- for sd in data_jsb:
- video_id, title = sd.get('id'), sd.get('title')
- if not video_id or not title:
- continue
- video_id = compat_str(video_id)
- formats = []
- for fd in sd['sources']:
- src = url_or_none(fd.get('src'))
- if not src:
- continue
- format_id = join_nonempty('delivery', 'quality', 'quality_string', from_dict=fd)
- ext = determine_ext(src)
- if ext == 'm3u8':
- m3u8_formats = self._extract_m3u8_formats(
- src, video_id, 'mp4', m3u8_id=format_id, fatal=False)
- if any('/geoprotection' in f['url'] for f in m3u8_formats):
- self.raise_geo_restricted()
- formats.extend(m3u8_formats)
- elif ext == 'f4m':
- formats.extend(self._extract_f4m_formats(
- src, video_id, f4m_id=format_id, fatal=False))
- elif ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- src, video_id, mpd_id=format_id, fatal=False))
- else:
- formats.append({
- 'format_id': format_id,
- 'url': src,
- 'protocol': fd.get('protocol'),
- })
+ if not self._yes_playlist(playlist_id, video_id, smuggled_data):
+ data_jsb = [sd for sd in data_jsb if str(sd.get('id')) == video_id]
- # Check for geoblocking.
- # There is a property is_geoprotection, but that's always false
- geo_str = sd.get('geoprotection_string')
- if geo_str:
- try:
- http_url = next(
- f['url']
- for f in formats
- if re.match(r'^https?://.*\.mp4$', f['url']))
- except StopIteration:
- pass
- else:
- req = HEADRequest(http_url)
- self._request_webpage(
- req, video_id,
- note='Testing for geoblocking',
- errnote=((
- 'This video seems to be blocked outside of %s. '
- 'You may want to try the streaming-* formats.')
- % geo_str),
- fatal=False)
-
- self._check_formats(formats, video_id)
- self._sort_formats(formats)
-
- subtitles = {}
- for sub in sd.get('subtitles', []):
- sub_src = sub.get('src')
- if not sub_src:
- continue
- subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({
- 'url': sub_src,
- })
-
- upload_date = unified_strdate(sd.get('created_date'))
-
- thumbnails = []
- preview = sd.get('preview_image_url')
- if preview:
- thumbnails.append({
- 'id': 'preview',
- 'url': preview,
- 'preference': 0,
- })
- image = sd.get('image_full_url')
- if not image and len(data_jsb) == 1:
- image = self._og_search_thumbnail(webpage)
- if image:
- thumbnails.append({
- 'id': 'full',
- 'url': image,
- 'preference': 1,
- })
-
- entries.append({
- '_type': 'video',
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'subtitles': subtitles,
- 'description': sd.get('description'),
- 'duration': int_or_none(sd.get('duration_in_seconds')),
- 'upload_date': upload_date,
- 'thumbnails': thumbnails,
- })
+ playlist_count = len(data_jsb)
+ image = self._og_search_thumbnail(webpage) if playlist_count == 1 else None
+ page_func = functools.partial(self._pagefunc, base_url, data_jsb, image=image)
return {
'_type': 'playlist',
- 'entries': entries,
+ 'entries': InAdvancePagedList(page_func, playlist_count, 1),
'id': playlist_id,
}
diff --git a/yt_dlp/extractor/pbs.py b/yt_dlp/extractor/pbs.py
index ffaa6bf92..e48a2b8e0 100644
--- a/yt_dlp/extractor/pbs.py
+++ b/yt_dlp/extractor/pbs.py
@@ -545,7 +545,7 @@ class PBSIE(InfoExtractor):
for vid_id in video_id]
return self.playlist_result(entries, display_id)
- info = None
+ info = {}
redirects = []
redirect_urls = set()
@@ -660,6 +660,9 @@ class PBSIE(InfoExtractor):
'protocol': 'http',
})
formats.append(f)
+ for f in formats:
+ if (f.get('format_note') or '').endswith(' AD'): # Audio description
+ f['language_preference'] = -10
self._sort_formats(formats)
rating_str = info.get('rating')
diff --git a/yt_dlp/extractor/peertube.py b/yt_dlp/extractor/peertube.py
index 1e22f24e3..e0b2ab982 100644
--- a/yt_dlp/extractor/peertube.py
+++ b/yt_dlp/extractor/peertube.py
@@ -7,6 +7,7 @@ import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ format_field,
int_or_none,
parse_resolution,
str_or_none,
@@ -1386,8 +1387,7 @@ class PeerTubePlaylistIE(InfoExtractor):
playlist_timestamp = unified_timestamp(info.get('createdAt'))
channel = try_get(info, lambda x: x['ownerAccount']['name']) or info.get('displayName')
channel_id = try_get(info, lambda x: x['ownerAccount']['id']) or info.get('id')
- thumbnail = info.get('thumbnailPath')
- thumbnail = f'https://{host}{thumbnail}' if thumbnail else None
+ thumbnail = format_field(info, 'thumbnailPath', f'https://{host}%s')
entries = OnDemandPagedList(functools.partial(
self.fetch_page, host, id, type), self._PAGE_SIZE)
diff --git a/yt_dlp/extractor/pladform.py b/yt_dlp/extractor/pladform.py
index dc2030017..99ade85ec 100644
--- a/yt_dlp/extractor/pladform.py
+++ b/yt_dlp/extractor/pladform.py
@@ -28,6 +28,24 @@ class PladformIE(InfoExtractor):
(?P<id>\d+)
'''
_TESTS = [{
+ 'url': 'http://out.pladform.ru/player?pl=18079&type=html5&videoid=100231282',
+ 'info_dict': {
+ 'id': '6216d548e755edae6e8280667d774791',
+ 'ext': 'mp4',
+ 'timestamp': 1406117012,
+ 'title': 'Гарик Мартиросян и Гарик Харламов - Кастинг на концерт ко Дню милиции',
+ 'age_limit': 0,
+ 'upload_date': '20140723',
+ 'thumbnail': str,
+ 'view_count': int,
+ 'description': str,
+ 'category': list,
+ 'uploader_id': '12082',
+ 'uploader': 'Comedy Club',
+ 'duration': 367,
+ },
+ 'expected_warnings': ['HTTP Error 404: Not Found']
+ }, {
'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0',
'md5': '53362fac3a27352da20fa2803cc5cd6f',
'info_dict': {
@@ -63,13 +81,19 @@ class PladformIE(InfoExtractor):
'http://out.pladform.ru/getVideo', video_id, query={
'pl': pl,
'videoid': video_id,
- })
+ }, fatal=False)
def fail(text):
raise ExtractorError(
'%s returned error: %s' % (self.IE_NAME, text),
expected=True)
+ if not video:
+ targetUrl = self._request_webpage(url, video_id, note='Resolving final URL').geturl()
+ if targetUrl == url:
+ raise ExtractorError('Can\'t parse page')
+ return self.url_result(targetUrl)
+
if video.tag == 'error':
fail(video.text)
diff --git a/yt_dlp/extractor/pokergo.py b/yt_dlp/extractor/pokergo.py
new file mode 100644
index 000000000..d27031c91
--- /dev/null
+++ b/yt_dlp/extractor/pokergo.py
@@ -0,0 +1,111 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ try_get,
+)
+
+
+class PokerGoBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'pokergo'
+ _AUTH_TOKEN = None
+ _PROPERTY_ID = '1dfb3940-7d53-4980-b0b0-f28b369a000d'
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if not username:
+ self.raise_login_required(method='password')
+
+ self.report_login()
+ PokerGoBaseIE._AUTH_TOKEN = self._download_json(
+ f'https://subscription.pokergo.com/properties/{self._PROPERTY_ID}/sign-in', None,
+ headers={'authorization': f'Basic {base64.b64encode(f"{username}:{password}".encode()).decode()}'},
+ data=b'')['meta']['token']
+ if not self._AUTH_TOKEN:
+ raise ExtractorError('Unable to get Auth Token.', expected=True)
+
+ def _real_initialize(self):
+ if not self._AUTH_TOKEN:
+ self._login()
+
+
+class PokerGoIE(PokerGoBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?pokergo\.com/videos/(?P<id>[^&$#/?]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.pokergo.com/videos/2a70ec4e-4a80-414b-97ec-725d9b72a7dc',
+ 'info_dict': {
+ 'id': 'aVLOxDzY',
+ 'ext': 'mp4',
+ 'title': 'Poker After Dark | Season 12 (2020) | Cry Me a River | Episode 2',
+ 'description': 'md5:c7a8c29556cbfb6eb3c0d5d622251b71',
+ 'thumbnail': 'https://cdn.jwplayer.com/v2/media/aVLOxDzY/poster.jpg?width=720',
+ 'timestamp': 1608085715,
+ 'duration': 2700.12,
+ 'season_number': 12,
+ 'episode_number': 2,
+ 'series': 'poker after dark',
+ 'upload_date': '20201216',
+ 'season': 'Season 12',
+ 'episode': 'Episode 2',
+ 'display_id': '2a70ec4e-4a80-414b-97ec-725d9b72a7dc',
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ data_json = self._download_json(f'https://api.pokergo.com/v2/properties/{self._PROPERTY_ID}/videos/{id}', id,
+ headers={'authorization': f'Bearer {self._AUTH_TOKEN}'})['data']
+ v_id = data_json['source']
+
+ thumbnails = [{
+ 'url': image['url'],
+ 'id': image.get('label'),
+ 'width': image.get('width'),
+ 'height': image.get('height')
+ } for image in data_json.get('images') or [] if image.get('url')]
+ series_json = next(dct for dct in data_json.get('show_tags') or [] if dct.get('video_id') == id) or {}
+
+ return {
+ '_type': 'url_transparent',
+ 'display_id': id,
+ 'title': data_json.get('title'),
+ 'description': data_json.get('description'),
+ 'duration': data_json.get('duration'),
+ 'thumbnails': thumbnails,
+ 'season_number': series_json.get('season'),
+ 'episode_number': series_json.get('episode_number'),
+ 'series': try_get(series_json, lambda x: x['tag']['name']),
+ 'url': f'https://cdn.jwplayer.com/v2/media/{v_id}'
+ }
+
+
+class PokerGoCollectionIE(PokerGoBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?pokergo\.com/collections/(?P<id>[^&$#/?]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.pokergo.com/collections/19ffe481-5dae-481a-8869-75cc0e3c4700',
+ 'playlist_mincount': 13,
+ 'info_dict': {
+ 'id': '19ffe481-5dae-481a-8869-75cc0e3c4700',
+ },
+ }]
+
+ def _entries(self, id):
+ data_json = self._download_json(f'https://api.pokergo.com/v2/properties/{self._PROPERTY_ID}/collections/{id}?include=entities',
+ id, headers={'authorization': f'Bearer {self._AUTH_TOKEN}'})['data']
+ for video in data_json.get('collection_video') or []:
+ video_id = video.get('id')
+ if video_id:
+ yield self.url_result(
+ f'https://www.pokergo.com/videos/{video_id}',
+ ie=PokerGoIE.ie_key(), video_id=video_id)
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ return self.playlist_result(self._entries(id), playlist_id=id)
diff --git a/yt_dlp/extractor/pornez.py b/yt_dlp/extractor/pornez.py
new file mode 100644
index 000000000..713dc0080
--- /dev/null
+++ b/yt_dlp/extractor/pornez.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+from __future__ import unicode_literals
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class PornezIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?pornez\.net/video(?P<id>[0-9]+)/'
+ _TEST = {
+ 'url': 'https://pornez.net/video344819/mistresst-funny_penis_names-wmv/',
+ 'md5': '2e19a0a1cff3a5dbea0ef1b9e80bcbbc',
+ 'info_dict': {
+ 'id': '344819',
+ 'ext': 'mp4',
+ 'title': r'mistresst funny_penis_names wmv',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ iframe_src = self._html_search_regex(
+ r'<iframe[^>]+src="(https?://pornez\.net/player/\?[^"]+)"', webpage, 'iframe', fatal=True)
+ title = self._html_search_meta(['name', 'twitter:title', 'og:title'], webpage, 'title', default=None)
+ if title is None:
+ title = self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title', fatal=True)
+ thumbnail = self._html_search_meta(['thumbnailUrl'], webpage, 'title', default=None)
+ webpage = self._download_webpage(iframe_src, video_id)
+ entries = self._parse_html5_media_entries(iframe_src, webpage, video_id)[0]
+ for format in entries['formats']:
+ height = self._search_regex(r'_(\d+)\.m3u8', format['url'], 'height')
+ format['format_id'] = '%sp' % height
+ format['height'] = int_or_none(height)
+
+ entries.update({
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'age_limit': 18
+ })
+ return entries
diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py
index 4357c79df..17c8c9100 100644
--- a/yt_dlp/extractor/pornhub.py
+++ b/yt_dlp/extractor/pornhub.py
@@ -18,6 +18,7 @@ from ..utils import (
clean_html,
determine_ext,
ExtractorError,
+ format_field,
int_or_none,
merge_dicts,
NO_DEFAULT,
@@ -32,7 +33,7 @@ from ..utils import (
class PornHubBaseIE(InfoExtractor):
_NETRC_MACHINE = 'pornhub'
- _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubthbh7ap3u\.onion)'
+ _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd\.onion)'
def _download_webpage_handle(self, *args, **kwargs):
def dl(*args, **kwargs):
@@ -247,7 +248,7 @@ class PornHubIE(PornHubBaseIE):
'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156',
'only_matching': True,
}, {
- 'url': 'http://pornhubthbh7ap3u.onion/view_video.php?viewkey=ph5a9813bfa7156',
+ 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/view_video.php?viewkey=ph5a9813bfa7156',
'only_matching': True,
}]
@@ -431,7 +432,7 @@ class PornHubIE(PornHubBaseIE):
default=None))
formats.append({
'url': format_url,
- 'format_id': '%dp' % height if height else None,
+ 'format_id': format_field(height, template='%dp'),
'height': height,
})
@@ -561,7 +562,7 @@ class PornHubUserIE(PornHubPlaylistBaseIE):
'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
'only_matching': True,
}, {
- 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph',
+ 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph',
'only_matching': True,
}]
@@ -732,7 +733,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn',
'only_matching': True,
}, {
- 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph/videos',
+ 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph/videos',
'only_matching': True,
}]
@@ -755,7 +756,7 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
'only_matching': True,
}, {
- 'url': 'http://pornhubthbh7ap3u.onion/pornstar/jenny-blighe/videos/upload',
+ 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/pornstar/jenny-blighe/videos/upload',
'only_matching': True,
}]
diff --git a/yt_dlp/extractor/prx.py b/yt_dlp/extractor/prx.py
new file mode 100644
index 000000000..80561b80a
--- /dev/null
+++ b/yt_dlp/extractor/prx.py
@@ -0,0 +1,431 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+from .common import InfoExtractor, SearchInfoExtractor
+from ..utils import (
+ urljoin,
+ traverse_obj,
+ int_or_none,
+ mimetype2ext,
+ clean_html,
+ url_or_none,
+ unified_timestamp,
+ str_or_none,
+)
+
+
+class PRXBaseIE(InfoExtractor):
+ PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s'
+
+ def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'):
+ return self._download_json(
+ urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note)
+
+ @staticmethod
+ def _get_prx_embed_response(response, section):
+ return traverse_obj(response, ('_embedded', f'prx:{section}'))
+
+ @staticmethod
+ def _extract_file_link(response):
+ return url_or_none(traverse_obj(
+ response, ('_links', 'enclosure', 'href'), expected_type=str))
+
+ @classmethod
+ def _extract_image(cls, image_response):
+ if not isinstance(image_response, dict):
+ return
+ return {
+ 'id': str_or_none(image_response.get('id')),
+ 'filesize': image_response.get('size'),
+ 'width': image_response.get('width'),
+ 'height': image_response.get('height'),
+ 'url': cls._extract_file_link(image_response)
+ }
+
+ @classmethod
+ def _extract_base_info(cls, response):
+ if not isinstance(response, dict):
+ return
+ item_id = str_or_none(response.get('id'))
+ if not item_id:
+ return
+ thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image'))
+ description = (
+ clean_html(response.get('description'))
+ or response.get('shortDescription'))
+ return {
+ 'id': item_id,
+ 'title': response.get('title') or item_id,
+ 'thumbnails': [thumbnail_dict] if thumbnail_dict else None,
+ 'description': description,
+ 'release_timestamp': unified_timestamp(response.get('releasedAt')),
+ 'timestamp': unified_timestamp(response.get('createdAt')),
+ 'modified_timestamp': unified_timestamp(response.get('updatedAt')),
+ 'duration': int_or_none(response.get('duration')),
+ 'tags': response.get('tags'),
+ 'episode_number': int_or_none(response.get('episodeIdentifier')),
+ 'season_number': int_or_none(response.get('seasonIdentifier'))
+ }
+
+ @classmethod
+ def _extract_series_info(cls, series_response):
+ base_info = cls._extract_base_info(series_response)
+ if not base_info:
+ return
+ account_info = cls._extract_account_info(
+ cls._get_prx_embed_response(series_response, 'account')) or {}
+ return {
+ **base_info,
+ 'channel_id': account_info.get('channel_id'),
+ 'channel_url': account_info.get('channel_url'),
+ 'channel': account_info.get('channel'),
+ 'series': base_info.get('title'),
+ 'series_id': base_info.get('id'),
+ }
+
+ @classmethod
+ def _extract_account_info(cls, account_response):
+ base_info = cls._extract_base_info(account_response)
+ if not base_info:
+ return
+ name = account_response.get('name')
+ return {
+ **base_info,
+ 'title': name,
+ 'channel_id': base_info.get('id'),
+ 'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'),
+ 'channel': name,
+ }
+
+ @classmethod
+ def _extract_story_info(cls, story_response):
+ base_info = cls._extract_base_info(story_response)
+ if not base_info:
+ return
+ series = cls._extract_series_info(
+ cls._get_prx_embed_response(story_response, 'series')) or {}
+ account = cls._extract_account_info(
+ cls._get_prx_embed_response(story_response, 'account')) or {}
+ return {
+ **base_info,
+ 'series': series.get('series'),
+ 'series_id': series.get('series_id'),
+ 'channel_id': account.get('channel_id'),
+ 'channel_url': account.get('channel_url'),
+ 'channel': account.get('channel')
+ }
+
+ def _entries(self, item_id, endpoint, entry_func, query=None):
+ """
+ Extract entries from paginated list API
+ @param entry_func: Function to generate entry from response item
+ """
+ total = 0
+ for page in itertools.count(1):
+ response = self._call_api(f'{item_id}: page {page}', endpoint, query={
+ **(query or {}),
+ 'page': page,
+ 'per': 100
+ })
+ items = self._get_prx_embed_response(response, 'items')
+ if not response or not items:
+ break
+
+ yield from filter(None, map(entry_func, items))
+
+ total += response['count']
+ if total >= response['total']:
+ break
+
+ def _story_playlist_entry(self, response):
+ story = self._extract_story_info(response)
+ if not story:
+ return
+ story.update({
+ '_type': 'url',
+ 'url': 'https://beta.prx.org/stories/%s' % story['id'],
+ 'ie_key': PRXStoryIE.ie_key()
+ })
+ return story
+
+ def _series_playlist_entry(self, response):
+ series = self._extract_series_info(response)
+ if not series:
+ return
+ series.update({
+ '_type': 'url',
+ 'url': 'https://beta.prx.org/series/%s' % series['id'],
+ 'ie_key': PRXSeriesIE.ie_key()
+ })
+ return series
+
+
+class PRXStoryIE(PRXBaseIE):
+ _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)'
+
+ _TESTS = [
+ {
+ # Story with season and episode details
+ 'url': 'https://beta.prx.org/stories/399200',
+ 'info_dict': {
+ 'id': '399200',
+ 'title': 'Fly Me To The Moon',
+ 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
+ 'release_timestamp': 1640250000,
+ 'timestamp': 1640208972,
+ 'modified_timestamp': 1641318202,
+ 'duration': 1004,
+ 'tags': 'count:7',
+ 'episode_number': 8,
+ 'season_number': 5,
+ 'series': 'AirSpace',
+ 'series_id': '38057',
+ 'channel_id': '220986',
+ 'channel_url': 'https://beta.prx.org/accounts/220986',
+ 'channel': 'Air and Space Museum',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '399200_part1',
+ 'title': 'Fly Me To The Moon',
+ 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
+ 'release_timestamp': 1640250000,
+ 'timestamp': 1640208972,
+ 'modified_timestamp': 1641318202,
+ 'duration': 530,
+ 'tags': 'count:7',
+ 'episode_number': 8,
+ 'season_number': 5,
+ 'series': 'AirSpace',
+ 'series_id': '38057',
+ 'channel_id': '220986',
+ 'channel_url': 'https://beta.prx.org/accounts/220986',
+ 'channel': 'Air and Space Museum',
+ 'ext': 'mp3',
+ 'upload_date': '20211222',
+ 'episode': 'Episode 8',
+ 'release_date': '20211223',
+ 'season': 'Season 5',
+ 'modified_date': '20220104'
+ }
+ }, {
+ 'info_dict': {
+ 'id': '399200_part2',
+ 'title': 'Fly Me To The Moon',
+ 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
+ 'release_timestamp': 1640250000,
+ 'timestamp': 1640208972,
+ 'modified_timestamp': 1641318202,
+ 'duration': 474,
+ 'tags': 'count:7',
+ 'episode_number': 8,
+ 'season_number': 5,
+ 'series': 'AirSpace',
+ 'series_id': '38057',
+ 'channel_id': '220986',
+ 'channel_url': 'https://beta.prx.org/accounts/220986',
+ 'channel': 'Air and Space Museum',
+ 'ext': 'mp3',
+ 'upload_date': '20211222',
+ 'episode': 'Episode 8',
+ 'release_date': '20211223',
+ 'season': 'Season 5',
+ 'modified_date': '20220104'
+ }
+ }
+
+ ]
+ }, {
+ # Story with only split audio
+ 'url': 'https://beta.prx.org/stories/326414',
+ 'info_dict': {
+ 'id': '326414',
+ 'title': 'Massachusetts v EPA',
+ 'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
+ 'timestamp': 1592509124,
+ 'modified_timestamp': 1592510457,
+ 'duration': 3088,
+ 'tags': 'count:0',
+ 'series': 'Outside/In',
+ 'series_id': '36252',
+ 'channel_id': '206',
+ 'channel_url': 'https://beta.prx.org/accounts/206',
+ 'channel': 'New Hampshire Public Radio',
+ },
+ 'playlist_count': 4
+ }, {
+ # Story with single combined audio
+ 'url': 'https://beta.prx.org/stories/400404',
+ 'info_dict': {
+ 'id': '400404',
+ 'title': 'Cafe Chill (Episode 2022-01)',
+ 'thumbnails': 'count:1',
+ 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
+ 'timestamp': 1641233952,
+ 'modified_timestamp': 1641234248,
+ 'duration': 3540,
+ 'series': 'Café Chill',
+ 'series_id': '37762',
+ 'channel_id': '5767',
+ 'channel_url': 'https://beta.prx.org/accounts/5767',
+ 'channel': 'C89.5 - KNHC Seattle',
+ 'ext': 'mp3',
+ 'tags': 'count:0',
+ 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
+ 'upload_date': '20220103',
+ 'modified_date': '20220103'
+ }
+ }, {
+ 'url': 'https://listen.prx.org/stories/399200',
+ 'only_matching': True
+ }
+ ]
+
+ def _extract_audio_pieces(self, audio_response):
+ return [{
+ 'format_id': str_or_none(piece_response.get('id')),
+ 'format_note': str_or_none(piece_response.get('label')),
+ 'filesize': int_or_none(piece_response.get('size')),
+ 'duration': int_or_none(piece_response.get('duration')),
+ 'ext': mimetype2ext(piece_response.get('contentType')),
+ 'asr': int_or_none(piece_response.get('frequency'), scale=1000),
+ 'abr': int_or_none(piece_response.get('bitRate')),
+ 'url': self._extract_file_link(piece_response),
+ 'vcodec': 'none'
+ } for piece_response in sorted(
+ self._get_prx_embed_response(audio_response, 'items') or [],
+ key=lambda p: int_or_none(p.get('position')))]
+
+ def _extract_story(self, story_response):
+ info = self._extract_story_info(story_response)
+ if not info:
+ return
+ audio_pieces = self._extract_audio_pieces(
+ self._get_prx_embed_response(story_response, 'audio'))
+ if len(audio_pieces) == 1:
+ return {
+ 'formats': audio_pieces,
+ **info
+ }
+
+ entries = [{
+ **info,
+ 'id': '%s_part%d' % (info['id'], (idx + 1)),
+ 'formats': [fmt],
+ } for idx, fmt in enumerate(audio_pieces)]
+ return {
+ '_type': 'multi_video',
+ 'entries': entries,
+ **info
+ }
+
+ def _real_extract(self, url):
+ story_id = self._match_id(url)
+ response = self._call_api(story_id, f'stories/{story_id}')
+ return self._extract_story(response)
+
+
+class PRXSeriesIE(PRXBaseIE):
+ _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'https://beta.prx.org/series/36252',
+ 'info_dict': {
+ 'id': '36252',
+ 'title': 'Outside/In',
+ 'thumbnails': 'count:1',
+ 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
+ 'timestamp': 1470684964,
+ 'modified_timestamp': 1582308830,
+ 'channel_id': '206',
+ 'channel_url': 'https://beta.prx.org/accounts/206',
+ 'channel': 'New Hampshire Public Radio',
+ 'series': 'Outside/In',
+ 'series_id': '36252'
+ },
+ 'playlist_mincount': 39
+ }, {
+ # Blank series
+ 'url': 'https://beta.prx.org/series/25038',
+ 'info_dict': {
+ 'id': '25038',
+ 'title': '25038',
+ 'timestamp': 1207612800,
+ 'modified_timestamp': 1207612800,
+ 'channel_id': '206',
+ 'channel_url': 'https://beta.prx.org/accounts/206',
+ 'channel': 'New Hampshire Public Radio',
+ 'series': '25038',
+ 'series_id': '25038'
+ },
+ 'playlist_count': 0
+ }
+ ]
+
+ def _extract_series(self, series_response):
+ info = self._extract_series_info(series_response)
+ return {
+ '_type': 'playlist',
+ 'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry),
+ **info
+ }
+
+ def _real_extract(self, url):
+ series_id = self._match_id(url)
+ response = self._call_api(series_id, f'series/{series_id}')
+ return self._extract_series(response)
+
+
+class PRXAccountIE(PRXBaseIE):
+ _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://beta.prx.org/accounts/206',
+ 'info_dict': {
+ 'id': '206',
+ 'title': 'New Hampshire Public Radio',
+ 'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
+ 'channel_id': '206',
+ 'channel_url': 'https://beta.prx.org/accounts/206',
+ 'channel': 'New Hampshire Public Radio',
+ 'thumbnails': 'count:1'
+ },
+ 'playlist_mincount': 380
+ }]
+
+ def _extract_account(self, account_response):
+ info = self._extract_account_info(account_response)
+ series = self._entries(
+ info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry)
+ stories = self._entries(
+ info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry)
+ return {
+ '_type': 'playlist',
+ 'entries': itertools.chain(series, stories),
+ **info
+ }
+
+ def _real_extract(self, url):
+ account_id = self._match_id(url)
+ response = self._call_api(account_id, f'accounts/{account_id}')
+ return self._extract_account(response)
+
+
+class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor):
+ IE_DESC = 'PRX Stories Search'
+ IE_NAME = 'prxstories:search'
+ _SEARCH_KEY = 'prxstories'
+
+ def _search_results(self, query):
+ yield from self._entries(
+ f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query})
+
+
+class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor):
+ IE_DESC = 'PRX Series Search'
+ IE_NAME = 'prxseries:search'
+ _SEARCH_KEY = 'prxseries'
+
+ def _search_results(self, query):
+ yield from self._entries(
+ f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query})
diff --git a/yt_dlp/extractor/radlive.py b/yt_dlp/extractor/radlive.py
index 2de7ab04a..dc9897305 100644
--- a/yt_dlp/extractor/radlive.py
+++ b/yt_dlp/extractor/radlive.py
@@ -1,6 +1,12 @@
import json
-from ..utils import ExtractorError, traverse_obj, try_get, unified_timestamp
+from ..utils import (
+ ExtractorError,
+ format_field,
+ traverse_obj,
+ try_get,
+ unified_timestamp
+)
from .common import InfoExtractor
@@ -74,7 +80,7 @@ class RadLiveIE(InfoExtractor):
'release_timestamp': release_date,
'channel': channel.get('name'),
'channel_id': channel_id,
- 'channel_url': f'https://rad.live/content/channel/{channel_id}' if channel_id else None,
+ 'channel_url': format_field(channel_id, template='https://rad.live/content/channel/%s'),
}
if content_type == 'episode':
diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py
index 39e57decd..34f127285 100644
--- a/yt_dlp/extractor/rai.py
+++ b/yt_dlp/extractor/rai.py
@@ -14,16 +14,14 @@ from ..utils import (
find_xpath_attr,
fix_xml_ampersands,
GeoRestrictedError,
- get_element_by_class,
HEADRequest,
int_or_none,
join_nonempty,
parse_duration,
- parse_list,
remove_start,
strip_or_none,
+ traverse_obj,
try_get,
- unescapeHTML,
unified_strdate,
unified_timestamp,
update_url_query,
@@ -37,7 +35,7 @@ class RaiBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['IT']
_GEO_BYPASS = False
- def _extract_relinker_info(self, relinker_url, video_id):
+ def _extract_relinker_info(self, relinker_url, video_id, audio_only=False):
if not re.match(r'https?://', relinker_url):
return {'formats': [{'url': relinker_url}]}
@@ -80,7 +78,15 @@ class RaiBaseIE(InfoExtractor):
if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'):
continue
- if ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon':
+ if ext == 'mp3':
+ formats.append({
+ 'url': media_url,
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ 'format_id': 'http-mp3',
+ })
+ break
+ elif ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon':
formats.extend(self._extract_m3u8_formats(
media_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
@@ -101,7 +107,8 @@ class RaiBaseIE(InfoExtractor):
if not formats and geoprotection is True:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
- formats.extend(self._create_http_urls(relinker_url, formats))
+ if not audio_only:
+ formats.extend(self._create_http_urls(relinker_url, formats))
return dict((k, v) for k, v in {
'is_live': is_live,
@@ -359,26 +366,44 @@ class RaiPlayLiveIE(RaiPlayIE):
class RaiPlayPlaylistIE(InfoExtractor):
- _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))'
+ _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?'
_TESTS = [{
- 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/',
+ 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/',
'info_dict': {
'id': 'nondirloalmiocapo',
'title': 'Non dirlo al mio capo',
'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b',
},
'playlist_mincount': 12,
+ }, {
+ 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/episodi/stagione-2/',
+ 'info_dict': {
+ 'id': 'nondirloalmiocapo',
+ 'title': 'Non dirlo al mio capo - Stagione 2',
+ 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b',
+ },
+ 'playlist_mincount': 12,
}]
def _real_extract(self, url):
- base, playlist_id = self._match_valid_url(url).groups()
+ base, playlist_id, extra_id = self._match_valid_url(url).groups()
program = self._download_json(
base + '.json', playlist_id, 'Downloading program JSON')
+ if extra_id:
+ extra_id = extra_id.upper().rstrip('/')
+
+ playlist_title = program.get('name')
entries = []
for b in (program.get('blocks') or []):
for s in (b.get('sets') or []):
+ if extra_id:
+ if extra_id != join_nonempty(
+ b.get('name'), s.get('name'), delim='/').replace(' ', '-').upper():
+ continue
+ playlist_title = join_nonempty(playlist_title, s.get('name'), delim=' - ')
+
s_id = s.get('id')
if not s_id:
continue
@@ -397,10 +422,128 @@ class RaiPlayPlaylistIE(InfoExtractor):
video_id=RaiPlayIE._match_id(video_url)))
return self.playlist_result(
- entries, playlist_id, program.get('name'),
+ entries, playlist_id, playlist_title,
try_get(program, lambda x: x['program_info']['description']))
+class RaiPlaySoundIE(RaiBaseIE):
+ _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE
+ _TESTS = [{
+ 'url': 'https://www.raiplaysound.it/audio/2021/12/IL-RUGGITO-DEL-CONIGLIO-1ebae2a7-7cdb-42bb-842e-fe0d193e9707.html',
+ 'md5': '8970abf8caf8aef4696e7b1f2adfc696',
+ 'info_dict': {
+ 'id': '1ebae2a7-7cdb-42bb-842e-fe0d193e9707',
+ 'ext': 'mp3',
+ 'title': 'Il Ruggito del Coniglio del 10/12/2021',
+ 'description': 'md5:2a17d2107e59a4a8faa0e18334139ee2',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'rai radio 2',
+ 'duration': 5685,
+ 'series': 'Il Ruggito del Coniglio',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ base, audio_id = self._match_valid_url(url).group('base', 'id')
+ media = self._download_json(f'{base}.json', audio_id, 'Downloading audio JSON')
+ uid = try_get(media, lambda x: remove_start(remove_start(x['uniquename'], 'ContentItem-'), 'Page-'))
+
+ info = {}
+ formats = []
+ relinkers = set(traverse_obj(media, (('downloadable_audio', 'audio', ('live', 'cards', 0, 'audio')), 'url')))
+ for r in relinkers:
+ info = self._extract_relinker_info(r, audio_id, True)
+ formats.extend(info.get('formats'))
+
+ date_published = try_get(media, (lambda x: f'{x["create_date"]} {x.get("create_time") or ""}',
+ lambda x: x['live']['create_date']))
+
+ podcast_info = traverse_obj(media, 'podcast_info', ('live', 'cards', 0)) or {}
+ thumbnails = [{
+ 'url': urljoin(url, thumb_url),
+ } for thumb_url in (podcast_info.get('images') or {}).values() if thumb_url]
+
+ return {
+ **info,
+ 'id': uid or audio_id,
+ 'display_id': audio_id,
+ 'title': traverse_obj(media, 'title', 'episode_title'),
+ 'alt_title': traverse_obj(media, ('track_info', 'media_name')),
+ 'description': media.get('description'),
+ 'uploader': traverse_obj(media, ('track_info', 'channel'), expected_type=strip_or_none),
+ 'creator': traverse_obj(media, ('track_info', 'editor'), expected_type=strip_or_none),
+ 'timestamp': unified_timestamp(date_published),
+ 'thumbnails': thumbnails,
+ 'series': podcast_info.get('title'),
+ 'season_number': int_or_none(media.get('season')),
+ 'episode': media.get('episode_title'),
+ 'episode_number': int_or_none(media.get('episode')),
+ 'formats': formats,
+ }
+
+
+class RaiPlaySoundLiveIE(RaiPlaySoundIE):
+ _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?P<id>[^/?#&]+)$)'
+ _TESTS = [{
+ 'url': 'https://www.raiplaysound.it/radio2',
+ 'info_dict': {
+ 'id': 'b00a50e6-f404-4af6-8f8c-ff3b9af73a44',
+ 'display_id': 'radio2',
+ 'ext': 'mp4',
+ 'title': 'Rai Radio 2',
+ 'uploader': 'rai radio 2',
+ 'creator': 'raiplaysound',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': 'live',
+ },
+ }]
+
+
+class RaiPlaySoundPlaylistIE(InfoExtractor):
+ _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?:programmi|playlist|audiolibri)/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?'
+ _TESTS = [{
+ 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio',
+ 'info_dict': {
+ 'id': 'ilruggitodelconiglio',
+ 'title': 'Il Ruggito del Coniglio',
+ 'description': 'md5:1bbaf631245a7ab1ec4d9fbb3c7aa8f3',
+ },
+ 'playlist_mincount': 65,
+ }, {
+ 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio/puntate/prima-stagione-1995',
+ 'info_dict': {
+ 'id': 'ilruggitodelconiglio_puntate_prima-stagione-1995',
+ 'title': 'Prima Stagione 1995',
+ },
+ 'playlist_count': 1,
+ }]
+
+ def _real_extract(self, url):
+ base, playlist_id, extra_id = self._match_valid_url(url).group('base', 'id', 'extra_id')
+ url = f'{base}.json'
+ program = self._download_json(url, playlist_id, 'Downloading program JSON')
+
+ if extra_id:
+ extra_id = extra_id.rstrip('/')
+ playlist_id += '_' + extra_id.replace('/', '_')
+ path = next(c['path_id'] for c in program.get('filters') or [] if extra_id in c.get('weblink'))
+ program = self._download_json(
+ urljoin('https://www.raiplaysound.it', path), playlist_id, 'Downloading program secondary JSON')
+
+ entries = [
+ self.url_result(urljoin(base, c['path_id']), ie=RaiPlaySoundIE.ie_key())
+ for c in traverse_obj(program, 'cards', ('block', 'cards')) or []
+ if c.get('path_id')]
+
+ return self.playlist_result(entries, playlist_id, program.get('title'),
+ traverse_obj(program, ('podcast_info', 'description')))
+
+
class RaiIE(RaiBaseIE):
_VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE
_TESTS = [{
@@ -593,84 +736,3 @@ class RaiIE(RaiBaseIE):
info.update(relinker_info)
return info
-
-
-class RaiPlayRadioBaseIE(InfoExtractor):
- _BASE = 'https://www.raiplayradio.it'
-
- def get_playlist_iter(self, url, uid):
- webpage = self._download_webpage(url, uid)
- for attrs in parse_list(webpage):
- title = attrs['data-title'].strip()
- audio_url = urljoin(url, attrs['data-mediapolis'])
- entry = {
- 'url': audio_url,
- 'id': attrs['data-uniquename'].lstrip('ContentItem-'),
- 'title': title,
- 'ext': 'mp3',
- 'language': 'it',
- }
- if 'data-image' in attrs:
- entry['thumbnail'] = urljoin(url, attrs['data-image'])
- yield entry
-
-
-class RaiPlayRadioIE(RaiPlayRadioBaseIE):
- _VALID_URL = r'%s/audio/.+?-(?P<id>%s)\.html' % (
- RaiPlayRadioBaseIE._BASE, RaiBaseIE._UUID_RE)
- _TEST = {
- 'url': 'https://www.raiplayradio.it/audio/2019/07/RADIO3---LEZIONI-DI-MUSICA-36b099ff-4123-4443-9bf9-38e43ef5e025.html',
- 'info_dict': {
- 'id': '36b099ff-4123-4443-9bf9-38e43ef5e025',
- 'ext': 'mp3',
- 'title': 'Dal "Chiaro di luna" al "Clair de lune", prima parte con Giovanni Bietti',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'language': 'it',
- }
- }
-
- def _real_extract(self, url):
- audio_id = self._match_id(url)
- list_url = url.replace('.html', '-list.html')
- return next(entry for entry in self.get_playlist_iter(list_url, audio_id) if entry['id'] == audio_id)
-
-
-class RaiPlayRadioPlaylistIE(RaiPlayRadioBaseIE):
- _VALID_URL = r'%s/playlist/.+?-(?P<id>%s)\.html' % (
- RaiPlayRadioBaseIE._BASE, RaiBaseIE._UUID_RE)
- _TEST = {
- 'url': 'https://www.raiplayradio.it/playlist/2017/12/Alice-nel-paese-delle-meraviglie-72371d3c-d998-49f3-8860-d168cfdf4966.html',
- 'info_dict': {
- 'id': '72371d3c-d998-49f3-8860-d168cfdf4966',
- 'title': "Alice nel paese delle meraviglie",
- 'description': "di Lewis Carrol letto da Aldo Busi",
- },
- 'playlist_count': 11,
- }
-
- def _real_extract(self, url):
- playlist_id = self._match_id(url)
- playlist_webpage = self._download_webpage(url, playlist_id)
- playlist_title = unescapeHTML(self._html_search_regex(
- r'data-playlist-title="(.+?)"', playlist_webpage, 'title'))
- playlist_creator = self._html_search_meta(
- 'nomeProgramma', playlist_webpage)
- playlist_description = get_element_by_class(
- 'textDescriptionProgramma', playlist_webpage)
-
- player_href = self._html_search_regex(
- r'data-player-href="(.+?)"', playlist_webpage, 'href')
- list_url = urljoin(url, player_href)
-
- entries = list(self.get_playlist_iter(list_url, playlist_id))
- for index, entry in enumerate(entries, start=1):
- entry.update({
- 'track': entry['title'],
- 'track_number': index,
- 'artist': playlist_creator,
- 'album': playlist_title
- })
-
- return self.playlist_result(
- entries, playlist_id, playlist_title, playlist_description,
- creator=playlist_creator)
diff --git a/yt_dlp/extractor/redbulltv.py b/yt_dlp/extractor/redbulltv.py
index e7fdcce3e..756a3666b 100644
--- a/yt_dlp/extractor/redbulltv.py
+++ b/yt_dlp/extractor/redbulltv.py
@@ -81,12 +81,11 @@ class RedBullTVIE(InfoExtractor):
title = video['title'].strip()
- formats = self._extract_m3u8_formats(
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token),
video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
self._sort_formats(formats)
- subtitles = {}
for resource in video.get('resources', []):
if resource.startswith('closed_caption_'):
splitted_resource = resource.split('_')
diff --git a/yt_dlp/extractor/rtl2.py b/yt_dlp/extractor/rtl2.py
index 4e3aa0398..e29171474 100644
--- a/yt_dlp/extractor/rtl2.py
+++ b/yt_dlp/extractor/rtl2.py
@@ -4,16 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..aes import aes_cbc_decrypt
+from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
from ..compat import (
compat_b64decode,
- compat_ord,
compat_str,
)
from ..utils import (
- bytes_to_intlist,
ExtractorError,
- intlist_to_bytes,
int_or_none,
strip_or_none,
)
@@ -142,17 +139,12 @@ class RTL2YouIE(RTL2YouBaseIE):
self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id)
data, iv = compat_b64decode(stream_data['streamUrl']).decode().split(':')
- stream_url = intlist_to_bytes(aes_cbc_decrypt(
- bytes_to_intlist(compat_b64decode(data)),
- bytes_to_intlist(self._AES_KEY),
- bytes_to_intlist(compat_b64decode(iv))
- ))
+ stream_url = unpad_pkcs7(aes_cbc_decrypt_bytes(
+ compat_b64decode(data), self._AES_KEY, compat_b64decode(iv)))
if b'rtl2_you_video_not_found' in stream_url:
raise ExtractorError('video not found', expected=True)
- formats = self._extract_m3u8_formats(
- stream_url[:-compat_ord(stream_url[-1])].decode(),
- video_id, 'mp4', 'm3u8_native')
+ formats = self._extract_m3u8_formats(stream_url.decode(), video_id, 'mp4', 'm3u8_native')
self._sort_formats(formats)
video_data = self._download_json(
diff --git a/yt_dlp/extractor/rtnews.py b/yt_dlp/extractor/rtnews.py
new file mode 100644
index 000000000..68b6044b6
--- /dev/null
+++ b/yt_dlp/extractor/rtnews.py
@@ -0,0 +1,199 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class RTNewsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?rt\.com/[^/]+/(?:[^/]+/)?(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://www.rt.com/sport/546301-djokovic-arrives-belgrade-crowds/',
+ 'playlist_mincount': 2,
+ 'info_dict': {
+ 'id': '546301',
+ 'title': 'Crowds gather to greet deported Djokovic as he returns to Serbia (VIDEO)',
+ 'description': 'md5:1d5bfe1a988d81fd74227cfdf93d314d',
+ 'thumbnail': 'https://cdni.rt.com/files/2022.01/article/61e587a085f540102c3386c1.png'
+ },
+ }, {
+ 'url': 'https://www.rt.com/shows/in-question/535980-plot-to-assassinate-julian-assange/',
+ 'playlist_mincount': 1,
+ 'info_dict': {
+ 'id': '535980',
+ 'title': 'The plot to assassinate Julian Assange',
+ 'description': 'md5:55279ce5e4441dc1d16e2e4a730152cd',
+ 'thumbnail': 'https://cdni.rt.com/files/2021.09/article/615226f42030274e8879b53d.png'
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '6152271d85f5400464496162',
+ 'ext': 'mp4',
+ 'title': '6152271d85f5400464496162',
+ },
+ }]
+ }]
+
+ def _entries(self, webpage):
+ video_urls = set(re.findall(r'https://cdnv\.rt\.com/.*[a-f0-9]+\.mp4', webpage))
+ for v_url in video_urls:
+ v_id = re.search(r'([a-f0-9]+)\.mp4', v_url).group(1)
+ if v_id:
+ yield {
+ 'id': v_id,
+ 'title': v_id,
+ 'url': v_url,
+ }
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+
+ return {
+ '_type': 'playlist',
+ 'id': id,
+ 'entries': self._entries(webpage),
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
+
+
+class RTDocumentryIE(InfoExtractor):
+ _VALID_URL = r'https?://rtd\.rt\.com/(?:(?:series|shows)/[^/]+|films)/(?P<id>[^/?$&#]+)'
+
+ _TESTS = [{
+ 'url': 'https://rtd.rt.com/films/escobars-hitman/',
+ 'info_dict': {
+ 'id': 'escobars-hitman',
+ 'ext': 'mp4',
+ 'title': "Escobar's Hitman. Former drug-gang killer, now loved and loathed in Colombia",
+ 'description': 'md5:647c76984b7cb9a8b52a567e87448d88',
+ 'thumbnail': 'https://cdni.rt.com/rtd-files/films/escobars-hitman/escobars-hitman_11.jpg',
+ 'average_rating': 8.53,
+ 'duration': 3134.0
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://rtd.rt.com/shows/the-kalashnikova-show-military-secrets-anna-knishenko/iskander-tactical-system-natos-headache/',
+ 'info_dict': {
+ 'id': 'iskander-tactical-system-natos-headache',
+ 'ext': 'mp4',
+ 'title': "Iskander tactical system. NATO's headache | The Kalashnikova Show. Episode 10",
+ 'description': 'md5:da7c24a0aa67bc2bb88c86658508ca87',
+ 'thumbnail': 'md5:89de8ce38c710b7c501ff02d47e2aa89',
+ 'average_rating': 9.27,
+ 'duration': 274.0,
+ 'timestamp': 1605726000,
+ 'view_count': int,
+ 'upload_date': '20201118'
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://rtd.rt.com/series/i-am-hacked-trailer/introduction-to-safe-digital-life-ep2/',
+ 'info_dict': {
+ 'id': 'introduction-to-safe-digital-life-ep2',
+ 'ext': 'mp4',
+ 'title': 'How to Keep your Money away from Hackers | I am Hacked. Episode 2',
+ 'description': 'md5:c46fa9a5af86c0008c45a3940a8cce87',
+ 'thumbnail': 'md5:a5e81b9bf5aed8f5e23d9c053601b825',
+ 'average_rating': 10.0,
+ 'duration': 1524.0,
+ 'timestamp': 1636977600,
+ 'view_count': int,
+ 'upload_date': '20211115'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ ld_json = self._search_json_ld(webpage, None, fatal=False)
+ if not ld_json:
+ self.raise_no_formats('No video/audio found at the provided url.', expected=True)
+ media_json = self._parse_json(
+ self._search_regex(r'(?s)\'Med\'\s*:\s*\[\s*({.+})\s*\]\s*};', webpage, 'media info'),
+ id, transform_source=js_to_json)
+ if 'title' not in ld_json and 'title' in media_json:
+ ld_json['title'] = media_json['title']
+ formats = [{'url': src['file']} for src in media_json.get('sources') or [] if src.get('file')]
+
+ return {
+ 'id': id,
+ 'thumbnail': media_json.get('image'),
+ 'formats': formats,
+ **ld_json
+ }
+
+
+class RTDocumentryPlaylistIE(InfoExtractor):
+ _VALID_URL = r'https?://rtd\.rt\.com/(?:series|shows)/(?P<id>[^/]+)/$'
+
+ _TESTS = [{
+ 'url': 'https://rtd.rt.com/series/i-am-hacked-trailer/',
+ 'playlist_mincount': 6,
+ 'info_dict': {
+ 'id': 'i-am-hacked-trailer',
+ },
+ }, {
+ 'url': 'https://rtd.rt.com/shows/the-kalashnikova-show-military-secrets-anna-knishenko/',
+ 'playlist_mincount': 34,
+ 'info_dict': {
+ 'id': 'the-kalashnikova-show-military-secrets-anna-knishenko',
+ },
+ }]
+
+ def _entries(self, webpage, id):
+ video_urls = set(re.findall(r'list-2__link\s*"\s*href="([^"]+)"', webpage))
+ for v_url in video_urls:
+ if id not in v_url:
+ continue
+ yield self.url_result(
+ 'https://rtd.rt.com%s' % v_url,
+ ie=RTDocumentryIE.ie_key())
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+
+ return {
+ '_type': 'playlist',
+ 'id': id,
+ 'entries': self._entries(webpage, id),
+ }
+
+
+class RuptlyIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?ruptly\.tv/[a-z]{2}/videos/(?P<id>\d+-\d+)'
+
+ _TESTS = [{
+ 'url': 'https://www.ruptly.tv/en/videos/20220112-020-Japan-Double-trouble-Tokyo-zoo-presents-adorable-panda-twins',
+ 'info_dict': {
+ 'id': '20220112-020',
+ 'ext': 'mp4',
+ 'title': 'Japan: Double trouble! Tokyo zoo presents adorable panda twins | Video Ruptly',
+ 'description': 'md5:85a8da5fdb31486f0562daf4360ce75a',
+ 'thumbnail': 'https://storage.ruptly.tv/thumbnails/20220112-020/i6JQKnTNpYuqaXsR/i6JQKnTNpYuqaXsR.jpg'
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ m3u8_url = self._search_regex(r'preview_url"\s?:\s?"(https?://storage\.ruptly\.tv/video_projects/.+\.m3u8)"', webpage, 'm3u8 url', fatal=False)
+ if not m3u8_url:
+ self.raise_no_formats('No video/audio found at the provided url.', expected=True)
+ formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, id, ext='mp4')
+ return {
+ 'id': id,
+ 'formats': formats,
+ 'subtitles': subs,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/yt_dlp/extractor/rule34video.py b/yt_dlp/extractor/rule34video.py
new file mode 100644
index 000000000..522d4ccd5
--- /dev/null
+++ b/yt_dlp/extractor/rule34video.py
@@ -0,0 +1,65 @@
+# coding: utf-8
+from __future__ import unicode_literals
+import re
+
+from ..utils import parse_duration
+from .common import InfoExtractor
+
+
+class Rule34VideoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?rule34video\.com/videos/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'https://rule34video.com/videos/3065157/shot-it-mmd-hmv/',
+ 'md5': 'ffccac2c23799dabbd192621ae4d04f3',
+ 'info_dict': {
+ 'id': '3065157',
+ 'ext': 'mp4',
+ 'title': 'Shot It-(mmd hmv)',
+ 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065157/preview.jpg',
+ 'duration': 347.0,
+ 'age_limit': 18
+ }
+ },
+ {
+ 'url': 'https://rule34video.com/videos/3065296/lara-in-trouble-ep-7-wildeerstudio/',
+ 'md5': '6bb5169f9f6b38cd70882bf2e64f6b86',
+ 'info_dict': {
+ 'id': '3065296',
+ 'ext': 'mp4',
+ 'title': 'Lara in Trouble Ep. 7 [WildeerStudio]',
+ 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065296/preview.jpg',
+ 'duration': 938.0,
+ 'age_limit': 18
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ formats = []
+
+ for mobj in re.finditer(r'<a[^>]+href="(?P<video_url>[^"]+download=true[^"]+)".*>(?P<ext>[^\s]+) (?P<quality>[^<]+)p</a>', webpage):
+ url, ext, quality = mobj.groups()
+ formats.append({
+ 'url': url,
+ 'ext': ext.lower(),
+ 'quality': quality,
+ })
+
+ title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
+ thumbnail = self._html_search_regex(r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None)
+ duration = self._html_search_regex(r'"icon-clock"></i>\s+<span>((?:\d+:?)+)', webpage, 'duration', default=None)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': parse_duration(duration),
+ 'age_limit': 18
+ }
diff --git a/yt_dlp/extractor/shemaroome.py b/yt_dlp/extractor/shemaroome.py
index 00a5b00cd..45c12915a 100644
--- a/yt_dlp/extractor/shemaroome.py
+++ b/yt_dlp/extractor/shemaroome.py
@@ -2,10 +2,9 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..aes import aes_cbc_decrypt
+from ..aes import aes_cbc_decrypt, unpad_pkcs7
from ..compat import (
compat_b64decode,
- compat_ord,
)
from ..utils import (
bytes_to_intlist,
@@ -76,8 +75,7 @@ class ShemarooMeIE(InfoExtractor):
url_data = bytes_to_intlist(compat_b64decode(data_json['new_play_url']))
key = bytes_to_intlist(compat_b64decode(data_json['key']))
iv = [0] * 16
- m3u8_url = intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))
- m3u8_url = m3u8_url[:-compat_ord((m3u8_url[-1]))].decode('ascii')
+ m3u8_url = unpad_pkcs7(intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))).decode('ascii')
formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers={'stream_key': data_json['stream_key']})
self._sort_formats(formats)
diff --git a/yt_dlp/extractor/skyit.py b/yt_dlp/extractor/skyit.py
index 496bb42a2..ddb43c075 100644
--- a/yt_dlp/extractor/skyit.py
+++ b/yt_dlp/extractor/skyit.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
- compat_str,
compat_parse_qs,
compat_urllib_parse_urlparse,
)
@@ -125,9 +124,7 @@ class SkyItVideoLiveIE(SkyItPlayerIE):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- asset_id = compat_str(self._parse_json(self._search_regex(
- r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>',
- webpage, 'next data'), display_id)['props']['initialState']['livePage']['content']['asset_id'])
+ asset_id = str(self._search_nextjs_data(webpage, display_id)['props']['initialState']['livePage']['content']['asset_id'])
livestream = self._download_json(
'https://apid.sky.it/vdp/v1/getLivestream',
asset_id, query={'id': asset_id})
diff --git a/yt_dlp/extractor/sportdeutschland.py b/yt_dlp/extractor/sportdeutschland.py
index 94bcaba44..15b488ab7 100644
--- a/yt_dlp/extractor/sportdeutschland.py
+++ b/yt_dlp/extractor/sportdeutschland.py
@@ -59,12 +59,8 @@ class SportDeutschlandIE(InfoExtractor):
videos = asset.get('videos') or []
if len(videos) > 1:
playlist_id = parse_qs(url).get('playlistId', [None])[0]
- if playlist_id:
- if self.get_param('noplaylist'):
- videos = [videos[int(playlist_id)]]
- self.to_screen('Downloading just a single video because of --no-playlist')
- else:
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % asset_id)
+ if not self._yes_playlist(playlist_id, asset_id):
+ videos = [videos[int(playlist_id)]]
def entries():
for i, video in enumerate(videos, 1):
diff --git a/yt_dlp/extractor/storyfire.py b/yt_dlp/extractor/storyfire.py
index 9c698626f..e18a59a49 100644
--- a/yt_dlp/extractor/storyfire.py
+++ b/yt_dlp/extractor/storyfire.py
@@ -5,7 +5,7 @@ import functools
from .common import InfoExtractor
from ..utils import (
- # HEADRequest,
+ format_field,
int_or_none,
OnDemandPagedList,
smuggle_url,
@@ -26,18 +26,6 @@ class StoryFireBaseIE(InfoExtractor):
r'https?://player\.vimeo\.com/external/(\d+)',
video['vimeoVideoURL'], 'vimeo id')
- # video_url = self._request_webpage(
- # HEADRequest(video['vimeoVideoURL']), video_id).geturl()
- # formats = []
- # for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]:
- # formats.extend(self._extract_m3u8_formats(
- # v_url, video_id, 'mp4', 'm3u8_native',
- # m3u8_id='hls' + suffix, fatal=False))
- # formats.extend(self._extract_mpd_formats(
- # v_url.replace('.m3u8', '.mpd'), video_id,
- # mpd_id='dash' + suffix, fatal=False))
- # self._sort_formats(formats)
-
uploader_id = video.get('hostID')
return {
@@ -51,7 +39,6 @@ class StoryFireBaseIE(InfoExtractor):
'Referer': 'https://storyfire.com/',
}
}),
- # 'formats': formats,
'thumbnail': video.get('storyImage'),
'view_count': int_or_none(video.get('views')),
'like_count': int_or_none(video.get('likesCount')),
@@ -60,7 +47,7 @@ class StoryFireBaseIE(InfoExtractor):
'timestamp': int_or_none(video.get('publishDate')),
'uploader': video.get('username'),
'uploader_id': uploader_id,
- 'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None,
+ 'uploader_url': format_field(uploader_id, template='https://storyfire.com/user/%s/video'),
'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')),
}
diff --git a/yt_dlp/extractor/streamcz.py b/yt_dlp/extractor/streamcz.py
index 58e0b4c80..0191c77de 100644
--- a/yt_dlp/extractor/streamcz.py
+++ b/yt_dlp/extractor/streamcz.py
@@ -1,105 +1,108 @@
# coding: utf-8
-from __future__ import unicode_literals
-
-import hashlib
-import time
+import json
from .common import InfoExtractor
from ..utils import (
+ float_or_none,
int_or_none,
- sanitized_Request,
+ parse_codecs,
+ traverse_obj,
+ urljoin,
)
-def _get_api_key(api_path):
- if api_path.endswith('?'):
- api_path = api_path[:-1]
-
- api_key = 'fb5f58a820353bd7095de526253c14fd'
- a = '{0:}{1:}{2:}'.format(api_key, api_path, int(round(time.time() / 24 / 3600)))
- return hashlib.md5(a.encode('ascii')).hexdigest()
-
-
class StreamCZIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<id>[0-9]+)'
- _API_URL = 'http://www.stream.cz/API'
-
+ _VALID_URL = r'https?://(?:www\.)?(?:stream|televizeseznam)\.cz/[^?#]+/(?P<display_id>[^?#]+)-(?P<id>[0-9]+)'
_TESTS = [{
- 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti',
- 'md5': '934bb6a6d220d99c010783c9719960d5',
+ 'url': 'https://www.televizeseznam.cz/video/lajna/buh-57953890',
+ 'md5': '40c41ade1464a390a0b447e333df4239',
'info_dict': {
- 'id': '765767',
+ 'id': '57953890',
'ext': 'mp4',
- 'title': 'Peklo na talíři: Éčka pro děti',
- 'description': 'Taška s grónskou pomazánkou a další pekelnosti ZDE',
- 'thumbnail': 're:^http://im.stream.cz/episode/52961d7e19d423f8f06f0100',
- 'duration': 256,
- },
+ 'title': 'Bůh',
+ 'display_id': 'buh',
+ 'description': 'md5:8f5f09b9b7bc67df910486cdd88f7165',
+ }
}, {
- 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka',
- 'md5': '849a88c1e1ca47d41403c2ba5e59e261',
+ 'url': 'https://www.stream.cz/tajemno/znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili-64147267',
+ 'md5': '3ee4d0be040e8f4a543e67e509d55e3f',
'info_dict': {
- 'id': '10002447',
+ 'id': '64147267',
'ext': 'mp4',
- 'title': 'Kancelář Blaník: Tři roky pro Mazánka',
- 'description': 'md5:3862a00ba7bf0b3e44806b544032c859',
- 'thumbnail': 're:^http://im.stream.cz/episode/537f838c50c11f8d21320000',
- 'duration': 368,
- },
+ 'title': 'Zničehonic jim skrz střechu prolítnul záhadný předmět. Badatelé vše objasnili',
+ 'display_id': 'znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili',
+ 'description': 'md5:1dcb5e010eb697dedc5942f76c5b3744',
+ }
}]
- def _real_extract(self, url):
- video_id = self._match_id(url)
- api_path = '/episode/%s' % video_id
-
- req = sanitized_Request(self._API_URL + api_path)
- req.add_header('Api-Password', _get_api_key(api_path))
- data = self._download_json(req, video_id)
+ def _extract_formats(self, spl_url, video):
+ for ext, pref, streams in (
+ ('ts', -1, traverse_obj(video, ('http_stream', 'qualities'))),
+ ('mp4', 1, video.get('mp4'))):
+ for format_id, stream in streams.items():
+ if not stream.get('url'):
+ continue
+ yield {
+ 'format_id': f'{format_id}-{ext}',
+ 'ext': ext,
+ 'source_preference': pref,
+ 'url': urljoin(spl_url, stream['url']),
+ 'tbr': float_or_none(stream.get('bandwidth'), scale=1000),
+ 'duration': float_or_none(stream.get('duration'), scale=1000),
+ 'width': traverse_obj(stream, ('resolution', 0)),
+ 'height': traverse_obj(stream, ('resolution', 1)) or int_or_none(format_id.replace('p', '')),
+ **parse_codecs(stream.get('codec')),
+ }
- formats = []
- for quality, video in enumerate(data['video_qualities']):
- for f in video['formats']:
- typ = f['type'].partition('/')[2]
- qlabel = video.get('quality_label')
- formats.append({
- 'format_note': '%s-%s' % (qlabel, typ) if qlabel else typ,
- 'format_id': '%s-%s' % (typ, f['quality']),
- 'url': f['source'],
- 'height': int_or_none(f['quality'].rstrip('p')),
- 'quality': quality,
- })
- self._sort_formats(formats)
+ def _real_extract(self, url):
+ display_id, video_id = self._match_valid_url(url).groups()
- image = data.get('image')
- if image:
- thumbnail = self._proto_relative_url(
- image.replace('{width}', '1240').replace('{height}', '697'),
- scheme='http:',
- )
- else:
- thumbnail = None
+ data = self._download_json(
+ 'https://www.televizeseznam.cz/api/graphql', video_id, 'Downloading GraphQL result',
+ data=json.dumps({
+ 'variables': {'urlName': video_id},
+ 'query': '''
+ query LoadEpisode($urlName : String){ episode(urlName: $urlName){ ...VideoDetailFragmentOnEpisode } }
+ fragment VideoDetailFragmentOnEpisode on Episode {
+ id
+ spl
+ urlName
+ name
+ perex
+ duration
+ views
+ }'''
+ }).encode('utf-8'),
+ headers={'Content-Type': 'application/json;charset=UTF-8'}
+ )['data']['episode']
- stream = data.get('_embedded', {}).get('stream:show', {}).get('name')
- if stream:
- title = '%s: %s' % (stream, data['name'])
- else:
- title = data['name']
+ spl_url = data['spl'] + 'spl2,3'
+ metadata = self._download_json(spl_url, video_id, 'Downloading playlist')
+ if 'Location' in metadata and 'data' not in metadata:
+ spl_url = metadata['Location']
+ metadata = self._download_json(spl_url, video_id, 'Downloading redirected playlist')
+ video = metadata['data']
subtitles = {}
- srt_url = data.get('subtitles_srt')
- if srt_url:
- subtitles['cs'] = [{
- 'ext': 'srt',
- 'url': srt_url,
- }]
+ for subs in video.get('subtitles', {}).values():
+ if not subs.get('language'):
+ continue
+ for ext, sub_url in subs.get('urls').items():
+ subtitles.setdefault(subs['language'], []).append({
+ 'ext': ext,
+ 'url': urljoin(spl_url, sub_url)
+ })
+
+ formats = list(self._extract_formats(spl_url, video))
+ self._sort_formats(formats)
return {
'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- 'description': data.get('web_site_text'),
- 'duration': int_or_none(data.get('duration')),
+ 'display_id': display_id,
+ 'title': data.get('name'),
+ 'description': data.get('perex'),
+ 'duration': float_or_none(data.get('duration')),
'view_count': int_or_none(data.get('views')),
+ 'formats': formats,
'subtitles': subtitles,
}
diff --git a/yt_dlp/extractor/stv.py b/yt_dlp/extractor/stv.py
index d36a4b6e9..ba5661d74 100644
--- a/yt_dlp/extractor/stv.py
+++ b/yt_dlp/extractor/stv.py
@@ -45,10 +45,7 @@ class STVPlayerIE(InfoExtractor):
ptype, video_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, video_id, fatal=False) or ''
- props = (self._parse_json(self._search_regex(
- r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>',
- webpage, 'next data', default='{}'), video_id,
- fatal=False) or {}).get('props') or {}
+ props = self._search_nextjs_data(webpage, video_id, default='{}').get('props') or {}
player_api_cache = try_get(
props, lambda x: x['initialReduxState']['playerApiCache']) or {}
diff --git a/yt_dlp/extractor/ted.py b/yt_dlp/extractor/ted.py
index f09f1a3f9..b5c7e35ac 100644
--- a/yt_dlp/extractor/ted.py
+++ b/yt_dlp/extractor/ted.py
@@ -1,274 +1,105 @@
-from __future__ import unicode_literals
-
-import json
+import itertools
import re
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_urlparse
-)
from ..utils import (
- extract_attributes,
- float_or_none,
int_or_none,
+ str_to_int,
try_get,
url_or_none,
+ unified_strdate,
+ parse_duration,
)
-class TEDIE(InfoExtractor):
- IE_NAME = 'ted'
- _VALID_URL = r'''(?x)
- (?P<proto>https?://)
- (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
- (
- (?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist
- |
- ((?P<type_talk>talks)) # We have a simple talk
- |
- (?P<type_watch>watch)/[^/]+/[^/]+
- )
- (/lang/(.*?))? # The url may contain the language
- /(?P<name>[\w-]+) # Here goes the name and then ".html"
- .*)$
- '''
+class TedBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://www\.ted\.com/(?:{type})(?:/lang/[^/#?]+)?/(?P<id>[\w-]+)'
+
+ def _parse_playlist(self, playlist):
+ for entry in try_get(playlist, lambda x: x['videos']['nodes'], list):
+ if entry.get('__typename') == 'Video' and entry.get('canonicalUrl'):
+ yield self.url_result(entry['canonicalUrl'], TedTalkIE.ie_key())
+
+
+class TedTalkIE(TedBaseIE):
+ _VALID_URL = TedBaseIE._VALID_URL_BASE.format(type='talks')
_TESTS = [{
- 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
- 'md5': 'b0ce2b05ca215042124fbc9e3886493a',
- 'info_dict': {
- 'id': '102',
- 'ext': 'mp4',
- 'title': 'The illusion of consciousness',
- 'description': ('Philosopher Dan Dennett makes a compelling '
- 'argument that not only don\'t we understand our own '
- 'consciousness, but that half the time our brains are '
- 'actively fooling us.'),
- 'uploader': 'Dan Dennett',
- 'width': 853,
- 'duration': 1308,
- 'view_count': int,
- 'comment_count': int,
- 'tags': list,
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- # missing HTTP bitrates
- 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms',
- 'info_dict': {
- 'id': '6069',
- 'ext': 'mp4',
- 'title': 'The beauty and power of algorithms',
- 'thumbnail': r're:^https?://.+\.jpg',
- 'description': 'md5:734e352710fb00d840ab87ae31aaf688',
- 'uploader': 'Vishal Sikka',
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
- 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0',
- 'info_dict': {
- 'id': '1972',
- 'ext': 'mp4',
- 'title': 'Be passionate. Be courageous. Be your best.',
- 'uploader': 'Gabby Giffords and Mark Kelly',
- 'description': 'md5:5174aed4d0f16021b704120360f72b92',
- 'duration': 1128,
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
- 'info_dict': {
- 'id': '10',
- 'title': 'Who are the hackers?',
- 'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a'
- },
- 'playlist_mincount': 6,
- }, {
- # contains a youtube video
- 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
- 'add_ie': ['Youtube'],
- 'info_dict': {
- 'id': '_ZG8HBuDjgc',
- 'ext': 'webm',
- 'title': 'Douglas Adams: Parrots the Universe and Everything',
- 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
- 'uploader': 'University of California Television (UCTV)',
- 'uploader_id': 'UCtelevision',
- 'upload_date': '20080522',
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- # no nativeDownloads
- 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
+ 'url': 'https://www.ted.com/talks/candace_parker_how_to_break_down_barriers_and_not_accept_limits',
+ 'md5': '47e82c666d9c3261d4fe74748a90aada',
'info_dict': {
- 'id': '1792',
+ 'id': '86532',
'ext': 'mp4',
- 'title': 'The orchestra in my mouth',
- 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
- 'uploader': 'Tom Thum',
+ 'title': 'How to break down barriers and not accept limits',
+ 'description': 'md5:000707cece219d1e165b11550d612331',
'view_count': int,
- 'comment_count': int,
- 'tags': list,
+ 'tags': ['personal growth', 'equality', 'activism', 'motivation', 'social change', 'sports'],
+ 'uploader': 'Candace Parker',
+ 'duration': 676.0,
+ 'upload_date': '20220114',
+ 'release_date': '20211201',
+ 'thumbnail': r're:http.*\.jpg',
},
- 'params': {
- 'skip_download': True,
- },
- }, {
- # with own formats and private Youtube external
- 'url': 'https://www.ted.com/talks/spencer_wells_a_family_tree_for_humanity',
- 'only_matching': True,
}]
- _NATIVE_FORMATS = {
- 'low': {'width': 320, 'height': 180},
- 'medium': {'width': 512, 'height': 288},
- 'high': {'width': 854, 'height': 480},
- }
-
- def _extract_info(self, webpage):
- info_json = self._search_regex(
- r'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*</script>',
- webpage, 'info json')
- return json.loads(info_json)
-
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url, re.VERBOSE)
- if m.group('type').startswith('embed'):
- desktop_url = m.group('proto') + 'www' + m.group('urlmain')
- return self.url_result(desktop_url, 'TED')
- name = m.group('name')
- if m.group('type_talk'):
- return self._talk_info(url, name)
- elif m.group('type_watch'):
- return self._watch_info(url, name)
- else:
- return self._playlist_videos_info(url, name)
-
- def _playlist_videos_info(self, url, name):
- '''Returns the videos of the playlist'''
-
- webpage = self._download_webpage(url, name,
- 'Downloading playlist webpage')
-
- playlist_entries = []
- for entry in re.findall(r'(?s)<[^>]+data-ga-context=["\']playlist["\'][^>]*>', webpage):
- attrs = extract_attributes(entry)
- entry_url = compat_urlparse.urljoin(url, attrs['href'])
- playlist_entries.append(self.url_result(entry_url, self.ie_key()))
-
- final_url = self._og_search_url(webpage, fatal=False)
- playlist_id = (
- re.match(self._VALID_URL, final_url).group('playlist_id')
- if final_url else None)
-
- return self.playlist_result(
- playlist_entries, playlist_id=playlist_id,
- playlist_title=self._og_search_title(webpage, fatal=False),
- playlist_description=self._og_search_description(webpage))
-
- def _talk_info(self, url, video_name):
- webpage = self._download_webpage(url, video_name)
-
- info = self._extract_info(webpage)
-
- data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info
- talk_info = data['talks'][0]
-
- title = talk_info['title'].strip()
-
- downloads = talk_info.get('downloads') or {}
- native_downloads = downloads.get('nativeDownloads') or talk_info.get('nativeDownloads') or {}
-
- formats = [{
- 'url': format_url,
- 'format_id': format_id,
- } for (format_id, format_url) in native_downloads.items() if format_url is not None]
-
- subtitled_downloads = downloads.get('subtitledDownloads') or {}
- for lang, subtitled_download in subtitled_downloads.items():
- for q in self._NATIVE_FORMATS:
- q_url = subtitled_download.get(q)
- if not q_url:
- continue
- formats.append({
- 'url': q_url,
- 'format_id': '%s-%s' % (q, lang),
- 'language': lang,
- })
-
- if formats:
- for f in formats:
- finfo = self._NATIVE_FORMATS.get(f['format_id'].split('-')[0])
- if finfo:
- f.update(finfo)
-
- player_talk = talk_info['player_talks'][0]
-
- resources_ = player_talk.get('resources') or talk_info.get('resources')
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ talk_info = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['videoData']
+ video_id = talk_info['id']
+ playerData = self._parse_json(talk_info.get('playerData'), video_id)
http_url = None
- for format_id, resources in resources_.items():
+ formats, subtitles = [], {}
+ for format_id, resources in (playerData.get('resources') or {}).items():
if format_id == 'hls':
- if not isinstance(resources, dict):
- continue
- stream_url = url_or_none(resources.get('stream'))
+ stream_url = url_or_none(try_get(resources, lambda x: x['stream']))
if not stream_url:
continue
- formats.extend(self._extract_m3u8_formats(
- stream_url, video_name, 'mp4', m3u8_id=format_id,
- fatal=False))
- else:
- if not isinstance(resources, list):
- continue
- if format_id == 'h264':
- for resource in resources:
- h264_url = resource.get('file')
- if not h264_url:
- continue
- bitrate = int_or_none(resource.get('bitrate'))
- formats.append({
- 'url': h264_url,
- 'format_id': '%s-%sk' % (format_id, bitrate),
- 'tbr': bitrate,
- })
- if re.search(r'\d+k', h264_url):
- http_url = h264_url
- elif format_id == 'rtmp':
- streamer = talk_info.get('streamer')
- if not streamer:
+ m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
+ stream_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
+ formats.extend(m3u8_formats)
+ subtitles = self._merge_subtitles(subtitles, m3u8_subs)
+ continue
+
+ if not isinstance(resources, list):
+ continue
+ if format_id == 'h264':
+ for resource in resources:
+ h264_url = resource.get('file')
+ if not h264_url:
continue
- for resource in resources:
- formats.append({
- 'format_id': '%s-%s' % (format_id, resource.get('name')),
- 'url': streamer,
- 'play_path': resource['file'],
- 'ext': 'flv',
- 'width': int_or_none(resource.get('width')),
- 'height': int_or_none(resource.get('height')),
- 'tbr': int_or_none(resource.get('bitrate')),
- })
+ bitrate = int_or_none(resource.get('bitrate'))
+ formats.append({
+ 'url': h264_url,
+ 'format_id': '%s-%sk' % (format_id, bitrate),
+ 'tbr': bitrate,
+ })
+ if re.search(r'\d+k', h264_url):
+ http_url = h264_url
+ elif format_id == 'rtmp':
+ streamer = talk_info.get('streamer')
+ if not streamer:
+ continue
+ formats.extend({
+ 'format_id': '%s-%s' % (format_id, resource.get('name')),
+ 'url': streamer,
+ 'play_path': resource['file'],
+ 'ext': 'flv',
+ 'width': int_or_none(resource.get('width')),
+ 'height': int_or_none(resource.get('height')),
+ 'tbr': int_or_none(resource.get('bitrate')),
+ } for resource in resources if resource.get('file'))
- m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
- formats))
if http_url:
+ m3u8_formats = [f for f in formats if f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none']
for m3u8_format in m3u8_formats:
bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
if not bitrate:
continue
bitrate_url = re.sub(r'\d+k', bitrate, http_url)
if not self._is_valid_url(
- bitrate_url, video_name, '%s bitrate' % bitrate):
+ bitrate_url, video_id, '%s bitrate' % bitrate):
continue
f = m3u8_format.copy()
f.update({
@@ -289,79 +120,123 @@ class TEDIE(InfoExtractor):
})
if not formats:
- external = player_talk.get('external')
- if isinstance(external, dict):
- service = external.get('service')
- if isinstance(service, compat_str):
- ext_url = None
- if service.lower() == 'youtube':
- ext_url = external.get('code')
- return self.url_result(ext_url or external['uri'])
+ external = playerData.get('external') or {}
+ service = external.get('service') or ''
+ ext_url = external.get('code') if service.lower() == 'youtube' else None
+ return self.url_result(ext_url or external['uri'])
self._sort_formats(formats)
- video_id = compat_str(talk_info['id'])
+ thumbnail = playerData.get('thumb') or self._og_search_property('image', webpage)
+ if thumbnail:
+ # trim thumbnail resize parameters
+ thumbnail = thumbnail.split('?')[0]
return {
'id': video_id,
- 'title': title,
- 'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
- 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
- 'description': self._og_search_description(webpage),
- 'subtitles': self._get_subtitles(video_id, talk_info),
+ 'title': talk_info.get('title') or self._og_search_title(webpage),
+ 'uploader': talk_info.get('presenterDisplayName'),
+ 'thumbnail': thumbnail,
+ 'description': talk_info.get('description') or self._og_search_description(webpage),
+ 'subtitles': subtitles,
'formats': formats,
- 'duration': float_or_none(talk_info.get('duration')),
- 'view_count': int_or_none(data.get('viewed_count')),
- 'comment_count': int_or_none(
- try_get(data, lambda x: x['comments']['count'])),
- 'tags': try_get(talk_info, lambda x: x['tags'], list),
+ 'duration': talk_info.get('duration') or parse_duration(self._og_search_property('video:duration', webpage)),
+ 'view_count': str_to_int(talk_info.get('viewedCount')),
+ 'upload_date': unified_strdate(talk_info.get('publishedAt')),
+ 'release_date': unified_strdate(talk_info.get('recordedOn')),
+ 'tags': try_get(playerData, lambda x: x['targeting']['tag'].split(',')),
}
- def _get_subtitles(self, video_id, talk_info):
- sub_lang_list = {}
- for language in try_get(
- talk_info,
- (lambda x: x['downloads']['languages'],
- lambda x: x['languages']), list):
- lang_code = language.get('languageCode') or language.get('ianaCode')
- if not lang_code:
- continue
- sub_lang_list[lang_code] = [
- {
- 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
- 'ext': ext,
- }
- for ext in ['ted', 'srt']
- ]
- return sub_lang_list
- def _watch_info(self, url, name):
- webpage = self._download_webpage(url, name)
+class TedSeriesIE(TedBaseIE):
+ _VALID_URL = fr'{TedBaseIE._VALID_URL_BASE.format(type=r"series")}(?:#season_(?P<season>\d+))?'
+ _TESTS = [{
+ 'url': 'https://www.ted.com/series/small_thing_big_idea',
+ 'info_dict': {
+ 'id': '3',
+ 'title': 'Small Thing Big Idea',
+ 'series': 'Small Thing Big Idea',
+ 'description': 'md5:6869ca52cec661aef72b3e9f7441c55c'
+ },
+ 'playlist_mincount': 16,
+ }, {
+ 'url': 'https://www.ted.com/series/the_way_we_work#season_2',
+ 'info_dict': {
+ 'id': '8_2',
+ 'title': 'The Way We Work Season 2',
+ 'series': 'The Way We Work',
+ 'description': 'md5:59469256e533e1a48c4aa926a382234c',
+ 'season_number': 2
+ },
+ 'playlist_mincount': 8,
+ }]
- config_json = self._html_search_regex(
- r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
- webpage, 'config', default=None)
- if not config_json:
- embed_url = self._search_regex(
- r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
- return self.url_result(self._proto_relative_url(embed_url))
- config = json.loads(config_json)['config']
- video_url = config['video']['url']
- thumbnail = config.get('image', {}).get('url')
+ def _real_extract(self, url):
+ display_id, season = self._match_valid_url(url).group('id', 'season')
+ webpage = self._download_webpage(url, display_id, 'Downloading series webpage')
+ info = self._search_nextjs_data(webpage, display_id)['props']['pageProps']
- title = self._html_search_regex(
- r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
- description = self._html_search_regex(
- [
- r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
- r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
- ],
- webpage, 'description', fatal=False)
+ entries = itertools.chain.from_iterable(
+ self._parse_playlist(s) for s in info['seasons'] if season in [None, s.get('seasonNumber')])
- return {
- 'id': name,
- 'url': video_url,
- 'title': title,
- 'thumbnail': thumbnail,
- 'description': description,
- }
+ series_id = try_get(info, lambda x: x['series']['id'])
+ series_name = try_get(info, lambda x: x['series']['name']) or self._og_search_title(webpage, fatal=False)
+
+ return self.playlist_result(
+ entries,
+ f'{series_id}_{season}' if season and series_id else series_id,
+ f'{series_name} Season {season}' if season else series_name,
+ self._og_search_description(webpage),
+ series=series_name, season_number=int_or_none(season))
+
+
+class TedPlaylistIE(TedBaseIE):
+ _VALID_URL = TedBaseIE._VALID_URL_BASE.format(type=r'playlists(?:/\d+)?')
+ _TESTS = [{
+ 'url': 'https://www.ted.com/playlists/171/the_most_popular_talks_of_all',
+ 'info_dict': {
+ 'id': '171',
+ 'title': 'The most popular talks of all time',
+ 'description': 'md5:d2f22831dc86c7040e733a3cb3993d78'
+ },
+ 'playlist_mincount': 25,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ playlist = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['playlist']
+
+ return self.playlist_result(
+ self._parse_playlist(playlist), playlist.get('id'),
+ playlist.get('title') or self._og_search_title(webpage, default='').replace(' | TED Talks', '') or None,
+ self._og_search_description(webpage))
+
+
+class TedEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://embed(?:-ssl)?\.ted\.com/'
+
+ _TESTS = [{
+ 'url': 'https://embed.ted.com/talks/janet_stovall_how_to_get_serious_about_diversity_and_inclusion_in_the_workplace',
+ 'info_dict': {
+ 'id': '21802',
+ 'ext': 'mp4',
+ 'title': 'How to get serious about diversity and inclusion in the workplace',
+ 'description': 'md5:0978aafe396e05341f8ecc795d22189d',
+ 'view_count': int,
+ 'tags': list,
+ 'uploader': 'Janet Stovall',
+ 'duration': 664.0,
+ 'upload_date': '20180822',
+ 'release_date': '20180719',
+ 'thumbnail': r're:http.*\.jpg',
+ },
+ }]
+
+ @classmethod
+ def _extract_urls(cls, webpage):
+ return [mobj.group('url') for mobj in re.finditer(
+ fr'<iframe[^>]+?src=(["\'])(?P<url>{cls._VALID_URL}.+?)\1', webpage)]
+
+ def _real_extract(self, url):
+ return self.url_result(re.sub(r'://embed(-ssl)?', '://www', url), TedTalkIE.ie_key())
diff --git a/yt_dlp/extractor/telemundo.py b/yt_dlp/extractor/telemundo.py
index e326bbdd5..ebcecf55f 100644
--- a/yt_dlp/extractor/telemundo.py
+++ b/yt_dlp/extractor/telemundo.py
@@ -34,8 +34,7 @@ class TelemundoIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- metadata = self._parse_json(
- self._search_regex(r'<[^>]+id="__NEXT_DATA__"[^>]+>([^<]+)', webpage, 'JSON metadata'), video_id)
+ metadata = self._search_nextjs_data(webpage, video_id)
redirect_url = try_get(
metadata,
lambda x: x['props']['initialState']['video']['associatedPlaylists'][0]['videos'][0]['videoAssets'][0]['publicUrl'])
diff --git a/yt_dlp/extractor/theta.py b/yt_dlp/extractor/theta.py
index 3b6543629..8b6d70a9f 100644
--- a/yt_dlp/extractor/theta.py
+++ b/yt_dlp/extractor/theta.py
@@ -6,7 +6,7 @@ from ..utils import try_get
class ThetaStreamIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?theta\.tv/(?!video/)(?P<id>[a-z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?theta\.tv/(?!video/)(?P<id>[a-z0-9-]+)'
_TESTS = [{
'url': 'https://www.theta.tv/davirus',
'skip': 'The live may have ended',
@@ -25,6 +25,14 @@ class ThetaStreamIE(InfoExtractor):
'title': 'Mystery Science Theatre 3000 24/7 Powered by the THETA Network.',
'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+\.jpg',
}
+ }, {
+ 'url': 'https://www.theta.tv/contv-anime',
+ 'info_dict': {
+ 'id': 'ConTVAnime',
+ 'ext': 'mp4',
+ 'title': 'CONTV ANIME 24/7. Powered by THETA Network.',
+ 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+\.jpg',
+ }
}]
def _real_extract(self, url):
diff --git a/yt_dlp/extractor/thisoldhouse.py b/yt_dlp/extractor/thisoldhouse.py
index a3d9b4017..8a1d17311 100644
--- a/yt_dlp/extractor/thisoldhouse.py
+++ b/yt_dlp/extractor/thisoldhouse.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import HEADRequest
class ThisOldHouseIE(InfoExtractor):
@@ -15,6 +16,11 @@ class ThisOldHouseIE(InfoExtractor):
'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.',
'timestamp': 1442548800,
'upload_date': '20150918',
+ 'duration': 674,
+ 'view_count': int,
+ 'average_rating': 0,
+ 'thumbnail': r're:^https?://.*\.jpg\?\d+$',
+ 'display_id': 'how-to-build-a-storage-bench',
},
'params': {
'skip_download': True,
@@ -41,7 +47,12 @@ class ThisOldHouseIE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- video_id = self._search_regex(
- r'<iframe[^>]+src=[\'"](?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})',
- webpage, 'video id')
+ if 'To Unlock This content' in webpage:
+ self.raise_login_required(method='cookies')
+ video_url = self._search_regex(
+ r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})[^\'"]*)[\'"]',
+ webpage, 'video url')
+ if 'subscription_required=true' in video_url or 'c-entry-group-labels__image' in webpage:
+ return self.url_result(self._request_webpage(HEADRequest(video_url), display_id).geturl(), 'Zype', display_id)
+ video_id = self._search_regex(r'(?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})', video_url, 'video id')
return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id)
diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py
index 18f1c5630..4150c3ff3 100644
--- a/yt_dlp/extractor/tiktok.py
+++ b/yt_dlp/extractor/tiktok.py
@@ -8,11 +8,16 @@ import time
import json
from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
+from ..compat import (
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_urlparse
+)
from ..utils import (
ExtractorError,
int_or_none,
join_nonempty,
+ LazyList,
+ srt_subtitles_timecode,
str_or_none,
traverse_obj,
try_get,
@@ -22,24 +27,36 @@ from ..utils import (
class TikTokBaseIE(InfoExtractor):
- _APP_VERSION = '20.1.0'
- _MANIFEST_APP_VERSION = '200'
+ _APP_VERSIONS = [('20.9.3', '293'), ('20.4.3', '243'), ('20.2.1', '221'), ('20.1.2', '212'), ('20.0.4', '204')]
+ _WORKING_APP_VERSION = None
_APP_NAME = 'trill'
_AID = 1180
_API_HOSTNAME = 'api-h2.tiktokv.com'
_UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
_WEBPAGE_HOST = 'https://www.tiktok.com/'
- QUALITIES = ('360p', '540p', '720p')
+ QUALITIES = ('360p', '540p', '720p', '1080p')
- def _call_api(self, ep, query, video_id, fatal=True,
- note='Downloading API JSON', errnote='Unable to download API page'):
- real_query = {
+ def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True,
+ note='Downloading API JSON', errnote='Unable to download API page'):
+ self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160)))
+ webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
+ if webpage_cookies.get('sid_tt'):
+ self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value)
+ return self._download_json(
+ 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id,
+ fatal=fatal, note=note, errnote=errnote, headers={
+ 'User-Agent': f'com.ss.android.ugc.trill/{manifest_app_version} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)',
+ 'Accept': 'application/json',
+ }, query=query)
+
+ def _build_api_query(self, query, app_version, manifest_app_version):
+ return {
**query,
- 'version_name': self._APP_VERSION,
- 'version_code': self._MANIFEST_APP_VERSION,
- 'build_number': self._APP_VERSION,
- 'manifest_version_code': self._MANIFEST_APP_VERSION,
- 'update_version_code': self._MANIFEST_APP_VERSION,
+ 'version_name': app_version,
+ 'version_code': manifest_app_version,
+ 'build_number': app_version,
+ 'manifest_version_code': manifest_app_version,
+ 'update_version_code': manifest_app_version,
'openudid': ''.join(random.choice('0123456789abcdef') for _ in range(16)),
'uuid': ''.join([random.choice(string.digits) for _ in range(16)]),
'_rticket': int(time.time() * 1000),
@@ -68,16 +85,61 @@ class TikTokBaseIE(InfoExtractor):
'as': 'a1qwert123',
'cp': 'cbfhckdckkde1',
}
- self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160)))
- webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
- if webpage_cookies.get('sid_tt'):
- self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value)
- return self._download_json(
- 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id,
- fatal=fatal, note=note, errnote=errnote, headers={
- 'User-Agent': f'com.ss.android.ugc.trill/{self._MANIFEST_APP_VERSION} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)',
- 'Accept': 'application/json',
- }, query=real_query)
+
+ def _call_api(self, ep, query, video_id, fatal=True,
+ note='Downloading API JSON', errnote='Unable to download API page'):
+ if not self._WORKING_APP_VERSION:
+ app_version = self._configuration_arg('app_version', [''], ie_key=TikTokIE.ie_key())[0]
+ manifest_app_version = self._configuration_arg('manifest_app_version', [''], ie_key=TikTokIE.ie_key())[0]
+ if app_version and manifest_app_version:
+ self._WORKING_APP_VERSION = (app_version, manifest_app_version)
+ self.write_debug('Imported app version combo from extractor arguments')
+ elif app_version or manifest_app_version:
+ self.report_warning('Only one of the two required version params are passed as extractor arguments', only_once=True)
+
+ if self._WORKING_APP_VERSION:
+ app_version, manifest_app_version = self._WORKING_APP_VERSION
+ real_query = self._build_api_query(query, app_version, manifest_app_version)
+ return self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote)
+
+ for count, (app_version, manifest_app_version) in enumerate(self._APP_VERSIONS, start=1):
+ real_query = self._build_api_query(query, app_version, manifest_app_version)
+ try:
+ res = self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote)
+ self._WORKING_APP_VERSION = (app_version, manifest_app_version)
+ return res
+ except ExtractorError as e:
+ if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
+ if count == len(self._APP_VERSIONS):
+ if fatal:
+ raise e
+ else:
+ self.report_warning(str(e.cause or e.msg))
+ return
+ self.report_warning('%s. Retrying... (attempt %s of %s)' % (str(e.cause or e.msg), count, len(self._APP_VERSIONS)))
+ continue
+ raise e
+
+ def _get_subtitles(self, aweme_detail, aweme_id):
+ # TODO: Extract text positioning info
+ subtitles = {}
+ captions_info = traverse_obj(
+ aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict, default=[])
+ for caption in captions_info:
+ caption_url = traverse_obj(caption, ('url', 'url_list', ...), expected_type=url_or_none, get_all=False)
+ if not caption_url:
+ continue
+ caption_json = self._download_json(
+ caption_url, aweme_id, note='Downloading captions', errnote='Unable to download captions', fatal=False)
+ if not caption_json:
+ continue
+ subtitles.setdefault(caption.get('language', 'en'), []).append({
+ 'ext': 'srt',
+ 'data': '\n\n'.join(
+ f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}'
+ for i, line in enumerate(caption_json['utterances']) if line.get('text'))
+ })
+ return subtitles
def _parse_aweme_video_app(self, aweme_detail):
aweme_id = aweme_detail['aweme_id']
@@ -123,7 +185,7 @@ class TikTokBaseIE(InfoExtractor):
'format_id': 'play_addr',
'format_note': 'Direct video',
'vcodec': 'h265' if traverse_obj(
- video_info, 'is_bytevc1', 'is_h265') else 'h264', # Always h264?
+ video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002
'width': video_info.get('width'),
'height': video_info.get('height'),
}))
@@ -161,6 +223,10 @@ class TikTokBaseIE(InfoExtractor):
}))
self._remove_duplicate_formats(formats)
+ auth_cookie = self._get_cookies(self._WEBPAGE_HOST).get('sid_tt')
+ if auth_cookie:
+ for f in formats:
+ self._set_cookie(compat_urllib_parse_urlparse(f['url']).hostname, 'sid_tt', auth_cookie.value)
self._sort_formats(formats, ('quality', 'codec', 'size', 'br'))
thumbnails = []
@@ -210,6 +276,7 @@ class TikTokBaseIE(InfoExtractor):
'artist': music_author,
'timestamp': int_or_none(aweme_detail.get('create_time')),
'formats': formats,
+ 'subtitles': self.extract_subtitles(aweme_detail, aweme_id),
'thumbnails': thumbnails,
'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000),
'availability': self._availability(
@@ -220,12 +287,13 @@ class TikTokBaseIE(InfoExtractor):
def _parse_aweme_video_web(self, aweme_detail, webpage_url):
video_info = aweme_detail['video']
- author_info = traverse_obj(aweme_detail, 'author', 'authorInfo', default={})
+ author_info = traverse_obj(aweme_detail, 'authorInfo', 'author', expected_type=dict, default={})
music_info = aweme_detail.get('music') or {}
stats_info = aweme_detail.get('stats') or {}
user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info,
'secUid', 'id', 'uid', 'uniqueId',
- expected_type=str_or_none, get_all=False))
+ expected_type=str_or_none, get_all=False)
+ or aweme_detail.get('authorSecId'))
formats = []
play_url = video_info.get('playAddr')
@@ -277,8 +345,8 @@ class TikTokBaseIE(InfoExtractor):
'comment_count': int_or_none(stats_info.get('commentCount')),
'timestamp': int_or_none(aweme_detail.get('createTime')),
'creator': str_or_none(author_info.get('nickname')),
- 'uploader': str_or_none(author_info.get('uniqueId')),
- 'uploader_id': str_or_none(author_info.get('id')),
+ 'uploader': str_or_none(author_info.get('uniqueId') or aweme_detail.get('author')),
+ 'uploader_id': str_or_none(author_info.get('id') or aweme_detail.get('authorId')),
'uploader_url': user_url,
'track': str_or_none(music_info.get('title')),
'album': str_or_none(music_info.get('album')) or None,
@@ -387,6 +455,10 @@ class TikTokIE(TikTokBaseIE):
'comment_count': int,
},
'expected_warnings': ['Video not available']
+ }, {
+ # Auto-captions available
+ 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
+ 'only_matching': True
}]
def _extract_aweme_app(self, aweme_id):
@@ -399,7 +471,7 @@ class TikTokIE(TikTokBaseIE):
self.report_warning(f'{e}; Retrying with feed workaround')
feed_list = self._call_api('feed', {'aweme_id': aweme_id}, aweme_id,
note='Downloading video feed', errnote='Unable to download video feed').get('aweme_list') or []
- aweme_detail = next(aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id)
+ aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None)
if not aweme_detail:
raise ExtractorError('Unable to find video in feed', video_id=aweme_id)
return self._parse_aweme_video_app(aweme_detail)
@@ -415,19 +487,23 @@ class TikTokIE(TikTokBaseIE):
# If we only call once, we get a 403 when downlaoding the video.
self._download_webpage(url, video_id)
webpage = self._download_webpage(url, video_id, note='Downloading video webpage')
- json_string = self._search_regex(
- r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\"\s*[^>]+>\s*(?P<json_string_ld>[^<]+)',
- webpage, 'json_string', group='json_string_ld')
- json_data = self._parse_json(json_string, video_id)
- props_data = try_get(json_data, lambda x: x['props'], expected_type=dict)
-
- # Chech statusCode for success
- status = props_data.get('pageProps').get('statusCode')
+ next_data = self._search_nextjs_data(webpage, video_id, default='{}')
+
+ if next_data:
+ status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0
+ video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct'), expected_type=dict)
+ else:
+ sigi_json = self._search_regex(
+ r'>\s*window\[[\'"]SIGI_STATE[\'"]\]\s*=\s*(?P<sigi_state>{.+});',
+ webpage, 'sigi data', group='sigi_state')
+ sigi_data = self._parse_json(sigi_json, video_id)
+ status = traverse_obj(sigi_data, ('VideoPage', 'statusCode'), expected_type=int) or 0
+ video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict)
+
if status == 0:
- return self._parse_aweme_video_web(props_data['pageProps']['itemInfo']['itemStruct'], url)
+ return self._parse_aweme_video_web(video_data, url)
elif status == 10216:
raise ExtractorError('This video is private', expected=True)
-
raise ExtractorError('Video not available', video_id=video_id)
@@ -440,6 +516,7 @@ class TikTokUserIE(TikTokBaseIE):
'info_dict': {
'id': '6935371178089399301',
'title': 'corgibobaa',
+ 'thumbnail': r're:https://.+_1080x1080\.webp'
},
'expected_warnings': ['Retrying']
}, {
@@ -448,6 +525,7 @@ class TikTokUserIE(TikTokBaseIE):
'info_dict': {
'id': '79005827461758976',
'title': 'meme',
+ 'thumbnail': r're:https://.+_1080x1080\.webp'
},
'expected_warnings': ['Retrying']
}]
@@ -471,7 +549,7 @@ class TikTokUserIE(TikTokBaseIE):
cursor = data_json['cursor']
'''
- def _entries_api(self, webpage, user_id, username):
+ def _video_entries_api(self, webpage, user_id, username):
query = {
'user_id': user_id,
'count': 21,
@@ -494,24 +572,31 @@ class TikTokUserIE(TikTokBaseIE):
continue
raise
break
- for video in post_list.get('aweme_list', []):
- yield {
- **self._parse_aweme_video_app(video),
- 'extractor_key': TikTokIE.ie_key(),
- 'extractor': 'TikTok',
- 'webpage_url': f'https://tiktok.com/@{user_id}/video/{video["aweme_id"]}',
- }
+ yield from post_list.get('aweme_list', [])
if not post_list.get('has_more'):
break
query['max_cursor'] = post_list['max_cursor']
+ def _entries_api(self, user_id, videos):
+ for video in videos:
+ yield {
+ **self._parse_aweme_video_app(video),
+ 'extractor_key': TikTokIE.ie_key(),
+ 'extractor': 'TikTok',
+ 'webpage_url': f'https://tiktok.com/@{user_id}/video/{video["aweme_id"]}',
+ }
+
def _real_extract(self, url):
user_name = self._match_id(url)
webpage = self._download_webpage(url, user_name, headers={
'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'
})
user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID')
- return self.playlist_result(self._entries_api(webpage, user_id, user_name), user_id, user_name)
+
+ videos = LazyList(self._video_entries_api(webpage, user_id, user_name))
+ thumbnail = traverse_obj(videos, (0, 'author', 'avatar_larger', 'url_list', 0))
+
+ return self.playlist_result(self._entries_api(user_id, videos), user_id, user_name, thumbnail=thumbnail)
class TikTokBaseListIE(TikTokBaseIE):
@@ -705,8 +790,7 @@ class DouyinIE(TikTokIE):
'comment_count': int,
}
}]
- _APP_VERSION = '9.6.0'
- _MANIFEST_APP_VERSION = '960'
+ _APP_VERSIONS = [('9.6.0', '960')]
_APP_NAME = 'aweme'
_AID = 1128
_API_HOSTNAME = 'aweme.snssdk.com'
diff --git a/yt_dlp/extractor/trovo.py b/yt_dlp/extractor/trovo.py
index 580cb533b..65ea13ddb 100644
--- a/yt_dlp/extractor/trovo.py
+++ b/yt_dlp/extractor/trovo.py
@@ -7,6 +7,7 @@ import json
from .common import InfoExtractor
from ..utils import (
ExtractorError,
+ format_field,
int_or_none,
str_or_none,
try_get,
@@ -28,7 +29,7 @@ class TrovoBaseIE(InfoExtractor):
return {
'uploader': streamer_info.get('nickName'),
'uploader_id': str_or_none(streamer_info.get('uid')),
- 'uploader_url': 'https://trovo.live/' + username if username else None,
+ 'uploader_url': format_field(username, template='https://trovo.live/%s'),
}
diff --git a/yt_dlp/extractor/tumblr.py b/yt_dlp/extractor/tumblr.py
index adc370127..a9ad2e513 100644
--- a/yt_dlp/extractor/tumblr.py
+++ b/yt_dlp/extractor/tumblr.py
@@ -21,28 +21,20 @@ class TumblrIE(InfoExtractor):
'id': '54196191430',
'ext': 'mp4',
'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...',
- 'description': 'md5:37db8211e40b50c7c44e95da14f630b7',
- 'thumbnail': r're:http://.*\.jpg',
+ 'description': 'md5:390ab77358960235b6937ab3b8528956',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 127,
}
}, {
- 'url': 'http://5sostrum.tumblr.com/post/90208453769/yall-forgetting-the-greatest-keek-of-them-all',
- 'md5': 'bf348ef8c0ef84fbf1cbd6fa6e000359',
+ 'url': 'https://maskofthedragon.tumblr.com/post/626907179849564160/mona-talking-in-english',
+ 'md5': 'f43ff8a8861712b6cf0e0c2bd84cfc68',
'info_dict': {
- 'id': '90208453769',
+ 'id': '626907179849564160',
'ext': 'mp4',
- 'title': '5SOS STRUM ;]',
- 'description': 'md5:dba62ac8639482759c8eb10ce474586a',
- 'thumbnail': r're:http://.*\.jpg',
- }
- }, {
- 'url': 'http://hdvideotest.tumblr.com/post/130323439814/test-description-for-my-hd-video',
- 'md5': '7ae503065ad150122dc3089f8cf1546c',
- 'info_dict': {
- 'id': '130323439814',
- 'ext': 'mp4',
- 'title': 'HD Video Testing \u2014 Test description for my HD video',
- 'description': 'md5:97cc3ab5fcd27ee4af6356701541319c',
- 'thumbnail': r're:http://.*\.jpg',
+ 'title': 'Me roast is buggered!, Mona\xa0“talking” in\xa0“english”',
+ 'description': 'md5:082a3a621530cb786ad2b7592a6d9e2c',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 7,
},
'params': {
'format': 'hd',
@@ -60,16 +52,20 @@ class TumblrIE(InfoExtractor):
'uploader_id': '1638622',
'uploader': 'naked-yogi',
},
- 'add_ie': ['Vidme'],
+ # 'add_ie': ['Vidme'],
+ 'skip': 'dead embedded video host'
}, {
- 'url': 'http://camdamage.tumblr.com/post/98846056295/',
- 'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6',
+ 'url': 'https://dominustempori.tumblr.com/post/673572712813297664/youtubes-all-right-for-some-pretty-cool',
+ 'md5': '5e45724c70b748f64f5a1731ac72c84a',
'info_dict': {
- 'id': '105463834',
+ 'id': '87816359',
'ext': 'mp4',
- 'title': 'Cam Damage-HD 720p',
- 'uploader': 'John Moyer',
- 'uploader_id': 'user32021558',
+ 'title': 'Harold Ramis',
+ 'uploader': 'Resolution Productions Group',
+ 'uploader_id': 'resolutionproductions',
+ 'uploader_url': 'https://vimeo.com/resolutionproductions',
+ 'thumbnail': r're:^https?://i.vimeocdn.com/video/.*',
+ 'duration': 291,
},
'add_ie': ['Vimeo'],
}, {
@@ -86,18 +82,27 @@ class TumblrIE(InfoExtractor):
'like_count': int,
'comment_count': int,
'repost_count': int,
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'timestamp': 1455940159,
+ 'view_count': int,
},
'add_ie': ['Vine'],
}, {
- 'url': 'http://vitasidorkina.tumblr.com/post/134652425014/joskriver-victoriassecret-invisibility-or',
- 'md5': '01c12ceb82cbf6b2fe0703aa56b3ad72',
+ 'url': 'https://silami.tumblr.com/post/84250043974/my-bad-river-flows-in-you-impression-on-maschine',
+ 'md5': '3c92d7c3d867f14ccbeefa2119022277',
'info_dict': {
- 'id': '-7LnUPGlSo',
+ 'id': 'nYtvtTPuTl',
'ext': 'mp4',
- 'title': 'Video by victoriassecret',
- 'description': 'Invisibility or flight…which superpower would YOU choose? #VSFashionShow #ThisOrThat',
- 'uploader_id': 'victoriassecret',
- 'thumbnail': r're:^https?://.*\.jpg'
+ 'title': 'Video by silbulterman',
+ 'description': '#maschine',
+ 'uploader_id': '242859024',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'timestamp': 1398801174,
+ 'like_count': int,
+ 'uploader': 'Sil',
+ 'channel': 'silbulterman',
+ 'comment_count': int,
+ 'upload_date': '20140429',
},
'add_ie': ['Instagram'],
}]
@@ -161,9 +166,14 @@ class TumblrIE(InfoExtractor):
r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
webpage, 'iframe url', default=None)
if iframe_url is None:
- return self.url_result(redirect_url, 'Generic')
-
- iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page')
+ iframe_url = self._search_regex(
+ r'src=["\'](https?://safe\.txmblr\.com/svc/embed/inline/[^"\']+)["\']',
+ webpage, 'embed iframe url', default=None)
+ return self.url_result(iframe_url or redirect_url, 'Generic')
+
+ iframe = self._download_webpage(
+ iframe_url, video_id, 'Downloading iframe page',
+ headers={'Referer': redirect_url})
duration = None
sources = []
diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py
index 943b3ebdd..b8ac41483 100644
--- a/yt_dlp/extractor/tver.py
+++ b/yt_dlp/extractor/tver.py
@@ -5,10 +5,11 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ ExtractorError,
int_or_none,
remove_start,
smuggle_url,
- try_get,
+ traverse_obj,
)
@@ -38,13 +39,18 @@ class TVerIE(InfoExtractor):
def _real_extract(self, url):
path, video_id = self._match_valid_url(url).groups()
- main = self._download_json(
+ api_response = self._download_json(
'https://api.tver.jp/v4/' + path, video_id,
- query={'token': self._TOKEN})['main']
- p_id = main['publisher_id']
- service = remove_start(main['service'], 'ts_')
+ query={'token': self._TOKEN})
+ p_id = traverse_obj(api_response, ('main', 'publisher_id'))
+ if not p_id:
+ error_msg, expected = traverse_obj(api_response, ('episode', 0, 'textbar', 0, ('text', 'longer')), get_all=False), True
+ if not error_msg:
+ error_msg, expected = 'Failed to extract publisher ID', False
+ raise ExtractorError(error_msg, expected=expected)
+ service = remove_start(traverse_obj(api_response, ('main', 'service')), 'ts_')
- r_id = main['reference_id']
+ r_id = traverse_obj(api_response, ('main', 'reference_id'))
if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'):
r_id = 'ref:' + r_id
bc_url = smuggle_url(
@@ -53,8 +59,8 @@ class TVerIE(InfoExtractor):
return {
'_type': 'url_transparent',
- 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str),
- 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])),
+ 'description': traverse_obj(api_response, ('main', 'note', 0, 'text'), expected_type=compat_str),
+ 'episode_number': int_or_none(traverse_obj(api_response, ('main', 'ext', 'episode_number'), expected_type=compat_str)),
'url': bc_url,
'ie_key': 'BrightcoveNew',
}
diff --git a/yt_dlp/extractor/tvopengr.py b/yt_dlp/extractor/tvopengr.py
new file mode 100644
index 000000000..667f6660f
--- /dev/null
+++ b/yt_dlp/extractor/tvopengr.py
@@ -0,0 +1,143 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ get_elements_text_and_html_by_attribute,
+ merge_dicts,
+ unescapeHTML,
+)
+
+
+class TVOpenGrBaseIE(InfoExtractor):
+ def _return_canonical_url(self, url, video_id):
+ webpage = self._download_webpage(url, video_id)
+ canonical_url = self._og_search_url(webpage)
+ title = self._og_search_title(webpage)
+ return self.url_result(canonical_url, ie=TVOpenGrWatchIE.ie_key(), video_id=video_id, video_title=title)
+
+
+class TVOpenGrWatchIE(TVOpenGrBaseIE):
+ IE_NAME = 'tvopengr:watch'
+ IE_DESC = 'tvopen.gr (and ethnos.gr) videos'
+ _VALID_URL = r'https?://(?P<netloc>(?:www\.)?(?:tvopen|ethnos)\.gr)/watch/(?P<id>\d+)/(?P<slug>[^/]+)'
+ _API_ENDPOINT = 'https://www.tvopen.gr/templates/data/player'
+
+ _TESTS = [{
+ 'url': 'https://www.ethnos.gr/watch/101009/nikoskaprabelosdenexoymekanenanasthenhsemethmethmetallaxhomikron',
+ 'md5': '8728570e3a72e0f8d9475ba94859fdc1',
+ 'info_dict': {
+ 'id': '101009',
+ 'title': 'md5:51f68773dcb6c70498cd326f45fefdf0',
+ 'display_id': 'nikoskaprabelosdenexoymekanenanasthenhsemethmethmetallaxhomikron',
+ 'description': 'md5:78fff49f18fb3effe41b070e5c7685d6',
+ 'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/d573ba71-ec5f-43c6-b4cb-d181f327d3a8.jpg',
+ 'ext': 'mp4',
+ 'upload_date': '20220109',
+ 'timestamp': 1641686400,
+ },
+ }, {
+ 'url': 'https://www.tvopen.gr/watch/100979/se28099agapaomenalla7cepeisodio267cmhthrargiapashskakias',
+ 'md5': '38f98a1be0c577db4ea2d1b1c0770c48',
+ 'info_dict': {
+ 'id': '100979',
+ 'title': 'md5:e021f3001e16088ee40fa79b20df305b',
+ 'display_id': 'se28099agapaomenalla7cepeisodio267cmhthrargiapashskakias',
+ 'description': 'md5:ba17db53954134eb8d625d199e2919fb',
+ 'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/9bb71cf1-21da-43a9-9d65-367950fde4e3.jpg',
+ 'ext': 'mp4',
+ 'upload_date': '20220108',
+ 'timestamp': 1641600000,
+ },
+ }]
+
+ def _extract_formats_and_subs(self, response, video_id):
+ formats, subs = [], {}
+ for format_id, format_url in response.items():
+ if format_id not in ('stream', 'httpstream', 'mpegdash'):
+ continue
+ ext = determine_ext(format_url)
+ if ext == 'm3u8':
+ formats_, subs_ = self._extract_m3u8_formats_and_subtitles(
+ format_url, video_id, 'mp4', m3u8_id=format_id,
+ fatal=False)
+ elif ext == 'mpd':
+ formats_, subs_ = self._extract_mpd_formats_and_subtitles(
+ format_url, video_id, 'mp4', fatal=False)
+ else:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ })
+ continue
+ formats.extend(formats_)
+ self._merge_subtitles(subs_, target=subs)
+ self._sort_formats(formats)
+ return formats, subs
+
+ @staticmethod
+ def _scale_thumbnails_to_max_width(formats, thumbnails, url_width_re):
+ _keys = ('width', 'height')
+ max_dimensions = max(
+ [tuple(format.get(k) or 0 for k in _keys) for format in formats],
+ default=(0, 0))
+ if not max_dimensions[0]:
+ return thumbnails
+ return [
+ merge_dicts(
+ {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
+ dict(zip(_keys, max_dimensions)), thumbnail)
+ for thumbnail in thumbnails
+ ]
+
+ def _real_extract(self, url):
+ netloc, video_id, display_id = self._match_valid_url(url).group('netloc', 'id', 'slug')
+ if netloc.find('tvopen.gr') == -1:
+ return self._return_canonical_url(url, video_id)
+ webpage = self._download_webpage(url, video_id)
+ info = self._search_json_ld(webpage, video_id, expected_type='VideoObject')
+ info['formats'], info['subtitles'] = self._extract_formats_and_subs(
+ self._download_json(self._API_ENDPOINT, video_id, query={'cid': video_id}),
+ video_id)
+ info['thumbnails'] = self._scale_thumbnails_to_max_width(
+ info['formats'], info['thumbnails'], r'(?<=/imgHandler/)\d+')
+ description, _html = next(get_elements_text_and_html_by_attribute('class', 'description', webpage))
+ if description and _html.startswith('<span '):
+ info['description'] = description
+ info['id'] = video_id
+ info['display_id'] = display_id
+ return info
+
+
+class TVOpenGrEmbedIE(TVOpenGrBaseIE):
+ IE_NAME = 'tvopengr:embed'
+ IE_DESC = 'tvopen.gr embedded videos'
+ _VALID_URL = r'(?:https?:)?//(?:www\.|cdn\.|)(?:tvopen|ethnos).gr/embed/(?P<id>\d+)'
+ _EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''')
+
+ _TESTS = [{
+ 'url': 'https://cdn.ethnos.gr/embed/100963',
+ 'md5': '2da147881f45571d81662d94d086628b',
+ 'info_dict': {
+ 'id': '100963',
+ 'display_id': 'koronoiosapotoysdieythyntestonsxoleionselftestgiaosoysdenbrhkan',
+ 'title': 'md5:2c71876fadf0cda6043da0da5fca2936',
+ 'description': 'md5:17482b4432e5ed30eccd93b05d6ea509',
+ 'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/5804e07f-799a-4247-a696-33842c94ca37.jpg',
+ 'ext': 'mp4',
+ 'upload_date': '20220108',
+ 'timestamp': 1641600000,
+ },
+ }]
+
+ @classmethod
+ def _extract_urls(cls, webpage):
+ for mobj in cls._EMBED_RE.finditer(webpage):
+ yield unescapeHTML(mobj.group('url'))
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ return self._return_canonical_url(url, video_id)
diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py
index 3acf1b118..8c2235a8e 100644
--- a/yt_dlp/extractor/twitcasting.py
+++ b/yt_dlp/extractor/twitcasting.py
@@ -8,22 +8,27 @@ from .common import InfoExtractor
from ..downloader.websocket import has_websockets
from ..utils import (
clean_html,
+ ExtractorError,
float_or_none,
get_element_by_class,
get_element_by_id,
parse_duration,
qualities,
str_to_int,
+ traverse_obj,
try_get,
unified_timestamp,
urlencode_postdata,
urljoin,
- ExtractorError,
)
class TwitCastingIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/(?:movie|twplayer)/(?P<id>\d+)'
+ _M3U8_HEADERS = {
+ 'Origin': 'https://twitcasting.tv',
+ 'Referer': 'https://twitcasting.tv/',
+ }
_TESTS = [{
'url': 'https://twitcasting.tv/ivetesangalo/movie/2357609',
'md5': '745243cad58c4681dc752490f7540d7f',
@@ -60,6 +65,16 @@ class TwitCastingIE(InfoExtractor):
'skip_download': True,
'videopassword': 'abc',
},
+ }, {
+ 'note': 'archive is split in 2 parts',
+ 'url': 'https://twitcasting.tv/loft_heaven/movie/685979292',
+ 'info_dict': {
+ 'id': '685979292',
+ 'ext': 'mp4',
+ 'title': '南波一海のhear_here “ナタリー望月哲さんに聞く編集と「渋谷系狂騒曲」”',
+ 'duration': 6964.599334,
+ },
+ 'playlist_mincount': 2,
}]
def _real_extract(self, url):
@@ -70,7 +85,7 @@ class TwitCastingIE(InfoExtractor):
if video_password:
request_data = urlencode_postdata({
'password': video_password,
- })
+ }, encoding='utf-8')
webpage = self._download_webpage(
url, video_id, data=request_data,
headers={'Origin': 'https://twitcasting.tv'})
@@ -78,58 +93,33 @@ class TwitCastingIE(InfoExtractor):
title = (clean_html(get_element_by_id('movietitle', webpage))
or self._html_search_meta(['og:title', 'twitter:title'], webpage, fatal=True))
- video_js_data = {}
- m3u8_url = self._search_regex(
- r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
- webpage, 'm3u8 url', group='url', default=None)
- if not m3u8_url:
- video_js_data = self._parse_json(self._search_regex(
- r'data-movie-playlist=(["\'])(?P<url>(?:(?!\1).)+)',
- webpage, 'movie playlist', group='url', default='[{}]'), video_id)
- if isinstance(video_js_data, dict):
- video_js_data = list(video_js_data.values())[0]
- video_js_data = video_js_data[0]
- m3u8_url = try_get(video_js_data, lambda x: x['source']['url'])
+ video_js_data = try_get(
+ webpage,
+ lambda x: self._parse_json(self._search_regex(
+ r'data-movie-playlist=\'([^\']+?)\'',
+ x, 'movie playlist', default=None), video_id)['2'], list)
- stream_server_data = self._download_json(
- 'https://twitcasting.tv/streamserver.php?target=%s&mode=client' % uploader_id, video_id,
- 'Downloading live info', fatal=False)
-
- is_live = 'data-status="online"' in webpage
- formats = []
- if is_live and not m3u8_url:
- m3u8_url = 'https://twitcasting.tv/%s/metastream.m3u8' % uploader_id
- if is_live and has_websockets and stream_server_data:
- qq = qualities(['base', 'mobilesource', 'main'])
- for mode, ws_url in stream_server_data['llfmp4']['streams'].items():
- formats.append({
- 'url': ws_url,
- 'format_id': 'ws-%s' % mode,
- 'ext': 'mp4',
- 'quality': qq(mode),
- 'protocol': 'websocket_frag', # TwitCasting simply sends moof atom directly over WS
- })
-
- thumbnail = video_js_data.get('thumbnailUrl') or self._og_search_thumbnail(webpage)
+ thumbnail = traverse_obj(video_js_data, (0, 'thumbnailUrl')) or self._og_search_thumbnail(webpage)
description = clean_html(get_element_by_id(
'authorcomment', webpage)) or self._html_search_meta(
['description', 'og:description', 'twitter:description'], webpage)
- duration = float_or_none(video_js_data.get(
- 'duration'), 1000) or parse_duration(clean_html(
- get_element_by_class('tw-player-duration-time', webpage)))
+ duration = (try_get(video_js_data, lambda x: sum(float_or_none(y.get('duration')) for y in x) / 1000)
+ or parse_duration(clean_html(get_element_by_class('tw-player-duration-time', webpage))))
view_count = str_to_int(self._search_regex(
- r'Total\s*:\s*([\d,]+)\s*Views', webpage, 'views', None))
+ (r'Total\s*:\s*([\d,]+)\s*Views', r'総視聴者\s*:\s*([\d,]+)\s*</'), webpage, 'views', None))
timestamp = unified_timestamp(self._search_regex(
r'data-toggle="true"[^>]+datetime="([^"]+)"',
webpage, 'datetime', None))
- if m3u8_url:
- formats.extend(self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live))
- self._sort_formats(formats)
+ stream_server_data = self._download_json(
+ 'https://twitcasting.tv/streamserver.php?target=%s&mode=client' % uploader_id, video_id,
+ 'Downloading live info', fatal=False)
- return {
- 'id': video_id,
+ is_live = 'data-status="online"' in webpage
+ if not traverse_obj(stream_server_data, 'llfmp4') and is_live:
+ self.raise_login_required(method='cookies')
+
+ base_dict = {
'title': title,
'description': description,
'thumbnail': thumbnail,
@@ -137,10 +127,73 @@ class TwitCastingIE(InfoExtractor):
'uploader_id': uploader_id,
'duration': duration,
'view_count': view_count,
- 'formats': formats,
'is_live': is_live,
}
+ def find_dmu(x):
+ data_movie_url = self._search_regex(
+ r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
+ x, 'm3u8 url', group='url', default=None)
+ if data_movie_url:
+ return [data_movie_url]
+
+ m3u8_urls = (try_get(webpage, find_dmu, list)
+ or traverse_obj(video_js_data, (..., 'source', 'url'))
+ or ([f'https://twitcasting.tv/{uploader_id}/metastream.m3u8'] if is_live else None))
+ if not m3u8_urls:
+ raise ExtractorError('Failed to get m3u8 playlist')
+
+ if is_live:
+ m3u8_url = m3u8_urls[0]
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4', m3u8_id='hls',
+ live=True, headers=self._M3U8_HEADERS)
+
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4', m3u8_id='source',
+ live=True, query={'mode': 'source'},
+ note='Downloading source quality m3u8',
+ headers=self._M3U8_HEADERS, fatal=False))
+
+ if has_websockets:
+ qq = qualities(['base', 'mobilesource', 'main'])
+ streams = traverse_obj(stream_server_data, ('llfmp4', 'streams')) or {}
+ for mode, ws_url in streams.items():
+ formats.append({
+ 'url': ws_url,
+ 'format_id': 'ws-%s' % mode,
+ 'ext': 'mp4',
+ 'quality': qq(mode),
+ # TwitCasting simply sends moof atom directly over WS
+ 'protocol': 'websocket_frag',
+ })
+
+ self._sort_formats(formats)
+
+ infodict = {
+ 'formats': formats
+ }
+ else:
+ infodict = {
+ '_type': 'multi_video',
+ 'entries': [{
+ 'id': f'{video_id}-{num}',
+ 'url': m3u8_url,
+ 'ext': 'mp4',
+ # Requesting the manifests here will cause download to fail.
+ # So use ffmpeg instead. See: https://github.com/yt-dlp/yt-dlp/issues/382
+ 'protocol': 'm3u8',
+ 'http_headers': self._M3U8_HEADERS,
+ **base_dict,
+ } for (num, m3u8_url) in enumerate(m3u8_urls)],
+ }
+
+ return {
+ 'id': video_id,
+ **base_dict,
+ **infodict,
+ }
+
class TwitCastingLiveIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/?(?:[#?]|$)'
diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py
index 0749263d9..8565a7c46 100644
--- a/yt_dlp/extractor/twitter.py
+++ b/yt_dlp/extractor/twitter.py
@@ -13,8 +13,10 @@ from ..compat import (
from ..utils import (
dict_get,
ExtractorError,
+ format_field,
float_or_none,
int_or_none,
+ traverse_obj,
try_get,
strip_or_none,
unified_timestamp,
@@ -55,7 +57,7 @@ class TwitterBaseIE(InfoExtractor):
def _extract_formats_from_vmap_url(self, vmap_url, video_id):
vmap_url = url_or_none(vmap_url)
if not vmap_url:
- return []
+ return [], {}
vmap_data = self._download_xml(vmap_url, video_id)
formats = []
subtitles = {}
@@ -468,7 +470,7 @@ class TwitterIE(TwitterBaseIE):
'uploader': uploader,
'timestamp': unified_timestamp(status.get('created_at')),
'uploader_id': uploader_id,
- 'uploader_url': 'https://twitter.com/' + uploader_id if uploader_id else None,
+ 'uploader_url': format_field(uploader_id, template='https://twitter.com/%s'),
'like_count': int_or_none(status.get('favorite_count')),
'repost_count': int_or_none(status.get('retweet_count')),
'comment_count': int_or_none(status.get('reply_count')),
@@ -508,7 +510,7 @@ class TwitterIE(TwitterBaseIE):
'duration': float_or_none(video_info.get('duration_millis'), 1000),
})
- media = try_get(status, lambda x: x['extended_entities']['media'][0])
+ media = traverse_obj(status, ((None, 'quoted_status'), 'extended_entities', 'media', 0), get_all=False)
if media and media.get('type') != 'photo':
extract_from_video_info(media)
else:
diff --git a/yt_dlp/extractor/veoh.py b/yt_dlp/extractor/veoh.py
index 1c44c145c..d9afb5617 100644
--- a/yt_dlp/extractor/veoh.py
+++ b/yt_dlp/extractor/veoh.py
@@ -5,21 +5,30 @@ from ..utils import (
int_or_none,
parse_duration,
qualities,
+ try_get
)
class VeohIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|embed|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)'
+ _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|videos|embed|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)'
_TESTS = [{
'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
- 'md5': '9e7ecc0fd8bbee7a69fe38953aeebd30',
+ 'md5': '620e68e6a3cff80086df3348426c9ca3',
'info_dict': {
'id': 'v56314296nk7Zdmz3',
'ext': 'mp4',
'title': 'Straight Backs Are Stronger',
+ 'description': 'md5:203f976279939a6dc664d4001e13f5f4',
+ 'thumbnail': 're:https://fcache\\.veoh\\.com/file/f/th56314296\\.jpg(\\?.*)?',
'uploader': 'LUMOback',
- 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
+ 'duration': 46,
+ 'view_count': int,
+ 'average_rating': int,
+ 'comment_count': int,
+ 'age_limit': 0,
+ 'categories': ['technology_and_gaming'],
+ 'tags': ['posture', 'posture', 'sensor', 'back', 'pain', 'wearable', 'tech', 'lumo'],
},
}, {
'url': 'http://www.veoh.com/embed/v56314296nk7Zdmz3',
@@ -51,30 +60,36 @@ class VeohIE(InfoExtractor):
}, {
'url': 'http://www.veoh.com/watch/e152215AJxZktGS',
'only_matching': True,
- }]
-
- def _extract_video(self, source):
- return {
- 'id': source.get('videoId'),
- 'title': source.get('title'),
- 'description': source.get('description'),
- 'thumbnail': source.get('highResImage') or source.get('medResImage'),
- 'uploader': source.get('username'),
- 'duration': int_or_none(source.get('length')),
- 'view_count': int_or_none(source.get('views')),
- 'age_limit': 18 if source.get('isMature') == 'true' or source.get('isSexy') == 'true' else 0,
- 'formats': self._extract_formats(source),
+ }, {
+ 'url': 'https://www.veoh.com/videos/v16374379WA437rMH',
+ 'md5': 'cceb73f3909063d64f4b93d4defca1b3',
+ 'info_dict': {
+ 'id': 'v16374379WA437rMH',
+ 'ext': 'mp4',
+ 'title': 'Phantasmagoria 2, pt. 1-3',
+ 'description': 'Phantasmagoria: a Puzzle of Flesh',
+ 'thumbnail': 're:https://fcache\\.veoh\\.com/file/f/th16374379\\.jpg(\\?.*)?',
+ 'uploader': 'davidspackage',
+ 'duration': 968,
+ 'view_count': int,
+ 'average_rating': int,
+ 'comment_count': int,
+ 'age_limit': 18,
+ 'categories': ['technology_and_gaming', 'gaming'],
+ 'tags': ['puzzle', 'of', 'flesh'],
}
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- video = self._download_json(
+ metadata = self._download_json(
'https://www.veoh.com/watch/getVideo/' + video_id,
- video_id)['video']
+ video_id)
+ video = metadata['video']
title = video['title']
thumbnail_url = None
- q = qualities(['HQ', 'Regular'])
+ q = qualities(['Regular', 'HQ'])
formats = []
for f_id, f_url in video.get('src', {}).items():
if not f_url:
@@ -89,6 +104,12 @@ class VeohIE(InfoExtractor):
})
self._sort_formats(formats)
+ categories = metadata.get('categoryPath')
+ if not categories:
+ category = try_get(video, lambda x: x['category'].strip().removeprefix('category_'))
+ categories = [category] if category else None
+ tags = video.get('tags')
+
return {
'id': video_id,
'title': title,
@@ -100,4 +121,7 @@ class VeohIE(InfoExtractor):
'formats': formats,
'average_rating': int_or_none(video.get('rating')),
'comment_count': int_or_none(video.get('numOfComments')),
+ 'age_limit': 18 if video.get('contentRatingId') == 2 else 0,
+ 'categories': categories,
+ 'tags': tags.split(', ') if tags else None,
}
diff --git a/yt_dlp/extractor/vidio.py b/yt_dlp/extractor/vidio.py
index 571448bf2..e99dbdefa 100644
--- a/yt_dlp/extractor/vidio.py
+++ b/yt_dlp/extractor/vidio.py
@@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..utils import (
clean_html,
ExtractorError,
+ format_field,
get_element_by_class,
int_or_none,
parse_iso8601,
@@ -160,7 +161,7 @@ class VidioIE(VidioBaseIE):
'uploader': user.get('name'),
'timestamp': parse_iso8601(video.get('created_at')),
'uploader_id': username,
- 'uploader_url': 'https://www.vidio.com/@' + username if username else None,
+ 'uploader_url': format_field(username, template='https://www.vidio.com/@%s'),
'channel': channel.get('name'),
'channel_id': str_or_none(channel.get('id')),
'view_count': get_count('view_count'),
@@ -291,5 +292,5 @@ class VidioLiveIE(VidioBaseIE):
'uploader': user.get('name'),
'timestamp': parse_iso8601(stream_meta.get('start_time')),
'uploader_id': username,
- 'uploader_url': 'https://www.vidio.com/@' + username if username else None,
+ 'uploader_url': format_field(username, template='https://www.vidio.com/@%s'),
}
diff --git a/yt_dlp/extractor/vidlii.py b/yt_dlp/extractor/vidlii.py
index ce7487ec1..a63919ff2 100644
--- a/yt_dlp/extractor/vidlii.py
+++ b/yt_dlp/extractor/vidlii.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..utils import (
HEADRequest,
+ format_field,
float_or_none,
get_element_by_id,
int_or_none,
@@ -102,7 +103,7 @@ class VidLiiIE(InfoExtractor):
uploader = self._search_regex(
r'<div[^>]+class=["\']wt_person[^>]+>\s*<a[^>]+\bhref=["\']/user/[^>]+>([^<]+)',
webpage, 'uploader', fatal=False)
- uploader_url = 'https://www.vidlii.com/user/%s' % uploader if uploader else None
+ uploader_url = format_field(uploader, template='https://www.vidlii.com/user/%s')
upload_date = unified_strdate(self._html_search_meta(
'datePublished', webpage, default=None) or self._search_regex(
diff --git a/yt_dlp/extractor/viki.py b/yt_dlp/extractor/viki.py
index 6a3c5532d..19b09121c 100644
--- a/yt_dlp/extractor/viki.py
+++ b/yt_dlp/extractor/viki.py
@@ -19,7 +19,7 @@ class VikiBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/'
_API_URL_TEMPLATE = 'https://api.viki.io%s'
- _DEVICE_ID = '86085977d' # used for android api
+ _DEVICE_ID = '112395910d'
_APP = '100005a'
_APP_VERSION = '6.11.3'
_APP_SECRET = 'd96704b180208dbb2efa30fe44c48bd8690441af9f567ba8fd710a72badc85198f7472'
@@ -253,7 +253,7 @@ class VikiIE(VikiBaseIE):
} for thumbnail_id, thumbnail in (video.get('images') or {}).items() if thumbnail.get('url')]
resp = self._call_api(
- 'playback_streams/%s.json?drms=dt1,dt2&device_id=%s' % (video_id, self._DEVICE_ID),
+ 'playback_streams/%s.json?drms=dt3&device_id=%s' % (video_id, self._DEVICE_ID),
video_id, 'Downloading video streams JSON')['main'][0]
stream_id = try_get(resp, lambda x: x['properties']['track']['stream_id'])
@@ -264,10 +264,13 @@ class VikiIE(VikiBaseIE):
} for ext in ('srt', 'vtt')]) for lang in (video.get('subtitle_completions') or {}).keys())
mpd_url = resp['url']
- # 1080p is hidden in another mpd which can be found in the current manifest content
+ # 720p is hidden in another MPD which can be found in the current manifest content
mpd_content = self._download_webpage(mpd_url, video_id, note='Downloading initial MPD manifest')
mpd_url = self._search_regex(
r'(?mi)<BaseURL>(http.+.mpd)', mpd_content, 'new manifest', default=mpd_url)
+ if 'mpdhd_high' not in mpd_url:
+ # Modify the URL to get 1080p
+ mpd_url = mpd_url.replace('mpdhd', 'mpdhd_high')
formats = self._extract_mpd_formats(mpd_url, video_id)
self._sort_formats(formats)
diff --git a/yt_dlp/extractor/vimm.py b/yt_dlp/extractor/vimm.py
new file mode 100644
index 000000000..060b92ba6
--- /dev/null
+++ b/yt_dlp/extractor/vimm.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from .common import InfoExtractor
+
+
+class VimmIE(InfoExtractor):
+ IE_NAME = 'Vimm:stream'
+ _VALID_URL = r'https?://(?:www\.)?vimm\.tv/(?:c/)?(?P<id>[0-9a-z-]+)$'
+ _TESTS = [{
+ 'url': 'https://www.vimm.tv/c/calimeatwagon',
+ 'info_dict': {
+ 'id': 'calimeatwagon',
+ 'ext': 'mp4',
+ 'title': 're:^calimeatwagon [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'live_status': 'is_live',
+ },
+ 'skip': 'Live',
+ }, {
+ 'url': 'https://www.vimm.tv/octaafradio',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+
+ formats, subs = self._extract_m3u8_formats_and_subtitles(
+ f'https://www.vimm.tv/hls/{channel_id}.m3u8', channel_id, 'mp4', m3u8_id='hls', live=True)
+ self._sort_formats(formats)
+
+ return {
+ 'id': channel_id,
+ 'title': channel_id,
+ 'is_live': True,
+ 'formats': formats,
+ 'subtitles': subs,
+ }
+
+
+class VimmRecordingIE(InfoExtractor):
+ IE_NAME = 'Vimm:recording'
+ _VALID_URL = r'https?://(?:www\.)?vimm\.tv/c/(?P<channel_id>[0-9a-z-]+)\?v=(?P<video_id>[0-9A-Za-z]+)'
+ _TESTS = [{
+ 'url': 'https://www.vimm.tv/c/kaldewei?v=2JZsrPTFxsSz',
+ 'md5': '15122ee95baa32a548e4a3e120b598f1',
+ 'info_dict': {
+ 'id': '2JZsrPTFxsSz',
+ 'ext': 'mp4',
+ 'title': 'VIMM - [DE/GER] Kaldewei Live - In Farbe und Bunt',
+ 'uploader_id': 'kaldewei',
+ },
+ }]
+
+ def _real_extract(self, url):
+ channel_id, video_id = self._match_valid_url(url).groups()
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._og_search_title(webpage)
+
+ formats, subs = self._extract_m3u8_formats_and_subtitles(
+ f'https://d211qfrkztakg3.cloudfront.net/{channel_id}/{video_id}/index.m3u8', video_id, 'mp4', m3u8_id='hls', live=False)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'is_live': False,
+ 'uploader_id': channel_id,
+ 'formats': formats,
+ 'subtitles': subs,
+ }
diff --git a/yt_dlp/extractor/vine.py b/yt_dlp/extractor/vine.py
index 07fce0daa..e59b1037b 100644
--- a/yt_dlp/extractor/vine.py
+++ b/yt_dlp/extractor/vine.py
@@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
+ format_field,
int_or_none,
unified_timestamp,
)
@@ -92,7 +93,7 @@ class VineIE(InfoExtractor):
username = data.get('username')
- alt_title = 'Vine by %s' % username if username else None
+ alt_title = format_field(username, template='Vine by %s')
return {
'id': video_id,
diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py
index 1b34c5296..b633df95d 100644
--- a/yt_dlp/extractor/viu.py
+++ b/yt_dlp/extractor/viu.py
@@ -287,8 +287,7 @@ class ViuOTTIE(InfoExtractor):
raise ExtractorError('This video is not available in your region.', expected=True)
series_id = video_data.get('series_id')
- if not self.get_param('noplaylist') and not idata.get('force_noplaylist'):
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % series_id)
+ if self._yes_playlist(series_id, video_id, idata):
series = product_data.get('series', {})
product = series.get('product')
if product:
@@ -308,9 +307,6 @@ class ViuOTTIE(InfoExtractor):
return self.playlist_result(entries, series_id, series.get('name'), series.get('description'))
- if self.get_param('noplaylist'):
- self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
-
duration_limit = False
query = {
'ccs_product_id': video_data['ccs_product_id'],
diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py
index 5cdb1542d..fab16780f 100644
--- a/yt_dlp/extractor/vk.py
+++ b/yt_dlp/extractor/vk.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import collections
-import functools
import re
from .common import InfoExtractor
@@ -12,7 +11,6 @@ from ..utils import (
ExtractorError,
get_element_by_class,
int_or_none,
- OnDemandPagedList,
orderedSet,
str_or_none,
str_to_int,
@@ -87,10 +85,10 @@ class VKIE(VKBaseIE):
)
ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)|
(?:
- (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video|
+ (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?(?:video|clip)|
(?:www\.)?daxab.com/embed/
)
- (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))?
+ (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>([\da-f]+)|(ln-[\da-zA-Z]+)))?
)
'''
_TESTS = [
@@ -182,6 +180,17 @@ class VKIE(VKBaseIE):
'skip': 'Removed',
},
{
+ 'url': 'https://vk.com/video-93049196_456239755?list=ln-cBjJ7S4jYYx3ADnmDT',
+ 'info_dict': {
+ 'id': '-93049196_456239755',
+ 'ext': 'mp4',
+ 'title': '8 серия (озвучка)',
+ 'duration': 8383,
+ 'upload_date': '20211222',
+ 'view_count': int,
+ },
+ },
+ {
# video (removed?) only available with list id
'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4',
'md5': '091287af5402239a1051c37ec7b92913',
@@ -298,6 +307,10 @@ class VKIE(VKBaseIE):
# The video is not available in your region.
'url': 'https://vk.com/video-51812607_171445436',
'only_matching': True,
+ },
+ {
+ 'url': 'https://vk.com/clip30014565_456240946',
+ 'only_matching': True,
}]
@staticmethod
@@ -496,63 +509,59 @@ class VKIE(VKBaseIE):
class VKUserVideosIE(VKBaseIE):
IE_NAME = 'vk:uservideos'
IE_DESC = "VK - User's Videos"
- _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P<section>\w+))?|$)'
+ _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/video/@(?P<id>[^?$#/&]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P<section>\w+))?|$)'
_TEMPLATE_URL = 'https://vk.com/videos'
_TESTS = [{
- 'url': 'https://vk.com/videos-767561',
+ 'url': 'https://vk.com/video/@mobidevices',
'info_dict': {
- 'id': '-767561_all',
+ 'id': '-17892518_all',
},
- 'playlist_mincount': 1150,
+ 'playlist_mincount': 1355,
}, {
- 'url': 'https://vk.com/videos-767561?section=uploaded',
+ 'url': 'https://vk.com/video/@mobidevices?section=uploaded',
'info_dict': {
- 'id': '-767561_uploaded',
+ 'id': '-17892518_uploaded',
},
- 'playlist_mincount': 425,
- }, {
- 'url': 'http://vk.com/videos205387401',
- 'only_matching': True,
- }, {
- 'url': 'http://vk.com/videos-77521',
- 'only_matching': True,
- }, {
- 'url': 'http://vk.com/videos-97664626?section=all',
- 'only_matching': True,
- }, {
- 'url': 'http://m.vk.com/videos205387401',
- 'only_matching': True,
- }, {
- 'url': 'http://new.vk.com/videos205387401',
- 'only_matching': True,
+ 'playlist_mincount': 182,
}]
- _PAGE_SIZE = 1000
_VIDEO = collections.namedtuple('Video', ['owner_id', 'id'])
- def _fetch_page(self, page_id, section, page):
- l = self._download_payload('al_video', page_id, {
+ def _entries(self, page_id, section):
+ video_list_json = self._download_payload('al_video', page_id, {
'act': 'load_videos_silent',
- 'offset': page * self._PAGE_SIZE,
+ 'offset': 0,
'oid': page_id,
'section': section,
- })[0][section]['list']
-
- for video in l:
- v = self._VIDEO._make(video[:2])
- video_id = '%d_%d' % (v.owner_id, v.id)
- yield self.url_result(
- 'http://vk.com/video' + video_id, VKIE.ie_key(), video_id)
+ })[0][section]
+ count = video_list_json['count']
+ total = video_list_json['total']
+ video_list = video_list_json['list']
+
+ while True:
+ for video in video_list:
+ v = self._VIDEO._make(video[:2])
+ video_id = '%d_%d' % (v.owner_id, v.id)
+ yield self.url_result(
+ 'http://vk.com/video' + video_id, VKIE.ie_key(), video_id)
+ if count >= total:
+ break
+ video_list_json = self._download_payload('al_video', page_id, {
+ 'act': 'load_videos_silent',
+ 'offset': count,
+ 'oid': page_id,
+ 'section': section,
+ })[0][section]
+ count += video_list_json['count']
+ video_list = video_list_json['list']
def _real_extract(self, url):
- page_id, section = self._match_valid_url(url).groups()
+ u_id, section = self._match_valid_url(url).groups()
+ webpage = self._download_webpage(url, u_id)
+ page_id = self._search_regex(r'data-owner-id\s?=\s?"([^"]+)"', webpage, 'page_id')
if not section:
section = 'all'
- entries = OnDemandPagedList(
- functools.partial(self._fetch_page, page_id, section),
- self._PAGE_SIZE)
-
- return self.playlist_result(entries, '%s_%s' % (page_id, section))
+ return self.playlist_result(self._entries(page_id, section), '%s_%s' % (page_id, section))
class VKWallPostIE(VKBaseIE):
diff --git a/yt_dlp/extractor/vlive.py b/yt_dlp/extractor/vlive.py
index 547bdd323..74dc349d5 100644
--- a/yt_dlp/extractor/vlive.py
+++ b/yt_dlp/extractor/vlive.py
@@ -146,30 +146,24 @@ class VLiveIE(VLiveBaseIE):
'post/v1.0/officialVideoPost-%s', video_id,
'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId},playlist{playlistSeq,totalCount,name}')
- playlist = post.get('playlist')
- if not playlist or self.get_param('noplaylist'):
- if playlist:
- self.to_screen(
- 'Downloading just video %s because of --no-playlist'
- % video_id)
-
+ playlist_id = str_or_none(try_get(post, lambda x: x['playlist']['playlistSeq']))
+ if not self._yes_playlist(playlist_id, video_id):
video = post['officialVideo']
return self._get_vlive_info(post, video, video_id)
- else:
- playlist_name = playlist.get('name')
- playlist_id = str_or_none(playlist.get('playlistSeq'))
- playlist_count = str_or_none(playlist.get('totalCount'))
- playlist = self._call_api(
- 'playlist/v1.0/playlist-%s/posts', playlist_id, 'data', {'limit': playlist_count})
+ playlist_name = str_or_none(try_get(post, lambda x: x['playlist']['name']))
+ playlist_count = str_or_none(try_get(post, lambda x: x['playlist']['totalCount']))
- entries = []
- for video_data in playlist['data']:
- video = video_data.get('officialVideo')
- video_id = str_or_none(video.get('videoSeq'))
- entries.append(self._get_vlive_info(video_data, video, video_id))
+ playlist = self._call_api(
+ 'playlist/v1.0/playlist-%s/posts', playlist_id, 'data', {'limit': playlist_count})
+
+ entries = []
+ for video_data in playlist['data']:
+ video = video_data.get('officialVideo')
+ video_id = str_or_none(video.get('videoSeq'))
+ entries.append(self._get_vlive_info(video_data, video, video_id))
- return self.playlist_result(entries, playlist_id, playlist_name)
+ return self.playlist_result(entries, playlist_id, playlist_name)
def _get_vlive_info(self, post, video, video_id):
def get_common_fields():
diff --git a/yt_dlp/extractor/xvideos.py b/yt_dlp/extractor/xvideos.py
index ab07f01af..d5261b6ab 100644
--- a/yt_dlp/extractor/xvideos.py
+++ b/yt_dlp/extractor/xvideos.py
@@ -25,14 +25,27 @@ class XVideosIE(InfoExtractor):
(?P<id>[0-9]+)
'''
_TESTS = [{
- 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl',
+ 'url': 'https://www.xvideos.com/video4588838/motorcycle_guy_cucks_influencer_steals_his_gf',
'md5': '14cea69fcb84db54293b1e971466c2e1',
'info_dict': {
'id': '4588838',
'ext': 'mp4',
- 'title': 'Biker Takes his Girl',
+ 'title': 'Motorcycle Guy Cucks Influencer, Steals his GF',
'duration': 108,
'age_limit': 18,
+ 'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg',
+ }
+ }, {
+ # Broken HLS formats
+ 'url': 'https://www.xvideos.com/video65982001/what_s_her_name',
+ 'md5': 'b82d7d7ef7d65a84b1fa6965f81f95a5',
+ 'info_dict': {
+ 'id': '65982001',
+ 'ext': 'mp4',
+ 'title': 'what\'s her name?',
+ 'duration': 120,
+ 'age_limit': 18,
+ 'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg',
}
}, {
'url': 'https://flashservice.xvideos.com/embedframe/4588838',
@@ -126,9 +139,11 @@ class XVideosIE(InfoExtractor):
r'setVideo([^(]+)\((["\'])(http.+?)\2\)', webpage):
format_id = kind.lower()
if format_id == 'hls':
- formats.extend(self._extract_m3u8_formats(
+ hls_formats = self._extract_m3u8_formats(
format_url, video_id, 'mp4',
- entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+ entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+ self._check_formats(hls_formats, video_id)
+ formats.extend(hls_formats)
elif format_id in ('urllow', 'urlhigh'):
formats.append({
'url': format_url,
diff --git a/yt_dlp/extractor/yahoo.py b/yt_dlp/extractor/yahoo.py
index 313e596f5..6cf3b1de2 100644
--- a/yt_dlp/extractor/yahoo.py
+++ b/yt_dlp/extractor/yahoo.py
@@ -414,11 +414,14 @@ class YahooGyaOIE(InfoExtractor):
IE_NAME = 'yahoo:gyao'
_VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:p|title(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/p/y)/(?P<id>\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_TESTS = [{
- 'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/',
+ 'url': 'https://gyao.yahoo.co.jp/title/%E3%82%BF%E3%82%A4%E3%83%A0%E3%83%9C%E3%82%AB%E3%83%B3%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA%20%E3%83%A4%E3%83%83%E3%82%BF%E3%83%BC%E3%83%9E%E3%83%B3/5f60ceb3-6e5e-40ef-ba40-d68b598d067f',
'info_dict': {
- 'id': '00449:v03102',
+ 'id': '5f60ceb3-6e5e-40ef-ba40-d68b598d067f',
},
- 'playlist_count': 2,
+ 'playlist_mincount': 80,
+ }, {
+ 'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/',
+ 'only_matching': True,
}, {
'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/',
'only_matching': True,
@@ -430,19 +433,30 @@ class YahooGyaOIE(InfoExtractor):
'only_matching': True,
}]
+ def _entries(self, program_id):
+ page = 1
+ while True:
+ playlist = self._download_json(
+ f'https://gyao.yahoo.co.jp/api/programs/{program_id}/videos?page={page}', program_id,
+ note=f'Downloading JSON metadata page {page}')
+ if not playlist:
+ break
+ for video in playlist['videos']:
+ video_id = video.get('id')
+ if not video_id:
+ continue
+ if video.get('streamingAvailability') == 'notYet':
+ continue
+ yield self.url_result(
+ 'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'),
+ YahooGyaOPlayerIE.ie_key(), video_id)
+ if playlist.get('ended'):
+ break
+ page += 1
+
def _real_extract(self, url):
program_id = self._match_id(url).replace('/', ':')
- videos = self._download_json(
- 'https://gyao.yahoo.co.jp/api/programs/%s/videos' % program_id, program_id)['videos']
- entries = []
- for video in videos:
- video_id = video.get('id')
- if not video_id:
- continue
- entries.append(self.url_result(
- 'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'),
- YahooGyaOPlayerIE.ie_key(), video_id))
- return self.playlist_result(entries, program_id)
+ return self.playlist_result(self._entries(program_id), program_id)
class YahooJapanNewsIE(InfoExtractor):
diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py
index 67095f2fd..a101af67e 100644
--- a/yt_dlp/extractor/yandexvideo.py
+++ b/yt_dlp/extractor/yandexvideo.py
@@ -11,6 +11,7 @@ from ..utils import (
int_or_none,
try_get,
url_or_none,
+ lowercase_escape,
)
@@ -148,6 +149,45 @@ class YandexVideoIE(InfoExtractor):
}
+class YandexVideoPreviewIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?yandex\.ru/video/preview(?:/?\?.*?filmId=|/)(?P<id>\d+)'
+ _TESTS = [{ # Odnoklassniki
+ 'url': 'https://yandex.ru/video/preview/?filmId=10682852472978372885&text=summer',
+ 'info_dict': {
+ 'id': '1352565459459',
+ 'ext': 'mp4',
+ 'like_count': int,
+ 'upload_date': '20191202',
+ 'age_limit': 0,
+ 'duration': 196,
+ 'thumbnail': 'https://i.mycdn.me/videoPreview?id=544866765315&type=37&idx=13&tkn=TY5qjLYZHxpmcnK8U2LgzYkgmaU&fn=external_8',
+ 'uploader_id': '481054701571',
+ 'title': 'LOFT - summer, summer, summer HD',
+ 'manifest_stream_number': 0,
+ 'uploader': 'АРТЁМ КУДРОВ',
+ },
+ }, { # youtube
+ 'url': 'https://yandex.ru/video/preview/?filmId=4479424425337895262&source=main_redirect&text=видео&utm_source=main_stripe_big',
+ 'only_matching': True,
+ }, { # YandexVideo
+ 'url': 'https://yandex.ru/video/preview/5275069442094787341',
+ 'only_matching': True,
+ }, { # youtube
+ 'url': 'https://yandex.ru/video/preview/?filmId=16658118429797832897&from=tabbar&p=1&text=%D0%BF%D1%80%D0%BE%D1%81%D0%BC%D0%BE%D1%82%D1%80+%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82%D0%B0+%D0%BC%D0%B0%D0%BB%D0%B5%D0%BD%D1%8C%D0%BA%D0%B8%D0%B9+%D0%BF%D1%80%D0%B8%D0%BD%D1%86+%D0%BC%D1%8B+%D0%B2+%D0%BE%D1%82%D0%B2%D0%B5%D1%82%D0%B5+%D0%B7%D0%B0+%D1%82%D0%B5%D1%85+%D0%BA%D0%BE%D0%B3%D0%BE+%D0%BF%D1%80%D0%B8%D1%80%D1%83%D1%87%D0%B8%D0%BB%D0%B8',
+ 'only_matching': True,
+ }, { # Odnoklassniki
+ 'url': 'https://yandex.ru/video/preview/?text=Francis%20Lai%20-%20Le%20Bon%20Et%20Les%20MC)chants&path=wizard&parent-reqid=1643208087979310-1481782809207673478-sas3-0931-2f9-sas-l7-balancer-8080-BAL-9380&wiz_type=vital&filmId=12508152936505397283',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ data_raw = self._search_regex(r'window.Ya.__inline_params__\s*=\s*JSON.parse\(\'([^"]+?\\u0022video\\u0022:[^"]+?})\'\);', webpage, 'data_raw')
+ data_json = self._parse_json(data_raw, id, transform_source=lowercase_escape)
+ return self.url_result(data_json['video']['url'])
+
+
class ZenYandexIE(InfoExtractor):
_VALID_URL = r'https?://zen\.yandex\.ru(?:/video)?/(media|watch)/(?:(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-)?(?P<id>[a-z0-9-]+)'
_TESTS = [{
diff --git a/yt_dlp/extractor/younow.py b/yt_dlp/extractor/younow.py
index 128faa30d..583aea38d 100644
--- a/yt_dlp/extractor/younow.py
+++ b/yt_dlp/extractor/younow.py
@@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
+ format_field,
int_or_none,
try_get,
)
@@ -93,7 +94,7 @@ def _extract_moment(item, fatal=True):
uploader = try_get(item, lambda x: x['owner']['name'], compat_str)
uploader_id = try_get(item, lambda x: x['owner']['userId'])
- uploader_url = 'https://www.younow.com/%s' % uploader if uploader else None
+ uploader_url = format_field(uploader, template='https://www.younow.com/%s')
entry = {
'extractor_key': 'YouNowMoment',
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index 852fbd78e..61804e2af 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -42,6 +42,7 @@ from ..utils import (
int_or_none,
is_html,
join_nonempty,
+ js_to_json,
mimetype2ext,
network_exceptions,
NO_DEFAULT,
@@ -62,6 +63,7 @@ from ..utils import (
try_get,
unescapeHTML,
unified_strdate,
+ unified_timestamp,
unsmuggle_url,
update_url_query,
url_or_none,
@@ -81,7 +83,7 @@ INNERTUBE_CLIENTS = {
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'WEB',
- 'clientVersion': '2.20210622.10.00',
+ 'clientVersion': '2.20211221.00.00',
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 1
@@ -91,7 +93,7 @@ INNERTUBE_CLIENTS = {
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'WEB_EMBEDDED_PLAYER',
- 'clientVersion': '1.20210620.0.1',
+ 'clientVersion': '1.20211215.00.01',
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 56
@@ -102,96 +104,96 @@ INNERTUBE_CLIENTS = {
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'WEB_REMIX',
- 'clientVersion': '1.20210621.00.00',
+ 'clientVersion': '1.20211213.00.00',
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 67,
},
'web_creator': {
- 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_API_KEY': 'AIzaSyBUPetSUmoZL-OhlxA7wSac5XinrygCqMo',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'WEB_CREATOR',
- 'clientVersion': '1.20210621.00.00',
+ 'clientVersion': '1.20211220.02.00',
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 62,
},
'android': {
- 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'ANDROID',
- 'clientVersion': '16.20',
+ 'clientVersion': '16.49',
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
'REQUIRE_JS_PLAYER': False
},
'android_embedded': {
- 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_API_KEY': 'AIzaSyCjc_pVEDi4qsv5MtC2dMXzpIaDoRFLsxw',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'ANDROID_EMBEDDED_PLAYER',
- 'clientVersion': '16.20',
+ 'clientVersion': '16.49',
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 55,
'REQUIRE_JS_PLAYER': False
},
'android_music': {
- 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30',
- 'INNERTUBE_HOST': 'music.youtube.com',
+ 'INNERTUBE_API_KEY': 'AIzaSyAOghZGza2MQSZkY_zfZ370N-PUdXEo8AI',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'ANDROID_MUSIC',
- 'clientVersion': '4.32',
+ 'clientVersion': '4.57',
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
'REQUIRE_JS_PLAYER': False
},
'android_creator': {
+ 'INNERTUBE_API_KEY': 'AIzaSyD_qjV8zaaUMehtLkrKFgVeSX_Iqbtyws8',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'ANDROID_CREATOR',
- 'clientVersion': '21.24.100',
+ 'clientVersion': '21.47',
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 14,
'REQUIRE_JS_PLAYER': False
},
- # ios has HLS live streams
- # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680
+ # iOS clients have HLS live streams. Setting device model to get 60fps formats.
+ # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558
'ios': {
- 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'IOS',
- 'clientVersion': '16.20',
+ 'clientVersion': '16.46',
+ 'deviceModel': 'iPhone14,3',
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 5,
'REQUIRE_JS_PLAYER': False
},
'ios_embedded': {
- 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'IOS_MESSAGES_EXTENSION',
- 'clientVersion': '16.20',
+ 'clientVersion': '16.46',
+ 'deviceModel': 'iPhone14,3',
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 66,
'REQUIRE_JS_PLAYER': False
},
'ios_music': {
- 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og',
- 'INNERTUBE_HOST': 'music.youtube.com',
+ 'INNERTUBE_API_KEY': 'AIzaSyBAETezhkwP0ZWA02RsqT1zu78Fpt0bC_s',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'IOS_MUSIC',
- 'clientVersion': '4.32',
+ 'clientVersion': '4.57',
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 26,
@@ -201,7 +203,7 @@ INNERTUBE_CLIENTS = {
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'IOS_CREATOR',
- 'clientVersion': '21.24.100',
+ 'clientVersion': '21.47',
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 15,
@@ -210,15 +212,15 @@ INNERTUBE_CLIENTS = {
# mweb has 'ultralow' formats
# See: https://github.com/yt-dlp/yt-dlp/pull/557
'mweb': {
- 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8',
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'MWEB',
- 'clientVersion': '2.20210721.07.00',
+ 'clientVersion': '2.20211221.01.00',
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 2
- },
+ }
}
@@ -256,7 +258,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
_RESERVED_NAMES = (
r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|'
- r'shorts|movies|results|shared|hashtag|trending|feed|feeds|'
+ r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|'
r'browse|oembed|get_video_info|iframe_api|s/player|'
r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
@@ -276,6 +278,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
r'(?:www\.)?invidious\.zee\.li',
r'(?:www\.)?invidious\.ethibox\.fr',
r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
+ r'(?:www\.)?osbivz6guyeahrwp2lnwyjk2xos342h4ocsxyqrlaopqjuhwn2djiiyd\.onion',
+ r'(?:www\.)?u2cvlit75owumwpy4dj2hsmvkq7nvrclkpht7xgyye2pyoxhpmclkrad\.onion',
# youtube-dl invidious instances list
r'(?:(?:www|no)\.)?invidiou\.sh',
r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
@@ -370,7 +374,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
pref = dict(compat_urlparse.parse_qsl(pref_cookie.value))
except ValueError:
self.report_warning('Failed to parse user PREF cookie' + bug_reports_message())
- pref.update({'hl': 'en'})
+ pref.update({'hl': 'en', 'tz': 'UTC'})
self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref))
def _real_initialize(self):
@@ -409,8 +413,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def _extract_context(self, ytcfg=None, default_client='web'):
context = get_first(
(ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict)
- # Enforce language for extraction
- traverse_obj(context, 'client', expected_type=dict, default={})['hl'] = 'en'
+ # Enforce language and tz for extraction
+ client_context = traverse_obj(context, 'client', expected_type=dict, default={})
+ client_context.update({'hl': 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0})
return context
_SAPISID = None
@@ -511,7 +516,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
Appears to be used to track session state
"""
return get_first(
- args, (('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))),
+ args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))],
expected_type=str)
@property
@@ -667,6 +672,14 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
if text:
return text
+ def _get_count(self, data, *path_list):
+ count_text = self._get_text(data, *path_list) or ''
+ count = parse_count(count_text)
+ if count is None:
+ count = str_to_int(
+ self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None))
+ return count
+
@staticmethod
def _extract_thumbnails(data, *path_list):
"""
@@ -695,12 +708,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def extract_relative_time(relative_time_text):
"""
Extracts a relative time from string and converts to dt object
- e.g. 'streamed 6 days ago', '5 seconds ago (edited)'
+ e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today'
"""
- mobj = re.search(r'(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text)
+ mobj = re.search(r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text)
if mobj:
+ start = mobj.group('start')
+ if start:
+ return datetime_from_str(start)
try:
- return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')), precision='auto')
+ return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')))
except ValueError:
return None
@@ -710,6 +726,14 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
timestamp = None
if isinstance(dt, datetime.datetime):
timestamp = calendar.timegm(dt.timetuple())
+
+ if timestamp is None:
+ timestamp = (
+ unified_timestamp(text) or unified_timestamp(
+ self._search_regex(
+ (r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*on)?\s*(.+\d)', r'\w+[\s,\.-]*\w+[\s,\.-]+20\d{2}'),
+ text.lower(), 'time text', default=None)))
+
if text and timestamp is None:
self.report_warning('Cannot parse localized time text' + bug_reports_message(), only_once=True)
return timestamp, text
@@ -737,13 +761,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
note='%s%s' % (note, ' (retry #%d)' % count if count else ''))
except ExtractorError as e:
if isinstance(e.cause, network_exceptions):
- if isinstance(e.cause, compat_HTTPError) and not is_html(e.cause.read(512)):
- e.cause.seek(0)
- yt_error = try_get(
- self._parse_json(e.cause.read().decode(), item_id, fatal=False),
- lambda x: x['error']['message'], compat_str)
- if yt_error:
- self._report_alerts([('ERROR', yt_error)], fatal=False)
+ if isinstance(e.cause, compat_HTTPError):
+ first_bytes = e.cause.read(512)
+ if not is_html(first_bytes):
+ yt_error = try_get(
+ self._parse_json(
+ self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False),
+ lambda x: x['error']['message'], compat_str)
+ if yt_error:
+ self._report_alerts([('ERROR', yt_error)], fatal=False)
# Downloading page may result in intermittent 5xx HTTP error
# Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
# We also want to catch all other network exceptions since errors in later pages can be troublesome
@@ -794,10 +820,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
description = self._get_text(renderer, 'descriptionSnippet')
duration = parse_duration(self._get_text(
renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
- view_count_text = self._get_text(renderer, 'viewCountText') or ''
- view_count = str_to_int(self._search_regex(
- r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
- 'view count', default=None))
+ view_count = self._get_count(renderer, 'viewCountText')
uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
channel_id = traverse_obj(
@@ -821,7 +844,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'uploader': uploader,
'channel_id': channel_id,
'thumbnails': thumbnails,
- 'upload_date': strftime_or_none(timestamp, '%Y%m%d'),
+ # 'upload_date': strftime_or_none(timestamp, '%Y%m%d'),
'live_status': ('is_upcoming' if scheduled_timestamp is not None
else 'was_live' if 'streamed' in time_text.lower()
else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges
@@ -846,7 +869,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
- (?:(?:v|embed|e|shorts)/(?!videoseries)) # v/ or embed/ or e/ or shorts/
+ (?:(?:v|embed|e|shorts)/(?!videoseries|live_stream)) # v/ or embed/ or e/ or shorts/
|(?: # or the v= param in all its forms
(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
@@ -1007,7 +1030,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'duration': 10,
'view_count': int,
'like_count': int,
- # 'dislike_count': int,
'availability': 'public',
'playable_in_embed': True,
'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg',
@@ -1015,6 +1037,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'age_limit': 0,
'start_time': 1,
'end_time': 9,
+ 'channel_follower_count': int
}
},
{
@@ -1043,14 +1066,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Philipp Hagemeister',
'uploader_id': 'phihag',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
+ 'channel': 'Philipp Hagemeister',
+ 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
+ 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
'upload_date': '20121002',
- 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
+ 'description': 'md5:8fb536f4877b8a7455c2ec23794dbc22',
'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
'duration': 10,
'view_count': int,
'like_count': int,
- 'dislike_count': int,
+ 'availability': 'public',
+ 'playable_in_embed': True,
+ 'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg',
+ 'live_status': 'not_live',
+ 'age_limit': 0,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1088,6 +1119,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'AfrojackVEVO',
'upload_date': '20131011',
'abr': 129.495,
+ 'like_count': int,
+ 'channel_id': 'UChuZAo1RKL85gev3Eal9_zg',
+ 'playable_in_embed': True,
+ 'channel_url': 'https://www.youtube.com/channel/UChuZAo1RKL85gev3Eal9_zg',
+ 'view_count': int,
+ 'track': 'The Spark',
+ 'live_status': 'not_live',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/IB3lcPjvWLA/maxresdefault.webp',
+ 'channel': 'Afrojack',
+ 'uploader_url': 'http://www.youtube.com/user/AfrojackVEVO',
+ 'tags': 'count:19',
+ 'availability': 'public',
+ 'categories': ['Music'],
+ 'age_limit': 0,
+ 'alt_title': 'The Spark',
+ 'channel_follower_count': int
},
'params': {
'youtube_include_dash_manifest': True,
@@ -1109,6 +1156,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
'upload_date': '20140605',
'age_limit': 18,
+ 'categories': ['Gaming'],
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/HtVdAasjOgU/maxresdefault.webp',
+ 'availability': 'needs_auth',
+ 'channel_url': 'https://www.youtube.com/channel/UCzybXLxv08IApdjdN0mJhEg',
+ 'like_count': int,
+ 'channel': 'The Witcher',
+ 'live_status': 'not_live',
+ 'tags': 'count:17',
+ 'channel_id': 'UCzybXLxv08IApdjdN0mJhEg',
+ 'playable_in_embed': True,
+ 'view_count': int,
+ 'channel_follower_count': int
},
},
{
@@ -1123,6 +1182,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'FlyingKitty900',
'uploader': 'FlyingKitty',
'age_limit': 18,
+ 'availability': 'needs_auth',
+ 'channel_id': 'UCYQT13AtrJC0gsM1far_zJg',
+ 'uploader_url': 'http://www.youtube.com/user/FlyingKitty900',
+ 'channel': 'FlyingKitty',
+ 'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg',
+ 'view_count': int,
+ 'categories': ['Entertainment'],
+ 'live_status': 'not_live',
+ 'tags': ['Flyingkitty', 'godzilla 2'],
+ 'thumbnail': 'https://i.ytimg.com/vi/HsUATh_Nc2U/maxresdefault.jpg',
+ 'like_count': int,
+ 'duration': 177,
+ 'playable_in_embed': True,
+ 'channel_follower_count': int
},
},
{
@@ -1137,6 +1210,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Projekt Melody',
'description': 'md5:17eccca93a786d51bc67646756894066',
'age_limit': 18,
+ 'like_count': int,
+ 'availability': 'needs_auth',
+ 'uploader_url': 'http://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ',
+ 'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
+ 'view_count': int,
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/Tq92D6wQ1mg/sddefault.webp',
+ 'channel': 'Projekt Melody',
+ 'live_status': 'not_live',
+ 'tags': ['mmd', 'dance', 'mikumikudance', 'kpop', 'vtuber'],
+ 'playable_in_embed': True,
+ 'categories': ['Entertainment'],
+ 'duration': 106,
+ 'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ',
+ 'channel_follower_count': int
},
},
{
@@ -1150,6 +1237,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'st3in234',
'description': 'Fan Video. Music & Lyrics by OOMPH!.',
'upload_date': '20130730',
+ 'track': 'Such mich find mich',
+ 'age_limit': 0,
+ 'tags': ['oomph', 'such mich find mich', 'lyrics', 'german industrial', 'musica industrial'],
+ 'like_count': int,
+ 'playable_in_embed': False,
+ 'creator': 'OOMPH!',
+ 'thumbnail': 'https://i.ytimg.com/vi/MeJVWBSsPAY/sddefault.jpg',
+ 'view_count': int,
+ 'alt_title': 'Such mich find mich',
+ 'duration': 210,
+ 'channel': 'Herr Lurik',
+ 'channel_id': 'UCdR3RSDPqub28LjZx0v9-aA',
+ 'categories': ['Music'],
+ 'availability': 'public',
+ 'uploader_url': 'http://www.youtube.com/user/st3in234',
+ 'channel_url': 'https://www.youtube.com/channel/UCdR3RSDPqub28LjZx0v9-aA',
+ 'live_status': 'not_live',
+ 'artist': 'OOMPH!',
+ 'channel_follower_count': int
},
},
{
@@ -1173,6 +1279,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'deadmau5',
'title': 'Deadmau5 - Some Chords (HD)',
'alt_title': 'Some Chords',
+ 'availability': 'public',
+ 'tags': 'count:14',
+ 'channel_id': 'UCYEK6xds6eo-3tr4xRdflmQ',
+ 'view_count': int,
+ 'live_status': 'not_live',
+ 'channel': 'deadmau5',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/__2ABJjxzNo/maxresdefault.webp',
+ 'like_count': int,
+ 'track': 'Some Chords',
+ 'artist': 'deadmau5',
+ 'playable_in_embed': True,
+ 'age_limit': 0,
+ 'channel_url': 'https://www.youtube.com/channel/UCYEK6xds6eo-3tr4xRdflmQ',
+ 'categories': ['Music'],
+ 'album': 'Some Chords',
+ 'channel_follower_count': int
},
'expected_warnings': [
'DASH manifest missing',
@@ -1191,6 +1313,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
'uploader': 'Olympics',
'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
+ 'like_count': int,
+ 'release_timestamp': 1343767800,
+ 'playable_in_embed': True,
+ 'categories': ['Sports'],
+ 'release_date': '20120731',
+ 'channel': 'Olympics',
+ 'tags': ['Hockey', '2012-07-31', '31 July 2012', 'Riverbank Arena', 'Session', 'Olympics', 'Olympic Games', 'London 2012', '2012 Summer Olympics', 'Summer Games'],
+ 'channel_id': 'UCTl3QQTvqHFjurroKxexy2Q',
+ 'thumbnail': 'https://i.ytimg.com/vi/lqQg6PlCWgI/maxresdefault.jpg',
+ 'age_limit': 0,
+ 'availability': 'public',
+ 'live_status': 'was_live',
+ 'view_count': int,
+ 'channel_url': 'https://www.youtube.com/channel/UCTl3QQTvqHFjurroKxexy2Q',
+ 'channel_follower_count': int
},
'params': {
'skip_download': 'requires avconv',
@@ -1210,6 +1347,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
'uploader': '孫ᄋᄅ',
'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
+ 'playable_in_embed': True,
+ 'channel': '孫ᄋᄅ',
+ 'age_limit': 0,
+ 'tags': 'count:11',
+ 'channel_url': 'https://www.youtube.com/channel/UCS-xxCmRaA6BFdmgDPA_BIw',
+ 'channel_id': 'UCS-xxCmRaA6BFdmgDPA_BIw',
+ 'thumbnail': 'https://i.ytimg.com/vi/_b-2C3KPAM0/maxresdefault.jpg',
+ 'view_count': int,
+ 'categories': ['People & Blogs'],
+ 'like_count': int,
+ 'live_status': 'not_live',
+ 'availability': 'unlisted',
+ 'channel_follower_count': int
},
},
# url_encoded_fmt_stream_map is empty string
@@ -1366,6 +1516,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'track': 'Dark Walk',
'artist': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/lsguqyKfVQg/maxresdefault.webp',
+ 'categories': ['Film & Animation'],
+ 'view_count': int,
+ 'live_status': 'not_live',
+ 'channel_url': 'https://www.youtube.com/channel/UCTSRgz5jylBvFt_S7wnsqLQ',
+ 'channel_id': 'UCTSRgz5jylBvFt_S7wnsqLQ',
+ 'tags': 'count:13',
+ 'availability': 'public',
+ 'channel': 'IronSoulElf',
+ 'playable_in_embed': True,
+ 'like_count': int,
+ 'age_limit': 0,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1412,6 +1575,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
'uploader': 'The Berkman Klein Center for Internet & Society',
'license': 'Creative Commons Attribution license (reuse allowed)',
+ 'channel_id': 'UCuLGmD72gJDBwmLw06X58SA',
+ 'channel_url': 'https://www.youtube.com/channel/UCuLGmD72gJDBwmLw06X58SA',
+ 'like_count': int,
+ 'age_limit': 0,
+ 'tags': ['Copyright (Legal Subject)', 'Law (Industry)', 'William W. Fisher (Author)'],
+ 'channel': 'The Berkman Klein Center for Internet & Society',
+ 'availability': 'public',
+ 'view_count': int,
+ 'categories': ['Education'],
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/M4gD1WSo5mA/maxresdefault.webp',
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1431,6 +1607,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
'license': 'Creative Commons Attribution license (reuse allowed)',
+ 'playable_in_embed': True,
+ 'tags': 'count:12',
+ 'like_count': int,
+ 'channel_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
+ 'age_limit': 0,
+ 'availability': 'public',
+ 'categories': ['News & Politics'],
+ 'channel': 'Bernie Sanders',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/eQcmzGIKrzg/maxresdefault.webp',
+ 'view_count': int,
+ 'live_status': 'not_live',
+ 'channel_url': 'https://www.youtube.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1480,6 +1669,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'series': 'Mind Field',
'season_number': 1,
'episode_number': 1,
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/iqKdEhx-dD4/maxresdefault.webp',
+ 'tags': 'count:12',
+ 'view_count': int,
+ 'availability': 'public',
+ 'age_limit': 0,
+ 'channel': 'Vsauce',
+ 'episode': 'Episode 1',
+ 'categories': ['Entertainment'],
+ 'season': 'Season 1',
+ 'channel_id': 'UC6nSFpj9HTCZ5t-N3Rm3-HA',
+ 'channel_url': 'https://www.youtube.com/channel/UC6nSFpj9HTCZ5t-N3Rm3-HA',
+ 'like_count': int,
+ 'playable_in_embed': True,
+ 'live_status': 'not_live',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1573,6 +1777,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'album': 'it\'s too much love to know my dear',
'release_date': '20190313',
'release_year': 2019,
+ 'alt_title': 'Voyeur Girl',
+ 'view_count': int,
+ 'uploader_url': 'http://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA',
+ 'playable_in_embed': True,
+ 'like_count': int,
+ 'categories': ['Music'],
+ 'channel_url': 'https://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA',
+ 'channel': 'Stephen',
+ 'availability': 'public',
+ 'creator': 'Stephen',
+ 'duration': 169,
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/MgNrAu2pzNs/maxresdefault.webp',
+ 'age_limit': 0,
+ 'channel_id': 'UC-pWHpBjdGG69N9mM2auIAA',
+ 'tags': 'count:11',
+ 'live_status': 'not_live',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1614,6 +1835,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20170613',
'uploader_id': 'ElevageOrVert',
'uploader': 'ElevageOrVert',
+ 'view_count': int,
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/x41yOUIvK2k/maxresdefault.webp',
+ 'uploader_url': 'http://www.youtube.com/user/ElevageOrVert',
+ 'like_count': int,
+ 'channel_id': 'UCo03ZQPBW5U4UC3regpt1nw',
+ 'tags': [],
+ 'channel_url': 'https://www.youtube.com/channel/UCo03ZQPBW5U4UC3regpt1nw',
+ 'availability': 'public',
+ 'age_limit': 0,
+ 'categories': ['Pets & Animals'],
+ 'duration': 7,
+ 'playable_in_embed': True,
+ 'live_status': 'not_live',
+ 'channel': 'ElevageOrVert',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1633,6 +1869,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20130831',
'uploader_id': 'kudvenkat',
'uploader': 'kudvenkat',
+ 'channel_id': 'UCCTVrRB5KpIiK6V2GGVsR1Q',
+ 'like_count': int,
+ 'uploader_url': 'http://www.youtube.com/user/kudvenkat',
+ 'channel_url': 'https://www.youtube.com/channel/UCCTVrRB5KpIiK6V2GGVsR1Q',
+ 'live_status': 'not_live',
+ 'categories': ['Education'],
+ 'availability': 'public',
+ 'thumbnail': 'https://i.ytimg.com/vi/CHqg6qOn4no/sddefault.jpg',
+ 'tags': 'count:12',
+ 'playable_in_embed': True,
+ 'age_limit': 0,
+ 'view_count': int,
+ 'duration': 522,
+ 'channel': 'kudvenkat',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1662,8 +1913,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'artist': 'The Cinematic Orchestra',
'track': 'Burn Out',
'album': 'Every Day',
- 'release_data': None,
- 'release_year': None,
+ 'like_count': int,
+ 'live_status': 'not_live',
+ 'alt_title': 'Burn Out',
+ 'duration': 614,
+ 'age_limit': 0,
+ 'view_count': int,
+ 'channel_url': 'https://www.youtube.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
+ 'creator': 'The Cinematic Orchestra',
+ 'channel': 'The Cinematic Orchestra',
+ 'tags': ['The Cinematic Orchestra', 'Every Day', 'Burn Out'],
+ 'channel_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
+ 'availability': 'public',
+ 'thumbnail': 'https://i.ytimg.com/vi/OtqTfy26tG0/maxresdefault.jpg',
+ 'categories': ['Music'],
+ 'playable_in_embed': True,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1682,10 +1947,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'title': 'San Diego teen commits suicide after bullying over embarrassing video',
'channel_id': 'UC-SJ6nODDmufqBzPBwCvYvQ',
- 'uploader': 'CBS This Morning',
+ 'uploader': 'CBS Mornings',
'uploader_id': 'CBSThisMorning',
'upload_date': '20140716',
- 'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7'
+ 'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7',
+ 'duration': 170,
+ 'categories': ['News & Politics'],
+ 'uploader_url': 'http://www.youtube.com/user/CBSThisMorning',
+ 'view_count': int,
+ 'channel': 'CBS Mornings',
+ 'tags': ['suicide', 'bullying', 'video', 'cbs', 'news'],
+ 'thumbnail': 'https://i.ytimg.com/vi/SZJvDhaSDnc/hqdefault.jpg',
+ 'age_limit': 18,
+ 'availability': 'needs_auth',
+ 'channel_url': 'https://www.youtube.com/channel/UC-SJ6nODDmufqBzPBwCvYvQ',
+ 'like_count': int,
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'channel_follower_count': int
}
},
{
@@ -1700,6 +1979,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Walk around Japan',
'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
+ 'duration': 1456,
+ 'categories': ['Travel & Events'],
+ 'channel_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
+ 'view_count': int,
+ 'channel': 'Walk around Japan',
+ 'tags': ['Ueno Tokyo', 'Okachimachi Tokyo', 'Ameyoko Street', 'Tokyo attraction', 'Travel in Tokyo'],
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/cBvYw8_A0vQ/hqdefault.webp',
+ 'age_limit': 0,
+ 'availability': 'public',
+ 'channel_url': 'https://www.youtube.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
@@ -1728,7 +2020,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'colinfurze',
'uploader_id': 'colinfurze',
'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw',
- 'description': 'md5:b5096f56af7ccd7a555c84db81738b22'
+ 'description': 'md5:5d5991195d599b56cd0c4148907eec50',
+ 'duration': 596,
+ 'categories': ['Entertainment'],
+ 'uploader_url': 'http://www.youtube.com/user/colinfurze',
+ 'view_count': int,
+ 'channel': 'colinfurze',
+ 'tags': ['Colin', 'furze', 'Terry', 'tunnel', 'underground', 'bunker'],
+ 'thumbnail': 'https://i.ytimg.com/vi/YOelRv7fMxY/maxresdefault.jpg',
+ 'age_limit': 0,
+ 'availability': 'public',
+ 'like_count': int,
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'channel_follower_count': int
},
'params': {
'format': '17', # 3gp format available on android
@@ -1758,6 +2063,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:89cd86034bdb5466cd87c6ba206cd2bc',
'upload_date': '20140324',
'uploader': 'SciShow',
+ 'like_count': int,
+ 'channel_id': 'UCZYTClx2T1of7BRZ86-8fow',
+ 'channel_url': 'https://www.youtube.com/channel/UCZYTClx2T1of7BRZ86-8fow',
+ 'view_count': int,
+ 'thumbnail': 'https://i.ytimg.com/vi/5KLPxDtMqe8/maxresdefault.jpg',
+ 'playable_in_embed': True,
+ 'tags': 'count:12',
+ 'uploader_url': 'http://www.youtube.com/user/scishow',
+ 'availability': 'public',
+ 'channel': 'SciShow',
+ 'live_status': 'not_live',
+ 'duration': 248,
+ 'categories': ['Education'],
+ 'age_limit': 0,
+ 'channel_follower_count': int
}, 'params': {'format': 'mhtml', 'skip_download': True}
}
]
@@ -2101,9 +2421,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id)
def _extract_n_function_name(self, jscode):
- return self._search_regex(
- (r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]{3})\([a-zA-Z0-9]\)',),
- jscode, 'Initial JS player n function name', group='nfunc')
+ nfunc, idx = self._search_regex(
+ r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]{3})(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)',
+ jscode, 'Initial JS player n function name', group=('nfunc', 'idx'))
+ if not idx:
+ return nfunc
+ return json.loads(js_to_json(self._search_regex(
+ rf'var {nfunc}\s*=\s*(\[.+?\]);', jscode,
+ f'Initial JS player n function list ({nfunc}.{idx})')))[int(idx)]
def _extract_n_function(self, video_id, player_url):
player_id = self._extract_player_info(player_url)
@@ -2317,8 +2642,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
_continuation = None
for content in contents:
comments_header_renderer = traverse_obj(content, 'commentsHeaderRenderer')
- expected_comment_count = parse_count(self._get_text(
- comments_header_renderer, 'countText', 'commentsCount', max_runs=1))
+ expected_comment_count = self._get_count(
+ comments_header_renderer, 'countText', 'commentsCount')
if expected_comment_count:
tracker['est_total'] = expected_comment_count
@@ -2539,7 +2864,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}.get(client)
if not url:
return {}
- webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config')
+ webpage = self._download_webpage(url, video_id, fatal=False, note='Downloading %s config' % client.replace('_', ' ').strip())
return self.extract_ytcfg(video_id, webpage) or {}
def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg):
@@ -2765,9 +3090,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_storyboard(self, player_responses, duration):
spec = get_first(
player_responses, ('storyboards', 'playerStoryboardSpecRenderer', 'spec'), default='').split('|')[::-1]
- if not spec:
+ base_url = url_or_none(urljoin('https://i.ytimg.com/', spec.pop() or None))
+ if not base_url:
return
- base_url = spec.pop()
L = len(spec) - 1
for i, args in enumerate(spec):
args = args.split('#')
@@ -3014,7 +3339,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
'uploader_url': owner_profile_url,
'channel_id': channel_id,
- 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None,
+ 'channel_url': format_field(channel_id, template='https://www.youtube.com/channel/%s'),
'duration': duration,
'view_count': int_or_none(
get_first((video_details, microformats), (..., 'viewCount'))
@@ -3194,7 +3519,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
})
vsir = content.get('videoSecondaryInfoRenderer')
if vsir:
- info['channel'] = self._get_text(vsir, ('owner', 'videoOwnerRenderer', 'title'))
+ vor = traverse_obj(vsir, ('owner', 'videoOwnerRenderer'))
+ info.update({
+ 'channel': self._get_text(vor, 'title'),
+ 'channel_follower_count': self._get_count(vor, 'subscriberCountText')})
+
rows = try_get(
vsir,
lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
@@ -3272,6 +3601,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
+ @staticmethod
+ def passthrough_smuggled_data(func):
+ def _smuggle(entries, smuggled_data):
+ for entry in entries:
+ # TODO: Convert URL to music.youtube instead.
+ # Do we need to passthrough any other smuggled_data?
+ entry['url'] = smuggle_url(entry['url'], smuggled_data)
+ yield entry
+
+ @functools.wraps(func)
+ def wrapper(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+ if self.is_music_url(url):
+ smuggled_data['is_music_url'] = True
+ info_dict = func(self, url, smuggled_data)
+ if smuggled_data and info_dict.get('entries'):
+ info_dict['entries'] = _smuggle(info_dict['entries'], smuggled_data)
+ return info_dict
+ return wrapper
+
def _extract_channel_id(self, webpage):
channel_id = self._html_search_meta(
'channelId', webpage, 'channel id', default=None)
@@ -3339,6 +3688,24 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title)
break
+ def _music_reponsive_list_entry(self, renderer):
+ video_id = traverse_obj(renderer, ('playlistItemData', 'videoId'))
+ if video_id:
+ return self.url_result(f'https://music.youtube.com/watch?v={video_id}',
+ ie=YoutubeIE.ie_key(), video_id=video_id)
+ playlist_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'playlistId'))
+ if playlist_id:
+ video_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'videoId'))
+ if video_id:
+ return self.url_result(f'https://music.youtube.com/watch?v={video_id}&list={playlist_id}',
+ ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
+ return self.url_result(f'https://music.youtube.com/playlist?list={playlist_id}',
+ ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
+ browse_id = traverse_obj(renderer, ('navigationEndpoint', 'browseEndpoint', 'browseId'))
+ if browse_id:
+ return self.url_result(f'https://music.youtube.com/browse/{browse_id}',
+ ie=YoutubeTabIE.ie_key(), video_id=browse_id)
+
def _shelf_entries_from_content(self, shelf_renderer):
content = shelf_renderer.get('content')
if not isinstance(content, dict):
@@ -3460,7 +3827,9 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
for content in contents:
if not isinstance(content, dict):
continue
- is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
+ is_renderer = traverse_obj(
+ content, 'itemSectionRenderer', 'musicShelfRenderer', 'musicShelfContinuation',
+ expected_type=dict)
if not is_renderer:
renderer = content.get('richItemRenderer')
if renderer:
@@ -3477,6 +3846,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
'playlistVideoListRenderer': self._playlist_entries,
'gridRenderer': self._grid_entries,
'shelfRenderer': lambda x: self._shelf_entries(x),
+ 'musicResponsiveListItemRenderer': lambda x: [self._music_reponsive_list_entry(x)],
'backstagePostThreadRenderer': self._post_thread_entries,
'videoRenderer': lambda x: [self._video_entry(x)],
'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}),
@@ -3603,6 +3973,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
tags = []
selected_tab = self._extract_selected_tab(tabs)
+ primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
renderer = try_get(
data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
if renderer:
@@ -3619,20 +3990,48 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
playlist_id = channel_id
tags = renderer.get('keywords', '').split()
- thumbnails = (
- self._extract_thumbnails(renderer, 'avatar')
- or self._extract_thumbnails(
- self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'),
- ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail')))
+ # We can get the uncropped banner/avatar by replacing the crop params with '=s0'
+ # See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714
+ def _get_uncropped(url):
+ return url_or_none((url or '').split('=')[0] + '=s0')
+
+ avatar_thumbnails = self._extract_thumbnails(renderer, 'avatar')
+ if avatar_thumbnails:
+ uncropped_avatar = _get_uncropped(avatar_thumbnails[0]['url'])
+ if uncropped_avatar:
+ avatar_thumbnails.append({
+ 'url': uncropped_avatar,
+ 'id': 'avatar_uncropped',
+ 'preference': 1
+ })
+
+ channel_banners = self._extract_thumbnails(
+ data, ('header', ..., ['banner', 'mobileBanner', 'tvBanner']))
+ for banner in channel_banners:
+ banner['preference'] = -10
+
+ if channel_banners:
+ uncropped_banner = _get_uncropped(channel_banners[0]['url'])
+ if uncropped_banner:
+ channel_banners.append({
+ 'url': uncropped_banner,
+ 'id': 'banner_uncropped',
+ 'preference': -5
+ })
+
+ primary_thumbnails = self._extract_thumbnails(
+ primary_sidebar_renderer, ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail'))
if playlist_id is None:
playlist_id = item_id
+
+ playlist_stats = traverse_obj(primary_sidebar_renderer, 'stats')
+ last_updated_unix, _ = self._extract_time_text(playlist_stats, 2)
if title is None:
- title = (
- try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
- or playlist_id)
+ title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id
title += format_field(selected_tab, 'title', ' - %s')
title += format_field(selected_tab, 'expandedText', ' - %s')
+
metadata = {
'playlist_id': playlist_id,
'playlist_title': title,
@@ -3640,12 +4039,14 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
'uploader': channel_name,
'uploader_id': channel_id,
'uploader_url': channel_url,
- 'thumbnails': thumbnails,
+ 'thumbnails': primary_thumbnails + avatar_thumbnails + channel_banners,
'tags': tags,
+ 'view_count': self._get_count(playlist_stats, 1),
+ 'availability': self._extract_availability(data),
+ 'modified_date': strftime_or_none(last_updated_unix, '%Y%m%d'),
+ 'playlist_count': self._get_count(playlist_stats, 0),
+ 'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')),
}
- availability = self._extract_availability(data)
- if availability:
- metadata['availability'] = availability
if not channel_id:
metadata.update(self._extract_uploader(data))
metadata.update({
@@ -3870,33 +4271,32 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
raise ExtractorError(err_note, expected=True)
self.report_warning(err_note, item_id)
- @staticmethod
- def _smuggle_data(entries, data):
- for entry in entries:
- if data:
- entry['url'] = smuggle_url(entry['url'], data)
- yield entry
-
_SEARCH_PARAMS = None
- def _search_results(self, query, params=NO_DEFAULT):
+ def _search_results(self, query, params=NO_DEFAULT, default_client='web'):
data = {'query': query}
if params is NO_DEFAULT:
params = self._SEARCH_PARAMS
if params:
data['params'] = params
+
+ content_keys = (
+ ('contents', 'twoColumnSearchResultsRenderer', 'primaryContents', 'sectionListRenderer', 'contents'),
+ ('onResponseReceivedCommands', 0, 'appendContinuationItemsAction', 'continuationItems'),
+ # ytmusic search
+ ('contents', 'tabbedSearchResultsRenderer', 'tabs', 0, 'tabRenderer', 'content', 'sectionListRenderer', 'contents'),
+ ('continuationContents', ),
+ )
+ check_get_keys = tuple(set(keys[0] for keys in content_keys))
+
continuation_list = [None]
for page_num in itertools.count(1):
data.update(continuation_list[0] or {})
search = self._extract_response(
item_id='query "%s" page %s' % (query, page_num), ep='search', query=data,
- check_get_keys=('contents', 'onResponseReceivedCommands'))
- slr_contents = try_get(
- search,
- (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
- lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
- list)
- yield from self._extract_entries({'contents': slr_contents}, continuation_list)
+ default_client=default_client, check_get_keys=check_get_keys)
+ slr_contents = traverse_obj(search, *content_keys)
+ yield from self._extract_entries({'contents': list(variadic(slr_contents))}, continuation_list)
if not continuation_list[0]:
break
@@ -3931,10 +4331,16 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'playlist_mincount': 94,
'info_dict': {
'id': 'UCqj7Cz7revf5maW9g5pgNcg',
- 'title': 'Игорь Клейнер - Playlists',
+ 'title': 'Igor Kleiner - Playlists',
'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
- 'uploader': 'Игорь Клейнер',
+ 'uploader': 'Igor Kleiner',
'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'channel': 'Igor Kleiner',
+ 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'tags': ['"критическое', 'мышление"', '"наука', 'просто"', 'математика', '"анализ', 'данных"'],
+ 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg',
+ 'uploader_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg',
+ 'channel_follower_count': int
},
}, {
'note': 'playlists, multipage, different order',
@@ -3942,10 +4348,16 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'playlist_mincount': 94,
'info_dict': {
'id': 'UCqj7Cz7revf5maW9g5pgNcg',
- 'title': 'Игорь Клейнер - Playlists',
+ 'title': 'Igor Kleiner - Playlists',
'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
- 'uploader': 'Игорь Клейнер',
+ 'uploader': 'Igor Kleiner',
+ 'uploader_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg',
+ 'tags': ['"критическое', 'мышление"', '"наука', 'просто"', 'математика', '"анализ', 'данных"'],
+ 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'channel': 'Igor Kleiner',
+ 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg',
+ 'channel_follower_count': int
},
}, {
'note': 'playlists, series',
@@ -3957,6 +4369,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'description': 'md5:e1384e8a133307dd10edee76e875d62f',
'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
'uploader': '3Blue1Brown',
+ 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
+ 'uploader_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
+ 'channel': '3Blue1Brown',
+ 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'tags': ['Mathematics'],
+ 'channel_follower_count': int
},
}, {
'note': 'playlists, singlepage',
@@ -3968,6 +4386,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'description': 'md5:609399d937ea957b0f53cbffb747a14c',
'uploader': 'ThirstForScience',
'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
+ 'uploader_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ',
+ 'channel_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ',
+ 'channel_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
+ 'tags': 'count:13',
+ 'channel': 'ThirstForScience',
+ 'channel_follower_count': int
}
}, {
'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
@@ -3980,6 +4404,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'uploader': 'Sergey M.',
'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
'title': 'youtube-dl public playlist',
+ 'description': '',
+ 'tags': [],
+ 'view_count': int,
+ 'modified_date': '20201130',
+ 'channel': 'Sergey M.',
+ 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
+ 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
+ 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
},
'playlist_count': 1,
}, {
@@ -3990,6 +4422,13 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'uploader': 'Sergey M.',
'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
'title': 'youtube-dl empty playlist',
+ 'tags': [],
+ 'channel': 'Sergey M.',
+ 'description': '',
+ 'modified_date': '20160902',
+ 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
+ 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
+ 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
},
'playlist_count': 0,
}, {
@@ -4001,6 +4440,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
'uploader': 'lex will',
'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel': 'lex will',
+ 'tags': ['bible', 'history', 'prophesy'],
+ 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_follower_count': int
},
'playlist_mincount': 2,
}, {
@@ -4012,6 +4457,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
'uploader': 'lex will',
'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'tags': ['bible', 'history', 'prophesy'],
+ 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel': 'lex will',
+ 'channel_follower_count': int
},
'playlist_mincount': 975,
}, {
@@ -4023,6 +4474,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
'uploader': 'lex will',
'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel': 'lex will',
+ 'tags': ['bible', 'history', 'prophesy'],
+ 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_follower_count': int
},
'playlist_mincount': 199,
}, {
@@ -4034,6 +4491,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
'uploader': 'lex will',
'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel': 'lex will',
+ 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'tags': ['bible', 'history', 'prophesy'],
+ 'channel_follower_count': int
},
'playlist_mincount': 17,
}, {
@@ -4045,6 +4508,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
'uploader': 'lex will',
'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel': 'lex will',
+ 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'tags': ['bible', 'history', 'prophesy'],
+ 'channel_follower_count': int
},
'playlist_mincount': 18,
}, {
@@ -4056,6 +4525,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
'uploader': 'lex will',
'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel': 'lex will',
+ 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'tags': ['bible', 'history', 'prophesy'],
+ 'channel_follower_count': int
},
'playlist_mincount': 12,
}, {
@@ -4068,6 +4543,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'description': 'md5:e1384e8a133307dd10edee76e875d62f',
'uploader': '3Blue1Brown',
'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
+ 'uploader_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
+ 'tags': ['Mathematics'],
+ 'channel': '3Blue1Brown',
+ 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'channel_follower_count': int
},
}, {
'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
@@ -4087,6 +4568,13 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'uploader': 'Christiaan008',
'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
+ 'tags': [],
+ 'uploader_url': 'https://www.youtube.com/c/ChRiStIaAn008',
+ 'view_count': int,
+ 'modified_date': '20150605',
+ 'channel_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
+ 'channel_url': 'https://www.youtube.com/c/ChRiStIaAn008',
+ 'channel': 'Christiaan008',
},
'playlist_count': 96,
}, {
@@ -4097,8 +4585,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
'uploader': 'Cauchemar',
'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
+ 'channel_url': 'https://www.youtube.com/c/Cauchemar89',
+ 'tags': [],
+ 'modified_date': r're:\d{8}',
+ 'channel': 'Cauchemar',
+ 'uploader_url': 'https://www.youtube.com/c/Cauchemar89',
+ 'view_count': int,
+ 'description': '',
+ 'channel_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
},
'playlist_mincount': 1123,
+ 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
}, {
'note': 'even larger playlist, 8832 videos',
'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
@@ -4111,6 +4608,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
'uploader': 'Interstellar Movie',
'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
+ 'uploader_url': 'https://www.youtube.com/c/InterstellarMovie',
+ 'tags': [],
+ 'view_count': int,
+ 'channel_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
+ 'channel_url': 'https://www.youtube.com/c/InterstellarMovie',
+ 'channel': 'Interstellar Movie',
+ 'description': '',
+ 'modified_date': r're:\d{8}',
},
'playlist_mincount': 21,
}, {
@@ -4121,8 +4626,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q',
'uploader': 'Phim Siêu Nhân Nhật Bản',
'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
+ 'view_count': int,
+ 'channel': 'Phim Siêu Nhân Nhật Bản',
+ 'tags': [],
+ 'uploader_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q',
+ 'description': '',
+ 'channel_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q',
+ 'channel_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
+ 'modified_date': r're:\d{8}',
},
'playlist_mincount': 200,
+ 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
}, {
'note': 'Playlist with unavailable videos in page 7',
'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w',
@@ -4131,8 +4645,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'id': 'UU8l9frL61Yl5KFOl87nIm2w',
'uploader': 'BlankTV',
'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w',
+ 'channel': 'BlankTV',
+ 'channel_url': 'https://www.youtube.com/c/blanktv',
+ 'channel_id': 'UC8l9frL61Yl5KFOl87nIm2w',
+ 'view_count': int,
+ 'tags': [],
+ 'uploader_url': 'https://www.youtube.com/c/blanktv',
+ 'modified_date': r're:\d{8}',
+ 'description': '',
},
'playlist_mincount': 1000,
+ 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
}, {
'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844',
'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
@@ -4142,6 +4665,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
'uploader': 'Computerphile',
'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
+ 'uploader_url': 'https://www.youtube.com/user/Computerphile',
+ 'tags': [],
+ 'view_count': int,
+ 'channel_id': 'UC9-y-6csu5WGm29I7JiwpnA',
+ 'channel_url': 'https://www.youtube.com/user/Computerphile',
+ 'channel': 'Computerphile',
},
'playlist_mincount': 11,
}, {
@@ -4164,7 +4693,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'tags': list,
'view_count': int,
'like_count': int,
- 'dislike_count': int,
},
'params': {
'skip_download': True,
@@ -4180,23 +4708,34 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
}, {
'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
'info_dict': {
- 'id': '3yImotZU3tw', # This will keep changing
+ 'id': 'GgL890LIznQ', # This will keep changing
'ext': 'mp4',
- 'title': compat_str,
+ 'title': str,
'uploader': 'Sky News',
'uploader_id': 'skynews',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
'upload_date': r're:\d{8}',
- 'description': compat_str,
+ 'description': str,
'categories': ['News & Politics'],
'tags': list,
'like_count': int,
- 'dislike_count': int,
+ 'release_timestamp': 1642502819,
+ 'channel': 'Sky News',
+ 'channel_id': 'UCoMdktPbSTixAyNGwb-UYkQ',
+ 'age_limit': 0,
+ 'view_count': int,
+ 'thumbnail': 'https://i.ytimg.com/vi/GgL890LIznQ/maxresdefault_live.jpg',
+ 'playable_in_embed': True,
+ 'release_date': '20220118',
+ 'availability': 'public',
+ 'live_status': 'is_live',
+ 'channel_url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ',
+ 'channel_follower_count': int
},
'params': {
'skip_download': True,
},
- 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '],
+ 'expected_warnings': ['Ignoring subtitle tracks found in '],
}, {
'url': 'https://www.youtube.com/user/TheYoungTurks/live',
'info_dict': {
@@ -4212,7 +4751,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'categories': ['News & Politics'],
'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
'like_count': int,
- 'dislike_count': int,
},
'params': {
'skip_download': True,
@@ -4268,6 +4806,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'info_dict': {
'id': 'cctv9',
'title': '#cctv9',
+ 'tags': [],
},
'playlist_mincount': 350,
}, {
@@ -4290,8 +4829,16 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
'title': 'NCS Releases',
+ 'uploader_url': 'https://www.youtube.com/c/NoCopyrightSounds',
+ 'channel_url': 'https://www.youtube.com/c/NoCopyrightSounds',
+ 'modified_date': r're:\d{8}',
+ 'view_count': int,
+ 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
+ 'tags': [],
+ 'channel': 'NoCopyrightSounds',
},
'playlist_mincount': 166,
+ 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
}, {
'note': 'Topic, should redirect to playlist?list=UU...',
'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
@@ -4300,10 +4847,19 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
'title': 'Uploads from Royalty Free Music - Topic',
'uploader': 'Royalty Free Music - Topic',
+ 'tags': [],
+ 'channel_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'channel': 'Royalty Free Music - Topic',
+ 'view_count': int,
+ 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'modified_date': r're:\d{8}',
+ 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'description': '',
},
'expected_warnings': [
- 'A channel/user page was given',
'The URL does not have a videos tab',
+ r'[Uu]navailable videos (are|will be) hidden',
],
'playlist_mincount': 101,
}, {
@@ -4312,11 +4868,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'info_dict': {
'id': 'UCtFRv9O2AHqOZjjynzrv-xg',
'title': 'UCtFRv9O2AHqOZjjynzrv-xg',
+ 'tags': [],
},
'expected_warnings': [
- 'A channel/user page was given',
- 'The URL does not have a videos tab',
- 'Falling back to channel URL',
+ 'the playlist redirect gave error',
],
'playlist_mincount': 9,
}, {
@@ -4325,6 +4880,11 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'info_dict': {
'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0',
'title': 'Album - Royalty Free Music Library V2 (50 Songs)',
+ 'tags': [],
+ 'view_count': int,
+ 'description': '',
+ 'availability': 'unlisted',
+ 'modified_date': r're:\d{8}',
},
'playlist_count': 50,
}, {
@@ -4335,7 +4895,15 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'uploader': 'colethedj',
'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
'title': 'yt-dlp unlisted playlist test',
- 'availability': 'unlisted'
+ 'availability': 'unlisted',
+ 'tags': [],
+ 'modified_date': '20211208',
+ 'channel': 'colethedj',
+ 'view_count': int,
+ 'description': '',
+ 'uploader_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q',
+ 'channel_id': 'UC9zHu_mHU96r19o-wV5Qs1Q',
+ 'channel_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q',
},
'playlist_count': 1,
}, {
@@ -4344,6 +4912,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'info_dict': {
'id': 'recommended',
'title': 'recommended',
+ 'tags': [],
},
'playlist_mincount': 50,
'params': {
@@ -4359,6 +4928,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa',
'uploader': 'Cody\'sLab',
'uploader_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw',
+ 'channel': 'Cody\'sLab',
+ 'channel_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw',
+ 'tags': [],
+ 'channel_url': 'https://www.youtube.com/channel/UCu6mSoMNzHQiBIOCkHUa2Aw',
+ 'uploader_url': 'https://www.youtube.com/channel/UCu6mSoMNzHQiBIOCkHUa2Aw',
+ 'channel_follower_count': int
},
'playlist_mincount': 650,
'params': {
@@ -4373,10 +4948,18 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
'title': 'Uploads from Royalty Free Music - Topic',
'uploader': 'Royalty Free Music - Topic',
+ 'modified_date': r're:\d{8}',
+ 'channel_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'description': '',
+ 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
+ 'tags': [],
+ 'channel': 'Royalty Free Music - Topic',
+ 'view_count': int,
+ 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
},
'expected_warnings': [
- 'A channel/user page was given',
- 'The URL does not have a videos tab',
+ 'does not have a videos tab',
+ r'[Uu]navailable videos (are|will be) hidden',
],
'playlist_mincount': 101,
'params': {
@@ -4390,18 +4973,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
return False if YoutubeIE.suitable(url) else super(
YoutubeTabIE, cls).suitable(url)
- def _real_extract(self, url):
- url, smuggled_data = unsmuggle_url(url, {})
- if self.is_music_url(url):
- smuggled_data['is_music_url'] = True
- info_dict = self.__real_extract(url, smuggled_data)
- if info_dict.get('entries'):
- info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data)
- return info_dict
-
- _URL_RE = re.compile(rf'(?P<pre>{_VALID_URL})(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$')
+ _URL_RE = re.compile(rf'(?P<pre>{_VALID_URL})(?(not_channel)|(?P<tab>/\w+))?(?P<post>.*)$')
- def __real_extract(self, url, smuggled_data):
+ @YoutubeTabBaseInfoExtractor.passthrough_smuggled_data
+ def _real_extract(self, url, smuggled_data):
item_id = self._match_id(url)
url = compat_urlparse.urlunparse(
compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
@@ -4431,6 +5006,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
elif mobj['channel_type'] == 'browse': # Youtube music /browse/ should be changed to /channel/
pre = f'https://www.youtube.com/channel/{item_id}'
+ original_tab_name = tab
if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
# Home URLs should redirect to /videos/
redirect_warning = ('A channel/user page was given. All the channel\'s videos will be downloaded. '
@@ -4465,29 +5041,35 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list)
if tabs:
selected_tab = self._extract_selected_tab(tabs)
- tab_name = selected_tab.get('title', '')
+ selected_tab_name = selected_tab.get('title', '').lower()
+ if selected_tab_name == 'home':
+ selected_tab_name = 'featured'
+ requested_tab_name = mobj['tab'][1:]
if 'no-youtube-channel-redirect' not in compat_opts:
- if mobj['tab'] == '/live':
+ if requested_tab_name == 'live':
# Live tab should have redirected to the video
raise ExtractorError('The channel is not currently live', expected=True)
- if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]:
- redirect_warning = f'The URL does not have a {mobj["tab"][1:]} tab'
- if not mobj['not_channel'] and item_id[:2] == 'UC':
- # Topic channels don't have /videos. Use the equivalent playlist instead
- pl_id = f'UU{item_id[2:]}'
- pl_url = f'https://www.youtube.com/playlist?list={pl_id}'
- try:
- data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True)
- except ExtractorError:
- redirect_warning += ' and the playlist redirect gave error'
- else:
- item_id, url, tab_name = pl_id, pl_url, mobj['tab'][1:]
- redirect_warning += f'. Redirecting to playlist {pl_id} instead'
- if tab_name.lower() != mobj['tab'][1:]:
- redirect_warning += f'. {tab_name} tab is being downloaded instead'
+ if requested_tab_name not in ('', selected_tab_name):
+ redirect_warning = f'The channel does not have a {requested_tab_name} tab'
+ if not original_tab_name:
+ if item_id[:2] == 'UC':
+ # Topic channels don't have /videos. Use the equivalent playlist instead
+ pl_id = f'UU{item_id[2:]}'
+ pl_url = f'https://www.youtube.com/playlist?list={pl_id}'
+ try:
+ data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True, webpage_fatal=True)
+ except ExtractorError:
+ redirect_warning += ' and the playlist redirect gave error'
+ else:
+ item_id, url, selected_tab_name = pl_id, pl_url, requested_tab_name
+ redirect_warning += f'. Redirecting to playlist {pl_id} instead'
+ if selected_tab_name and selected_tab_name != requested_tab_name:
+ redirect_warning += f'. {selected_tab_name} tab is being downloaded instead'
+ else:
+ raise ExtractorError(redirect_warning, expected=True)
if redirect_warning:
- self.report_warning(redirect_warning)
+ self.to_screen(redirect_warning)
self.write_debug(f'Final URL: {url}')
# YouTube sometimes provides a button to reload playlist with unavailable videos.
@@ -4538,9 +5120,16 @@ class YoutubePlaylistIE(InfoExtractor):
'info_dict': {
'title': '[OLD]Team Fortress 2 (Class-based LP)',
'id': 'PLBB231211A4F62143',
- 'uploader': 'Wickydoo',
+ 'uploader': 'Wickman',
'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2',
+ 'view_count': int,
+ 'uploader_url': 'https://www.youtube.com/user/Wickydoo',
+ 'modified_date': r're:\d{8}',
+ 'channel_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
+ 'channel': 'Wickman',
+ 'tags': [],
+ 'channel_url': 'https://www.youtube.com/user/Wickydoo',
},
'playlist_mincount': 29,
}, {
@@ -4560,7 +5149,16 @@ class YoutubePlaylistIE(InfoExtractor):
'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
'uploader': 'milan',
'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
- }
+ 'description': '',
+ 'channel_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw',
+ 'tags': [],
+ 'modified_date': '20140919',
+ 'view_count': int,
+ 'channel': 'milan',
+ 'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
+ 'uploader_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw',
+ },
+ 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
}, {
'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
'playlist_mincount': 654,
@@ -4570,7 +5168,15 @@ class YoutubePlaylistIE(InfoExtractor):
'uploader': 'LBK',
'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
'description': 'md5:da521864744d60a198e3a88af4db0d9d',
- }
+ 'channel': 'LBK',
+ 'view_count': int,
+ 'channel_url': 'https://www.youtube.com/c/愛低音的國王',
+ 'tags': [],
+ 'uploader_url': 'https://www.youtube.com/c/愛低音的國王',
+ 'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA',
+ 'modified_date': r're:\d{8}',
+ },
+ 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
}, {
'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
'only_matching': True,
@@ -4618,7 +5224,16 @@ class YoutubeYtBeIE(InfoExtractor):
'categories': ['Nonprofits & Activism'],
'tags': list,
'like_count': int,
- 'dislike_count': int,
+ 'age_limit': 0,
+ 'playable_in_embed': True,
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/yeWKywCrFtk/maxresdefault.webp',
+ 'channel': 'Backus-Page House Museum',
+ 'channel_id': 'UCEfMCQ9bs3tjvjy1s451zaw',
+ 'live_status': 'not_live',
+ 'view_count': int,
+ 'channel_url': 'https://www.youtube.com/channel/UCEfMCQ9bs3tjvjy1s451zaw',
+ 'availability': 'public',
+ 'duration': 59,
},
'params': {
'noplaylist': True,
@@ -4641,8 +5256,24 @@ class YoutubeYtBeIE(InfoExtractor):
}), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
+class YoutubeLivestreamEmbedIE(InfoExtractor):
+ IE_DESC = 'YouTube livestream embeds'
+ _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/embed/live_stream/?\?(?:[^#]+&)?channel=(?P<id>[^&#]+)'
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/embed/live_stream?channel=UC2_KI6RB__jGdlnK6dvFEZA',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+ return self.url_result(
+ f'https://www.youtube.com/channel/{channel_id}/live',
+ ie=YoutubeTabIE.ie_key(), video_id=channel_id)
+
+
class YoutubeYtUserIE(InfoExtractor):
IE_DESC = 'YouTube user videos; "ytuser:" prefix'
+ IE_NAME = 'youtube:user'
_VALID_URL = r'ytuser:(?P<id>.+)'
_TESTS = [{
'url': 'ytuser:phihag',
@@ -4680,7 +5311,14 @@ class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
IE_NAME = 'youtube:search'
_SEARCH_KEY = 'ytsearch'
_SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only
- _TESTS = []
+ _TESTS = [{
+ 'url': 'ytsearch5:youtube-dl test video',
+ 'playlist_count': 5,
+ 'info_dict': {
+ 'id': 'youtube-dl test video',
+ 'title': 'youtube-dl test video',
+ }
+ }]
class YoutubeSearchDateIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
@@ -4688,12 +5326,20 @@ class YoutubeSearchDateIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
_SEARCH_KEY = 'ytsearchdate'
IE_DESC = 'YouTube search, newest videos first'
_SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date
+ _TESTS = [{
+ 'url': 'ytsearchdate5:youtube-dl test video',
+ 'playlist_count': 5,
+ 'info_dict': {
+ 'id': 'youtube-dl test video',
+ 'title': 'youtube-dl test video',
+ }
+ }]
class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor):
IE_DESC = 'YouTube search URLs with sorting and filter support'
IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:results|search)\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)'
_TESTS = [{
'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
'playlist_mincount': 5,
@@ -4720,7 +5366,60 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor):
return self.playlist_result(self._search_results(query, qs.get('sp', (None,))[0]), query, query)
-class YoutubeFeedsInfoExtractor(YoutubeTabIE):
+class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor):
+ IE_DESC = 'YouTube music search URLs with selectable sections (Eg: #songs)'
+ IE_NAME = 'youtube:music:search_url'
+ _VALID_URL = r'https?://music\.youtube\.com/search\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)'
+ _TESTS = [{
+ 'url': 'https://music.youtube.com/search?q=royalty+free+music',
+ 'playlist_count': 16,
+ 'info_dict': {
+ 'id': 'royalty free music',
+ 'title': 'royalty free music',
+ }
+ }, {
+ 'url': 'https://music.youtube.com/search?q=royalty+free+music&sp=EgWKAQIIAWoKEAoQAxAEEAkQBQ%3D%3D',
+ 'playlist_mincount': 30,
+ 'info_dict': {
+ 'id': 'royalty free music - songs',
+ 'title': 'royalty free music - songs',
+ },
+ 'params': {'extract_flat': 'in_playlist'}
+ }, {
+ 'url': 'https://music.youtube.com/search?q=royalty+free+music#community+playlists',
+ 'playlist_mincount': 30,
+ 'info_dict': {
+ 'id': 'royalty free music - community playlists',
+ 'title': 'royalty free music - community playlists',
+ },
+ 'params': {'extract_flat': 'in_playlist'}
+ }]
+
+ _SECTIONS = {
+ 'albums': 'EgWKAQIYAWoKEAoQAxAEEAkQBQ==',
+ 'artists': 'EgWKAQIgAWoKEAoQAxAEEAkQBQ==',
+ 'community playlists': 'EgeKAQQoAEABagoQChADEAQQCRAF',
+ 'featured playlists': 'EgeKAQQoADgBagwQAxAJEAQQDhAKEAU==',
+ 'songs': 'EgWKAQIIAWoKEAoQAxAEEAkQBQ==',
+ 'videos': 'EgWKAQIQAWoKEAoQAxAEEAkQBQ==',
+ }
+
+ def _real_extract(self, url):
+ qs = parse_qs(url)
+ query = (qs.get('search_query') or qs.get('q'))[0]
+ params = qs.get('sp', (None,))[0]
+ if params:
+ section = next((k for k, v in self._SECTIONS.items() if v == params), params)
+ else:
+ section = compat_urllib_parse_unquote_plus((url.split('#') + [''])[1]).lower()
+ params = self._SECTIONS.get(section)
+ if not params:
+ section = None
+ title = join_nonempty(query, section, delim=' - ')
+ return self.playlist_result(self._search_results(query, params, default_client='web_music'), title, title)
+
+
+class YoutubeFeedsInfoExtractor(InfoExtractor):
"""
Base class for feed extractors
Subclasses must define the _FEED_NAME property.
@@ -4734,8 +5433,7 @@ class YoutubeFeedsInfoExtractor(YoutubeTabIE):
def _real_extract(self, url):
return self.url_result(
- 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
- ie=YoutubeTabIE.ie_key())
+ f'https://www.youtube.com/feed/{self._FEED_NAME}', ie=YoutubeTabIE.ie_key())
class YoutubeWatchLaterIE(InfoExtractor):
diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py
index 6f7f801e1..0aa5184f7 100644
--- a/yt_dlp/extractor/zdf.py
+++ b/yt_dlp/extractor/zdf.py
@@ -136,18 +136,18 @@ class ZDFBaseIE(InfoExtractor):
class ZDFIE(ZDFBaseIE):
_VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html'
_TESTS = [{
- # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html
- 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html',
- 'md5': '0aff3e7bc72c8813f5e0fae333316a1d',
+ 'url': 'https://www.zdf.de/nachrichten/heute-journal/heute-journal-vom-30-12-2021-100.html',
'info_dict': {
- 'id': '141007_ab18_10wochensommer_film',
+ 'id': '211230_sendung_hjo',
'ext': 'mp4',
- 'title': 'Ab 18! - 10 Wochen Sommer',
- 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26',
- 'duration': 2660,
- 'timestamp': 1608604200,
- 'upload_date': '20201222',
- },
+ 'description': 'md5:47dff85977bde9fb8cba9e9c9b929839',
+ 'duration': 1890.0,
+ 'upload_date': '20211230',
+ 'chapters': list,
+ 'thumbnail': 'md5:e65f459f741be5455c952cd820eb188e',
+ 'title': 'heute journal vom 30.12.2021',
+ 'timestamp': 1640897100,
+ }
}, {
'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html',
'info_dict': {
@@ -158,6 +158,7 @@ class ZDFIE(ZDFBaseIE):
'duration': 2615,
'timestamp': 1465021200,
'upload_date': '20160604',
+ 'thumbnail': 'https://www.zdf.de/assets/mauve-im-labor-100~768x432?cb=1464909117806',
},
}, {
'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html',
@@ -169,7 +170,8 @@ class ZDFIE(ZDFBaseIE):
'description': 'Die Neue an der Schule verdreht Ismail den Kopf.',
'title': 'Alles ist verzaubert',
'timestamp': 1635520560,
- 'upload_date': '20211029'
+ 'upload_date': '20211029',
+ 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-100~1920x1080?cb=1636466431799',
},
}, {
# Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche
@@ -196,6 +198,10 @@ class ZDFIE(ZDFBaseIE):
# Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html
'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html',
'only_matching': True
+ }, {
+ # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html
+ 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html',
+ 'only_matching': True
}]
def _extract_entry(self, url, player, content, video_id):
@@ -234,12 +240,21 @@ class ZDFIE(ZDFBaseIE):
})
thumbnails.append(thumbnail)
+ chapter_marks = t.get('streamAnchorTag') or []
+ chapter_marks.append({'anchorOffset': int_or_none(t.get('duration'))})
+ chapters = [{
+ 'start_time': chap.get('anchorOffset'),
+ 'end_time': next_chap.get('anchorOffset'),
+ 'title': chap.get('anchorLabel')
+ } for chap, next_chap in zip(chapter_marks, chapter_marks[1:])]
+
return merge_dicts(info, {
'title': title,
'description': content.get('leadParagraph') or content.get('teasertext'),
'duration': int_or_none(t.get('duration')),
'timestamp': unified_timestamp(content.get('editorialDate')),
'thumbnails': thumbnails,
+ 'chapters': chapters or None
})
def _extract_regular(self, url, player, video_id):
diff --git a/yt_dlp/extractor/zee5.py b/yt_dlp/extractor/zee5.py
index a3a705bdd..55c225d85 100644
--- a/yt_dlp/extractor/zee5.py
+++ b/yt_dlp/extractor/zee5.py
@@ -88,6 +88,7 @@ class Zee5IE(InfoExtractor):
_USER_TOKEN = None
_LOGIN_HINT = 'Use "--username <mobile_number>" to login using otp or "--username token" and "--password <user_token>" to login using user token.'
_NETRC_MACHINE = 'zee5'
+ _GEO_COUNTRIES = ['IN']
def _login(self):
username, password = self._get_login_info()
@@ -110,6 +111,8 @@ class Zee5IE(InfoExtractor):
raise ExtractorError(otp_request_json['message'], expected=True)
elif username.lower() == 'token' and len(password) > 1198:
self._USER_TOKEN = password
+ else:
+ raise ExtractorError(self._LOGIN_HINT, expected=True)
def _real_initialize(self):
self._login()
diff --git a/yt_dlp/extractor/zhihu.py b/yt_dlp/extractor/zhihu.py
index d1ed55be3..278a9438e 100644
--- a/yt_dlp/extractor/zhihu.py
+++ b/yt_dlp/extractor/zhihu.py
@@ -2,7 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import float_or_none, int_or_none
+from ..utils import format_field, float_or_none, int_or_none
class ZhihuIE(InfoExtractor):
@@ -61,7 +61,7 @@ class ZhihuIE(InfoExtractor):
'uploader': author.get('name'),
'timestamp': int_or_none(zvideo.get('published_at')),
'uploader_id': author.get('id'),
- 'uploader_url': 'https://www.zhihu.com/people/' + url_token if url_token else None,
+ 'uploader_url': format_field(url_token, template='https://www.zhihu.com/people/%s'),
'duration': float_or_none(video.get('duration')),
'view_count': int_or_none(zvideo.get('play_count')),
'like_count': int_or_none(zvideo.get('liked_count')),
diff --git a/yt_dlp/options.py b/yt_dlp/options.py
index 971c51515..0086c3619 100644
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@@ -13,10 +13,11 @@ from .compat import (
compat_shlex_split,
)
from .utils import (
+ Config,
expand_path,
get_executable_path,
OUTTMPL_TYPES,
- preferredencoding,
+ POSTPROCESS_WHEN,
remove_end,
write_string,
)
@@ -34,39 +35,16 @@ from .postprocessor import (
from .postprocessor.modify_chapters import DEFAULT_SPONSORBLOCK_CHAPTER_TITLE
-def _hide_login_info(opts):
- PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
- eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
-
- def _scrub_eq(o):
- m = eqre.match(o)
- if m:
- return m.group('key') + '=PRIVATE'
- else:
- return o
-
- opts = list(map(_scrub_eq, opts))
- for idx, opt in enumerate(opts):
- if opt in PRIVATE_OPTS and idx + 1 < len(opts):
- opts[idx + 1] = 'PRIVATE'
- return opts
+def parseOpts(overrideArguments=None, ignore_config_files='if_override'):
+ parser = create_parser()
+ root = Config(parser)
-
-def parseOpts(overrideArguments=None):
- def _readOptions(filename_bytes, default=[]):
- try:
- optionf = open(filename_bytes)
- except IOError:
- return default # silently skip if file is not present
- try:
- # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
- contents = optionf.read()
- if sys.version_info < (3,):
- contents = contents.decode(preferredencoding())
- res = compat_shlex_split(contents, comments=True)
- finally:
- optionf.close()
- return res
+ if ignore_config_files == 'if_override':
+ ignore_config_files = overrideArguments is not None
+ if overrideArguments:
+ root.append_config(overrideArguments, label='Override')
+ else:
+ root.append_config(sys.argv[1:], label='Command-line')
def _readUserConf(package_name, default=[]):
# .config
@@ -74,7 +52,7 @@ def parseOpts(overrideArguments=None):
userConfFile = os.path.join(xdg_config_home, package_name, 'config')
if not os.path.isfile(userConfFile):
userConfFile = os.path.join(xdg_config_home, '%s.conf' % package_name)
- userConf = _readOptions(userConfFile, default=None)
+ userConf = Config.read_file(userConfFile, default=None)
if userConf is not None:
return userConf, userConfFile
@@ -82,24 +60,64 @@ def parseOpts(overrideArguments=None):
appdata_dir = compat_getenv('appdata')
if appdata_dir:
userConfFile = os.path.join(appdata_dir, package_name, 'config')
- userConf = _readOptions(userConfFile, default=None)
+ userConf = Config.read_file(userConfFile, default=None)
if userConf is None:
userConfFile += '.txt'
- userConf = _readOptions(userConfFile, default=None)
+ userConf = Config.read_file(userConfFile, default=None)
if userConf is not None:
return userConf, userConfFile
# home
userConfFile = os.path.join(compat_expanduser('~'), '%s.conf' % package_name)
- userConf = _readOptions(userConfFile, default=None)
+ userConf = Config.read_file(userConfFile, default=None)
if userConf is None:
userConfFile += '.txt'
- userConf = _readOptions(userConfFile, default=None)
+ userConf = Config.read_file(userConfFile, default=None)
if userConf is not None:
return userConf, userConfFile
return default, None
+ def add_config(label, path, user=False):
+ """ Adds config and returns whether to continue """
+ if root.parse_args()[0].ignoreconfig:
+ return False
+ # Multiple package names can be given here
+ # Eg: ('yt-dlp', 'youtube-dlc', 'youtube-dl') will look for
+ # the configuration file of any of these three packages
+ for package in ('yt-dlp',):
+ if user:
+ args, current_path = _readUserConf(package, default=None)
+ else:
+ current_path = os.path.join(path, '%s.conf' % package)
+ args = Config.read_file(current_path, default=None)
+ if args is not None:
+ root.append_config(args, current_path, label=label)
+ return True
+ return True
+
+ def load_configs():
+ yield not ignore_config_files
+ yield add_config('Portable', get_executable_path())
+ yield add_config('Home', expand_path(root.parse_args()[0].paths.get('home', '')).strip())
+ yield add_config('User', None, user=True)
+ yield add_config('System', '/etc')
+
+ if all(load_configs()):
+ # If ignoreconfig is found inside the system configuration file,
+ # the user configuration is removed
+ if root.parse_args()[0].ignoreconfig:
+ user_conf = next((i for i, conf in enumerate(root.configs) if conf.label == 'User'), None)
+ if user_conf is not None:
+ root.configs.pop(user_conf)
+
+ opts, args = root.parse_args()
+ if opts.verbose:
+ write_string(f'\n{root}'.replace('\n| ', '\n[debug] ')[1:] + '\n')
+ return parser, opts, args
+
+
+def create_parser():
def _format_option_string(option):
''' ('-o', '--option') -> -o, --format METAVAR'''
@@ -119,7 +137,7 @@ def parseOpts(overrideArguments=None):
def _list_from_options_callback(option, opt_str, value, parser, append=True, delim=',', process=str.strip):
# append can be True, False or -1 (prepend)
- current = getattr(parser.values, option.dest) if append else []
+ current = list(getattr(parser.values, option.dest)) if append else []
value = list(filter(None, [process(value)] if delim is None else map(process, value.split(delim))))
setattr(
parser.values, option.dest,
@@ -128,7 +146,7 @@ def parseOpts(overrideArguments=None):
def _set_from_options_callback(
option, opt_str, value, parser, delim=',', allowed_values=None, aliases={},
process=lambda x: x.lower().strip()):
- current = getattr(parser.values, option.dest)
+ current = set(getattr(parser.values, option.dest))
values = [process(value)] if delim is None else list(map(process, value.split(delim)[::-1]))
while values:
actual_val = val = values.pop()
@@ -152,14 +170,19 @@ def parseOpts(overrideArguments=None):
def _dict_from_options_callback(
option, opt_str, value, parser,
allowed_keys=r'[\w-]+', delimiter=':', default_key=None, process=None, multiple_keys=True,
- process_key=str.lower):
+ process_key=str.lower, append=False):
- out_dict = getattr(parser.values, option.dest)
+ out_dict = dict(getattr(parser.values, option.dest))
+ multiple_args = not isinstance(value, str)
if multiple_keys:
allowed_keys = r'(%s)(,(%s))*' % (allowed_keys, allowed_keys)
- mobj = re.match(r'(?i)(?P<keys>%s)%s(?P<val>.*)$' % (allowed_keys, delimiter), value)
+ mobj = re.match(
+ r'(?i)(?P<keys>%s)%s(?P<val>.*)$' % (allowed_keys, delimiter),
+ value[0] if multiple_args else value)
if mobj is not None:
keys, val = mobj.group('keys').split(','), mobj.group('val')
+ if multiple_args:
+ val = [val, *value[1:]]
elif default_key is not None:
keys, val = [default_key], value
else:
@@ -171,7 +194,8 @@ def parseOpts(overrideArguments=None):
except Exception as err:
raise optparse.OptionValueError(f'wrong {opt_str} formatting; {err}')
for key in keys:
- out_dict[key] = val
+ out_dict[key] = out_dict.get(key, []) + [val] if append else val
+ setattr(parser.values, option.dest, out_dict)
# No need to wrap help messages if we're on a wide console
columns = compat_get_terminal_size().columns
@@ -238,14 +262,21 @@ def parseOpts(overrideArguments=None):
'--ignore-config', '--no-config',
action='store_true', dest='ignoreconfig',
help=(
- 'Disable loading any configuration files except the one provided by --config-location. '
- 'When given inside a configuration file, no further configuration files are loaded. '
- 'Additionally, (for backward compatibility) if this option is found inside the '
- 'system configuration file, the user configuration is not loaded'))
+ 'Don\'t load any more configuration files except those given by --config-locations. '
+ 'For backward compatibility, if this option is found inside the system configuration file, the user configuration is not loaded. '
+ '(Alias: --no-config)'))
general.add_option(
- '--config-location',
- dest='config_location', metavar='PATH',
- help='Location of the main configuration file; either the path to the config or its containing directory')
+ '--no-config-locations',
+ action='store_const', dest='config_locations', const=[],
+ help=(
+ 'Do not load any custom configuration files (default). When given inside a '
+ 'configuration file, ignore all previous --config-locations defined in the current file'))
+ general.add_option(
+ '--config-locations',
+ dest='config_locations', metavar='PATH', action='append',
+ help=(
+ 'Location of the main configuration file; either the path to the config or its containing directory. '
+ 'Can be used multiple times and inside other configuration files'))
general.add_option(
'--flat-playlist',
action='store_const', dest='extract_flat', const='in_playlist', default=False,
@@ -257,7 +288,7 @@ def parseOpts(overrideArguments=None):
general.add_option(
'--live-from-start',
action='store_true', dest='live_from_start',
- help='Download livestreams from the start. Currently only supported for YouTube')
+ help='Download livestreams from the start. Currently only supported for YouTube (Experimental)')
general.add_option(
'--no-live-from-start',
action='store_false', dest='live_from_start',
@@ -310,7 +341,7 @@ def parseOpts(overrideArguments=None):
help=(
'Use the specified HTTP/HTTPS/SOCKS proxy. To enable '
'SOCKS proxy, specify a proper scheme. For example '
- 'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") '
+ 'socks5://user:pass@127.0.0.1:1080/. Pass in an empty string (--proxy "") '
'for direct connection'))
network.add_option(
'--socket-timeout',
@@ -346,10 +377,10 @@ def parseOpts(overrideArguments=None):
geo.add_option(
'--geo-bypass',
action='store_true', dest='geo_bypass', default=True,
- help='Bypass geographic restriction via faking X-Forwarded-For HTTP header')
+ help='Bypass geographic restriction via faking X-Forwarded-For HTTP header (default)')
geo.add_option(
'--no-geo-bypass',
- action='store_false', dest='geo_bypass', default=True,
+ action='store_false', dest='geo_bypass',
help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header')
geo.add_option(
'--geo-bypass-country', metavar='CODE',
@@ -782,7 +813,7 @@ def parseOpts(overrideArguments=None):
metavar='NAME:ARGS', dest='external_downloader_args', default={}, type='str',
action='callback', callback=_dict_from_options_callback,
callback_kwargs={
- 'allowed_keys': r'ffmpeg_[io]\d*|%s' % '|'.join(list_external_downloaders()),
+ 'allowed_keys': r'ffmpeg_[io]\d*|%s' % '|'.join(map(re.escape, list_external_downloaders())),
'default_key': 'default',
'process': compat_shlex_split
}, help=(
@@ -798,6 +829,10 @@ def parseOpts(overrideArguments=None):
dest='encoding', metavar='ENCODING',
help='Force the specified encoding (experimental)')
workarounds.add_option(
+ '--legacy-server-connect',
+ action='store_true', dest='legacy_server_connect', default=False,
+ help='Explicitly allow HTTPS connection to servers that do not support RFC 5746 secure renegotiation')
+ workarounds.add_option(
'--no-check-certificates',
action='store_true', dest='no_check_certificate', default=False,
help='Suppress HTTPS certificate validation')
@@ -878,10 +913,29 @@ def parseOpts(overrideArguments=None):
help='Do not download the video but write all related files (Alias: --no-download)')
verbosity.add_option(
'-O', '--print',
- metavar='TEMPLATE', action='append', dest='forceprint',
- help=(
- 'Quiet, but print the given fields for each video. Simulate unless --no-simulate is used. '
- 'Either a field name or same syntax as the output template can be used'))
+ metavar='[WHEN:]TEMPLATE', dest='forceprint', default={}, type='str',
+ action='callback', callback=_dict_from_options_callback,
+ callback_kwargs={
+ 'allowed_keys': 'video|' + '|'.join(map(re.escape, POSTPROCESS_WHEN)),
+ 'default_key': 'video',
+ 'multiple_keys': False,
+ 'append': True,
+ }, help=(
+ 'Field name or output template to print to screen, optionally prefixed with when to print it, separated by a ":". '
+ 'Supported values of "WHEN" are the same as that of --use-postprocessor, and "video" (default). '
+ 'Implies --quiet and --simulate (unless --no-simulate is used). This option can be used multiple times'))
+ verbosity.add_option(
+ '--print-to-file',
+ metavar='[WHEN:]TEMPLATE FILE', dest='print_to_file', default={}, type='str', nargs=2,
+ action='callback', callback=_dict_from_options_callback,
+ callback_kwargs={
+ 'allowed_keys': 'video|' + '|'.join(map(re.escape, POSTPROCESS_WHEN)),
+ 'default_key': 'video',
+ 'multiple_keys': False,
+ 'append': True,
+ }, help=(
+ 'Append given template to the file. The values of WHEN and TEMPLATE are same as that of --print. '
+ 'FILE uses the same syntax as the output template. This option can be used multiple times'))
verbosity.add_option(
'-g', '--get-url',
action='store_true', dest='geturl', default=False,
@@ -1014,7 +1068,7 @@ def parseOpts(overrideArguments=None):
metavar='[TYPES:]PATH', dest='paths', default={}, type='str',
action='callback', callback=_dict_from_options_callback,
callback_kwargs={
- 'allowed_keys': 'home|temp|%s' % '|'.join(OUTTMPL_TYPES.keys()),
+ 'allowed_keys': 'home|temp|%s' % '|'.join(map(re.escape, OUTTMPL_TYPES.keys())),
'default_key': 'home'
}, help=(
'The paths where the files should be downloaded. '
@@ -1029,7 +1083,7 @@ def parseOpts(overrideArguments=None):
metavar='[TYPES:]TEMPLATE', dest='outtmpl', default={}, type='str',
action='callback', callback=_dict_from_options_callback,
callback_kwargs={
- 'allowed_keys': '|'.join(OUTTMPL_TYPES.keys()),
+ 'allowed_keys': '|'.join(map(re.escape, OUTTMPL_TYPES.keys())),
'default_key': 'default'
}, help='Output filename template; see "OUTPUT TEMPLATE" for details')
filesystem.add_option(
@@ -1266,7 +1320,8 @@ def parseOpts(overrideArguments=None):
metavar='NAME:ARGS', dest='postprocessor_args', default={}, type='str',
action='callback', callback=_dict_from_options_callback,
callback_kwargs={
- 'allowed_keys': r'\w+(?:\+\w+)?', 'default_key': 'default-compat',
+ 'allowed_keys': r'\w+(?:\+\w+)?',
+ 'default_key': 'default-compat',
'process': compat_shlex_split,
'multiple_keys': False
}, help=(
@@ -1360,6 +1415,16 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='xattrs', default=False,
help='Write metadata to the video file\'s xattrs (using dublin core and xdg standards)')
postproc.add_option(
+ '--concat-playlist',
+ metavar='POLICY', dest='concat_playlist', default='multi_video',
+ choices=('never', 'always', 'multi_video'),
+ help=(
+ 'Concatenate videos in a playlist. One of "never", "always", or '
+ '"multi_video" (default; only when the videos form a single show). '
+ 'All the video files must have same codecs and number of streams to be concatable. '
+ 'The "pl_video:" prefix can be used with "--paths" and "--output" to '
+ 'set the output filename for the split files. See "OUTPUT TEMPLATE" for details'))
+ postproc.add_option(
'--fixup',
metavar='POLICY', dest='fixup', default=None,
choices=('never', 'ignore', 'warn', 'detect_or_warn', 'force'),
@@ -1381,29 +1446,33 @@ def parseOpts(overrideArguments=None):
dest='ffmpeg_location',
help='Location of the ffmpeg binary; either the path to the binary or its containing directory')
postproc.add_option(
- '--exec', metavar='CMD',
- action='append', dest='exec_cmd',
- help=(
- 'Execute a command on the file after downloading and post-processing. '
+ '--exec',
+ metavar='[WHEN:]CMD', dest='exec_cmd', default={}, type='str',
+ action='callback', callback=_dict_from_options_callback,
+ callback_kwargs={
+ 'allowed_keys': '|'.join(map(re.escape, POSTPROCESS_WHEN)),
+ 'default_key': 'after_move',
+ 'multiple_keys': False,
+ 'append': True,
+ }, help=(
+ 'Execute a command, optionally prefixed with when to execute it (after_move if unspecified), separated by a ":". '
+ 'Supported values of "WHEN" are the same as that of --use-postprocessor. '
'Same syntax as the output template can be used to pass any field as arguments to the command. '
- 'An additional field "filepath" that contains the final path of the downloaded file is also available. '
- 'If no fields are passed, %(filepath)q is appended to the end of the command. '
+ 'After download, an additional field "filepath" that contains the final path of the downloaded file '
+ 'is also available, and if no fields are passed, %(filepath)q is appended to the end of the command. '
'This option can be used multiple times'))
postproc.add_option(
'--no-exec',
- action='store_const', dest='exec_cmd', const=[],
+ action='store_const', dest='exec_cmd', const={},
help='Remove any previously defined --exec')
postproc.add_option(
'--exec-before-download', metavar='CMD',
action='append', dest='exec_before_dl_cmd',
- help=(
- 'Execute a command before the actual download. '
- 'The syntax is the same as --exec but "filepath" is not available. '
- 'This option can be used multiple times'))
+ help=optparse.SUPPRESS_HELP)
postproc.add_option(
'--no-exec-before-download',
action='store_const', dest='exec_before_dl_cmd', const=[],
- help='Remove any previously defined --exec-before-download')
+ help=optparse.SUPPRESS_HELP)
postproc.add_option(
'--convert-subs', '--convert-sub', '--convert-subtitles',
metavar='FORMAT', dest='convertsubtitles', default=None,
@@ -1465,8 +1534,10 @@ def parseOpts(overrideArguments=None):
'ARGS are a semicolon ";" delimited list of NAME=VALUE. '
'The "when" argument determines when the postprocessor is invoked. '
'It can be one of "pre_process" (after extraction), '
- '"before_dl" (before video download), "post_process" (after video download; default) '
- 'or "after_move" (after moving file to their final locations). '
+ '"before_dl" (before video download), "post_process" (after video download; default), '
+ '"after_move" (after moving file to their final locations), '
+ '"after_video" (after downloading and processing all formats of a video), '
+ 'or "playlist" (end of playlist). '
'This option can be used multiple times to add different postprocessors'))
sponsorblock = optparse.OptionGroup(parser, 'SponsorBlock Options', description=(
@@ -1568,7 +1639,8 @@ def parseOpts(overrideArguments=None):
'--no-hls-split-discontinuity',
dest='hls_split_discontinuity', action='store_false',
help='Do not split HLS playlists to different formats at discontinuities such as ad breaks (default)')
- _extractor_arg_parser = lambda key, vals='': (key.strip().lower().replace('-', '_'), [val.strip() for val in vals.split(',')])
+ _extractor_arg_parser = lambda key, vals='': (key.strip().lower().replace('-', '_'), [
+ val.replace(r'\,', ',').strip() for val in re.split(r'(?<!\\),', vals)])
extractor.add_option(
'--extractor-args',
metavar='KEY:ARGS', dest='extractor_args', default={}, type='str',
@@ -1614,75 +1686,11 @@ def parseOpts(overrideArguments=None):
parser.add_option_group(sponsorblock)
parser.add_option_group(extractor)
- if overrideArguments is not None:
- opts, args = parser.parse_args(overrideArguments)
- if opts.verbose:
- write_string('[debug] Override config: ' + repr(overrideArguments) + '\n')
- else:
- def compat_conf(conf):
- if sys.version_info < (3,):
- return [a.decode(preferredencoding(), 'replace') for a in conf]
- return conf
-
- configs = {
- 'command-line': compat_conf(sys.argv[1:]),
- 'custom': [], 'home': [], 'portable': [], 'user': [], 'system': []}
- paths = {'command-line': False}
-
- def read_options(name, path, user=False):
- ''' loads config files and returns ignoreconfig '''
- # Multiple package names can be given here
- # Eg: ('yt-dlp', 'youtube-dlc', 'youtube-dl') will look for
- # the configuration file of any of these three packages
- for package in ('yt-dlp',):
- if user:
- config, current_path = _readUserConf(package, default=None)
- else:
- current_path = os.path.join(path, '%s.conf' % package)
- config = _readOptions(current_path, default=None)
- if config is not None:
- current_path = os.path.realpath(current_path)
- if current_path in paths.values():
- return False
- configs[name], paths[name] = config, current_path
- return parser.parse_args(config)[0].ignoreconfig
- return False
-
- def get_configs():
- opts, _ = parser.parse_args(configs['command-line'])
- if opts.config_location is not None:
- location = compat_expanduser(opts.config_location)
- if os.path.isdir(location):
- location = os.path.join(location, 'yt-dlp.conf')
- if not os.path.exists(location):
- parser.error('config-location %s does not exist.' % location)
- config = _readOptions(location, default=None)
- if config:
- configs['custom'], paths['custom'] = config, location
+ return parser
- if opts.ignoreconfig:
- return
- if parser.parse_args(configs['custom'])[0].ignoreconfig:
- return
- if read_options('portable', get_executable_path()):
- return
- opts, _ = parser.parse_args(configs['portable'] + configs['custom'] + configs['command-line'])
- if read_options('home', expand_path(opts.paths.get('home', '')).strip()):
- return
- if read_options('system', '/etc'):
- return
- if read_options('user', None, user=True):
- configs['system'], paths['system'] = [], None
- get_configs()
- argv = configs['system'] + configs['user'] + configs['home'] + configs['portable'] + configs['custom'] + configs['command-line']
- opts, args = parser.parse_args(argv)
- if opts.verbose:
- for label in ('Command-line', 'Custom', 'Portable', 'Home', 'User', 'System'):
- key = label.lower()
- if paths.get(key):
- write_string(f'[debug] {label} config file: {paths[key]}\n')
- if paths.get(key) is not None:
- write_string(f'[debug] {label} config: {_hide_login_info(configs[key])!r}\n')
-
- return parser, opts, args
+def _hide_login_info(opts):
+ write_string(
+ 'DeprecationWarning: "yt_dlp.options._hide_login_info" is deprecated and may be removed in a future version. '
+ 'Use "yt_dlp.utils.Config.hide_login_info" instead\n')
+ return Config.hide_login_info(opts)
diff --git a/yt_dlp/postprocessor/__init__.py b/yt_dlp/postprocessor/__init__.py
index 7f8adb368..e47631eb6 100644
--- a/yt_dlp/postprocessor/__init__.py
+++ b/yt_dlp/postprocessor/__init__.py
@@ -7,6 +7,8 @@ from .embedthumbnail import EmbedThumbnailPP
from .exec import ExecPP, ExecAfterDownloadPP
from .ffmpeg import (
FFmpegPostProcessor,
+ FFmpegCopyStreamPP,
+ FFmpegConcatPP,
FFmpegEmbedSubtitlePP,
FFmpegExtractAudioPP,
FFmpegFixupDuplicateMoovPP,
diff --git a/yt_dlp/postprocessor/embedthumbnail.py b/yt_dlp/postprocessor/embedthumbnail.py
index e199a1cdd..84ab54f44 100644
--- a/yt_dlp/postprocessor/embedthumbnail.py
+++ b/yt_dlp/postprocessor/embedthumbnail.py
@@ -108,7 +108,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options)
elif info['ext'] in ['mkv', 'mka']:
- options = ['-c', 'copy', '-map', '0', '-dn']
+ options = list(self.stream_copy_opts())
mimetype = 'image/%s' % ('png' if thumbnail_ext == 'png' else 'jpeg')
old_stream, new_stream = self.get_stream_number(
@@ -184,7 +184,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
if not success:
success = True
try:
- options = ['-c', 'copy', '-map', '0', '-dn', '-map', '1']
+ options = [*self.stream_copy_opts(), '-map', '1']
old_stream, new_stream = self.get_stream_number(
filename, ('disposition', 'attached_pic'), 1)
diff --git a/yt_dlp/postprocessor/exec.py b/yt_dlp/postprocessor/exec.py
index 28a7c3d70..63f4d23f2 100644
--- a/yt_dlp/postprocessor/exec.py
+++ b/yt_dlp/postprocessor/exec.py
@@ -22,11 +22,13 @@ class ExecPP(PostProcessor):
if tmpl_dict: # if there are no replacements, tmpl_dict = {}
return self._downloader.escape_outtmpl(tmpl) % tmpl_dict
- # If no replacements are found, replace {} for backard compatibility
- if '{}' not in cmd:
- cmd += ' {}'
- return cmd.replace('{}', compat_shlex_quote(
- info.get('filepath') or info['_filename']))
+ filepath = info.get('filepath', info.get('_filename'))
+ # If video, and no replacements are found, replace {} for backard compatibility
+ if filepath:
+ if '{}' not in cmd:
+ cmd += ' {}'
+ cmd = cmd.replace('{}', compat_shlex_quote(filepath))
+ return cmd
def run(self, info):
for tmpl in self.exec_cmd:
diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py
index 96b48ded5..42e9d12a7 100644
--- a/yt_dlp/postprocessor/ffmpeg.py
+++ b/yt_dlp/postprocessor/ffmpeg.py
@@ -1,5 +1,6 @@
from __future__ import unicode_literals
+import collections
import io
import itertools
import os
@@ -12,6 +13,7 @@ from .common import AudioConversionError, PostProcessor
from ..compat import compat_str
from ..utils import (
+ determine_ext,
dfxp2srt,
encodeArgument,
encodeFilename,
@@ -190,6 +192,18 @@ class FFmpegPostProcessor(PostProcessor):
def probe_executable(self):
return self._paths[self.probe_basename]
+ @staticmethod
+ def stream_copy_opts(copy=True, *, ext=None):
+ yield from ('-map', '0')
+ # Don't copy Apple TV chapters track, bin_data
+ # See https://github.com/yt-dlp/yt-dlp/issues/2, #19042, #19024, https://trac.ffmpeg.org/ticket/6016
+ yield from ('-dn', '-ignore_unknown')
+ if copy:
+ yield from ('-c', 'copy')
+ # For some reason, '-c copy -map 0' is not enough to copy subtitles
+ if ext in ('mp4', 'mov'):
+ yield from ('-c:s', 'mov_text')
+
def get_audio_codec(self, path):
if not self.probe_available and not self.available:
raise PostProcessingError('ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location')
@@ -351,8 +365,9 @@ class FFmpegPostProcessor(PostProcessor):
timestamps = timestamps[1:]
keyframe_file = prepend_extension(filename, 'keyframes.temp')
self.to_screen(f'Re-encoding "{filename}" with appropriate keyframes')
- self.run_ffmpeg(filename, keyframe_file, ['-force_key_frames', ','.join(
- f'{t:.6f}' for t in timestamps)])
+ self.run_ffmpeg(filename, keyframe_file, [
+ *self.stream_copy_opts(False, ext=determine_ext(filename)),
+ '-force_key_frames', ','.join(f'{t:.6f}' for t in timestamps)])
return keyframe_file
def concat_files(self, in_files, out_file, concat_opts=None):
@@ -367,10 +382,7 @@ class FFmpegPostProcessor(PostProcessor):
with open(concat_file, 'wt', encoding='utf-8') as f:
f.writelines(self._concat_spec(in_files, concat_opts))
- out_flags = ['-c', 'copy']
- if out_file.rpartition('.')[-1] in ('mp4', 'mov'):
- # For some reason, '-c copy' is not enough to copy subtitles
- out_flags.extend(['-c:s', 'mov_text'])
+ out_flags = list(self.stream_copy_opts(ext=determine_ext(out_file)))
try:
self.real_run_ffmpeg(
@@ -556,7 +568,7 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
else f'already is in target format {source_ext}' if source_ext == target_ext
else None)
if _skip_msg:
- self.to_screen(f'Not {self._ACTION} media file {filename!r}; {_skip_msg}')
+ self.to_screen(f'Not {self._ACTION} media file "{filename}"; {_skip_msg}')
return [], info
outpath = replace_extension(filename, target_ext, source_ext)
@@ -573,7 +585,7 @@ class FFmpegVideoRemuxerPP(FFmpegVideoConvertorPP):
@staticmethod
def _options(target_ext):
- return ['-c', 'copy', '-map', '0', '-dn']
+ return FFmpegPostProcessor.stream_copy_opts()
class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
@@ -633,16 +645,11 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
input_files = [filename] + sub_filenames
opts = [
- '-c', 'copy', '-map', '0', '-dn',
+ *self.stream_copy_opts(ext=info['ext']),
# Don't copy the existing subtitles, we may be running the
# postprocessor a second time
'-map', '-0:s',
- # Don't copy Apple TV chapters track, bin_data (see #19042, #19024,
- # https://trac.ffmpeg.org/ticket/6016)
- '-map', '-0:d',
]
- if info['ext'] == 'mp4':
- opts += ['-c:s', 'mov_text']
for i, (lang, name) in enumerate(zip(sub_langs, sub_names)):
opts.extend(['-map', '%d:0' % (i + 1)])
lang_code = ISO639Utils.short2long(lang) or lang
@@ -670,11 +677,10 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
@staticmethod
def _options(target_ext):
- yield from ('-map', '0', '-dn')
- if target_ext == 'm4a':
+ audio_only = target_ext == 'm4a'
+ yield from FFmpegPostProcessor.stream_copy_opts(not audio_only)
+ if audio_only:
yield from ('-vn', '-acodec', 'copy')
- else:
- yield from ('-c', 'copy')
@PostProcessor._restrict_to(images=False)
def run(self, info):
@@ -728,15 +734,15 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
yield ('-map_metadata', '1')
def _get_metadata_opts(self, info):
- metadata = {}
- meta_prefix = 'meta_'
+ meta_prefix = 'meta'
+ metadata = collections.defaultdict(dict)
def add(meta_list, info_list=None):
value = next((
- str(info[key]) for key in [meta_prefix] + list(variadic(info_list or meta_list))
+ str(info[key]) for key in [f'{meta_prefix}_'] + list(variadic(info_list or meta_list))
if info.get(key) is not None), None)
if value not in ('', None):
- metadata.update({meta_f: value for meta_f in variadic(meta_list)})
+ metadata['common'].update({meta_f: value for meta_f in variadic(meta_list)})
# See [1-4] for some info on media metadata/metadata supported
# by ffmpeg.
@@ -760,22 +766,26 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
add('episode_sort', 'episode_number')
if 'embed-metadata' in self.get_param('compat_opts', []):
add('comment', 'description')
- metadata.pop('synopsis', None)
+ metadata['common'].pop('synopsis', None)
+ meta_regex = rf'{re.escape(meta_prefix)}(?P<i>\d+)?_(?P<key>.+)'
for key, value in info.items():
- if value is not None and key != meta_prefix and key.startswith(meta_prefix):
- metadata[key[len(meta_prefix):]] = value
+ mobj = re.fullmatch(meta_regex, key)
+ if value is not None and mobj:
+ metadata[mobj.group('i') or 'common'][mobj.group('key')] = value
- for name, value in metadata.items():
+ for name, value in metadata['common'].items():
yield ('-metadata', f'{name}={value}')
stream_idx = 0
for fmt in info.get('requested_formats') or []:
stream_count = 2 if 'none' not in (fmt.get('vcodec'), fmt.get('acodec')) else 1
- if fmt.get('language'):
- lang = ISO639Utils.short2long(fmt['language']) or fmt['language']
- for i in range(stream_count):
- yield ('-metadata:s:%d' % (stream_idx + i), 'language=%s' % lang)
+ lang = ISO639Utils.short2long(fmt.get('language') or '') or fmt.get('language')
+ for i in range(stream_idx, stream_idx + stream_count):
+ if lang:
+ metadata[str(i)].setdefault('language', lang)
+ for name, value in metadata[str(i)].items():
+ yield (f'-metadata:s:{i}', f'{name}={value}')
stream_idx += stream_count
def _get_infojson_opts(self, info, infofn):
@@ -854,7 +864,7 @@ class FFmpegFixupStretchedPP(FFmpegFixupPostProcessor):
stretched_ratio = info.get('stretched_ratio')
if stretched_ratio not in (None, 1):
self._fixup('Fixing aspect ratio', info['filepath'], [
- '-c', 'copy', '-map', '0', '-dn', '-aspect', '%f' % stretched_ratio])
+ *self.stream_copy_opts(), '-aspect', '%f' % stretched_ratio])
return [], info
@@ -862,8 +872,7 @@ class FFmpegFixupM4aPP(FFmpegFixupPostProcessor):
@PostProcessor._restrict_to(images=False, video=False)
def run(self, info):
if info.get('container') == 'm4a_dash':
- self._fixup('Correcting container', info['filepath'], [
- '-c', 'copy', '-map', '0', '-dn', '-f', 'mp4'])
+ self._fixup('Correcting container', info['filepath'], [*self.stream_copy_opts(), '-f', 'mp4'])
return [], info
@@ -883,7 +892,7 @@ class FFmpegFixupM3u8PP(FFmpegFixupPostProcessor):
def run(self, info):
if all(self._needs_fixup(info)):
self._fixup('Fixing MPEG-TS in MP4 container', info['filepath'], [
- '-c', 'copy', '-map', '0', '-dn', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'])
+ *self.stream_copy_opts(), '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'])
return [], info
@@ -904,24 +913,24 @@ class FFmpegFixupTimestampPP(FFmpegFixupPostProcessor):
opts = ['-vf', 'setpts=PTS-STARTPTS']
else:
opts = ['-c', 'copy', '-bsf', 'setts=ts=TS-STARTPTS']
- self._fixup('Fixing frame timestamp', info['filepath'], opts + ['-map', '0', '-dn', '-ss', self.trim])
+ self._fixup('Fixing frame timestamp', info['filepath'], opts + [*self.stream_copy_opts(False), '-ss', self.trim])
return [], info
-class FFmpegCopyStreamPostProcessor(FFmpegFixupPostProcessor):
+class FFmpegCopyStreamPP(FFmpegFixupPostProcessor):
MESSAGE = 'Copying stream'
@PostProcessor._restrict_to(images=False)
def run(self, info):
- self._fixup(self.MESSAGE, info['filepath'], ['-c', 'copy', '-map', '0', '-dn'])
+ self._fixup(self.MESSAGE, info['filepath'], self.stream_copy_opts())
return [], info
-class FFmpegFixupDurationPP(FFmpegCopyStreamPostProcessor):
+class FFmpegFixupDurationPP(FFmpegCopyStreamPP):
MESSAGE = 'Fixing video duration'
-class FFmpegFixupDuplicateMoovPP(FFmpegCopyStreamPostProcessor):
+class FFmpegFixupDuplicateMoovPP(FFmpegCopyStreamPP):
MESSAGE = 'Fixing duplicate MOOV atoms'
@@ -1041,7 +1050,7 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor):
self.to_screen('Splitting video by chapters; %d chapters found' % len(chapters))
for idx, chapter in enumerate(chapters):
destination, opts = self._ffmpeg_args_for_chapter(idx + 1, chapter, info)
- self.real_run_ffmpeg([(in_file, opts)], [(destination, ['-c', 'copy'])])
+ self.real_run_ffmpeg([(in_file, opts)], [(destination, self.stream_copy_opts())])
if in_file != info['filepath']:
os.remove(in_file)
return [], info
@@ -1114,3 +1123,52 @@ class FFmpegThumbnailsConvertorPP(FFmpegPostProcessor):
if not has_thumbnail:
self.to_screen('There aren\'t any thumbnails to convert')
return files_to_delete, info
+
+
+class FFmpegConcatPP(FFmpegPostProcessor):
+ def __init__(self, downloader, only_multi_video=False):
+ self._only_multi_video = only_multi_video
+ super().__init__(downloader)
+
+ def concat_files(self, in_files, out_file):
+ if len(in_files) == 1:
+ if os.path.realpath(in_files[0]) != os.path.realpath(out_file):
+ self.to_screen(f'Moving "{in_files[0]}" to "{out_file}"')
+ os.replace(in_files[0], out_file)
+ return []
+
+ codecs = [traverse_obj(self.get_metadata_object(file), ('streams', ..., 'codec_name')) for file in in_files]
+ if len(set(map(tuple, codecs))) > 1:
+ raise PostProcessingError(
+ 'The files have different streams/codecs and cannot be concatenated. '
+ 'Either select different formats or --recode-video them to a common format')
+
+ self.to_screen(f'Concatenating {len(in_files)} files; Destination: {out_file}')
+ super().concat_files(in_files, out_file)
+ return in_files
+
+ @PostProcessor._restrict_to(images=False)
+ def run(self, info):
+ entries = info.get('entries') or []
+ if (self.get_param('skip_download') or not any(entries)
+ or self._only_multi_video and info['_type'] != 'multi_video'):
+ return [], info
+ elif any(len(entry) > 1 for entry in traverse_obj(entries, (..., 'requested_downloads')) or []):
+ raise PostProcessingError('Concatenation is not supported when downloading multiple separate formats')
+
+ in_files = traverse_obj(entries, (..., 'requested_downloads', 0, 'filepath'))
+ if len(in_files) < len(entries):
+ raise PostProcessingError('Aborting concatenation because some downloads failed')
+
+ ie_copy = self._downloader._playlist_infodict(info)
+ exts = traverse_obj(entries, (..., 'requested_downloads', 0, 'ext'), (..., 'ext'))
+ ie_copy['ext'] = exts[0] if len(set(exts)) == 1 else 'mkv'
+ out_file = self._downloader.prepare_filename(ie_copy, 'pl_video')
+
+ files_to_delete = self.concat_files(in_files, out_file)
+
+ info['requested_downloads'] = [{
+ 'filepath': out_file,
+ 'ext': ie_copy['ext'],
+ }]
+ return files_to_delete, info
diff --git a/yt_dlp/postprocessor/metadataparser.py b/yt_dlp/postprocessor/metadataparser.py
index 646659e75..5452b92d8 100644
--- a/yt_dlp/postprocessor/metadataparser.py
+++ b/yt_dlp/postprocessor/metadataparser.py
@@ -66,7 +66,7 @@ class MetadataParserPP(PostProcessor):
self.write_debug(f'Searching for {out_re.pattern!r} in {template!r}')
match = out_re.search(data_to_parse)
if match is None:
- self.report_warning(f'Could not interpret {inp!r} as {out!r}')
+ self.to_screen(f'Could not interpret {inp!r} as {out!r}')
return
for attribute, value in match.groupdict().items():
info[attribute] = value
@@ -80,7 +80,7 @@ class MetadataParserPP(PostProcessor):
def f(info):
val = info.get(field)
if val is None:
- self.report_warning(f'Video does not have a {field}')
+ self.to_screen(f'Video does not have a {field}')
return
elif not isinstance(val, str):
self.report_warning(f'Cannot replace in field {field} since it is a {type(val).__name__}')
diff --git a/yt_dlp/postprocessor/modify_chapters.py b/yt_dlp/postprocessor/modify_chapters.py
index 91433c364..435a144e2 100644
--- a/yt_dlp/postprocessor/modify_chapters.py
+++ b/yt_dlp/postprocessor/modify_chapters.py
@@ -57,6 +57,7 @@ class ModifyChaptersPP(FFmpegPostProcessor):
self.write_debug('Expected and actual durations mismatch')
concat_opts = self._make_concat_opts(cuts, real_duration)
+ self.write_debug('Concat spec = %s' % ', '.join(f'{c.get("inpoint", 0.0)}-{c.get("outpoint", "inf")}' for c in concat_opts))
def remove_chapters(file, is_sub):
return file, self.remove_chapters(file, cuts, concat_opts, self._force_keyframes and not is_sub)
@@ -332,6 +333,6 @@ class ModifyChaptersPP(FFmpegPostProcessor):
continue
opts[-1]['outpoint'] = f'{s["start_time"]:.6f}'
# Do not create 0 duration chunk at the end.
- if s['end_time'] != duration:
+ if s['end_time'] < duration:
opts.append({'inpoint': f'{s["end_time"]:.6f}'})
return opts
diff --git a/yt_dlp/version.py b/yt_dlp/version.py
index 7b5732595..01e1b2345 100644
--- a/yt_dlp/version.py
+++ b/yt_dlp/version.py
@@ -1,5 +1,5 @@
# Autogenerated by devscripts/update-version.py
-__version__ = '2021.12.27'
+__version__ = '2022.02.04'
-RELEASE_GIT_HEAD = '6223f67a8'
+RELEASE_GIT_HEAD = 'c1653e9ef'