aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJesus <heckyel@riseup.net>2023-09-04 01:59:36 +0800
committerJesus <heckyel@riseup.net>2023-09-04 01:59:36 +0800
commitb3013540b41d1eb77c4803c5fca46f8d75b40fc1 (patch)
tree97735cb0c49f3a2b0f276e1cd90817833d590d69
parenteaeeef9c1d1bedb76fea953c332ef84d53bffe2c (diff)
downloadhypervideo-b3013540b41d1eb77c4803c5fca46f8d75b40fc1.tar.lz
hypervideo-b3013540b41d1eb77c4803c5fca46f8d75b40fc1.tar.xz
hypervideo-b3013540b41d1eb77c4803c5fca46f8d75b40fc1.zip
update from upstream
-rw-r--r--AUTHORS111
-rw-r--r--CONTRIBUTING.md89
-rw-r--r--CONTRIBUTORS116
-rw-r--r--Changelog.md711
-rw-r--r--Makefile7
-rw-r--r--README.md315
-rw-r--r--completions/zsh/_hypervideo2
-rw-r--r--devscripts/changelog_override.json73
-rw-r--r--devscripts/changelog_override.schema.json96
-rw-r--r--devscripts/cli_to_api.py48
-rw-r--r--devscripts/lazy_load_template.py1
-rw-r--r--devscripts/make_changelog.py510
-rw-r--r--devscripts/make_lazy_extractors.py4
-rw-r--r--devscripts/make_readme.py22
-rw-r--r--devscripts/utils.py13
-rw-r--r--hypervideo_dl/YoutubeDL.py908
-rw-r--r--hypervideo_dl/__init__.py170
-rw-r--r--hypervideo_dl/__pyinstaller/__init__.py5
-rw-r--r--hypervideo_dl/__pyinstaller/hook-yt_dlp.py32
-rw-r--r--hypervideo_dl/aes.py8
-rw-r--r--hypervideo_dl/cache.py16
-rw-r--r--hypervideo_dl/casefold.py5
-rw-r--r--hypervideo_dl/compat/__init__.py19
-rw-r--r--hypervideo_dl/compat/_deprecated.py9
-rw-r--r--hypervideo_dl/compat/_legacy.py37
-rw-r--r--hypervideo_dl/compat/compat_utils.py111
-rw-r--r--hypervideo_dl/compat/types.py13
-rw-r--r--hypervideo_dl/compat/urllib/__init__.py10
-rw-r--r--hypervideo_dl/compat/urllib/request.py40
-rw-r--r--hypervideo_dl/cookies.py383
-rw-r--r--hypervideo_dl/dependencies/Cryptodome.py38
-rw-r--r--hypervideo_dl/dependencies/__init__.py83
-rw-r--r--hypervideo_dl/downloader/__init__.py3
-rw-r--r--hypervideo_dl/downloader/common.py44
-rw-r--r--hypervideo_dl/downloader/external.py186
-rw-r--r--hypervideo_dl/downloader/f4m.py8
-rw-r--r--hypervideo_dl/downloader/fragment.py73
-rw-r--r--hypervideo_dl/downloader/hls.py81
-rw-r--r--hypervideo_dl/downloader/http.py81
-rw-r--r--hypervideo_dl/downloader/ism.py4
-rw-r--r--hypervideo_dl/downloader/niconico.py98
-rw-r--r--hypervideo_dl/downloader/youtube_live_chat.py10
-rw-r--r--hypervideo_dl/extractor/_extractors.py282
-rw-r--r--hypervideo_dl/extractor/abc.py15
-rw-r--r--hypervideo_dl/extractor/abematv.py114
-rw-r--r--hypervideo_dl/extractor/acast.py34
-rw-r--r--hypervideo_dl/extractor/adn.py18
-rw-r--r--hypervideo_dl/extractor/adobepass.py24
-rw-r--r--hypervideo_dl/extractor/adultswim.py6
-rw-r--r--hypervideo_dl/extractor/aenetworks.py15
-rw-r--r--hypervideo_dl/extractor/aeonco.py52
-rw-r--r--hypervideo_dl/extractor/afreecatv.py103
-rw-r--r--hypervideo_dl/extractor/airtv.py96
-rw-r--r--hypervideo_dl/extractor/aitube.py60
-rw-r--r--hypervideo_dl/extractor/amazon.py116
-rw-r--r--hypervideo_dl/extractor/amazonminitv.py3
-rw-r--r--hypervideo_dl/extractor/americastestkitchen.py78
-rw-r--r--hypervideo_dl/extractor/amp.py9
-rw-r--r--hypervideo_dl/extractor/anchorfm.py98
-rw-r--r--hypervideo_dl/extractor/antenna.py143
-rw-r--r--hypervideo_dl/extractor/anvato.py10
-rw-r--r--hypervideo_dl/extractor/archiveorg.py244
-rw-r--r--hypervideo_dl/extractor/ard.py63
-rw-r--r--hypervideo_dl/extractor/arte.py24
-rw-r--r--hypervideo_dl/extractor/atresplayer.py6
-rw-r--r--hypervideo_dl/extractor/bandcamp.py48
-rw-r--r--hypervideo_dl/extractor/bbc.py14
-rw-r--r--hypervideo_dl/extractor/beatbump.py101
-rw-r--r--hypervideo_dl/extractor/bfmtv.py19
-rw-r--r--hypervideo_dl/extractor/bibeltv.py202
-rw-r--r--hypervideo_dl/extractor/bilibili.py571
-rw-r--r--hypervideo_dl/extractor/bitchute.py7
-rw-r--r--hypervideo_dl/extractor/blerp.py167
-rw-r--r--hypervideo_dl/extractor/boxcast.py102
-rw-r--r--hypervideo_dl/extractor/brainpop.py318
-rw-r--r--hypervideo_dl/extractor/bravotv.py236
-rw-r--r--hypervideo_dl/extractor/brightcove.py12
-rw-r--r--hypervideo_dl/extractor/callin.py55
-rw-r--r--hypervideo_dl/extractor/camfm.py85
-rw-r--r--hypervideo_dl/extractor/cammodels.py39
-rw-r--r--hypervideo_dl/extractor/canalplus.py2
-rw-r--r--hypervideo_dl/extractor/cbc.py192
-rw-r--r--hypervideo_dl/extractor/cbs.py113
-rw-r--r--hypervideo_dl/extractor/cbsnews.py380
-rw-r--r--hypervideo_dl/extractor/cda.py47
-rw-r--r--hypervideo_dl/extractor/ceskatelevize.py30
-rw-r--r--hypervideo_dl/extractor/chilloutzone.py128
-rw-r--r--hypervideo_dl/extractor/cinetecamilano.py4
-rw-r--r--hypervideo_dl/extractor/ciscowebex.py32
-rw-r--r--hypervideo_dl/extractor/clipchamp.py61
-rw-r--r--hypervideo_dl/extractor/clyp.py43
-rw-r--r--hypervideo_dl/extractor/comedycentral.py5
-rw-r--r--hypervideo_dl/extractor/common.py392
-rw-r--r--hypervideo_dl/extractor/crackle.py4
-rw-r--r--hypervideo_dl/extractor/crtvg.py34
-rw-r--r--hypervideo_dl/extractor/crunchyroll.py693
-rw-r--r--hypervideo_dl/extractor/cultureunplugged.py6
-rw-r--r--hypervideo_dl/extractor/curiositystream.py8
-rw-r--r--hypervideo_dl/extractor/dacast.py158
-rw-r--r--hypervideo_dl/extractor/daftsex.py27
-rw-r--r--hypervideo_dl/extractor/dailymotion.py6
-rw-r--r--hypervideo_dl/extractor/digitalconcerthall.py27
-rw-r--r--hypervideo_dl/extractor/discogs.py35
-rw-r--r--hypervideo_dl/extractor/discovery.py8
-rw-r--r--hypervideo_dl/extractor/dlf.py192
-rw-r--r--hypervideo_dl/extractor/douyutv.py55
-rw-r--r--hypervideo_dl/extractor/dplay.py66
-rw-r--r--hypervideo_dl/extractor/dropbox.py42
-rw-r--r--hypervideo_dl/extractor/dropout.py54
-rw-r--r--hypervideo_dl/extractor/drtv.py166
-rw-r--r--hypervideo_dl/extractor/dumpert.py49
-rw-r--r--hypervideo_dl/extractor/eagleplatform.py6
-rw-r--r--hypervideo_dl/extractor/ebay.py36
-rw-r--r--hypervideo_dl/extractor/eitb.py10
-rw-r--r--hypervideo_dl/extractor/elevensports.py59
-rw-r--r--hypervideo_dl/extractor/embedly.py99
-rw-r--r--hypervideo_dl/extractor/eporner.py2
-rw-r--r--hypervideo_dl/extractor/espn.py13
-rw-r--r--hypervideo_dl/extractor/ettutv.py60
-rw-r--r--hypervideo_dl/extractor/europa.py84
-rw-r--r--hypervideo_dl/extractor/eurosport.py28
-rw-r--r--hypervideo_dl/extractor/extractors.py6
-rw-r--r--hypervideo_dl/extractor/facebook.py114
-rw-r--r--hypervideo_dl/extractor/fc2.py6
-rw-r--r--hypervideo_dl/extractor/fifa.py23
-rw-r--r--hypervideo_dl/extractor/filmon.py14
-rw-r--r--hypervideo_dl/extractor/fox.py16
-rw-r--r--hypervideo_dl/extractor/foxnews.py77
-rw-r--r--hypervideo_dl/extractor/foxsports.py57
-rw-r--r--hypervideo_dl/extractor/freesound.py1
-rw-r--r--hypervideo_dl/extractor/fujitv.py2
-rw-r--r--hypervideo_dl/extractor/funimation.py8
-rw-r--r--hypervideo_dl/extractor/funker530.py79
-rw-r--r--hypervideo_dl/extractor/gamejolt.py2
-rw-r--r--hypervideo_dl/extractor/gdcvault.py15
-rw-r--r--hypervideo_dl/extractor/generic.py411
-rw-r--r--hypervideo_dl/extractor/genius.py34
-rw-r--r--hypervideo_dl/extractor/globalplayer.py254
-rw-r--r--hypervideo_dl/extractor/globo.py2
-rw-r--r--hypervideo_dl/extractor/gmanetwork.py83
-rw-r--r--hypervideo_dl/extractor/googledrive.py37
-rw-r--r--hypervideo_dl/extractor/goplay.py6
-rw-r--r--hypervideo_dl/extractor/gronkh.py14
-rw-r--r--hypervideo_dl/extractor/hidive.py51
-rw-r--r--hypervideo_dl/extractor/hketv.py2
-rw-r--r--hypervideo_dl/extractor/hollywoodreporter.py72
-rw-r--r--hypervideo_dl/extractor/hotnewhiphop.py14
-rw-r--r--hypervideo_dl/extractor/hotstar.py138
-rw-r--r--hypervideo_dl/extractor/hrefli.py15
-rw-r--r--hypervideo_dl/extractor/hrti.py10
-rw-r--r--hypervideo_dl/extractor/hungama.py109
-rw-r--r--hypervideo_dl/extractor/huya.py9
-rw-r--r--hypervideo_dl/extractor/hypergryph.py32
-rw-r--r--hypervideo_dl/extractor/idolplus.py115
-rw-r--r--hypervideo_dl/extractor/ign.py334
-rw-r--r--hypervideo_dl/extractor/imggaming.py6
-rw-r--r--hypervideo_dl/extractor/instagram.py6
-rw-r--r--hypervideo_dl/extractor/iprima.py41
-rw-r--r--hypervideo_dl/extractor/iqiyi.py35
-rw-r--r--hypervideo_dl/extractor/ivi.py24
-rw-r--r--hypervideo_dl/extractor/iwara.py413
-rw-r--r--hypervideo_dl/extractor/joj.py26
-rw-r--r--hypervideo_dl/extractor/jstream.py73
-rw-r--r--hypervideo_dl/extractor/jwplatform.py37
-rw-r--r--hypervideo_dl/extractor/kakao.py6
-rw-r--r--hypervideo_dl/extractor/kankanews.py48
-rw-r--r--hypervideo_dl/extractor/kick.py126
-rw-r--r--hypervideo_dl/extractor/kommunetv.py31
-rw-r--r--hypervideo_dl/extractor/kuwo.py2
-rw-r--r--hypervideo_dl/extractor/la7.py61
-rw-r--r--hypervideo_dl/extractor/lastfm.py43
-rw-r--r--hypervideo_dl/extractor/lbry.py140
-rw-r--r--hypervideo_dl/extractor/lecturio.py2
-rw-r--r--hypervideo_dl/extractor/lefigaro.py135
-rw-r--r--hypervideo_dl/extractor/lego.py4
-rw-r--r--hypervideo_dl/extractor/limelight.py6
-rw-r--r--hypervideo_dl/extractor/linuxacademy.py20
-rw-r--r--hypervideo_dl/extractor/litv.py4
-rw-r--r--hypervideo_dl/extractor/livestream.py97
-rw-r--r--hypervideo_dl/extractor/lumni.py24
-rw-r--r--hypervideo_dl/extractor/magellantv.py50
-rw-r--r--hypervideo_dl/extractor/mailru.py8
-rw-r--r--hypervideo_dl/extractor/medaltv.py23
-rw-r--r--hypervideo_dl/extractor/mediaite.py18
-rw-r--r--hypervideo_dl/extractor/mediaset.py205
-rw-r--r--hypervideo_dl/extractor/mediasite.py2
-rw-r--r--hypervideo_dl/extractor/mediastream.py208
-rw-r--r--hypervideo_dl/extractor/megatvcom.py6
-rw-r--r--hypervideo_dl/extractor/mgtv.py67
-rw-r--r--hypervideo_dl/extractor/minds.py2
-rw-r--r--hypervideo_dl/extractor/miomio.py10
-rw-r--r--hypervideo_dl/extractor/mixch.py10
-rw-r--r--hypervideo_dl/extractor/motherless.py223
-rw-r--r--hypervideo_dl/extractor/moviepilot.py53
-rw-r--r--hypervideo_dl/extractor/mtv.py11
-rw-r--r--hypervideo_dl/extractor/museai.py112
-rw-r--r--hypervideo_dl/extractor/myvideoge.py68
-rw-r--r--hypervideo_dl/extractor/mzaalo.py95
-rw-r--r--hypervideo_dl/extractor/naver.py35
-rw-r--r--hypervideo_dl/extractor/nbc.py288
-rw-r--r--hypervideo_dl/extractor/nebula.py155
-rw-r--r--hypervideo_dl/extractor/nekohacker.py217
-rw-r--r--hypervideo_dl/extractor/neteasemusic.py6
-rw-r--r--hypervideo_dl/extractor/netverse.py115
-rw-r--r--hypervideo_dl/extractor/nfl.py148
-rw-r--r--hypervideo_dl/extractor/nhk.py249
-rw-r--r--hypervideo_dl/extractor/niconico.py268
-rw-r--r--hypervideo_dl/extractor/ninenow.py2
-rw-r--r--hypervideo_dl/extractor/nitter.py124
-rw-r--r--hypervideo_dl/extractor/njpwworld.py2
-rw-r--r--hypervideo_dl/extractor/noice.py116
-rw-r--r--hypervideo_dl/extractor/noodlemagazine.py31
-rw-r--r--hypervideo_dl/extractor/nosnl.py34
-rw-r--r--hypervideo_dl/extractor/nosvideo.py6
-rw-r--r--hypervideo_dl/extractor/nowness.py8
-rw-r--r--hypervideo_dl/extractor/npo.py314
-rw-r--r--hypervideo_dl/extractor/nrk.py5
-rw-r--r--hypervideo_dl/extractor/ntvru.py13
-rw-r--r--hypervideo_dl/extractor/nubilesporn.py99
-rw-r--r--hypervideo_dl/extractor/nzonscreen.py93
-rw-r--r--hypervideo_dl/extractor/odkmedia.py105
-rw-r--r--hypervideo_dl/extractor/odnoklassniki.py85
-rw-r--r--hypervideo_dl/extractor/oneplace.py43
-rw-r--r--hypervideo_dl/extractor/opencast.py41
-rw-r--r--hypervideo_dl/extractor/orf.py2
-rw-r--r--hypervideo_dl/extractor/owncloud.py80
-rw-r--r--hypervideo_dl/extractor/packtpub.py11
-rw-r--r--hypervideo_dl/extractor/panopto.py4
-rw-r--r--hypervideo_dl/extractor/parler.py94
-rw-r--r--hypervideo_dl/extractor/patreon.py18
-rw-r--r--hypervideo_dl/extractor/pbs.py59
-rw-r--r--hypervideo_dl/extractor/peekvids.py190
-rw-r--r--hypervideo_dl/extractor/peloton.py12
-rw-r--r--hypervideo_dl/extractor/pgatour.py47
-rw-r--r--hypervideo_dl/extractor/piapro.py21
-rw-r--r--hypervideo_dl/extractor/picarto.py56
-rw-r--r--hypervideo_dl/extractor/piksel.py16
-rw-r--r--hypervideo_dl/extractor/pinterest.py153
-rw-r--r--hypervideo_dl/extractor/pladform.py2
-rw-r--r--hypervideo_dl/extractor/platzi.py2
-rw-r--r--hypervideo_dl/extractor/playplustv.py14
-rw-r--r--hypervideo_dl/extractor/playsuisse.py88
-rw-r--r--hypervideo_dl/extractor/plutotv.py13
-rw-r--r--hypervideo_dl/extractor/polskieradio.py399
-rw-r--r--hypervideo_dl/extractor/porn91.py89
-rw-r--r--hypervideo_dl/extractor/pornez.py63
-rw-r--r--hypervideo_dl/extractor/pornhub.py23
-rw-r--r--hypervideo_dl/extractor/pr0gramm.py97
-rw-r--r--hypervideo_dl/extractor/prankcast.py6
-rw-r--r--hypervideo_dl/extractor/puhutv.py8
-rw-r--r--hypervideo_dl/extractor/qdance.py150
-rw-r--r--hypervideo_dl/extractor/radiko.py31
-rw-r--r--hypervideo_dl/extractor/radiocanada.py6
-rw-r--r--hypervideo_dl/extractor/rai.py565
-rw-r--r--hypervideo_dl/extractor/rbgtum.py93
-rw-r--r--hypervideo_dl/extractor/rcs.py402
-rw-r--r--hypervideo_dl/extractor/rcti.py4
-rw-r--r--hypervideo_dl/extractor/recurbate.py42
-rw-r--r--hypervideo_dl/extractor/redbulltv.py6
-rw-r--r--hypervideo_dl/extractor/reddit.py141
-rw-r--r--hypervideo_dl/extractor/redgifs.py4
-rw-r--r--hypervideo_dl/extractor/regiotv.py10
-rw-r--r--hypervideo_dl/extractor/rheinmaintv.py94
-rw-r--r--hypervideo_dl/extractor/rokfin.py54
-rw-r--r--hypervideo_dl/extractor/roosterteeth.py10
-rw-r--r--hypervideo_dl/extractor/rottentomatoes.py80
-rw-r--r--hypervideo_dl/extractor/rozhlas.py296
-rw-r--r--hypervideo_dl/extractor/rte.py6
-rw-r--r--hypervideo_dl/extractor/rts.py4
-rw-r--r--hypervideo_dl/extractor/rtvcplay.py285
-rw-r--r--hypervideo_dl/extractor/rumble.py168
-rw-r--r--hypervideo_dl/extractor/rutube.py63
-rw-r--r--hypervideo_dl/extractor/s4c.py62
-rw-r--r--hypervideo_dl/extractor/safari.py6
-rw-r--r--hypervideo_dl/extractor/sbs.py109
-rw-r--r--hypervideo_dl/extractor/scrippsnetworks.py1
-rw-r--r--hypervideo_dl/extractor/senalcolombia.py31
-rw-r--r--hypervideo_dl/extractor/servus.py169
-rw-r--r--hypervideo_dl/extractor/sevenplus.py10
-rw-r--r--hypervideo_dl/extractor/shahid.py8
-rw-r--r--hypervideo_dl/extractor/shemaroome.py5
-rw-r--r--hypervideo_dl/extractor/sibnet.py17
-rw-r--r--hypervideo_dl/extractor/sina.py10
-rw-r--r--hypervideo_dl/extractor/sixplay.py2
-rw-r--r--hypervideo_dl/extractor/slideslive.py566
-rw-r--r--hypervideo_dl/extractor/sonyliv.py16
-rw-r--r--hypervideo_dl/extractor/soundcloud.py38
-rw-r--r--hypervideo_dl/extractor/spankbang.py7
-rw-r--r--hypervideo_dl/extractor/sportdeutschland.py191
-rw-r--r--hypervideo_dl/extractor/stacommu.py148
-rw-r--r--hypervideo_dl/extractor/stageplus.py515
-rw-r--r--hypervideo_dl/extractor/stripchat.py16
-rw-r--r--hypervideo_dl/extractor/stv.py2
-rw-r--r--hypervideo_dl/extractor/substack.py8
-rw-r--r--hypervideo_dl/extractor/sverigesradio.py62
-rw-r--r--hypervideo_dl/extractor/svt.py61
-rw-r--r--hypervideo_dl/extractor/tagesschau.py58
-rw-r--r--hypervideo_dl/extractor/tbsjp.py152
-rw-r--r--hypervideo_dl/extractor/teachable.py2
-rw-r--r--hypervideo_dl/extractor/teamcoco.py337
-rw-r--r--hypervideo_dl/extractor/telecaribe.py91
-rw-r--r--hypervideo_dl/extractor/telemundo.py9
-rw-r--r--hypervideo_dl/extractor/tempo.py119
-rw-r--r--hypervideo_dl/extractor/tencent.py102
-rw-r--r--hypervideo_dl/extractor/tennistv.py2
-rw-r--r--hypervideo_dl/extractor/tenplay.py9
-rw-r--r--hypervideo_dl/extractor/testurl.py19
-rw-r--r--hypervideo_dl/extractor/tf1.py19
-rw-r--r--hypervideo_dl/extractor/tfo.py8
-rw-r--r--hypervideo_dl/extractor/theplatform.py30
-rw-r--r--hypervideo_dl/extractor/thesun.py13
-rw-r--r--hypervideo_dl/extractor/thisoldhouse.py4
-rw-r--r--hypervideo_dl/extractor/thisvid.py226
-rw-r--r--hypervideo_dl/extractor/threeqsdn.py4
-rw-r--r--hypervideo_dl/extractor/tiktok.py526
-rw-r--r--hypervideo_dl/extractor/tnaflix.py27
-rw-r--r--hypervideo_dl/extractor/toutv.py6
-rw-r--r--hypervideo_dl/extractor/triller.py315
-rw-r--r--hypervideo_dl/extractor/trtcocuk.py48
-rw-r--r--hypervideo_dl/extractor/trueid.py6
-rw-r--r--hypervideo_dl/extractor/tubetugraz.py27
-rw-r--r--hypervideo_dl/extractor/tubitv.py8
-rw-r--r--hypervideo_dl/extractor/tumblr.py2
-rw-r--r--hypervideo_dl/extractor/tunein.py280
-rw-r--r--hypervideo_dl/extractor/tv2.py10
-rw-r--r--hypervideo_dl/extractor/tv4.py77
-rw-r--r--hypervideo_dl/extractor/tvp.py130
-rw-r--r--hypervideo_dl/extractor/tvplay.py223
-rw-r--r--hypervideo_dl/extractor/tvplayer.py10
-rw-r--r--hypervideo_dl/extractor/twitcasting.py38
-rw-r--r--hypervideo_dl/extractor/twitch.py81
-rw-r--r--hypervideo_dl/extractor/twitter.py720
-rw-r--r--hypervideo_dl/extractor/txxx.py418
-rw-r--r--hypervideo_dl/extractor/udemy.py35
-rw-r--r--hypervideo_dl/extractor/unsupported.py34
-rw-r--r--hypervideo_dl/extractor/uplynk.py80
-rw-r--r--hypervideo_dl/extractor/urplay.py53
-rw-r--r--hypervideo_dl/extractor/vevo.py10
-rw-r--r--hypervideo_dl/extractor/vice.py10
-rw-r--r--hypervideo_dl/extractor/videa.py2
-rw-r--r--hypervideo_dl/extractor/videocampus_sachsen.py4
-rw-r--r--hypervideo_dl/extractor/videoken.py336
-rw-r--r--hypervideo_dl/extractor/vidlii.py3
-rw-r--r--hypervideo_dl/extractor/viewlift.py6
-rw-r--r--hypervideo_dl/extractor/viidea.py6
-rw-r--r--hypervideo_dl/extractor/vimeo.py102
-rw-r--r--hypervideo_dl/extractor/viu.py148
-rw-r--r--hypervideo_dl/extractor/vk.py331
-rw-r--r--hypervideo_dl/extractor/vocaroo.py63
-rw-r--r--hypervideo_dl/extractor/vodlocker.py12
-rw-r--r--hypervideo_dl/extractor/volejtv.py40
-rw-r--r--hypervideo_dl/extractor/voot.py183
-rw-r--r--hypervideo_dl/extractor/vrt.py415
-rw-r--r--hypervideo_dl/extractor/vrv.py9
-rw-r--r--hypervideo_dl/extractor/vshare.py2
-rw-r--r--hypervideo_dl/extractor/vzaar.py2
-rw-r--r--hypervideo_dl/extractor/wat.py14
-rw-r--r--hypervideo_dl/extractor/webcamerapl.py44
-rw-r--r--hypervideo_dl/extractor/weibo.py2
-rw-r--r--hypervideo_dl/extractor/weverse.py608
-rw-r--r--hypervideo_dl/extractor/wevidi.py108
-rw-r--r--hypervideo_dl/extractor/weyyak.py86
-rw-r--r--hypervideo_dl/extractor/whyp.py50
-rw-r--r--hypervideo_dl/extractor/wimbledon.py61
-rw-r--r--hypervideo_dl/extractor/wistia.py45
-rw-r--r--hypervideo_dl/extractor/wrestleuniverse.py307
-rw-r--r--hypervideo_dl/extractor/wykop.py268
-rw-r--r--hypervideo_dl/extractor/xanimu.py51
-rw-r--r--hypervideo_dl/extractor/xhamster.py10
-rw-r--r--hypervideo_dl/extractor/ximalaya.py13
-rw-r--r--hypervideo_dl/extractor/xtube.py4
-rw-r--r--hypervideo_dl/extractor/xvideos.py21
-rw-r--r--hypervideo_dl/extractor/yahoo.py117
-rw-r--r--hypervideo_dl/extractor/yandexvideo.py4
-rw-r--r--hypervideo_dl/extractor/yappy.py127
-rw-r--r--hypervideo_dl/extractor/yesjapan.py9
-rw-r--r--hypervideo_dl/extractor/yle_areena.py127
-rw-r--r--hypervideo_dl/extractor/youku.py80
-rw-r--r--hypervideo_dl/extractor/youporn.py35
-rw-r--r--hypervideo_dl/extractor/youtube.py1697
-rw-r--r--hypervideo_dl/extractor/zaiko.py130
-rw-r--r--hypervideo_dl/extractor/zattoo.py5
-rw-r--r--hypervideo_dl/extractor/zdf.py31
-rw-r--r--hypervideo_dl/extractor/zee5.py34
-rw-r--r--hypervideo_dl/extractor/zingmp3.py101
-rw-r--r--hypervideo_dl/extractor/zoom.py108
-rw-r--r--hypervideo_dl/extractor/zype.py6
-rw-r--r--hypervideo_dl/jsinterp.py62
-rw-r--r--hypervideo_dl/networking/__init__.py13
-rw-r--r--hypervideo_dl/networking/_helper.py208
-rw-r--r--hypervideo_dl/networking/_urllib.py454
-rw-r--r--hypervideo_dl/networking/common.py564
-rw-r--r--hypervideo_dl/networking/exceptions.py217
-rw-r--r--hypervideo_dl/options.py271
-rw-r--r--hypervideo_dl/plugins.py173
-rw-r--r--hypervideo_dl/postprocessor/__init__.py5
-rw-r--r--hypervideo_dl/postprocessor/common.py13
-rw-r--r--hypervideo_dl/postprocessor/embedthumbnail.py6
-rw-r--r--hypervideo_dl/postprocessor/ffmpeg.py29
-rw-r--r--hypervideo_dl/postprocessor/metadataparser.py4
-rw-r--r--hypervideo_dl/postprocessor/modify_chapters.py1
-rw-r--r--hypervideo_dl/utils/__init__.py10
-rw-r--r--hypervideo_dl/utils/_deprecated.py39
-rw-r--r--hypervideo_dl/utils/_legacy.py242
-rw-r--r--hypervideo_dl/utils/_utils.py5484
-rw-r--r--hypervideo_dl/utils/networking.py163
-rw-r--r--hypervideo_dl/utils/traversal.py254
-rw-r--r--hypervideo_dl/version.py6
-rw-r--r--setup.cfg4
-rw-r--r--setup.py31
-rw-r--r--test/conftest.py21
-rw-r--r--test/helper.py4
-rw-r--r--test/test_InfoExtractor.py128
-rw-r--r--test/test_YoutubeDL.py168
-rw-r--r--test/test_YoutubeDLCookieJar.py24
-rw-r--r--test/test_aes.py6
-rw-r--r--test/test_age_restriction.py19
-rw-r--r--test/test_compat.py9
-rw-r--r--test/test_config.py227
-rw-r--r--test/test_cookies.py18
-rwxr-xr-xtest/test_download.py9
-rw-r--r--test/test_downloader_external.py139
-rw-r--r--test/test_downloader_http.py12
-rw-r--r--test/test_networking.py1439
-rw-r--r--test/test_networking_utils.py282
-rw-r--r--test/test_plugins.py73
-rw-r--r--test/test_socks.py521
-rw-r--r--test/test_utils.py363
-rw-r--r--test/testdata/yt_dlp_plugins/extractor/_ignore.py5
-rw-r--r--test/testdata/yt_dlp_plugins/extractor/ignore.py12
-rw-r--r--test/testdata/yt_dlp_plugins/extractor/normal.py9
-rw-r--r--test/testdata/yt_dlp_plugins/postprocessor/normal.py5
-rw-r--r--test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py5
-rw-r--r--test/testdata/zipped_plugins/yt_dlp_plugins/postprocessor/zipped.py5
434 files changed, 39163 insertions, 8062 deletions
diff --git a/AUTHORS b/AUTHORS
index 8dafe32..374a1b1 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -16,6 +16,7 @@ Aaron Brager
Aaron Lipinski
Aaron Wojnowski
Aaron Zeng
+Aaruni Kaushik
Abdullah Ibn Fulan
Abhishek Kedia
Abubukker Chaudhary
@@ -41,7 +42,10 @@ Aleri Kaisattera
Ales Jirasek
Alessandro Ghedini
Alex
+Alex Berg
+Alex Ionescu
Alex Karabanov
+Alex Klapheke
Alex Merkel
Alex Monk
Alex Seiler
@@ -60,7 +64,9 @@ Ali Sherief
Allan Daemon
Allan Zhou
Alpesh Valia
+Aman Salwan
Amaury Gauthier
+Amirreza Aflakparast
Amish Bhadeshia
Anand Babu Periasamy
Anant Murmu
@@ -126,7 +132,9 @@ Ben Rog-Wilhelm
Ben Welsh
Benedikt Wildenhain
Benjamin Congdon
+Benjamin Ryan
Bepis
+Berkan Teber
Bernhard M. Wiedemann
Bjorn Heesakkers
BlahGeek
@@ -143,6 +151,7 @@ Bricio
Bruno Guerreiro
BunnyHelp
Burve
+ByteDream
CHJ85
CXwudi
Camillo Dell'mour
@@ -155,6 +164,7 @@ Charlie Le
ChillingPepper
Ching Yi, Chan
Chirantan Ekbote
+Chris Caruso
Chris Gavin
Chris Hranj
Chris Lamb
@@ -162,6 +172,7 @@ Christian Albrecht
Christian Paul
Christian Pointner
Christoph Döpmann
+Christoph Flathmann
Christoph Moench-Tegeder
Christopher Krooss
Christopher Neugebauer
@@ -175,14 +186,17 @@ Conner
Corey Farwell
Corey Nicholson
Cory Hall
+CoryTibbettsDev
Costy Petrisor
CplPwnies
Craig Markwardt
CrankDatSouljaBoy
CrypticSignal
CyberJacob
+Cyberes
Cyril Roelandt
Cássio Ávila
+D0LLYNH0
DEvmIb
DaMightyZombie
Daan van Vugt
@@ -197,7 +211,9 @@ Daniel Bolton
Daniel Höpfl
Daniel Lindholm
Daniel Peukert
+Daniel Rich
Daniel Twardowski
+Daniel Vogt
Daniel.Zeng
Danko Alexeyev
Dankryn
@@ -205,6 +221,7 @@ Dao Hoang Son
Dario Guarascio
DarkZeros
DarkstaIkers
+DataGhost
Dave
Dave Loyall
Dave Vasilevsky
@@ -220,9 +237,12 @@ David Powell
David Rabinowitz
David Skrundz
David Triendl
+David Turner
David Wagner
+Davin Kevin
Deer-Spangle
Delon
+Denis
DepFA
Derek Land
DesweR
@@ -266,6 +286,7 @@ Erik
Erik Johnson
Erwin de Haan
Evan Spensley
+Eveldee
FND
Fabi019
Fabian Stahl
@@ -280,16 +301,22 @@ Filip B
Filip Hedman
Filippo Valsorda
Finn Petersen
+Finn R. Gärtner
FireDart
FliegendeWurst
+Florian Albrechtskirchinger
FooBarQuaxx
Founder Fang
Francesco Frassinelli
Francois du Toit
+Franklin Lee
Frans de Jonge
François Charlier
François Revol
Frederic Bournival
+Frederik Nordahl Jul Sabroe
+Friedrich Rehren
+GD-Slime
GDR!
Gabriel Schubiner
Gaetan Gilbert
@@ -310,11 +337,13 @@ Giedrius Statkevičius
Gilles Pietri
Gino Lisignoli
Giovanni Visentini
+Giulio Muscarello
Giuseppe Fabiano
Gjorgji Jankovski
Glenn Slayden
Gorfiend
Grabien
+Greg Sadetsky
GreyAlien502
Grom PE
Grzegorz P
@@ -349,8 +378,10 @@ Itachi
Itay Brandes
Iulian Onofrei
Ivan Kozik
+Ivan Skodje
J
J.D. Purcell
+JC-Chung
JChris246
Jacek Nowacki
Jack Danger Canty
@@ -373,6 +404,7 @@ Jan Schär
Janez Troha
Jason Normore
Jason Terk
+Jasper Rebane
Jay
Jeff Buchbinder
Jeff Crouse
@@ -383,11 +415,13 @@ Jelle van der Waa
Jens Rutschmann
Jens Timmerman
Jens Wille
+Jeong, Heon
Jeremie J. Jarosh
Jeroen Jacobs
Jertzukka
Jesse
Jesse de Zwart
+Jesus
Jesús
Jia Rong Yee
JianxinLi
@@ -412,6 +446,7 @@ Johny Mo Swag
Joost Verdoorn
Joram Schrijver
Jordan Weatherby
+Jorge
Joseph Frazier
Joseph Spiros
Josh Soref
@@ -452,10 +487,12 @@ KiberInfinity
Kid
Kieran O'Reilly
Kitten King
+Kurt Bestor
Kyle
Kyle Anthony Williams
Kyu Yeun Kim
LE
+LXYan2333
Laneone
LangerJan
Lapinot
@@ -464,6 +501,7 @@ Lauren Liberda
Laurent Raufaste
Leonardo Amaral
Leonardo Taccari
+LeoniePhiline
Leslie P. Polzer
Lesmiscore
Li4ick
@@ -474,6 +512,7 @@ Locke
Logan B
Logan Fleur
Lovius
+LowSuggestion912
Luc Ritchie
Luca Cherubin
Luca Steeb
@@ -490,6 +529,7 @@ MAA
MMM
MRWITEK
Magnus Kolstad
+Mahmoud Abdel-Fattah
Malte Kiefer
Mamay Alexander
Mantas Mikulėnas
@@ -499,6 +539,7 @@ Marcin Cieślak
Marco Fantauzzo
Marco Ferragina
Marco Schuster
+Marek Hudik
Marek Rusinowski
Marenga
Marian Sigler
@@ -513,6 +554,7 @@ Martin Trigaux
Martin Weinelt
Marvin Ewald
Marwen Dallel
+Master
Matej Dujava
Mathias Rav
Mats
@@ -524,6 +566,7 @@ Matthew Rayfield
Matthieu Muffato
Mattias Harrysson
Mattias Wadman
+Matumo
Matěj Cepl
Max
Max Mehl
@@ -553,12 +596,15 @@ Misael Aguayo
Mister Hat
Mitsukarenai
MobiDotS
+Mohamed Al Mehairbi
Mohamedh Fazal
Mohammad Khaled AbouElSherbini
Mohammad Teimori Pabandi
Mohammed Yaseen Mowzer
+Mohit Tokas
Moises Lima
Moritz Patelscheck
+Mozi
MrDoritos
MrOctopus
MrRawes
@@ -567,12 +613,17 @@ Muratcan Simsek
N1k145
NRTICN
Naglis Jonaitis
+Nam Vu
Namnamseo
Nathan Rossi
+Nathan Touzé
Nehal Patel
NeroBurner
+Neurognostic
Nevar Angelo
+Nicholas Defranco
Nick Daniels
+Nicolai Dagestad
Nicolas Kaiser
Nicolas SAPA
Nicolas Évrard
@@ -585,6 +636,8 @@ Nitish Kumar
Noah
NotFound
OHaiiBuzzle
+OIRNOIR
+OMEGA_RAZER
Odd Stråbø
OhMyBahGosh
Ole Ernst
@@ -592,6 +645,8 @@ Oleg Prutz
Oli Allen
Oliver Freyermuth
Olivier Bilodeau
+Omar Atef
+OndrejBakan
Ondřej Bárta
Ondřej Caletka
Ori Avtalion
@@ -599,6 +654,7 @@ Orn
Osama Khalid
Oskar Cieslik
Oskar Jauch
+OverlordQ
P-reducible
PB
PC
@@ -652,22 +708,26 @@ Quan Hua
Quentin Rameau
RPing
Rafal Borczuch
+Rajeshwaran
Ralf Haring
Random User
Raphael Michel
Rasmus Rendal
Rastislav Barlik
Ray Douglass
+RedDeffender
Remita Amine
Reto Kromer
Reventl0v
RexYuan
+RfadnjdExt
RiCON
Ricardo
Ricardo Constantino
Ricardo Garcia
Richard Clamp
Richard Gibson
+RjY
Rob
Rob van Bekkum
Robert Geislinger
@@ -718,8 +778,10 @@ Shaun Walbridge
Shaya G
Shreyas Minocha
Shrimadhav U K
+Siddhartha Sahu
Sidney de Koning
Silvan Mosberger
+Simon
Simon Morgan
Simon Sawicki
Simon W. Jackson
@@ -739,8 +801,10 @@ Stefan Lobbenmeier
Stefan Pöschel
Stefan-Gabriel Muscalu
Steffan Donal
+Stel Abrego
Stephan
Stephen Stair
+Steve
Steven Gosseling
Steven Maude
Sukhbir Singh
@@ -791,24 +855,29 @@ Toni Viemerö
TotalCaesar659
Trevor Nelson
Tristan Waddington
+TxI5
Tyler Szabo
Unit 193
Unknown
Urgau
Varun
Vasyl' Vavrychuk
+Venkata Krishna S
Vid
VietTPham
Vignesh Venkat
Vijay Singh
Viktor Szakats
Viren Rajput
+Vita
Vitaliy Syrchikov
Vitaly Khabarov
+Vladislav
Vobe
Vrihub
Vukkk
Vítor Galvão
+Văn Anh
Wandang
Wang Jun Tham
WassimAttar
@@ -821,6 +890,7 @@ Witold Baryluk
WolfganP
Xaver Hellauer
Xiao Di Guan
+Xiao Han
Xie Yanbo
Xu Cheng
Xuan Hu (Sean)
@@ -839,8 +909,10 @@ Zach Bruggeman
Zack Fernandes
Zenon Mousmoulas
Zhong Jianxin
+Zhong Lufan
Zhymabek Roman
Zirro
+Zprokkel
aarubui
adamanldo
aegamesi
@@ -864,10 +936,12 @@ aviperes
axelerometer
aystroganov@gmail.com
azeem
+barsnick
bashonly
bastik
bato3
beefchop
+bepvte
bitraid
biwubo
blissland
@@ -877,6 +951,7 @@ bpfoley
bsun0000
bubbleguuum
bzc6p
+c-basalt
ca-za
cant-think-of-a-name
cantandwont
@@ -950,11 +1025,13 @@ fluks
fnord
foghawk
forDream
+foreignBlade
frenchy1983
ftk
funniray
gam2046
gamer191
+garret
gcmalloc
gdzx
geauxlo
@@ -970,6 +1047,7 @@ h-collector
ha shao
hakatashi
haobinliang
+hasezoey
hassaanaliw
hcwhan
hdclark
@@ -977,6 +1055,7 @@ hedii
helb
hh0rva1h
hmlinaric
+hoaluvn
hojel
hrimfaxi
hseg
@@ -986,6 +1065,7 @@ huohuarong
hurda
i6t
ian
+ifan-t
igv
inondle
insaneracist
@@ -1005,12 +1085,14 @@ jfogelman
jhwgh1968
jjatria
jnozsc
+jo-nike
joehillen
jomo
josanabr
julien
jxu
k3ns1n
+kangalio
kaspi
kayb94
kaz-us
@@ -1028,12 +1110,14 @@ knapior
kr4ssi
krichbanana
kurumigi
-lauren
+lauren n. liberda
lazypete365
light94
lightmare
linhua55
+linsui
lkho
+lkw123
llamasblade
llyyr
logon84
@@ -1060,6 +1144,7 @@ mehq
mexican porn commits
midas02
migbac
+milkknife
minusf
miseran
mjdubell
@@ -1070,10 +1155,13 @@ motophil
mpeter50
mrBliss
mrkrossxdx
+mrscrapy
mrtnmtth
mtilbury
+mushbite
mutantmonkey
mzbaulhaque
+mzhou
nawl
nemunaire
net
@@ -1086,6 +1174,7 @@ nikhil
nixxo
nmeum
nmrugg
+nnoboa
nomevi
nosoop
nto
@@ -1101,10 +1190,12 @@ opusforlife2
oteng
ouwou
ovitei
+oxamun
ozburo
pachacamac
panatexxa
patrickslin
+permunkle
peugeot
pgaig
phaer
@@ -1117,11 +1208,15 @@ pingtux
piplongrun
pishposhmcgee
plroman
+pmitchell86
+puc9
pukkandan
pulpe
pyed
pypy
+qbnu
quinlander
+qulaz
quyleanh
raleeper
rand-net
@@ -1131,10 +1226,12 @@ reddraggone9
reiv
remis
renalid
+rexlambert22
rhhayward
rhsmachine
rigstot
riking
+ringus1
rmanola
robbie
robin
@@ -1154,6 +1251,7 @@ sceext
schn0sch
schnusch
scil
+sepro
sh!zeeg
shirt
shirt-dev
@@ -1162,6 +1260,7 @@ sichuan-pepper
siddharth
siikamiika
skacurt
+skbeh
slangangular
slocum
smed79
@@ -1205,16 +1304,24 @@ tlsssl
tobi1805
tom
toniz4
+toomyzoom
+trainman261
trasssh
troywith77
+truedread
tsantala
tsia
u-spec-png
+unbeatable-101
+urectanc
user
utlasidyo
v-delta
+vampirefrog
venth
+vidiot720
vijayanand nandam
+viktor-enzell
vkorablin
vobe
vordep
@@ -1240,6 +1347,8 @@ zackmark29
zcanfly
zejn
zenerdi0de
+zhgwn
+zhong-yiyu
zootedb0t
zouhair
zraktvor
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0ed1eb4..372587b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -42,7 +42,9 @@ Before reporting any issue, type `doas pacman -Sy hypervideo`. This should repor
### Is the issue already documented?
-Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/ytdl-org/youtube-dl/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity.
+Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/yt-dlp/yt-dlp/search?type=Issues) of this repository. If there is an issue, subcribe to it to be notified when there is any progress. Unless you have something useful to add to the converation, please refrain from commenting.
+
+Additionally, it is also helpful to see if the issue has already been documented in the [youtube-dl issue tracker](https://github.com/ytdl-org/youtube-dl/issues). If similar issues have already been reported in youtube-dl (but not in our issue tracker), links to them can be included in your issue report here.
### Why are existing options not enough?
@@ -66,6 +68,28 @@ Only post features that you (or an incapacitated friend you can personally talk
### Is your question about hypervideo?
+Some bug reports are completely unrelated to hypervideo and relate to a different, or even the reporter's own, application. Please make sure that you are actually using hypervideo. If you are using a UI for hypervideo, report the bug to the maintainer of the actual application providing the UI. In general, if you are unable to provide the verbose log, you should not be opening the issue here.
+
+If the issue is with `youtube-dl` (the upstream fork of hypervideo) and not with hypervideo, the issue should be raised in the youtube-dl project.
+
+### Are you willing to share account details if needed?
+
+The maintainers and potential contributors of the project often do not have an account for the website you are asking support for. So any developer interested in solving your issue may ask you for account details. It is your personal discretion whether you are willing to share the account in order for the developer to try and solve your issue. However, if you are unwilling or unable to provide details, they obviously cannot work on the issue and it cannot be solved unless some developer who both has an account and is willing/able to contribute decides to solve it.
+
+By sharing an account with anyone, you agree to bear all risks associated with it. The maintainers and yt-dlp can't be held responsible for any misuse of the credentials.
+
+While these steps won't necessarily ensure that no misuse of the account takes place, these are still some good practices to follow.
+
+- Look for people with `Member` (maintainers of the project) or `Contributor` (people who have previously contributed code) tag on their messages.
+- Change the password before sharing the account to something random (use [this](https://passwordsgenerator.net/) if you don't have a random password generator).
+- Change the password after receiving the account back.
+
+### Is the website primarily used for piracy?
+
+We follow [youtube-dl's policy](https://github.com/ytdl-org/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free) to not support services that is primarily used for infringing copyright. Additionally, it has been decided to not to support porn sites that specialize in fakes. We also cannot support any service that serves only [DRM protected content](https://en.wikipedia.org/wiki/Digital_rights_management).
+
+
+
It may sound strange, but some bug reports we receive are completely unrelated to hypervideo and relate to a different, or even the reporter's own, application. Please make sure that you are actually using hypervideo. If you are using a UI for hypervideo, report the bug to the maintainer of the actual application providing the UI. On the other hand, if your UI for hypervideo fails in some way you believe is related to hypervideo, by all means, go ahead and report the bug.
# DEVELOPER INSTRUCTIONS
@@ -74,7 +98,7 @@ Most users do not need to build hypervideo and can [download the builds](https:/
To run hypervideo as a developer, you don't need to build anything either. Simply execute
- python -m youtube_dl
+ python -m hypervideo_dl
To run the test, simply invoke your favorite test runner, or execute a test file directly; any of the following work:
@@ -108,7 +132,7 @@ After you have ensured this site is distributing its content legally, you can fo
cd hypervideo
git checkout -b yourextractor
-4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`:
+4. Start with this simple template and save it to `hypervideo_dl/extractor/yourextractor.py`:
```python
# coding: utf-8
@@ -147,21 +171,21 @@ After you have ensured this site is distributing its content legally, you can fo
'title': title,
'description': self._og_search_description(webpage),
'uploader': self._search_regex(r'<div[^>]+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False),
- # TODO more properties (see youtube_dl/extractor/common.py)
+ # TODO more properties (see hypervideo_dl/extractor/common.py)
}
```
-5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
+5. Add an import in [`hypervideo_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in.
-7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want.
+7. Have a look at [`hypervideo_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want.
8. Make sure your code follows [hypervideo coding conventions](#hypervideo-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart):
- $ flake8 youtube_dl/extractor/yourextractor.py
+ $ flake8 hypervideo_dl/extractor/yourextractor.py
9. Make sure your code works under all [Python](https://www.python.org/) versions claimed supported by hypervideo, namely 2.6, 2.7, and 3.2+.
10. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this:
- $ git add youtube_dl/extractor/extractors.py
- $ git add youtube_dl/extractor/yourextractor.py
+ $ git add hypervideo_dl/extractor/extractors.py
+ $ git add hypervideo_dl/extractor/yourextractor.py
$ git commit -m '[yourextractor] Add new extractor'
$ git push origin yourextractor
@@ -173,7 +197,8 @@ In any case, thank you very much for your contributions!
This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code.
-Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hosters out of your control and this layout tends to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize dependency on the source's layout and even to make the code foresee potential future changes and be ready for that. This is important because it will allow the extractor not to break on minor layout changes thus keeping old hypervideo versions working. Even though this breakage issue is easily fixed by emitting a new version of hypervideo with a fix incorporated, all the previous versions become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say, some non rolling release distros may never receive an update at all.
+Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hosters out of your control and this layout tends to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize dependency on the source's layout and even to make the code foresee potential future changes and be ready for that. This is important because it will allow the extractor not to break on minor layout changes thus keeping old yt-dlp versions working. Even though this breakage issue may be easily fixed by a new version of yt-dlp, this could take some time, during which the extractor will remain broken.
+
### Mandatory and optional metafields
@@ -239,6 +264,46 @@ description = self._search_regex(
On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that may or may not be present.
+
+Another thing to remember is not to try to iterate over `None`
+
+Say you extracted a list of thumbnails into `thumbnail_data` and want to iterate over them
+
+```python
+thumbnail_data = data.get('thumbnails') or []
+thumbnails = [{
+ 'url': item['url'],
+ 'height': item.get('h'),
+} for item in thumbnail_data if item.get('url')] # correct
+```
+
+and not like:
+
+```python
+thumbnail_data = data.get('thumbnails')
+thumbnails = [{
+ 'url': item['url'],
+ 'height': item.get('h'),
+} for item in thumbnail_data] # incorrect
+```
+
+In this case, `thumbnail_data` will be `None` if the field was not found and this will cause the loop `for item in thumbnail_data` to raise a fatal error. Using `or []` avoids this error and results in setting an empty list in `thumbnails` instead.
+
+Alternately, this can be further simplified by using `traverse_obj`
+
+```python
+thumbnails = [{
+ 'url': item['url'],
+ 'height': item.get('h'),
+} for item in traverse_obj(data, ('thumbnails', lambda _, v: v['url']))]
+```
+
+or, even better,
+
+```python
+thumbnails = traverse_obj(data, ('thumbnails', ..., {'url': 'url', 'height': 'h'}))
+```
+
### Provide fallbacks
When extracting metadata try to do so from multiple sources. For example if `title` is present in several places, try extracting from at least some of them. This makes it more future-proof in case some of the sources become unavailable.
@@ -407,7 +472,7 @@ Incorrect:
### Use convenience conversion and parsing functions
-Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well.
+Wrap all extracted numeric data into safe functions from [`hypervideo_dl/utils.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well.
Use `url_or_none` for safe URL processing.
@@ -415,7 +480,7 @@ Use `try_get` for safe metadata extraction from parsed JSON.
Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction.
-Explore [`youtube_dl/utils.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions.
+Explore [`hypervideo_dl/utils.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions.
#### More examples
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index f2a1368..6b9b9f4 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -2,7 +2,8 @@ pukkandan (owner)
shirt-dev (collaborator)
coletdjnz/colethedj (collaborator)
Ashish0804 (collaborator)
-nao20010128nao/Lesmiscore (collaborator)
+bashonly (collaborator)
+Grub4K (collaborator)
h-h-h-h
pauldubois98
nixxo
@@ -295,7 +296,6 @@ Mehavoid
winterbird-code
yashkc2025
aldoridhoni
-bashonly
jacobtruman
masta79
palewire
@@ -319,7 +319,6 @@ columndeeply
DoubleCouponDay
Fabi019
GautamMKGarg
-Grub4K
itachi-19
jeroenj
josanabr
@@ -357,3 +356,114 @@ SG5
the-marenga
tkgmomosheep
vitkhab
+glensc
+synthpop123
+tntmod54321
+milkknife
+Bnyro
+CapacitorSet
+stelcodes
+skbeh
+muddi900
+digitall
+chengzhicn
+mexus
+JChris246
+redraskal
+Spicadox
+barsnick
+docbender
+KurtBestor
+Chrissi2812
+FrederikNS
+gschizas
+JC-Chung
+mzhou
+OndrejBakan
+ab4cbef
+aionescu
+amra
+ByteDream
+carusocr
+chexxor
+felixonmars
+FrankZ85
+FriedrichRehren
+gregsadetsky
+LeoniePhiline
+LowSuggestion912
+Matumo
+OIRNOIR
+OMEGARAZER
+oxamun
+pmitchell86
+qbnu
+qulaz
+rebane2001
+road-master
+rohieb
+sdht0
+seproDev
+Hill-98
+LXYan2333
+mushbite
+venkata-krishnas
+7vlad7
+alexklapheke
+arobase-che
+bepvte
+bergoid
+blmarket
+brandon-dacrib
+c-basalt
+CoryTibbettsDev
+Cyberes
+D0LLYNH0
+danog
+DataGhost
+falbrechtskirchinger
+foreignBlade
+garret1317
+hasezoey
+hoaluvn
+ItzMaxTV
+ivanskodje
+jo-nike
+kangalio
+linsui
+makew0rld
+menschel
+mikf
+mrscrapy
+NDagestad
+Neurognostic
+NextFire
+nick-cd
+permunkle
+pzhlkj6612
+ringus1
+rjy
+Schmoaaaaah
+sjthespian
+theperfectpunk
+toomyzoom
+truedread
+TxI5
+unbeatable-101
+vampirefrog
+vidiot720
+viktor-enzell
+zhgwn
+barthelmannk
+berkanteber
+OverlordQ
+rexlambert22
+Ti4eeT4e
+AmanSal1
+bbilly1
+meliber
+nnoboa
+rdamas
+RfadnjdExt
+urectanc
+nao20010128nao/Lesmiscore
diff --git a/Changelog.md b/Changelog.md
index 1a39d29..9073814 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -1,15 +1,712 @@
# Changelog
<!--
-# Instuctions for creating release
-
-* Run `make doc`
-* Update Changelog.md and CONTRIBUTORS
-* Change "Based on ytdl" version in Readme.md if needed
-* Commit as `Release <version>` and push to master
-* Dispatch the workflow https://github.com/yt-dlp/yt-dlp/actions/workflows/build.yml on master
+# To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master
-->
+### 2023.07.06
+
+#### Important changes
+- Security: [[CVE-2023-35934](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-35934)] Fix [Cookie leak](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj)
+ - `--add-header Cookie:` is deprecated and auto-scoped to input URL domains
+ - Cookies are scoped when passed to external downloaders
+ - Add `cookies` field to info.json and deprecate `http_headers.Cookie`
+
+#### Core changes
+- [Allow extractors to mark formats as potentially DRM](https://github.com/yt-dlp/yt-dlp/commit/bc344cd456380999c1ee74554dfd432a38f32ec7) ([#7396](https://github.com/yt-dlp/yt-dlp/issues/7396)) by [pukkandan](https://github.com/pukkandan)
+- [Bugfix for b4e0d75848e9447cee2cd3646ce54d4744a7ff56](https://github.com/yt-dlp/yt-dlp/commit/e59e20744eb32ce4b6ea0dece7c673be8376a710) by [pukkandan](https://github.com/pukkandan)
+- [Change how `Cookie` headers are handled](https://github.com/yt-dlp/yt-dlp/commit/3121512228487c9c690d3d39bfd2579addf96e07) by [Grub4K](https://github.com/Grub4K)
+- [Prevent `Cookie` leaks on HTTP redirect](https://github.com/yt-dlp/yt-dlp/commit/f8b4bcc0a791274223723488bfbfc23ea3276641) by [coletdjnz](https://github.com/coletdjnz)
+- **formats**: [Fix best fallback for storyboards](https://github.com/yt-dlp/yt-dlp/commit/906c0bdcd8974340d619e99ccd613c163eb0d0c2) by [pukkandan](https://github.com/pukkandan)
+- **outtmpl**: [Pad `playlist_index` etc even when with internal formatting](https://github.com/yt-dlp/yt-dlp/commit/47bcd437247152e0af5b3ebc5592db7bb66855c2) by [pukkandan](https://github.com/pukkandan)
+- **utils**: clean_podcast_url: [Handle protocol in redirect URL](https://github.com/yt-dlp/yt-dlp/commit/91302ed349f34dc26cc1d661bb45a4b71f4417f7) by [pukkandan](https://github.com/pukkandan)
+
+#### Extractor changes
+- **abc**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/8f05fbae2a79ce0713077ccc68b354e63216bf20) ([#7434](https://github.com/yt-dlp/yt-dlp/issues/7434)) by [meliber](https://github.com/meliber)
+- **AdultSwim**: [Extract subtitles from m3u8](https://github.com/yt-dlp/yt-dlp/commit/5e16cf92eb496b7c1541a6b1d727cb87542984db) ([#7421](https://github.com/yt-dlp/yt-dlp/issues/7421)) by [nnoboa](https://github.com/nnoboa)
+- **crunchyroll**: music: [Fix `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/5b4b92769afcc398475e481bfa839f1158902fe9) ([#7439](https://github.com/yt-dlp/yt-dlp/issues/7439)) by [AmanSal1](https://github.com/AmanSal1), [rdamas](https://github.com/rdamas)
+- **Douyin**: [Fix extraction from webpage](https://github.com/yt-dlp/yt-dlp/commit/a2be9781fbf4d7e4db245c277ca2ecc41cf3a7b2) by [bashonly](https://github.com/bashonly)
+- **googledrive**: [Fix source format extraction](https://github.com/yt-dlp/yt-dlp/commit/3b7f5300c577fef40464d46d4e4037a69d51fe82) ([#7395](https://github.com/yt-dlp/yt-dlp/issues/7395)) by [RfadnjdExt](https://github.com/RfadnjdExt)
+- **kick**: [Fix `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/ef8509c300ea50da86aea447eb214d3d6f6db6bb) by [bashonly](https://github.com/bashonly)
+- **qdance**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/f0a1ff118145b6449982ba401f9a9f656ecd8062) ([#7420](https://github.com/yt-dlp/yt-dlp/issues/7420)) by [bashonly](https://github.com/bashonly)
+- **sbs**: [Python 3.7 compat](https://github.com/yt-dlp/yt-dlp/commit/f393bbe724b1fc6c7f754a5da507e807b2b40ad2) by [pukkandan](https://github.com/pukkandan)
+- **stacommu**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/af1fd12f675220df6793fc019dff320bc76e8080) ([#7432](https://github.com/yt-dlp/yt-dlp/issues/7432)) by [urectanc](https://github.com/urectanc)
+- **twitter**
+ - [Fix unauthenticated extraction](https://github.com/yt-dlp/yt-dlp/commit/49296437a8e5fa91dacb5446e51ab588474c85d3) ([#7476](https://github.com/yt-dlp/yt-dlp/issues/7476)) by [bashonly](https://github.com/bashonly)
+ - spaces: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/1cffd621cb371f1563563cfb2fe37d137e8a7bee) ([#7512](https://github.com/yt-dlp/yt-dlp/issues/7512)) by [bashonly](https://github.com/bashonly)
+- **vidlii**: [Handle relative URLs](https://github.com/yt-dlp/yt-dlp/commit/ad8902f616ad2541f9b9626738f1393fad89a64c) by [pukkandan](https://github.com/pukkandan)
+- **vk**: VKPlay, VKPlayLive: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/8776349ef6b1f644584a92dfa00a05208a48edc4) ([#7358](https://github.com/yt-dlp/yt-dlp/issues/7358)) by [c-basalt](https://github.com/c-basalt)
+- **youtube**
+ - [Add extractor-arg `formats`](https://github.com/yt-dlp/yt-dlp/commit/58786a10f212bd63f9ad1d0b4d9e4d31c3b385e2) by [pukkandan](https://github.com/pukkandan)
+ - [Avoid false DRM detection](https://github.com/yt-dlp/yt-dlp/commit/94ed638a437fc766699d440e978982e24ce6a30a) ([#7396](https://github.com/yt-dlp/yt-dlp/issues/7396)) by [pukkandan](https://github.com/pukkandan)
+ - [Fix comments' `is_favorited`](https://github.com/yt-dlp/yt-dlp/commit/89bed013741a776506f60380b7fd89d27d0710b4) ([#7390](https://github.com/yt-dlp/yt-dlp/issues/7390)) by [bbilly1](https://github.com/bbilly1)
+ - [Ignore incomplete data for comment threads by default](https://github.com/yt-dlp/yt-dlp/commit/4dc4d8473c085900edc841c87c20041233d25b1f) ([#7475](https://github.com/yt-dlp/yt-dlp/issues/7475)) by [coletdjnz](https://github.com/coletdjnz)
+ - [Process `post_live` over 2 hours](https://github.com/yt-dlp/yt-dlp/commit/d949c10c45bfc359bdacd52e6a180169b8128958) by [pukkandan](https://github.com/pukkandan)
+ - stories: [Remove](https://github.com/yt-dlp/yt-dlp/commit/90db9a3c00ca80492c6a58c542e4cbf4c2710866) ([#7459](https://github.com/yt-dlp/yt-dlp/issues/7459)) by [pukkandan](https://github.com/pukkandan)
+ - tab: [Support shorts-only playlists](https://github.com/yt-dlp/yt-dlp/commit/fcbc9ed760be6e3455bbadfaf277b4504b06f068) ([#7425](https://github.com/yt-dlp/yt-dlp/issues/7425)) by [coletdjnz](https://github.com/coletdjnz)
+
+#### Downloader changes
+- **aria2c**: [Add `--no-conf`](https://github.com/yt-dlp/yt-dlp/commit/8a8af356e3bba98a7f7d333aff0777d5d92130c8) by [pukkandan](https://github.com/pukkandan)
+- **external**: [Scope cookies](https://github.com/yt-dlp/yt-dlp/commit/1ceb657bdd254ad961489e5060f2ccc7d556b729) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz)
+- **http**: [Avoid infinite loop when no data is received](https://github.com/yt-dlp/yt-dlp/commit/662ef1e910b72e57957f06589925b2332ba52821) by [pukkandan](https://github.com/pukkandan)
+
+#### Misc. changes
+- [Add CodeQL workflow](https://github.com/yt-dlp/yt-dlp/commit/6355b5f1e1e8e7f4ef866d71d51e03baf0e82f17) ([#7497](https://github.com/yt-dlp/yt-dlp/issues/7497)) by [jorgectf](https://github.com/jorgectf)
+- **cleanup**: Miscellaneous: [337734d](https://github.com/yt-dlp/yt-dlp/commit/337734d4a8a6500bc65434843db346b5cbd05e81) by [pukkandan](https://github.com/pukkandan)
+- **docs**: [Minor fixes](https://github.com/yt-dlp/yt-dlp/commit/b532a3481046e1eabb6232ee8196fb696c356ff6) by [pukkandan](https://github.com/pukkandan)
+- **make_changelog**: [Skip reverted commits](https://github.com/yt-dlp/yt-dlp/commit/fa44802809d189fca0f4782263d48d6533384503) by [pukkandan](https://github.com/pukkandan)
+
+### 2023.06.22
+
+#### Core changes
+- [Fix bug in db3ad8a67661d7b234a6954d9c6a4a9b1749f5eb](https://github.com/yt-dlp/yt-dlp/commit/d7cd97e8d8d42b500fea9abb2aa4ac9b0f98b2ad) by [pukkandan](https://github.com/pukkandan)
+- [Improve `--download-sections`](https://github.com/yt-dlp/yt-dlp/commit/b4e0d75848e9447cee2cd3646ce54d4744a7ff56) by [pukkandan](https://github.com/pukkandan)
+ - Support negative time-ranges
+ - Add `*from-url` to obey time-ranges in URL
+- [Indicate `filesize` approximated from `tbr` better](https://github.com/yt-dlp/yt-dlp/commit/0dff8e4d1e6e9fb938f4256ea9af7d81f42fd54f) by [pukkandan](https://github.com/pukkandan)
+
+#### Extractor changes
+- [Support multiple `_VALID_URL`s](https://github.com/yt-dlp/yt-dlp/commit/5fd8367496b42c7b900b896a0d5460561a2859de) ([#5812](https://github.com/yt-dlp/yt-dlp/issues/5812)) by [nixxo](https://github.com/nixxo)
+- **dplay**: GlobalCyclingNetworkPlus: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/774aa09dd6aa61ced9ec818d1f67e53414d22762) ([#7360](https://github.com/yt-dlp/yt-dlp/issues/7360)) by [bashonly](https://github.com/bashonly)
+- **dropout**: [Fix season extraction](https://github.com/yt-dlp/yt-dlp/commit/db22142f6f817ff673d417b4b78e8db497bf8ab3) ([#7304](https://github.com/yt-dlp/yt-dlp/issues/7304)) by [OverlordQ](https://github.com/OverlordQ)
+- **motherless**: [Add gallery support, fix groups](https://github.com/yt-dlp/yt-dlp/commit/f2ff0f6f1914b82d4a51681a72cc0828115dcb4a) ([#7211](https://github.com/yt-dlp/yt-dlp/issues/7211)) by [rexlambert22](https://github.com/rexlambert22), [Ti4eeT4e](https://github.com/Ti4eeT4e)
+- **nebula**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/3f756c8c4095b942cf49788eb0862ceaf57847f2) ([#7156](https://github.com/yt-dlp/yt-dlp/issues/7156)) by [Lamieur](https://github.com/Lamieur), [rohieb](https://github.com/rohieb)
+- **rheinmaintv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/98cb1eda7a4cf67c96078980dbd63e6c06ad7f7c) ([#7311](https://github.com/yt-dlp/yt-dlp/issues/7311)) by [barthelmannk](https://github.com/barthelmannk)
+- **youtube**
+ - [Add `ios` to default clients used](https://github.com/yt-dlp/yt-dlp/commit/1e75d97db21152acc764b30a688e516f04b8a142) by [pukkandan](https://github.com/pukkandan)
+ - IOS is affected neither by 403 nor by nsig so helps mitigate them preemptively
+ - IOS also has higher bit-rate 'premium' formats though they are not labeled as such
+ - [Improve description parsing performance](https://github.com/yt-dlp/yt-dlp/commit/71dc18fa29263a1ff0472c23d81bfc8dd4422d48) ([#7315](https://github.com/yt-dlp/yt-dlp/issues/7315)) by [berkanteber](https://github.com/berkanteber), [pukkandan](https://github.com/pukkandan)
+ - [Improve nsig function name extraction](https://github.com/yt-dlp/yt-dlp/commit/cd810afe2ac5567c822b7424800fc470ef2d0045) by [pukkandan](https://github.com/pukkandan)
+ - [Workaround 403 for android formats](https://github.com/yt-dlp/yt-dlp/commit/81ca451480051d7ce1a31c017e005358345a9149) by [pukkandan](https://github.com/pukkandan)
+
+#### Misc. changes
+- [Revert "Add automatic duplicate issue detection"](https://github.com/yt-dlp/yt-dlp/commit/a4486bfc1dc7057efca9dd3fe70d7fa25c56f700) by [pukkandan](https://github.com/pukkandan)
+- **cleanup**
+ - Miscellaneous
+ - [7f9c6a6](https://github.com/yt-dlp/yt-dlp/commit/7f9c6a63b16e145495479e9f666f5b9e2ee69e2f) by [bashonly](https://github.com/bashonly)
+ - [812cdfa](https://github.com/yt-dlp/yt-dlp/commit/812cdfa06c33a40e73a8e04b3e6f42c084666a43) by [pukkandan](https://github.com/pukkandan)
+
+### 2023.06.21
+
+#### Important changes
+- YouTube: Improved throttling and signature fixes
+
+#### Core changes
+- [Add `--compat-option playlist-match-filter`](https://github.com/yt-dlp/yt-dlp/commit/93b39cdbd9dcf351bfa0c4ee252805b4617fdca9) by [pukkandan](https://github.com/pukkandan)
+- [Add `--no-quiet`](https://github.com/yt-dlp/yt-dlp/commit/d669772c65e8630162fd6555d0a578b246591921) by [pukkandan](https://github.com/pukkandan)
+- [Add option `--color`](https://github.com/yt-dlp/yt-dlp/commit/8417f26b8a819cd7ffcd4e000ca3e45033e670fb) ([#6904](https://github.com/yt-dlp/yt-dlp/issues/6904)) by [Grub4K](https://github.com/Grub4K)
+- [Add option `--netrc-cmd`](https://github.com/yt-dlp/yt-dlp/commit/db3ad8a67661d7b234a6954d9c6a4a9b1749f5eb) ([#6682](https://github.com/yt-dlp/yt-dlp/issues/6682)) by [NDagestad](https://github.com/NDagestad), [pukkandan](https://github.com/pukkandan)
+- [Add option `--xff`](https://github.com/yt-dlp/yt-dlp/commit/c16644642b08e2bf4130a6c5fa01395d8718c990) by [pukkandan](https://github.com/pukkandan)
+- [Auto-select default format in `-f-`](https://github.com/yt-dlp/yt-dlp/commit/372a0f3b9dadd1e52234b498aa4c7040ef868c7d) ([#7101](https://github.com/yt-dlp/yt-dlp/issues/7101)) by [ivanskodje](https://github.com/ivanskodje), [pukkandan](https://github.com/pukkandan)
+- [Deprecate internal `Youtubedl-no-compression` header](https://github.com/yt-dlp/yt-dlp/commit/955c89584b66fcd0fcfab3e611f1edeb1ca63886) ([#6876](https://github.com/yt-dlp/yt-dlp/issues/6876)) by [coletdjnz](https://github.com/coletdjnz)
+- [Do not translate newlines in `--print-to-file`](https://github.com/yt-dlp/yt-dlp/commit/9874e82b5a61582169300bea561b3e8899ad1ef7) by [pukkandan](https://github.com/pukkandan)
+- [Ensure pre-processor errors do not block `--print`](https://github.com/yt-dlp/yt-dlp/commit/f005a35aa7e4f67a0c603a946c0dd714c151b2d6) by [pukkandan](https://github.com/pukkandan) (With fixes in [17ba434](https://github.com/yt-dlp/yt-dlp/commit/17ba4343cf99701692a7f4798fd42b50f644faba))
+- [Fix `filepath` being copied to underlying format dict](https://github.com/yt-dlp/yt-dlp/commit/84078a8b38f403495d00b46654c8750774d821de) by [pukkandan](https://github.com/pukkandan)
+- [Improve HTTP redirect handling](https://github.com/yt-dlp/yt-dlp/commit/08916a49c777cb6e000eec092881eb93ec22076c) ([#7094](https://github.com/yt-dlp/yt-dlp/issues/7094)) by [coletdjnz](https://github.com/coletdjnz)
+- [Populate `filename` and `urls` fields at all stages of `--print`](https://github.com/yt-dlp/yt-dlp/commit/170605840ea9d5ad75da6576485ea7d125b428ee) by [pukkandan](https://github.com/pukkandan) (With fixes in [b5f61b6](https://github.com/yt-dlp/yt-dlp/commit/b5f61b69d4561b81fc98c226b176f0c15493e688))
+- [Relaxed validation for numeric format filters](https://github.com/yt-dlp/yt-dlp/commit/c3f624ef0a5d7a6ae1c5ffeb243087e9fc7d79dc) by [pukkandan](https://github.com/pukkandan)
+- [Support decoding multiple content encodings](https://github.com/yt-dlp/yt-dlp/commit/daafbf49b3482edae4d70dd37070be99742a926e) ([#7142](https://github.com/yt-dlp/yt-dlp/issues/7142)) by [coletdjnz](https://github.com/coletdjnz)
+- [Support loading info.json with a list at it's root](https://github.com/yt-dlp/yt-dlp/commit/ab1de9cb1e39cf421c2b7dc6756c6ff1955bb313) by [pukkandan](https://github.com/pukkandan)
+- [Workaround erroneous urllib Windows proxy parsing](https://github.com/yt-dlp/yt-dlp/commit/3f66b6fe50f8d5b545712f8b19d5ae62f5373980) ([#7092](https://github.com/yt-dlp/yt-dlp/issues/7092)) by [coletdjnz](https://github.com/coletdjnz)
+- **cookies**
+ - [Defer extraction of v11 key from keyring](https://github.com/yt-dlp/yt-dlp/commit/9b7a48abd1b187eae1e3f6c9839c47d43ccec00b) by [Grub4K](https://github.com/Grub4K)
+ - [Move `YoutubeDLCookieJar` to cookies module](https://github.com/yt-dlp/yt-dlp/commit/b87e01c123fd560b6a674ce00f45a9459d82d98a) ([#7091](https://github.com/yt-dlp/yt-dlp/issues/7091)) by [coletdjnz](https://github.com/coletdjnz)
+ - [Support custom Safari cookies path](https://github.com/yt-dlp/yt-dlp/commit/a58182b75a05fe0a10c5e94a536711d3ade19c20) ([#6783](https://github.com/yt-dlp/yt-dlp/issues/6783)) by [NextFire](https://github.com/NextFire)
+ - [Update for chromium changes](https://github.com/yt-dlp/yt-dlp/commit/b38d4c941d1993ab27e4c0f8e024e23c2ec0f8f8) ([#6897](https://github.com/yt-dlp/yt-dlp/issues/6897)) by [mbway](https://github.com/mbway)
+- **Cryptodome**: [Fix `__bool__`](https://github.com/yt-dlp/yt-dlp/commit/98ac902c4979e4529b166e873473bef42baa2e3e) by [pukkandan](https://github.com/pukkandan)
+- **jsinterp**
+ - [Do not compile regex](https://github.com/yt-dlp/yt-dlp/commit/7aeda6cc9e73ada0b0a0b6a6748c66bef63a20a8) by [pukkandan](https://github.com/pukkandan)
+ - [Fix division](https://github.com/yt-dlp/yt-dlp/commit/b4a252fba81f53631c07ca40ce7583f5d19a8a36) ([#7279](https://github.com/yt-dlp/yt-dlp/issues/7279)) by [bashonly](https://github.com/bashonly)
+ - [Fix global object extraction](https://github.com/yt-dlp/yt-dlp/commit/01aba2519a0884ef17d5f85608dbd2a455577147) by [pukkandan](https://github.com/pukkandan)
+ - [Handle `NaN` in bitwise operators](https://github.com/yt-dlp/yt-dlp/commit/1d7656184c6b8aa46b29149893894b3c24f1df00) by [pukkandan](https://github.com/pukkandan)
+ - [Handle negative numbers better](https://github.com/yt-dlp/yt-dlp/commit/7cf51f21916292cd80bdeceb37489f5322f166dd) by [pukkandan](https://github.com/pukkandan)
+- **outtmpl**
+ - [Allow `\n` in replacements and default.](https://github.com/yt-dlp/yt-dlp/commit/78fde6e3398ff11e5d383a66b28664badeab5180) by [pukkandan](https://github.com/pukkandan)
+ - [Fix some minor bugs](https://github.com/yt-dlp/yt-dlp/commit/ebe1b4e34f43c3acad30e4bcb8484681a030c114) by [pukkandan](https://github.com/pukkandan) (With fixes in [1619ab3](https://github.com/yt-dlp/yt-dlp/commit/1619ab3e67d8dc4f86fc7ed292c79345bc0d91a0))
+ - [Support `str.format` syntax inside replacements](https://github.com/yt-dlp/yt-dlp/commit/ec9311c41b111110bc52cfbd6ea682c6fb23f77a) by [pukkandan](https://github.com/pukkandan)
+- **update**
+ - [Better error handling](https://github.com/yt-dlp/yt-dlp/commit/d2e84d5eb01c66fc5304e8566348d65a7be24ed7) by [pukkandan](https://github.com/pukkandan)
+ - [Do not restart into versions without `--update-to`](https://github.com/yt-dlp/yt-dlp/commit/02948a17d903f544363bb20b51a6d8baed7bba08) by [pukkandan](https://github.com/pukkandan)
+ - [Implement `--update-to` repo](https://github.com/yt-dlp/yt-dlp/commit/665472a7de3880578c0b7b3f95c71570c056368e) by [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan)
+- **upstream**
+ - [Merged with youtube-dl 07af47](https://github.com/yt-dlp/yt-dlp/commit/42f2d40b475db66486a4b4fe5b56751a640db5db) by [pukkandan](https://github.com/pukkandan)
+ - [Merged with youtube-dl d1c6c5](https://github.com/yt-dlp/yt-dlp/commit/4823ec9f461512daa1b8ab362893bb86a6320b26) by [pukkandan](https://github.com/pukkandan) (With fixes in [edbe5b5](https://github.com/yt-dlp/yt-dlp/commit/edbe5b589dd0860a67b4e03f58db3cd2539d91c2) by [bashonly](https://github.com/bashonly))
+- **utils**
+ - `FormatSorter`: [Improve `size` and `br`](https://github.com/yt-dlp/yt-dlp/commit/eedda5252c05327748dede204a8fccafa0288118) by [pukkandan](https://github.com/pukkandan), [u-spec-png](https://github.com/u-spec-png)
+ - `js_to_json`: [Implement template strings](https://github.com/yt-dlp/yt-dlp/commit/0898c5c8ccadfc404472456a7a7751b72afebadd) ([#6623](https://github.com/yt-dlp/yt-dlp/issues/6623)) by [Grub4K](https://github.com/Grub4K)
+ - `locked_file`: [Fix for virtiofs](https://github.com/yt-dlp/yt-dlp/commit/45998b3e371b819ce0dbe50da703809a048cc2fe) ([#6840](https://github.com/yt-dlp/yt-dlp/issues/6840)) by [brandon-dacrib](https://github.com/brandon-dacrib)
+ - `strftime_or_none`: [Handle negative timestamps](https://github.com/yt-dlp/yt-dlp/commit/a35af4306d24c56c6358f89cdf204860d1cd62b4) by [dirkf](https://github.com/dirkf), [pukkandan](https://github.com/pukkandan)
+ - `traverse_obj`
+ - [Allow iterables in traversal](https://github.com/yt-dlp/yt-dlp/commit/21b5ec86c2c37d10c5bb97edd7051d3aac16bb3e) ([#6902](https://github.com/yt-dlp/yt-dlp/issues/6902)) by [Grub4K](https://github.com/Grub4K)
+ - [More fixes](https://github.com/yt-dlp/yt-dlp/commit/b079c26f0af8085bccdadc72c61c8164ca5ab0f8) ([#6959](https://github.com/yt-dlp/yt-dlp/issues/6959)) by [Grub4K](https://github.com/Grub4K)
+ - `write_string`: [Fix noconsole behavior](https://github.com/yt-dlp/yt-dlp/commit/3b479100df02e20dd949e046003ae96ddbfced57) by [Grub4K](https://github.com/Grub4K)
+
+#### Extractor changes
+- [Do not exit early for unsuitable `url_result`](https://github.com/yt-dlp/yt-dlp/commit/baa922b5c74b10e3b86ff5e6cf6529b3aae8efab) by [pukkandan](https://github.com/pukkandan)
+- [Do not warn for invalid chapter data in description](https://github.com/yt-dlp/yt-dlp/commit/84ffeb7d5e72e3829319ba7720a8480fc4c7503b) by [pukkandan](https://github.com/pukkandan)
+- [Extract more metadata from ISM](https://github.com/yt-dlp/yt-dlp/commit/f68434cc74cfd3db01b266476a2eac8329fbb267) by [pukkandan](https://github.com/pukkandan)
+- **abematv**: [Add fallback for title and description extraction and extract more metadata](https://github.com/yt-dlp/yt-dlp/commit/c449c0655d7c8549e6e1389c26b628053b253d39) ([#6994](https://github.com/yt-dlp/yt-dlp/issues/6994)) by [Lesmiscore](https://github.com/Lesmiscore)
+- **acast**: [Support embeds](https://github.com/yt-dlp/yt-dlp/commit/c91ac833ea99b00506e470a44cf930e4e23378c9) ([#7212](https://github.com/yt-dlp/yt-dlp/issues/7212)) by [pabs3](https://github.com/pabs3)
+- **adobepass**: [Handle `Charter_Direct` MSO as `Spectrum`](https://github.com/yt-dlp/yt-dlp/commit/ea0570820336a0fe9c3b530d1b0d1e59313274f4) ([#6824](https://github.com/yt-dlp/yt-dlp/issues/6824)) by [bashonly](https://github.com/bashonly)
+- **aeonco**: [Support Youtube embeds](https://github.com/yt-dlp/yt-dlp/commit/ed81b74802b4247ee8d9dc0ef87eb52baefede1c) ([#6591](https://github.com/yt-dlp/yt-dlp/issues/6591)) by [alexklapheke](https://github.com/alexklapheke)
+- **afreecatv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/fdd69db38924c38194ef236b26325d66ac815c88) ([#6283](https://github.com/yt-dlp/yt-dlp/issues/6283)) by [blmarket](https://github.com/blmarket)
+- **ARDBetaMediathek**: [Add thumbnail](https://github.com/yt-dlp/yt-dlp/commit/f78eb41e1c0f1dcdb10317358a26bf541dc7ee15) ([#6890](https://github.com/yt-dlp/yt-dlp/issues/6890)) by [StefanLobbenmeier](https://github.com/StefanLobbenmeier)
+- **bibeltv**: [Fix extraction, support live streams and series](https://github.com/yt-dlp/yt-dlp/commit/4ad58667c102bd82a7c4cca8aa395ec1682e3b4c) ([#6505](https://github.com/yt-dlp/yt-dlp/issues/6505)) by [flashdagger](https://github.com/flashdagger)
+- **bilibili**
+ - [Support festival videos](https://github.com/yt-dlp/yt-dlp/commit/ab29e47029e2f5b48abbbab78e82faf7cf6e9506) ([#6547](https://github.com/yt-dlp/yt-dlp/issues/6547)) by [qbnu](https://github.com/qbnu)
+ - SpaceVideo: [Extract signature](https://github.com/yt-dlp/yt-dlp/commit/6f10cdcf7eeaeae5b75e0a4428cd649c156a2d83) ([#7149](https://github.com/yt-dlp/yt-dlp/issues/7149)) by [elyse0](https://github.com/elyse0)
+- **biliIntl**: [Add comment extraction](https://github.com/yt-dlp/yt-dlp/commit/b093c38cc9f26b59a8504211d792f053142c847d) ([#6079](https://github.com/yt-dlp/yt-dlp/issues/6079)) by [HobbyistDev](https://github.com/HobbyistDev)
+- **bitchute**: [Add more fallback subdomains](https://github.com/yt-dlp/yt-dlp/commit/0c4e0fbcade0fc92d14c2a6d63e360fe067f6192) ([#6907](https://github.com/yt-dlp/yt-dlp/issues/6907)) by [Neurognostic](https://github.com/Neurognostic)
+- **booyah**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/f7f7a877bf8e87fd4eb0ad2494ad948ca7691114) by [pukkandan](https://github.com/pukkandan)
+- **BrainPOP**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/979568f26ece80bca72b48f0dd57d676e431059a) ([#6106](https://github.com/yt-dlp/yt-dlp/issues/6106)) by [MinePlayersPE](https://github.com/MinePlayersPE)
+- **bravotv**
+ - [Detect DRM](https://github.com/yt-dlp/yt-dlp/commit/1fe5bf240e6ade487d18079a62aa36bcc440a27a) ([#7171](https://github.com/yt-dlp/yt-dlp/issues/7171)) by [bashonly](https://github.com/bashonly)
+ - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/06966cb8966b9aa4f60ab9c44c182a057d4ca3a3) ([#6568](https://github.com/yt-dlp/yt-dlp/issues/6568)) by [bashonly](https://github.com/bashonly)
+- **camfm**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/4cbfa570a1b9bd65b0f48770693377e8d842dcb0) ([#7083](https://github.com/yt-dlp/yt-dlp/issues/7083)) by [garret1317](https://github.com/garret1317)
+- **cbc**
+ - [Fix live extractor, playlist `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/7a7b1376fbce0067cf37566bb47131bc0022638d) ([#6625](https://github.com/yt-dlp/yt-dlp/issues/6625)) by [makew0rld](https://github.com/makew0rld)
+ - [Ignore 426 from API](https://github.com/yt-dlp/yt-dlp/commit/4afb208cf07b59291ae3b0c4efc83945ee5b8812) ([#6781](https://github.com/yt-dlp/yt-dlp/issues/6781)) by [jo-nike](https://github.com/jo-nike)
+ - gem: [Update `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/871c907454693940cb56906ed9ea49fcb7154829) ([#6499](https://github.com/yt-dlp/yt-dlp/issues/6499)) by [makeworld-the-better-one](https://github.com/makeworld-the-better-one)
+- **cbs**: [Add `ParamountPressExpress` extractor](https://github.com/yt-dlp/yt-dlp/commit/44369c9afa996e14e9f466754481d878811b5b4a) ([#6604](https://github.com/yt-dlp/yt-dlp/issues/6604)) by [bashonly](https://github.com/bashonly)
+- **cbsnews**: [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/f6e43d6fa9804c24525e1fed0a87782754dab7ed) ([#6681](https://github.com/yt-dlp/yt-dlp/issues/6681)) by [bashonly](https://github.com/bashonly)
+- **chilloutzone**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6f4fc5660f40f3458882a8f51601eae4af7be609) ([#6445](https://github.com/yt-dlp/yt-dlp/issues/6445)) by [bashonly](https://github.com/bashonly)
+- **clipchamp**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/2f07c4c1da4361af213e5791279b9d152d2e4ce3) ([#6978](https://github.com/yt-dlp/yt-dlp/issues/6978)) by [bashonly](https://github.com/bashonly)
+- **comedycentral**: [Add support for movies](https://github.com/yt-dlp/yt-dlp/commit/66468bbf49562ff82670cbbd456c5e8448a6df34) ([#7108](https://github.com/yt-dlp/yt-dlp/issues/7108)) by [sqrtNOT](https://github.com/sqrtNOT)
+- **crtvg**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/26c517b29c8727e47948d6fff749d5297f0efb60) ([#7168](https://github.com/yt-dlp/yt-dlp/issues/7168)) by [ItzMaxTV](https://github.com/ItzMaxTV)
+- **crunchyroll**: [Rework with support for movies, music and artists](https://github.com/yt-dlp/yt-dlp/commit/032de83ea9ff2f4977d9c71a93bbc1775597b762) ([#6237](https://github.com/yt-dlp/yt-dlp/issues/6237)) by [Grub4K](https://github.com/Grub4K)
+- **dacast**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/c25cac2f8e5fbac2737a426d7778fd2f0efc5381) ([#6896](https://github.com/yt-dlp/yt-dlp/issues/6896)) by [bashonly](https://github.com/bashonly)
+- **daftsex**: [Update domain and embed player url](https://github.com/yt-dlp/yt-dlp/commit/fc5a7f9b27d2a89b1f3ca7d33a95301c21d832cd) ([#5966](https://github.com/yt-dlp/yt-dlp/issues/5966)) by [JChris246](https://github.com/JChris246)
+- **DigitalConcertHall**: [Support films](https://github.com/yt-dlp/yt-dlp/commit/55ed4ff73487feb3177b037dfc2ea527e777da3e) ([#7202](https://github.com/yt-dlp/yt-dlp/issues/7202)) by [ItzMaxTV](https://github.com/ItzMaxTV)
+- **discogs**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6daaf21092888beff11b807cd46f832f1f9c46a0) ([#6624](https://github.com/yt-dlp/yt-dlp/issues/6624)) by [rjy](https://github.com/rjy)
+- **dlf**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b423b6a48e0b19260bc95ab7d72d2138d7f124dc) ([#6697](https://github.com/yt-dlp/yt-dlp/issues/6697)) by [nick-cd](https://github.com/nick-cd)
+- **drtv**: [Fix radio page extraction](https://github.com/yt-dlp/yt-dlp/commit/9a06b7b1891b48cebbe275652ae8025a36d97d97) ([#6552](https://github.com/yt-dlp/yt-dlp/issues/6552)) by [viktor-enzell](https://github.com/viktor-enzell)
+- **Dumpert**: [Fix m3u8 and support new URL pattern](https://github.com/yt-dlp/yt-dlp/commit/f8ae441501596733e2b967430471643a1d7cacb8) ([#6091](https://github.com/yt-dlp/yt-dlp/issues/6091)) by [DataGhost](https://github.com/DataGhost), [pukkandan](https://github.com/pukkandan)
+- **elevensports**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/ecfe47973f6603b5367fe2cc3c65274627d94516) ([#7172](https://github.com/yt-dlp/yt-dlp/issues/7172)) by [ItzMaxTV](https://github.com/ItzMaxTV)
+- **ettutv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/83465fc4100a2fb2c188898fbc2f3021f6a9b4dd) ([#6579](https://github.com/yt-dlp/yt-dlp/issues/6579)) by [elyse0](https://github.com/elyse0)
+- **europarl**: [Rewrite extractor](https://github.com/yt-dlp/yt-dlp/commit/03789976d301eaed3e957dbc041573098f6af059) ([#7114](https://github.com/yt-dlp/yt-dlp/issues/7114)) by [HobbyistDev](https://github.com/HobbyistDev)
+- **eurosport**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/45e87ea106ad37b2a002663fa30ee41ce97b16cd) ([#7076](https://github.com/yt-dlp/yt-dlp/issues/7076)) by [HobbyistDev](https://github.com/HobbyistDev)
+- **facebook**: [Fix metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/3b52a606881e6adadc33444abdeacce562b79330) ([#6856](https://github.com/yt-dlp/yt-dlp/issues/6856)) by [ringus1](https://github.com/ringus1)
+- **foxnews**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/97d60ad8cd6c99f01e463a9acfce8693aff2a609) ([#7222](https://github.com/yt-dlp/yt-dlp/issues/7222)) by [bashonly](https://github.com/bashonly)
+- **funker530**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/cab94a0cd8b6d3fffed5a6faff030274adbed182) ([#7291](https://github.com/yt-dlp/yt-dlp/issues/7291)) by [Cyberes](https://github.com/Cyberes)
+- **generic**
+ - [Accept values for `fragment_query`, `variant_query`](https://github.com/yt-dlp/yt-dlp/commit/5cc0a8fd2e9fec50026fb92170b57993af939e4a) ([#6600](https://github.com/yt-dlp/yt-dlp/issues/6600)) by [bashonly](https://github.com/bashonly) (With fixes in [9bfe0d1](https://github.com/yt-dlp/yt-dlp/commit/9bfe0d15bd7dbdc6b0e6378fa9f5e2e289b2373b))
+ - [Add extractor-args `hls_key`, `variant_query`](https://github.com/yt-dlp/yt-dlp/commit/c2e0fc40a73dd85ab3920f977f579d475e66ef59) ([#6567](https://github.com/yt-dlp/yt-dlp/issues/6567)) by [bashonly](https://github.com/bashonly)
+ - [Attempt to detect live HLS](https://github.com/yt-dlp/yt-dlp/commit/93e7c6995e07dafb9dcc06c0d06acf6c5bdfecc5) ([#6775](https://github.com/yt-dlp/yt-dlp/issues/6775)) by [bashonly](https://github.com/bashonly)
+- **genius**: [Add support for articles](https://github.com/yt-dlp/yt-dlp/commit/460da07439718d9af1e3661da2a23e05a913a2e6) ([#6474](https://github.com/yt-dlp/yt-dlp/issues/6474)) by [bashonly](https://github.com/bashonly)
+- **globalplayer**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/30647668a92a0ca5cd108776804baac0996bd9f7) ([#6903](https://github.com/yt-dlp/yt-dlp/issues/6903)) by [garret1317](https://github.com/garret1317)
+- **gmanetwork**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/2d97d154fe4fb84fe2ed3a4e1ed5819e89b71e88) ([#5945](https://github.com/yt-dlp/yt-dlp/issues/5945)) by [HobbyistDev](https://github.com/HobbyistDev)
+- **gronkh**: [Extract duration and chapters](https://github.com/yt-dlp/yt-dlp/commit/9c92b803fa24e48543ce969468d5404376e315b7) ([#6817](https://github.com/yt-dlp/yt-dlp/issues/6817)) by [satan1st](https://github.com/satan1st)
+- **hentaistigma**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/04f8018a0544736a18494bc3899d06b05b78fae6) by [pukkandan](https://github.com/pukkandan)
+- **hidive**: [Fix login](https://github.com/yt-dlp/yt-dlp/commit/e6ab678e36c40ded0aae305bbb866cdab554d417) by [pukkandan](https://github.com/pukkandan)
+- **hollywoodreporter**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/6bdb64e2a2a6d504d8ce1dc830fbfb8a7f199c63) ([#6614](https://github.com/yt-dlp/yt-dlp/issues/6614)) by [bashonly](https://github.com/bashonly)
+- **hotstar**: [Support `/shows/` URLs](https://github.com/yt-dlp/yt-dlp/commit/7f8ddebbb51c9fd4a347306332a718ba41b371b8) ([#7225](https://github.com/yt-dlp/yt-dlp/issues/7225)) by [bashonly](https://github.com/bashonly)
+- **hrefli**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/7e35526d5b970a034b9d76215ee3e4bd7631edcd) ([#6762](https://github.com/yt-dlp/yt-dlp/issues/6762)) by [selfisekai](https://github.com/selfisekai)
+- **idolplus**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/5c14b213679ed4401288bdc86ae696932e219222) ([#6732](https://github.com/yt-dlp/yt-dlp/issues/6732)) by [ping](https://github.com/ping)
+- **iq**: [Set more language codes](https://github.com/yt-dlp/yt-dlp/commit/2d5cae9636714ff922d28c548c349d5f2b48f317) ([#6476](https://github.com/yt-dlp/yt-dlp/issues/6476)) by [D0LLYNH0](https://github.com/D0LLYNH0)
+- **iwara**
+ - [Accept old URLs](https://github.com/yt-dlp/yt-dlp/commit/ab92d8651c48d247dfb7d3f0a824cc986e47c7ed) by [Lesmiscore](https://github.com/Lesmiscore)
+ - [Fix authentication](https://github.com/yt-dlp/yt-dlp/commit/0a5d7c39e17bb9bd50c9db42bcad40eb82d7f784) ([#7137](https://github.com/yt-dlp/yt-dlp/issues/7137)) by [toomyzoom](https://github.com/toomyzoom)
+ - [Fix format sorting](https://github.com/yt-dlp/yt-dlp/commit/56793f74c36899742d7abd52afb0deca97d469e1) ([#6651](https://github.com/yt-dlp/yt-dlp/issues/6651)) by [hasezoey](https://github.com/hasezoey)
+ - [Fix typo](https://github.com/yt-dlp/yt-dlp/commit/d1483ec693c79f0b4ddf493870bcb840aca4da08) by [Lesmiscore](https://github.com/Lesmiscore)
+ - [Implement login](https://github.com/yt-dlp/yt-dlp/commit/21b9413cf7dd4830b2ece57af21589dd4538fc52) ([#6721](https://github.com/yt-dlp/yt-dlp/issues/6721)) by [toomyzoom](https://github.com/toomyzoom)
+ - [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/c14af7a741931b364bab3d9546c0f4359f318f8c) ([#6557](https://github.com/yt-dlp/yt-dlp/issues/6557)) by [Lesmiscore](https://github.com/Lesmiscore)
+ - [Report private videos](https://github.com/yt-dlp/yt-dlp/commit/95a383be1b6fb00c92ee3fb091732c4f6009acb6) ([#6641](https://github.com/yt-dlp/yt-dlp/issues/6641)) by [Lesmiscore](https://github.com/Lesmiscore)
+- **JStream**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/3459d3c5af3b2572ed51e8ecfda6c11022a838c6) ([#6252](https://github.com/yt-dlp/yt-dlp/issues/6252)) by [Lesmiscore](https://github.com/Lesmiscore)
+- **jwplatform**: [Update `_extract_embed_urls`](https://github.com/yt-dlp/yt-dlp/commit/cf9fd52fabe71d6e7c30d3ea525029ffa561fc9c) ([#6383](https://github.com/yt-dlp/yt-dlp/issues/6383)) by [carusocr](https://github.com/carusocr)
+- **kick**: [Make initial request non-fatal](https://github.com/yt-dlp/yt-dlp/commit/0a6918a4a1431960181d8c50e0bbbcb0afbaff9a) by [bashonly](https://github.com/bashonly)
+- **LastFM**: [Rewrite playlist extraction](https://github.com/yt-dlp/yt-dlp/commit/026435714cb7c39613a0d7d2acd15d3823b78d94) ([#6379](https://github.com/yt-dlp/yt-dlp/issues/6379)) by [hatienl0i261299](https://github.com/hatienl0i261299), [pukkandan](https://github.com/pukkandan)
+- **lbry**: [Extract original quality formats](https://github.com/yt-dlp/yt-dlp/commit/44c0d66442b568d9e1359e669d8b029b08a77fa7) ([#7257](https://github.com/yt-dlp/yt-dlp/issues/7257)) by [bashonly](https://github.com/bashonly)
+- **line**: [Remove extractors](https://github.com/yt-dlp/yt-dlp/commit/faa0332ed69e070cf3bd31390589a596e962f392) ([#6734](https://github.com/yt-dlp/yt-dlp/issues/6734)) by [sian1468](https://github.com/sian1468)
+- **livestream**: [Support videos with account id](https://github.com/yt-dlp/yt-dlp/commit/bfdf144c7e5d7a93fbfa9d8e65598c72bf2b542a) ([#6324](https://github.com/yt-dlp/yt-dlp/issues/6324)) by [theperfectpunk](https://github.com/theperfectpunk)
+- **medaltv**: [Fix clips](https://github.com/yt-dlp/yt-dlp/commit/1e3c2b6ec28d7ab5e31341fa93c47b65be4fbff4) ([#6502](https://github.com/yt-dlp/yt-dlp/issues/6502)) by [xenova](https://github.com/xenova)
+- **mediastream**: [Improve `WinSports` and embed extraction](https://github.com/yt-dlp/yt-dlp/commit/03025b6e105139d01cd415ddc51fd692957fd2ba) ([#6426](https://github.com/yt-dlp/yt-dlp/issues/6426)) by [bashonly](https://github.com/bashonly)
+- **mgtv**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/59d9fe08312bbb76ee26238d207a8ca35410a48d) ([#7234](https://github.com/yt-dlp/yt-dlp/issues/7234)) by [bashonly](https://github.com/bashonly)
+- **Mzaalo**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/dc3c44f349ba85af320e706e2a27ad81a78b1c6e) ([#7163](https://github.com/yt-dlp/yt-dlp/issues/7163)) by [ItzMaxTV](https://github.com/ItzMaxTV)
+- **nbc**: [Fix `NBCStations` direct mp4 formats](https://github.com/yt-dlp/yt-dlp/commit/9be0fe1fd967f62cbf3c60bd14e1021a70abc147) ([#6637](https://github.com/yt-dlp/yt-dlp/issues/6637)) by [bashonly](https://github.com/bashonly)
+- **nebula**: [Add `beta.nebula.tv`](https://github.com/yt-dlp/yt-dlp/commit/cbfe2e5cbe0f4649a91e323a82b8f5f774f36662) ([#6516](https://github.com/yt-dlp/yt-dlp/issues/6516)) by [unbeatable-101](https://github.com/unbeatable-101)
+- **nekohacker**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/489f51279d00318018478fd7461eddbe3b45297e) ([#7003](https://github.com/yt-dlp/yt-dlp/issues/7003)) by [hasezoey](https://github.com/hasezoey)
+- **nhk**
+ - [Add `NhkRadiru` extractor](https://github.com/yt-dlp/yt-dlp/commit/8f0be90ecb3b8d862397177bb226f17b245ef933) ([#6819](https://github.com/yt-dlp/yt-dlp/issues/6819)) by [garret1317](https://github.com/garret1317)
+ - [Fix API extraction](https://github.com/yt-dlp/yt-dlp/commit/f41b949a2ef646fbc36375febbe3f0c19d742c0f) ([#7180](https://github.com/yt-dlp/yt-dlp/issues/7180)) by [menschel](https://github.com/menschel), [sjthespian](https://github.com/sjthespian)
+ - `NhkRadiruLive`: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/81c8b9bdd9841b72cbfc1bbff9dab5fb4aa038b0) ([#7332](https://github.com/yt-dlp/yt-dlp/issues/7332)) by [garret1317](https://github.com/garret1317)
+- **niconico**
+ - [Download comments from the new endpoint](https://github.com/yt-dlp/yt-dlp/commit/52ecc33e221f7de7eb6fed6c22489f0c5fdd2c6d) ([#6773](https://github.com/yt-dlp/yt-dlp/issues/6773)) by [Lesmiscore](https://github.com/Lesmiscore)
+ - live: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/f8f9250fe280d37f0988646cd5cc0072f4d33a6d) ([#5764](https://github.com/yt-dlp/yt-dlp/issues/5764)) by [Lesmiscore](https://github.com/Lesmiscore)
+ - series: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/c86e433c35fe5da6cb29f3539eef97497f84ed38) ([#6898](https://github.com/yt-dlp/yt-dlp/issues/6898)) by [sqrtNOT](https://github.com/sqrtNOT)
+- **nubilesporn**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/d4e6ef40772e0560a8ed33b844ef7549e86837be) ([#6231](https://github.com/yt-dlp/yt-dlp/issues/6231)) by [permunkle](https://github.com/permunkle)
+- **odnoklassniki**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/1a2eb5bda51d8b7a78a65acebf72a0dcf9da196b) ([#7217](https://github.com/yt-dlp/yt-dlp/issues/7217)) by [bashonly](https://github.com/bashonly)
+- **opencast**
+ - [Add ltitools to `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/3588be59cee429a0ab5c4ceb2f162298bb44147d) ([#6371](https://github.com/yt-dlp/yt-dlp/issues/6371)) by [C0D3D3V](https://github.com/C0D3D3V)
+ - [Fix format bug](https://github.com/yt-dlp/yt-dlp/commit/89dbf0848370deaa55af88c3593a2a264124caf5) ([#6512](https://github.com/yt-dlp/yt-dlp/issues/6512)) by [C0D3D3V](https://github.com/C0D3D3V)
+- **owncloud**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/c6d4b82a8b8bce59b1c9ce5e6d349ea428dac0a7) ([#6533](https://github.com/yt-dlp/yt-dlp/issues/6533)) by [C0D3D3V](https://github.com/C0D3D3V)
+- **Parler**: [Rewrite extractor](https://github.com/yt-dlp/yt-dlp/commit/80ea6d3dea8483cddd39fc89b5ee1fc06670c33c) ([#6446](https://github.com/yt-dlp/yt-dlp/issues/6446)) by [JChris246](https://github.com/JChris246)
+- **pgatour**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/3ae182ad89e1427ff7b1684d6a44ff93fa857a0c) ([#6613](https://github.com/yt-dlp/yt-dlp/issues/6613)) by [bashonly](https://github.com/bashonly)
+- **playsuisse**: [Support new url format](https://github.com/yt-dlp/yt-dlp/commit/94627c5dde12a72766bdba36e056916c29c40ed1) ([#6528](https://github.com/yt-dlp/yt-dlp/issues/6528)) by [sbor23](https://github.com/sbor23)
+- **polskieradio**: [Improve extractors](https://github.com/yt-dlp/yt-dlp/commit/738c90a463257634455ada3e5c18b714c531dede) ([#5948](https://github.com/yt-dlp/yt-dlp/issues/5948)) by [selfisekai](https://github.com/selfisekai)
+- **pornez**: [Support new URL formats](https://github.com/yt-dlp/yt-dlp/commit/cbdf9408e6f1e35e98fd6477b3d6902df5b8a47f) ([#6792](https://github.com/yt-dlp/yt-dlp/issues/6792)) by [zhgwn](https://github.com/zhgwn)
+- **pornhub**: [Set access cookies to fix extraction](https://github.com/yt-dlp/yt-dlp/commit/62beefa818c75c20b6941389bb197051554a5d41) ([#6685](https://github.com/yt-dlp/yt-dlp/issues/6685)) by [arobase-che](https://github.com/arobase-che), [Schmoaaaaah](https://github.com/Schmoaaaaah)
+- **rai**: [Rewrite extractors](https://github.com/yt-dlp/yt-dlp/commit/c6d3f81a4077aaf9cffc6aa2d0dec92f38e74bb0) ([#5940](https://github.com/yt-dlp/yt-dlp/issues/5940)) by [danog](https://github.com/danog), [nixxo](https://github.com/nixxo)
+- **recurbate**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/c2502cfed91415c7ccfff925fd3404d230046484) ([#6297](https://github.com/yt-dlp/yt-dlp/issues/6297)) by [mrscrapy](https://github.com/mrscrapy)
+- **reddit**
+ - [Add login support](https://github.com/yt-dlp/yt-dlp/commit/4d9280c9c853733534dda60486fa949bcca36c9e) ([#6950](https://github.com/yt-dlp/yt-dlp/issues/6950)) by [bashonly](https://github.com/bashonly)
+ - [Support cookies and short URLs](https://github.com/yt-dlp/yt-dlp/commit/7a6f6f24592a8065376f11a58e44878807732cf6) ([#6825](https://github.com/yt-dlp/yt-dlp/issues/6825)) by [bashonly](https://github.com/bashonly)
+- **rokfin**: [Re-construct manifest url](https://github.com/yt-dlp/yt-dlp/commit/7a6c8a0807941dd24fbf0d6172e811884f98e027) ([#6507](https://github.com/yt-dlp/yt-dlp/issues/6507)) by [vampirefrog](https://github.com/vampirefrog)
+- **rottentomatoes**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/2d306c03d6f2697fcbabb7da35aa62cc078359d3) ([#6844](https://github.com/yt-dlp/yt-dlp/issues/6844)) by [JChris246](https://github.com/JChris246)
+- **rozhlas**
+ - [Extract manifest formats](https://github.com/yt-dlp/yt-dlp/commit/e4cf7741f9302b3faa092962f2895b55cb3d89bb) ([#6590](https://github.com/yt-dlp/yt-dlp/issues/6590)) by [bashonly](https://github.com/bashonly)
+ - `MujRozhlas`: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/c2b801fea59628d5c873e06a0727fbf2051bbd1f) ([#7129](https://github.com/yt-dlp/yt-dlp/issues/7129)) by [stanoarn](https://github.com/stanoarn)
+- **rtvc**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/9b30cd3dfce83c2f0201b28a7a3ef44ab9722664) ([#6578](https://github.com/yt-dlp/yt-dlp/issues/6578)) by [elyse0](https://github.com/elyse0)
+- **rumble**
+ - [Detect timeline format](https://github.com/yt-dlp/yt-dlp/commit/78bc1868ff3352108ab2911033d1ac67a55f151e) by [pukkandan](https://github.com/pukkandan)
+ - [Fix videos without quality selection](https://github.com/yt-dlp/yt-dlp/commit/6994afc030d2a786d8032075ed71a14d7eac5a4f) by [pukkandan](https://github.com/pukkandan)
+- **sbs**: [Overhaul extractor for new API](https://github.com/yt-dlp/yt-dlp/commit/6a765f135ccb654861336ea27a2c1c24ea8e286f) ([#6839](https://github.com/yt-dlp/yt-dlp/issues/6839)) by [bashonly](https://github.com/bashonly), [dirkf](https://github.com/dirkf), [vidiot720](https://github.com/vidiot720)
+- **shemaroome**: [Pass `stream_key` header to downloader](https://github.com/yt-dlp/yt-dlp/commit/7bc92517463f5766e9d9b92c3823b5cf403c0e3d) ([#7224](https://github.com/yt-dlp/yt-dlp/issues/7224)) by [bashonly](https://github.com/bashonly)
+- **sonyliv**: [Fix login with token](https://github.com/yt-dlp/yt-dlp/commit/4815d35c191e7d375b94492a6486dd2ba43a8954) ([#7223](https://github.com/yt-dlp/yt-dlp/issues/7223)) by [bashonly](https://github.com/bashonly)
+- **stageplus**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e5265dc6517478e589ee3c1ff0cb19bdf4e35ce1) ([#6838](https://github.com/yt-dlp/yt-dlp/issues/6838)) by [bashonly](https://github.com/bashonly)
+- **stripchat**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f9213f8a2d7ba46b912afe1dd3ce6bb700a33d72) ([#7306](https://github.com/yt-dlp/yt-dlp/issues/7306)) by [foreignBlade](https://github.com/foreignBlade)
+- **substack**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/12037d8b0a578fcc78a5c8f98964e48ee6060e25) ([#7218](https://github.com/yt-dlp/yt-dlp/issues/7218)) by [bashonly](https://github.com/bashonly)
+- **sverigesradio**: [Support slug URLs](https://github.com/yt-dlp/yt-dlp/commit/5ee9a7d6e18ceea956e831994cf11c423979354f) ([#7220](https://github.com/yt-dlp/yt-dlp/issues/7220)) by [bashonly](https://github.com/bashonly)
+- **tagesschau**: [Fix single audio urls](https://github.com/yt-dlp/yt-dlp/commit/af7585c824a1e405bd8afa46d87b4be322edc93c) ([#6626](https://github.com/yt-dlp/yt-dlp/issues/6626)) by [flashdagger](https://github.com/flashdagger)
+- **teamcoco**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/c459d45dd4d417fb80a52e1a04e607776a44baa4) ([#6437](https://github.com/yt-dlp/yt-dlp/issues/6437)) by [bashonly](https://github.com/bashonly)
+- **telecaribe**: [Expand livestream support](https://github.com/yt-dlp/yt-dlp/commit/69b2f838d3d3e37dc17367ef64d978db1bea45cf) ([#6601](https://github.com/yt-dlp/yt-dlp/issues/6601)) by [bashonly](https://github.com/bashonly)
+- **tencent**: [Fix fatal metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/971d901d129403e875a04dd92109507a03fbc070) ([#7219](https://github.com/yt-dlp/yt-dlp/issues/7219)) by [bashonly](https://github.com/bashonly)
+- **thesun**: [Update `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/0181b9a1b31db3fde943f7cd3fe9662f23bff292) ([#6522](https://github.com/yt-dlp/yt-dlp/issues/6522)) by [hatienl0i261299](https://github.com/hatienl0i261299)
+- **tiktok**
+ - [Extract 1080p adaptive formats](https://github.com/yt-dlp/yt-dlp/commit/c2a1bdb00931969193f2a31ea27b9c66a07aaec2) ([#7228](https://github.com/yt-dlp/yt-dlp/issues/7228)) by [bashonly](https://github.com/bashonly)
+ - [Fix and improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/925936908a3c3ee0e508621db14696b9f6a8b563) ([#6777](https://github.com/yt-dlp/yt-dlp/issues/6777)) by [bashonly](https://github.com/bashonly)
+ - [Fix mp3 formats](https://github.com/yt-dlp/yt-dlp/commit/8ceb07e870424c219dced8f4348729553f05c5cc) ([#6615](https://github.com/yt-dlp/yt-dlp/issues/6615)) by [bashonly](https://github.com/bashonly)
+ - [Fix resolution extraction](https://github.com/yt-dlp/yt-dlp/commit/ab6057ec80aa75db6303b8206916d00c376c622c) ([#7237](https://github.com/yt-dlp/yt-dlp/issues/7237)) by [puc9](https://github.com/puc9)
+ - [Improve `TikTokLive` extractor](https://github.com/yt-dlp/yt-dlp/commit/216bcb66d7dce0762767d751dad10650cb57da9d) ([#6520](https://github.com/yt-dlp/yt-dlp/issues/6520)) by [bashonly](https://github.com/bashonly)
+- **triller**: [Support short URLs, detect removed videos](https://github.com/yt-dlp/yt-dlp/commit/33b737bedf8383c0d00d4e1d06a5273dcdfdb756) ([#6636](https://github.com/yt-dlp/yt-dlp/issues/6636)) by [bashonly](https://github.com/bashonly)
+- **tv4**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/125ffaa1737dd04716f2f6fbb0595ad3eb7a4b1c) ([#5649](https://github.com/yt-dlp/yt-dlp/issues/5649)) by [dirkf](https://github.com/dirkf), [TxI5](https://github.com/TxI5)
+- **tvp**: [Use new API](https://github.com/yt-dlp/yt-dlp/commit/0c7ce146e4d2a84e656d78f6857952bfd25ab389) ([#6989](https://github.com/yt-dlp/yt-dlp/issues/6989)) by [selfisekai](https://github.com/selfisekai)
+- **tvplay**: [Remove outdated domains](https://github.com/yt-dlp/yt-dlp/commit/937264419f9bf375d5656785ae6e53282587c15d) ([#7106](https://github.com/yt-dlp/yt-dlp/issues/7106)) by [ivanskodje](https://github.com/ivanskodje)
+- **twitch**
+ - [Extract original size thumbnail](https://github.com/yt-dlp/yt-dlp/commit/80b732b7a9585b2a61e456dc0d2d014a439cbaee) ([#6629](https://github.com/yt-dlp/yt-dlp/issues/6629)) by [JC-Chung](https://github.com/JC-Chung)
+ - [Fix `is_live`](https://github.com/yt-dlp/yt-dlp/commit/0551511b45f7847f40e4314aa9e624e80d086539) ([#6500](https://github.com/yt-dlp/yt-dlp/issues/6500)) by [elyse0](https://github.com/elyse0)
+ - [Support mobile clips](https://github.com/yt-dlp/yt-dlp/commit/02312c03cf53eb1da24c9ad022ee79af26060733) ([#6699](https://github.com/yt-dlp/yt-dlp/issues/6699)) by [bepvte](https://github.com/bepvte)
+ - [Update `_CLIENT_ID` and add extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/01231feb142e80828985aabdec04ac608e3d43e2) ([#7200](https://github.com/yt-dlp/yt-dlp/issues/7200)) by [bashonly](https://github.com/bashonly)
+ - vod: [Support links from schedule tab](https://github.com/yt-dlp/yt-dlp/commit/dbce5afa6bb61f6272ade613f2e9a3d66b88c7ea) ([#7071](https://github.com/yt-dlp/yt-dlp/issues/7071)) by [falbrechtskirchinger](https://github.com/falbrechtskirchinger)
+- **twitter**
+ - [Add login support](https://github.com/yt-dlp/yt-dlp/commit/d1795f4a6af99c976c9d3ea2dabe5cf4f8965d3c) ([#7258](https://github.com/yt-dlp/yt-dlp/issues/7258)) by [bashonly](https://github.com/bashonly)
+ - [Default to GraphQL, handle auth errors](https://github.com/yt-dlp/yt-dlp/commit/147e62fc584c3ea6fdb09bb7a47905df68553a22) ([#6957](https://github.com/yt-dlp/yt-dlp/issues/6957)) by [bashonly](https://github.com/bashonly)
+ - spaces: [Add `release_timestamp`](https://github.com/yt-dlp/yt-dlp/commit/1c16d9df5330819cc79ad588b24aa5b72765c168) ([#7186](https://github.com/yt-dlp/yt-dlp/issues/7186)) by [CeruleanSky](https://github.com/CeruleanSky)
+- **urplay**: [Extract all subtitles](https://github.com/yt-dlp/yt-dlp/commit/7bcd4813215ac98daa4949af2ffc677c78307a38) ([#7309](https://github.com/yt-dlp/yt-dlp/issues/7309)) by [hoaluvn](https://github.com/hoaluvn)
+- **voot**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4f7b11cc1c1cebf598107e00cd7295588ed484da) ([#7227](https://github.com/yt-dlp/yt-dlp/issues/7227)) by [bashonly](https://github.com/bashonly)
+- **vrt**: [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/1a7dcca378e80a387923ee05c250d8ba122441c6) ([#6244](https://github.com/yt-dlp/yt-dlp/issues/6244)) by [bashonly](https://github.com/bashonly), [bergoid](https://github.com/bergoid), [jeroenj](https://github.com/jeroenj)
+- **weverse**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b844a3f8b16500663e7ab6c6ec061cc9b30f71ac) ([#6711](https://github.com/yt-dlp/yt-dlp/issues/6711)) by [bashonly](https://github.com/bashonly) (With fixes in [fd5d93f](https://github.com/yt-dlp/yt-dlp/commit/fd5d93f7040f9776fd541f4e4079dad7d3b3fb4f))
+- **wevidi**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/1ea15603d852971ed7d92f4de12808b27b3d9370) ([#6868](https://github.com/yt-dlp/yt-dlp/issues/6868)) by [truedread](https://github.com/truedread)
+- **weyyak**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6dc00acf0f1f1107a626c21befd1691403e6aeeb) ([#7124](https://github.com/yt-dlp/yt-dlp/issues/7124)) by [ItzMaxTV](https://github.com/ItzMaxTV)
+- **whyp**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/2c566ed14101673c651c08c306c30fa5b4010b85) ([#6803](https://github.com/yt-dlp/yt-dlp/issues/6803)) by [CoryTibbettsDev](https://github.com/CoryTibbettsDev)
+- **wrestleuniverse**
+ - [Fix cookies support](https://github.com/yt-dlp/yt-dlp/commit/c8561c6d03f025268d6d3972abeb47987c8d7cbb) by [bashonly](https://github.com/bashonly)
+ - [Fix extraction, add login](https://github.com/yt-dlp/yt-dlp/commit/ef8fb7f029b816dfc95600727d84400591a3b5c5) ([#6982](https://github.com/yt-dlp/yt-dlp/issues/6982)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K)
+- **wykop**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/aed945e1b9b7d3af2a907e1a12e6508cc81d6a20) ([#6140](https://github.com/yt-dlp/yt-dlp/issues/6140)) by [selfisekai](https://github.com/selfisekai)
+- **ximalaya**: [Sort playlist entries](https://github.com/yt-dlp/yt-dlp/commit/8790ea7b2536332777bce68590386b1aa935fac7) ([#7292](https://github.com/yt-dlp/yt-dlp/issues/7292)) by [linsui](https://github.com/linsui)
+- **YahooGyaOIE, YahooGyaOPlayerIE**: [Delete extractors due to website close](https://github.com/yt-dlp/yt-dlp/commit/68be95bd0ca3f76aa63c9812935bd826b3a42e53) ([#6218](https://github.com/yt-dlp/yt-dlp/issues/6218)) by [Lesmiscore](https://github.com/Lesmiscore)
+- **yappy**: YappyProfile: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6f69101dc912690338d32e2aab085c32e44eba3f) ([#7346](https://github.com/yt-dlp/yt-dlp/issues/7346)) by [7vlad7](https://github.com/7vlad7)
+- **youku**: [Improve error message](https://github.com/yt-dlp/yt-dlp/commit/ef0848abd425dfda6db62baa8d72897eefb0007f) ([#6690](https://github.com/yt-dlp/yt-dlp/issues/6690)) by [carusocr](https://github.com/carusocr)
+- **youporn**: [Extract m3u8 formats](https://github.com/yt-dlp/yt-dlp/commit/ddae33754ae1f32dd9c64cf895c47d20f6b5f336) by [pukkandan](https://github.com/pukkandan)
+- **youtube**
+ - [Add client name to `format_note` when `-v`](https://github.com/yt-dlp/yt-dlp/commit/c795c39f27244cbce846067891827e4847036441) ([#6254](https://github.com/yt-dlp/yt-dlp/issues/6254)) by [Lesmiscore](https://github.com/Lesmiscore), [pukkandan](https://github.com/pukkandan)
+ - [Add extractor-arg `include_duplicate_formats`](https://github.com/yt-dlp/yt-dlp/commit/86cb922118b236306310a72657f70426c20e28bb) by [pukkandan](https://github.com/pukkandan)
+ - [Bypass throttling for `-f17`](https://github.com/yt-dlp/yt-dlp/commit/c9abebb851e6188cb34b9eb744c1863dd46af919) by [pukkandan](https://github.com/pukkandan)
+ - [Construct fragment list lazily](https://github.com/yt-dlp/yt-dlp/commit/2a23d92d9ec44a0168079e38bcf3d383e5c4c7bb) by [pukkandan](https://github.com/pukkandan) (With fixes in [e389d17](https://github.com/yt-dlp/yt-dlp/commit/e389d172b6f42e4f332ae679dc48543fb7b9b61d))
+ - [Define strict uploader metadata mapping](https://github.com/yt-dlp/yt-dlp/commit/7666b93604b97e9ada981c6b04ccf5605dd1bd44) ([#6384](https://github.com/yt-dlp/yt-dlp/issues/6384)) by [coletdjnz](https://github.com/coletdjnz)
+ - [Determine audio language using automatic captions](https://github.com/yt-dlp/yt-dlp/commit/ff9b0e071ffae5543cc309e6f9e647ac51e5846e) by [pukkandan](https://github.com/pukkandan)
+ - [Extract `channel_is_verified`](https://github.com/yt-dlp/yt-dlp/commit/8213ce28a485e200f6a7e1af1434a987c8e702bd) ([#7213](https://github.com/yt-dlp/yt-dlp/issues/7213)) by [coletdjnz](https://github.com/coletdjnz)
+ - [Extract `heatmap` data](https://github.com/yt-dlp/yt-dlp/commit/5caf30dbc34f10b0be60676fece635b5c59f0d72) ([#7100](https://github.com/yt-dlp/yt-dlp/issues/7100)) by [tntmod54321](https://github.com/tntmod54321)
+ - [Extract more metadata for comments](https://github.com/yt-dlp/yt-dlp/commit/c35448b7b14113b35c4415dbfbf488c4731f006f) ([#7179](https://github.com/yt-dlp/yt-dlp/issues/7179)) by [coletdjnz](https://github.com/coletdjnz)
+ - [Extract uploader metadata for feed/playlist items](https://github.com/yt-dlp/yt-dlp/commit/93e12ed76ef49252dc6869b59d21d0777e5e11af) by [coletdjnz](https://github.com/coletdjnz)
+ - [Fix comment loop detection for pinned comments](https://github.com/yt-dlp/yt-dlp/commit/141a8dff98874a426d7fbe772e0a8421bb42656f) ([#6714](https://github.com/yt-dlp/yt-dlp/issues/6714)) by [coletdjnz](https://github.com/coletdjnz)
+ - [Fix continuation loop with no comments](https://github.com/yt-dlp/yt-dlp/commit/18f8fba7c89a87f99cc3313a1795848867e84fff) ([#7148](https://github.com/yt-dlp/yt-dlp/issues/7148)) by [coletdjnz](https://github.com/coletdjnz)
+ - [Fix parsing `comment_count`](https://github.com/yt-dlp/yt-dlp/commit/071670cbeaa01ddf2cc20a95ae6da25f8f086431) ([#6523](https://github.com/yt-dlp/yt-dlp/issues/6523)) by [nick-cd](https://github.com/nick-cd)
+ - [Handle incomplete initial data from watch page](https://github.com/yt-dlp/yt-dlp/commit/607510b9f2f67bfe7d33d74031a5c1fe22a24862) ([#6510](https://github.com/yt-dlp/yt-dlp/issues/6510)) by [coletdjnz](https://github.com/coletdjnz)
+ - [Ignore wrong fps of some formats](https://github.com/yt-dlp/yt-dlp/commit/97afb093d4cbe5df889145afa5f9ede4535e93e4) by [pukkandan](https://github.com/pukkandan)
+ - [Misc cleanup](https://github.com/yt-dlp/yt-dlp/commit/14a14335b280766fbf5a469ae26836d6c1fe450a) by [coletdjnz](https://github.com/coletdjnz)
+ - [Prioritize premium formats](https://github.com/yt-dlp/yt-dlp/commit/51a07b0dca4c079d58311c19b6d1c097c24bb021) by [pukkandan](https://github.com/pukkandan)
+ - [Revert default formats to `https`](https://github.com/yt-dlp/yt-dlp/commit/c6786ff3baaf72a5baa4d56d34058e54cbcf8ceb) by [pukkandan](https://github.com/pukkandan)
+ - [Support podcasts and releases tabs](https://github.com/yt-dlp/yt-dlp/commit/447afb9eaa65bc677e3245c83e53a8e69c174a3c) by [coletdjnz](https://github.com/coletdjnz)
+ - [Support shorter relative time format](https://github.com/yt-dlp/yt-dlp/commit/2fb35f6004c7625f0dd493da4a5abf0690f7777c) ([#7191](https://github.com/yt-dlp/yt-dlp/issues/7191)) by [coletdjnz](https://github.com/coletdjnz)
+ - music_search_url: [Extract title](https://github.com/yt-dlp/yt-dlp/commit/69a40e4a7f6caa5662527ebd2f3c4e8aa02857a2) ([#7102](https://github.com/yt-dlp/yt-dlp/issues/7102)) by [kangalio](https://github.com/kangalio)
+- **zaiko**
+ - [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/345b4c0aedd9d19898ce00d5cef35fe0d277a052) ([#7254](https://github.com/yt-dlp/yt-dlp/issues/7254)) by [c-basalt](https://github.com/c-basalt)
+ - ZaikoETicket: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/5cc09c004bd5edbbada9b041c08a720cadc4f4df) ([#7347](https://github.com/yt-dlp/yt-dlp/issues/7347)) by [pzhlkj6612](https://github.com/pzhlkj6612)
+- **zdf**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/ee0ed0338df328cd986f97315c8162b5a151476d) by [bashonly](https://github.com/bashonly)
+- **zee5**: [Fix extraction of new content](https://github.com/yt-dlp/yt-dlp/commit/9d7fde89a40360396f0baa2ee8bf507f92108b32) ([#7280](https://github.com/yt-dlp/yt-dlp/issues/7280)) by [bashonly](https://github.com/bashonly)
+- **zingmp3**: [Fix and improve extractors](https://github.com/yt-dlp/yt-dlp/commit/17d7ca84ea723c20668bd9bfa938be7ea0e64f6b) ([#6367](https://github.com/yt-dlp/yt-dlp/issues/6367)) by [hatienl0i261299](https://github.com/hatienl0i261299)
+- **zoom**
+ - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/79c77e85b70ae3b9942d5a88c14d021a9bd24222) ([#6741](https://github.com/yt-dlp/yt-dlp/issues/6741)) by [shreyasminocha](https://github.com/shreyasminocha)
+ - [Fix share URL extraction](https://github.com/yt-dlp/yt-dlp/commit/90c1f5120694105496a6ad9e3ecfc6c25de6cae1) ([#6789](https://github.com/yt-dlp/yt-dlp/issues/6789)) by [bashonly](https://github.com/bashonly)
+
+#### Downloader changes
+- **curl**: [Fix progress reporting](https://github.com/yt-dlp/yt-dlp/commit/66aeaac9aa30b5959069ba84e53a5508232deb38) by [pukkandan](https://github.com/pukkandan)
+- **fragment**: [Do not sleep between fragments](https://github.com/yt-dlp/yt-dlp/commit/424f3bf03305088df6e01d62f7311be8601ad3f4) by [pukkandan](https://github.com/pukkandan)
+
+#### Postprocessor changes
+- [Fix chapters if duration is not extracted](https://github.com/yt-dlp/yt-dlp/commit/01ddec7e661bf90dc4c34e6924eb9d7629886cef) ([#6037](https://github.com/yt-dlp/yt-dlp/issues/6037)) by [bashonly](https://github.com/bashonly)
+- [Print newline for `--progress-template`](https://github.com/yt-dlp/yt-dlp/commit/13ff78095372fd98900a32572cf817994c07ccb5) by [pukkandan](https://github.com/pukkandan)
+- **EmbedThumbnail, FFmpegMetadata**: [Fix error on attaching thumbnails and info json for mkv/mka](https://github.com/yt-dlp/yt-dlp/commit/0f0875ed555514f32522a0f30554fb08825d5124) ([#6647](https://github.com/yt-dlp/yt-dlp/issues/6647)) by [Lesmiscore](https://github.com/Lesmiscore)
+- **FFmpegFixupM3u8PP**: [Check audio codec before fixup](https://github.com/yt-dlp/yt-dlp/commit/3f7e2bd80e3c5d8a1682f20a1b245fcd974f295d) ([#6778](https://github.com/yt-dlp/yt-dlp/issues/6778)) by [bashonly](https://github.com/bashonly)
+- **FixupDuplicateMoov**: [Fix bug in triggering](https://github.com/yt-dlp/yt-dlp/commit/26010b5cec50193b98ad7845d1d77450f9f14c2b) by [pukkandan](https://github.com/pukkandan)
+
+#### Misc. changes
+- [Add automatic duplicate issue detection](https://github.com/yt-dlp/yt-dlp/commit/15b2d3db1d40b0437fca79d8874d392aa54b3cdd) by [pukkandan](https://github.com/pukkandan)
+- **build**
+ - [Fix macOS target](https://github.com/yt-dlp/yt-dlp/commit/44a79958f0b596ee71e1eb25f158610aada29d1b) by [Grub4K](https://github.com/Grub4K)
+ - [Implement build verification using `--update-to`](https://github.com/yt-dlp/yt-dlp/commit/b73193c99aa23b135732408a5fcf655c68d731c6) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K)
+ - [Pin `pyinstaller` version for MacOS](https://github.com/yt-dlp/yt-dlp/commit/427a8fafbb0e18c28d0ed7960be838d7b26b88d3) by [pukkandan](https://github.com/pukkandan)
+ - [Various build workflow improvements](https://github.com/yt-dlp/yt-dlp/commit/c4efa0aefec8daef1de62fd1693f13edf3c8b03c) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K)
+- **cleanup**
+ - Miscellaneous
+ - [6f2287c](https://github.com/yt-dlp/yt-dlp/commit/6f2287cb18cbfb27518f068d868fa9390fee78ad) by [pukkandan](https://github.com/pukkandan)
+ - [ad54c91](https://github.com/yt-dlp/yt-dlp/commit/ad54c9130e793ce433bf9da334fa80df9f3aee58) by [freezboltz](https://github.com/freezboltz), [mikf](https://github.com/mikf), [pukkandan](https://github.com/pukkandan)
+- **cleanup, utils**: [Split into submodules](https://github.com/yt-dlp/yt-dlp/commit/69bec6730ec9d724bcedeab199d9d684d61423ba) ([#7090](https://github.com/yt-dlp/yt-dlp/issues/7090)) by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan)
+- **cli_to_api**: [Add script](https://github.com/yt-dlp/yt-dlp/commit/46f1370e9af6f8af8762f67e27e5acb8f0c48a47) by [pukkandan](https://github.com/pukkandan)
+- **devscripts**: `make_changelog`: [Various improvements](https://github.com/yt-dlp/yt-dlp/commit/23c39a4beadee382060bb47fdaa21316ca707d38) by [Grub4K](https://github.com/Grub4K)
+- **docs**: [Misc improvements](https://github.com/yt-dlp/yt-dlp/commit/c8bc203fbf3bb09914e53f0833eed622ab7edbb9) by [pukkandan](https://github.com/pukkandan)
+
+### 2023.03.04
+
+#### Extractor changes
+- bilibili
+ - [Fix for downloading wrong subtitles](https://github.com/yt-dlp/yt-dlp/commit/8a83baaf218ab89e6e7faa76b7c7be3a2ec19e3a) ([#6358](https://github.com/yt-dlp/yt-dlp/issues/6358)) by [LXYan2333](https://github.com/LXYan2333)
+- ESPNcricinfo
+ - [Handle new URL pattern](https://github.com/yt-dlp/yt-dlp/commit/640c934823fc2d1ec77ec932566078014058635f) ([#6321](https://github.com/yt-dlp/yt-dlp/issues/6321)) by [venkata-krishnas](https://github.com/venkata-krishnas)
+- lefigaro
+ - [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/eb8fd6d044e8926532772b72be0645c6b8ecb3aa) ([#6309](https://github.com/yt-dlp/yt-dlp/issues/6309)) by [elyse0](https://github.com/elyse0)
+- lumni
+ - [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/1f8489cccbdc6e96027ef527b88717458f0900e8) ([#6302](https://github.com/yt-dlp/yt-dlp/issues/6302)) by [carusocr](https://github.com/carusocr)
+- Prankcast
+ - [Fix tags](https://github.com/yt-dlp/yt-dlp/commit/ed4cc4ea793314c50ae3f82e98248c1de1c25694) ([#6316](https://github.com/yt-dlp/yt-dlp/issues/6316)) by [columndeeply](https://github.com/columndeeply)
+- rutube
+ - [Extract chapters from description](https://github.com/yt-dlp/yt-dlp/commit/22ccd5420b3eb0782776071f12cccd1fedaa1fd0) ([#6345](https://github.com/yt-dlp/yt-dlp/issues/6345)) by [mushbite](https://github.com/mushbite)
+- SportDeutschland
+ - [Rewrite extractor](https://github.com/yt-dlp/yt-dlp/commit/45db357289b4e1eec09093c8bc5446520378f426) by [pukkandan](https://github.com/pukkandan)
+- telecaribe
+ - [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b40471282286bd2b09c485bf79afd271d229272c) ([#6311](https://github.com/yt-dlp/yt-dlp/issues/6311)) by [elyse0](https://github.com/elyse0)
+- tubetugraz
+ - [Support `--twofactor` (#6424)](https://github.com/yt-dlp/yt-dlp/commit/f44cb4e77bb9be8be291d02ab6f79dc0b4c0d4a1) ([#6427](https://github.com/yt-dlp/yt-dlp/issues/6427)) by [Ferdi265](https://github.com/Ferdi265)
+- tunein
+ - [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/46580ced56c90b559885aded6aa8f46f20a9cdce) ([#6310](https://github.com/yt-dlp/yt-dlp/issues/6310)) by [elyse0](https://github.com/elyse0)
+- twitch
+ - [Update for GraphQL API changes](https://github.com/yt-dlp/yt-dlp/commit/4a6272c6d1bff89969b67cd22b26ebe6d7e72279) ([#6318](https://github.com/yt-dlp/yt-dlp/issues/6318)) by [elyse0](https://github.com/elyse0)
+- twitter
+ - [Fix retweet extraction](https://github.com/yt-dlp/yt-dlp/commit/cf605226521e99c89fc8dff26a319025810e63a0) ([#6422](https://github.com/yt-dlp/yt-dlp/issues/6422)) by [selfisekai](https://github.com/selfisekai)
+- xvideos
+ - quickies: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/283a0b5bc511f3b350eead4488158f50c20ec526) ([#6414](https://github.com/yt-dlp/yt-dlp/issues/6414)) by [Yakabuff](https://github.com/Yakabuff)
+
+#### Misc. changes
+- build
+ - [Fix publishing to PyPI and homebrew](https://github.com/yt-dlp/yt-dlp/commit/55676fe498345a389a2539d8baaba958d6d61c3e) by [bashonly](https://github.com/bashonly)
+ - [Only archive if `vars.ARCHIVE_REPO` is set](https://github.com/yt-dlp/yt-dlp/commit/08ff6d59f97b5f5f0128f6bf6fbef56fd836cc52) by [Grub4K](https://github.com/Grub4K)
+- cleanup
+ - Miscellaneous: [392389b](https://github.com/yt-dlp/yt-dlp/commit/392389b7df7b818f794b231f14dc396d4875fbad) by [pukkandan](https://github.com/pukkandan)
+- devscripts
+ - `make_changelog`: [Stop at `Release ...` commit](https://github.com/yt-dlp/yt-dlp/commit/7accdd9845fe7ce9d0aa5a9d16faaa489c1294eb) by [pukkandan](https://github.com/pukkandan)
+
+### 2023.03.03
+
+#### Important changes
+- **A new release type has been added!**
+ * [`nightly`](https://github.com/yt-dlp/yt-dlp/releases/tag/nightly) builds will be made after each push, containing the latest fixes (but also possibly bugs).
+ * When using `--update`/`-U`, a release binary will only update to its current channel (either `stable` or `nightly`).
+ * The `--update-to` option has been added allowing the user more control over program upgrades (or downgrades).
+ * `--update-to` can change the release channel (`stable`, `nightly`) and also upgrade or downgrade to specific tags.
+ * **Usage**: `--update-to CHANNEL`, `--update-to TAG`, `--update-to CHANNEL@TAG`
+- **YouTube throttling fixes!**
+
+#### Core changes
+- [Add option `--break-match-filters`](https://github.com/yt-dlp/yt-dlp/commit/fe2ce85aff0aa03735fc0152bb8cb9c3d4ef0753) by [pukkandan](https://github.com/pukkandan)
+- [Fix `--break-on-existing` with `--lazy-playlist`](https://github.com/yt-dlp/yt-dlp/commit/d21056f4cf0a1623daa107f9181074f5725ac436) by [pukkandan](https://github.com/pukkandan)
+- dependencies
+ - [Simplify `Cryptodome`](https://github.com/yt-dlp/yt-dlp/commit/65f6e807804d2af5e00f2aecd72bfc43af19324a) by [pukkandan](https://github.com/pukkandan)
+- jsinterp
+ - [Handle `Date` at epoch 0](https://github.com/yt-dlp/yt-dlp/commit/9acf1ee25f7ad3920ede574a9de95b8c18626af4) by [pukkandan](https://github.com/pukkandan)
+- plugins
+ - [Don't look in `.egg` directories](https://github.com/yt-dlp/yt-dlp/commit/b059188383eee4fa336ef728dda3ff4bb7335625) by [pukkandan](https://github.com/pukkandan)
+- update
+ - [Add option `--update-to`, including to nightly](https://github.com/yt-dlp/yt-dlp/commit/77df20f14cc9ed41dfe3a1fe2d77fd27f5365a94) ([#6220](https://github.com/yt-dlp/yt-dlp/issues/6220)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan)
+- utils
+ - `LenientJSONDecoder`: [Parse unclosed objects](https://github.com/yt-dlp/yt-dlp/commit/cc09083636ce21e58ff74f45eac2dbda507462b0) by [pukkandan](https://github.com/pukkandan)
+ - `Popen`: [Shim undocumented `text_mode` property](https://github.com/yt-dlp/yt-dlp/commit/da8e2912b165005f76779a115a071cd6132ceedf) by [Grub4K](https://github.com/Grub4K)
+
+#### Extractor changes
+- [Fix DRM detection in m3u8](https://github.com/yt-dlp/yt-dlp/commit/43a3eaf96393b712d60cbcf5c6cb1e90ed7f42f5) by [pukkandan](https://github.com/pukkandan)
+- generic
+ - [Detect manifest links via extension](https://github.com/yt-dlp/yt-dlp/commit/b38cae49e6f4849c8ee2a774bdc3c1c647ae5f0e) by [bashonly](https://github.com/bashonly)
+ - [Handle basic-auth when checking redirects](https://github.com/yt-dlp/yt-dlp/commit/8e9fe43cd393e69fa49b3d842aa3180c1d105b8f) by [pukkandan](https://github.com/pukkandan)
+- GoogleDrive
+ - [Fix some audio](https://github.com/yt-dlp/yt-dlp/commit/4d248e29d20d983ededab0b03d4fe69dff9eb4ed) by [pukkandan](https://github.com/pukkandan)
+- iprima
+ - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/9fddc12ab022a31754e0eaa358fc4e1dfa974587) ([#6291](https://github.com/yt-dlp/yt-dlp/issues/6291)) by [std-move](https://github.com/std-move)
+- mediastream
+ - [Improve WinSports support](https://github.com/yt-dlp/yt-dlp/commit/2d5a8c5db2bd4ff1c2e45e00cd890a10f8ffca9e) ([#6401](https://github.com/yt-dlp/yt-dlp/issues/6401)) by [bashonly](https://github.com/bashonly)
+- ntvru
+ - [Extract HLS and DASH formats](https://github.com/yt-dlp/yt-dlp/commit/77d6d136468d0c23c8e79bc937898747804f585a) ([#6403](https://github.com/yt-dlp/yt-dlp/issues/6403)) by [bashonly](https://github.com/bashonly)
+- tencent
+ - [Add more formats and info](https://github.com/yt-dlp/yt-dlp/commit/18d295c9e0f95adc179eef345b7af64d6372db78) ([#5950](https://github.com/yt-dlp/yt-dlp/issues/5950)) by [Hill-98](https://github.com/Hill-98)
+- yle_areena
+ - [Extract non-Kaltura videos](https://github.com/yt-dlp/yt-dlp/commit/40d77d89027cd0e0ce31d22aec81db3e1d433900) ([#6402](https://github.com/yt-dlp/yt-dlp/issues/6402)) by [bashonly](https://github.com/bashonly)
+- youtube
+ - [Construct dash formats with `range` query](https://github.com/yt-dlp/yt-dlp/commit/5038f6d713303e0967d002216e7a88652401c22a) by [pukkandan](https://github.com/pukkandan) (With fixes in [f34804b](https://github.com/yt-dlp/yt-dlp/commit/f34804b2f920f62a6e893a14a9e2a2144b14dd23) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz))
+ - [Detect and break on looping comments](https://github.com/yt-dlp/yt-dlp/commit/7f51861b1820c37b157a239b1fe30628d907c034) ([#6301](https://github.com/yt-dlp/yt-dlp/issues/6301)) by [coletdjnz](https://github.com/coletdjnz)
+ - [Extract channel `view_count` when `/about` tab is passed](https://github.com/yt-dlp/yt-dlp/commit/31e183557fcd1b937582f9429f29207c1261f501) by [pukkandan](https://github.com/pukkandan)
+
+#### Misc. changes
+- build
+ - [Add `cffi` as a dependency for `hypervideo_dl_linux`](https://github.com/yt-dlp/yt-dlp/commit/776d1c3f0c9b00399896dd2e40e78e9a43218109) by [bashonly](https://github.com/bashonly)
+ - [Automated builds and nightly releases](https://github.com/yt-dlp/yt-dlp/commit/29cb20bd563c02671b31dd840139e93dd37150a1) ([#6220](https://github.com/yt-dlp/yt-dlp/issues/6220)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) (With fixes in [bfc861a](https://github.com/yt-dlp/yt-dlp/commit/bfc861a91ee65c9b0ac169754f512e052c6827cf) by [pukkandan](https://github.com/pukkandan))
+ - [Sign SHA files and release public key](https://github.com/yt-dlp/yt-dlp/commit/12647e03d417feaa9ea6a458bea5ebd747494a53) by [Grub4K](https://github.com/Grub4K)
+- cleanup
+ - [Fix `Changelog`](https://github.com/yt-dlp/yt-dlp/commit/17ca19ab60a6a13eb8a629c51442b5248b0d8394) by [pukkandan](https://github.com/pukkandan)
+ - jsinterp: [Give functions names to help debugging](https://github.com/yt-dlp/yt-dlp/commit/b2e0343ba0fc5d8702e90f6ba2b71358e2677e0b) by [pukkandan](https://github.com/pukkandan)
+ - Miscellaneous: [4815bbf](https://github.com/yt-dlp/yt-dlp/commit/4815bbfc41cf641e4a0650289dbff968cb3bde76), [5b28cef](https://github.com/yt-dlp/yt-dlp/commit/5b28cef72db3b531680d89c121631c73ae05354f) by [pukkandan](https://github.com/pukkandan)
+- devscripts
+ - [Script to generate changelog](https://github.com/yt-dlp/yt-dlp/commit/d400e261cf029a3f20d364113b14de973be75404) ([#6220](https://github.com/yt-dlp/yt-dlp/issues/6220)) by [Grub4K](https://github.com/Grub4K) (With fixes in [9344964](https://github.com/yt-dlp/yt-dlp/commit/93449642815a6973a4b09b289982ca7e1f961b5f))
+
+### 2023.02.17
+
+* Merge youtube-dl: Upto [commit/2dd6c6e](https://github.com/ytdl-org/youtube-dl/commit/2dd6c6e)
+* Fix `--concat-playlist`
+* Imply `--no-progress` when `--print`
+* Improve default subtitle language selection by [sdht0](https://github.com/sdht0)
+* Make `title` completely non-fatal
+* Sanitize formats before sorting by [pukkandan](https://github.com/pukkandan)
+* Support module level `__bool__` and `property`
+* [dependencies] Standardize `Cryptodome` imports
+* [hls] Allow extractors to provide AES key by [Grub4K](https://github.com/Grub4K), [bashonly](https://github.com/bashonly)
+* [ExtractAudio] Handle outtmpl without ext by [carusocr](https://github.com/carusocr)
+* [extractor/common] Fix `_search_nuxt_data` by [LowSuggestion912](https://github.com/LowSuggestion912)
+* [extractor/generic] Avoid catastrophic backtracking in KVS regex by [bashonly](https://github.com/bashonly)
+* [jsinterp] Support `if` statements
+* [plugins] Fix zip search paths
+* [utils] `traverse_obj`: Various improvements by [Grub4K](https://github.com/Grub4K)
+* [utils] `traverse_obj`: Fix more bugs
+* [utils] `traverse_obj`: Fix several behavioral problems by [Grub4K](https://github.com/Grub4K)
+* [utils] Don't use Content-length with encoding by [felixonmars](https://github.com/felixonmars)
+* [utils] Fix `time_seconds` to use the provided TZ by [Grub4K](https://github.com/Grub4K), [Lesmiscore](https://github.com/Lesmiscore)
+* [utils] Fix race condition in `make_dir` by [aionescu](https://github.com/aionescu)
+* [utils] Use local kernel32 for file locking on Windows by [Grub4K](https://github.com/Grub4K)
+* [compat_utils] Improve `passthrough_module`
+* [compat_utils] Simplify `EnhancedModule`
+* [build] Update pyinstaller
+* [pyinst] Fix for pyinstaller 5.8
+* [devscripts] Provide `pyinstaller` hooks
+* [devscripts/pyinstaller] Analyze sub-modules of `Cryptodome`
+* [cleanup] Misc fixes and cleanup
+* [extractor/anchorfm] Add episode extractor by [HobbyistDev](https://github.com/HobbyistDev), [bashonly](https://github.com/bashonly)
+* [extractor/boxcast] Add extractor by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/ebay] Add extractor by [JChris246](https://github.com/JChris246)
+* [extractor/hypergryph] Add extractor by [HobbyistDev](https://github.com/HobbyistDev), [bashonly](https://github.com/bashonly)
+* [extractor/NZOnScreen] Add extractor by [gregsadetsky](https://github.com/gregsadetsky), [pukkandan](https://github.com/pukkandan)
+* [extractor/rozhlas] Add extractor RozhlasVltavaIE by [amra](https://github.com/amra)
+* [extractor/tempo] Add IVXPlayer extractor by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/txxx] Add extractors by [chio0hai](https://github.com/chio0hai)
+* [extractor/vocaroo] Add extractor by [SuperSonicHub1](https://github.com/SuperSonicHub1), [qbnu](https://github.com/qbnu)
+* [extractor/wrestleuniverse] Add extractors by [Grub4K](https://github.com/Grub4K), [bashonly](https://github.com/bashonly)
+* [extractor/yappy] Add extractor by [HobbyistDev](https://github.com/HobbyistDev), [dirkf](https://github.com/dirkf)
+* [extractor/youtube] **Fix `uploader_id` extraction** by [bashonly](https://github.com/bashonly)
+* [extractor/youtube] Add hyperpipe instances by [Generator](https://github.com/Generator)
+* [extractor/youtube] Handle `consent.youtube`
+* [extractor/youtube] Support `/live/` URL
+* [extractor/youtube] Update invidious and piped instances by [rohieb](https://github.com/rohieb)
+* [extractor/91porn] Fix title and comment extraction by [pmitchell86](https://github.com/pmitchell86)
+* [extractor/AbemaTV] Cache user token whenever appropriate by [Lesmiscore](https://github.com/Lesmiscore)
+* [extractor/bfmtv] Support `rmc` prefix by [carusocr](https://github.com/carusocr)
+* [extractor/biliintl] Add intro and ending chapters by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/clyp] Support `wav` by [qulaz](https://github.com/qulaz)
+* [extractor/crunchyroll] Add intro chapter by [ByteDream](https://github.com/ByteDream)
+* [extractor/crunchyroll] Better message for premium videos
+* [extractor/crunchyroll] Fix incorrect premium-only error by [Grub4K](https://github.com/Grub4K)
+* [extractor/DouyuTV] Use new API by [hatienl0i261299](https://github.com/hatienl0i261299)
+* [extractor/embedly] Embedded links may be for other extractors
+* [extractor/freesound] Workaround invalid URL in webpage by [rebane2001](https://github.com/rebane2001)
+* [extractor/GoPlay] Use new API by [jeroenj](https://github.com/jeroenj)
+* [extractor/Hidive] Fix subtitles and age-restriction by [chexxor](https://github.com/chexxor)
+* [extractor/huya] Support HD streams by [felixonmars](https://github.com/felixonmars)
+* [extractor/moviepilot] Fix extractor by [panatexxa](https://github.com/panatexxa)
+* [extractor/nbc] Fix `NBC` and `NBCStations` extractors by [bashonly](https://github.com/bashonly)
+* [extractor/nbc] Fix XML parsing by [bashonly](https://github.com/bashonly)
+* [extractor/nebula] Remove broken cookie support by [hheimbuerger](https://github.com/hheimbuerger)
+* [extractor/nfl] Add `NFLPlus` extractors by [bashonly](https://github.com/bashonly)
+* [extractor/niconico] Add support for like history by [Matumo](https://github.com/Matumo), [pukkandan](https://github.com/pukkandan)
+* [extractor/nitter] Update instance list by [OIRNOIR](https://github.com/OIRNOIR)
+* [extractor/npo] Fix extractor and add HD support by [seproDev](https://github.com/seproDev)
+* [extractor/odkmedia] Add `OnDemandChinaEpisodeIE` by [HobbyistDev](https://github.com/HobbyistDev), [pukkandan](https://github.com/pukkandan)
+* [extractor/pornez] Handle relative URLs in iframe by [JChris246](https://github.com/JChris246)
+* [extractor/radiko] Fix format sorting for Time Free by [road-master](https://github.com/road-master)
+* [extractor/rcs] Fix extractors by [nixxo](https://github.com/nixxo), [pukkandan](https://github.com/pukkandan)
+* [extractor/reddit] Support user posts by [OMEGARAZER](https://github.com/OMEGARAZER)
+* [extractor/rumble] Fix format sorting by [pukkandan](https://github.com/pukkandan)
+* [extractor/servus] Rewrite extractor by [Ashish0804](https://github.com/Ashish0804), [FrankZ85](https://github.com/FrankZ85), [StefanLobbenmeier](https://github.com/StefanLobbenmeier)
+* [extractor/slideslive] Fix slides and chapters/duration by [bashonly](https://github.com/bashonly)
+* [extractor/SportDeutschland] Fix extractor by [FriedrichRehren](https://github.com/FriedrichRehren)
+* [extractor/Stripchat] Fix extractor by [JChris246](https://github.com/JChris246), [bashonly](https://github.com/bashonly)
+* [extractor/tnaflix] Fix extractor by [bashonly](https://github.com/bashonly), [oxamun](https://github.com/oxamun)
+* [extractor/tvp] Support `stream.tvp.pl` by [selfisekai](https://github.com/selfisekai)
+* [extractor/twitter] Fix `--no-playlist` and add media `view_count` when using GraphQL by [Grub4K](https://github.com/Grub4K)
+* [extractor/twitter] Fix graphql extraction on some tweets by [selfisekai](https://github.com/selfisekai)
+* [extractor/vimeo] Fix `playerConfig` extraction by [LeoniePhiline](https://github.com/LeoniePhiline), [bashonly](https://github.com/bashonly)
+* [extractor/viu] Add `ViuOTTIndonesiaIE` extractor by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/vk] Fix playlists for new API by [the-marenga](https://github.com/the-marenga)
+* [extractor/vlive] Replace with `VLiveWebArchiveIE` by [seproDev](https://github.com/seproDev)
+* [extractor/ximalaya] Update album `_VALID_URL` by [carusocr](https://github.com/carusocr)
+* [extractor/zdf] Use android API endpoint for UHD downloads by [seproDev](https://github.com/seproDev)
+* [extractor/drtv] Fix bug in [ab4cbef](https://github.com/yt-dlp/yt-dlp/commit/ab4cbef) by [bashonly](https://github.com/bashonly)
+
+
+### 2023.01.06
+
+* Fix config locations by [Grub4K](https://github.com/Grub4K), [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan)
+* [downloader/aria2c] Disable native progress
+* [utils] `mimetype2ext`: `weba` is not standard
+* [utils] `windows_enable_vt_mode`: Better error handling
+* [build] Add minimal `pyproject.toml`
+* [update] Fix updater file removal on windows by [Grub4K](https://github.com/Grub4K)
+* [cleanup] Misc fixes and cleanup
+* [extractor/aitube] Add extractor by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/drtv] Add series extractors by [FrederikNS](https://github.com/FrederikNS)
+* [extractor/volejtv] Add extractor by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/xanimu] Add extractor by [JChris246](https://github.com/JChris246)
+* [extractor/youtube] Retry manifest refresh for live-from-start by [mzhou](https://github.com/mzhou)
+* [extractor/biliintl] Add `/media` to `VALID_URL` by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/biliIntl] Add fallback to `video_data` by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/crunchyroll:show] Add `language` to entries by [Chrissi2812](https://github.com/Chrissi2812)
+* [extractor/joj] Fix extractor by [OndrejBakan](https://github.com/OndrejBakan), [pukkandan](https://github.com/pukkandan)
+* [extractor/nbc] Update graphql query by [jacobtruman](https://github.com/jacobtruman)
+* [extractor/reddit] Add subreddit as `channel_id` by [gschizas](https://github.com/gschizas)
+* [extractor/tiktok] Add `TikTokLive` extractor by [JC-Chung](https://github.com/JC-Chung)
+
+### 2023.01.02
+
+* **Improve plugin architecture** by [Grub4K](https://github.com/Grub4K), [coletdjnz](https://github.com/coletdjnz), [flashdagger](https://github.com/flashdagger), [pukkandan](https://github.com/pukkandan)
+ * Plugins can be loaded in any distribution of yt-dlp (binary, pip, source, etc.) and can be distributed and installed as packages. See [the readme](https://github.com/yt-dlp/yt-dlp/tree/05997b6e98e638d97d409c65bb5eb86da68f3b64#plugins) for more information
+* Add `--compat-options 2021,2022`
+ * This allows devs to change defaults and make other potentially breaking changes more easily. If you need everything to work exactly as-is, put Use `--compat 2022` in your config to guard against future compat changes.
+* [downloader/aria2c] Native progress for aria2c via RPC by [Lesmiscore](https://github.com/Lesmiscore), [pukkandan](https://github.com/pukkandan)
+* Merge youtube-dl: Upto [commit/195f22f](https://github.com/ytdl-org/youtube-dl/commit/195f22f6) by [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan)
+* Add pre-processor stage `video`
+* Let `--parse/replace-in-metadata` run at any post-processing stage
+* Add `--enable-file-urls` by [coletdjnz](https://github.com/coletdjnz)
+* Add new field `aspect_ratio`
+* Add `ac4` to known codecs
+* Add `weba` to known extensions
+* [FFmpegVideoConvertor] Add `gif` to `--recode-video`
+* Add message when there are no subtitles/thumbnails
+* Deprioritize HEVC-over-FLV formats by [Lesmiscore](https://github.com/Lesmiscore)
+* Make early reject of `--match-filter` stricter
+* Fix `--cookies-from-browser` CLI parsing
+* Fix `original_url` in playlists
+* Fix bug in writing playlist info-json
+* Fix bugs in `PlaylistEntries`
+* [downloader/ffmpeg] Fix headers for video+audio formats by [Grub4K](https://github.com/Grub4K), [bashonly](https://github.com/bashonly)
+* [extractor] Add a way to distinguish IEs that returns only videos
+* [extractor] Implement universal format sorting and deprecate `_sort_formats`
+* [extractor] Let `_extract_format` functions obey `--ignore-no-formats`
+* [extractor/generic] Add `fragment_query` extractor arg for DASH and HLS by [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan)
+* [extractor/generic] Decode unicode-escaped embed URLs by [bashonly](https://github.com/bashonly)
+* [extractor/generic] Don't report redirect to https
+* [extractor/generic] Fix JSON LD manifest extraction by [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan)
+* [extractor/generic] Use `Accept-Encoding: identity` for initial request by [coletdjnz](https://github.com/coletdjnz)
+* [FormatSort] Add `mov` to `vext`
+* [jsinterp] Escape regex that looks like nested set
+* [webvtt] Handle premature EOF by [flashdagger](https://github.com/flashdagger)
+* [utils] `classproperty`: Add cache support
+* [utils] `get_exe_version`: Detect broken executables by [dirkf](https://github.com/dirkf), [pukkandan](https://github.com/pukkandan)
+* [utils] `js_to_json`: Fix bug in [f55523c](https://github.com/yt-dlp/yt-dlp/commit/f55523c) by [ChillingPepper](https://github.com/ChillingPepper), [pukkandan](https://github.com/pukkandan)
+* [utils] Make `ExtractorError` mutable
+* [utils] Move `FileDownloader.parse_bytes` into utils
+* [utils] Move format sorting code into `utils`
+* [utils] `windows_enable_vt_mode`: Proper implementation by [Grub4K](https://github.com/Grub4K)
+* [update] Workaround [#5632](https://github.com/yt-dlp/yt-dlp/issues/5632)
+* [docs] Improvements
+* [cleanup] Misc fixes and cleanup
+* [cleanup] Use `random.choices` by [freezboltz](https://github.com/freezboltz)
+* [extractor/airtv] Add extractor by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/amazonminitv] Add extractors by [GautamMKGarg](https://github.com/GautamMKGarg), [nyuszika7h](https://github.com/nyuszika7h)
+* [extractor/beatbump] Add extractors by [Bobscorn](https://github.com/Bobscorn), [pukkandan](https://github.com/pukkandan)
+* [extractor/europarl] Add EuroParlWebstream extractor by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/kanal2] Add extractor by [bashonly](https://github.com/bashonly), [glensc](https://github.com/glensc), [pukkandan](https://github.com/pukkandan)
+* [extractor/kankanews] Add extractor by [synthpop123](https://github.com/synthpop123)
+* [extractor/kick] Add extractor by [bashonly](https://github.com/bashonly)
+* [extractor/mediastream] Add extractor by [HobbyistDev](https://github.com/HobbyistDev), [elyse0](https://github.com/elyse0)
+* [extractor/noice] Add NoicePodcast extractor by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/oneplace] Add OnePlacePodcast extractor by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/rumble] Add RumbleIE extractor by [flashdagger](https://github.com/flashdagger)
+* [extractor/screencastify] Add extractor by [bashonly](https://github.com/bashonly)
+* [extractor/trtcocuk] Add extractor by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/Veoh] Add user extractor by [tntmod54321](https://github.com/tntmod54321)
+* [extractor/videoken] Add extractors by [bashonly](https://github.com/bashonly)
+* [extractor/webcamerapl] Add extractor by [milkknife](https://github.com/milkknife)
+* [extractor/amazon] Add `AmazonReviews` extractor by [bashonly](https://github.com/bashonly)
+* [extractor/netverse] Add `NetverseSearch` extractor by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/vimeo] Add `VimeoProIE` by [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan)
+* [extractor/xiami] Remove extractors by [synthpop123](https://github.com/synthpop123)
+* [extractor/youtube] Add `piped.video` by [Bnyro](https://github.com/Bnyro)
+* [extractor/youtube] Consider language in format de-duplication
+* [extractor/youtube] Extract DRC formats
+* [extractor/youtube] Fix `ytuser:`
+* [extractor/youtube] Fix bug in handling of music URLs
+* [extractor/youtube] Subtitles cannot be translated to `und`
+* [extractor/youtube:tab] Extract metadata from channel items by [coletdjnz](https://github.com/coletdjnz)
+* [extractor/ARD] Add vtt subtitles by [CapacitorSet](https://github.com/CapacitorSet)
+* [extractor/ArteTV] Extract chapters by [bashonly](https://github.com/bashonly), [iw0nderhow](https://github.com/iw0nderhow)
+* [extractor/bandcamp] Add `album_artist` by [stelcodes](https://github.com/stelcodes)
+* [extractor/bilibili] Fix `--no-playlist` for anthology
+* [extractor/bilibili] Improve `_VALID_URL` by [skbeh](https://github.com/skbeh)
+* [extractor/biliintl:series] Make partial download of series faster
+* [extractor/BiliLive] Fix extractor
+* [extractor/brightcove] Add `BrightcoveNewBaseIE` and fix embed extraction
+* [extractor/cda] Support premium and misc improvements by [selfisekai](https://github.com/selfisekai)
+* [extractor/ciscowebex] Support password-protected videos by [damianoamatruda](https://github.com/damianoamatruda)
+* [extractor/curiositystream] Fix auth by [mnn](https://github.com/mnn)
+* [extractor/embedly] Handle vimeo embeds
+* [extractor/fifa] Fix Preplay extraction by [dirkf](https://github.com/dirkf)
+* [extractor/foxsports] Fix extractor by [bashonly](https://github.com/bashonly)
+* [extractor/gronkh] Fix `_VALID_URL` by [muddi900](https://github.com/muddi900)
+* [extractor/hotstar] Improve format metadata
+* [extractor/iqiyi] Fix `Iq` JS regex by [bashonly](https://github.com/bashonly)
+* [extractor/la7] Improve extractor by [nixxo](https://github.com/nixxo)
+* [extractor/mediaset] Better embed detection and error messages by [nixxo](https://github.com/nixxo)
+* [extractor/mixch] Support `--wait-for-video`
+* [extractor/naver] Improve `_VALID_URL` for `NaverNowIE` by [bashonly](https://github.com/bashonly)
+* [extractor/naver] Treat fan subtitles as separate language
+* [extractor/netverse] Extract comments by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/nosnl] Add support for /video by [HobbyistDev](https://github.com/HobbyistDev)
+* [extractor/odnoklassniki] Extract subtitles by [bashonly](https://github.com/bashonly)
+* [extractor/pinterest] Fix extractor by [bashonly](https://github.com/bashonly)
+* [extractor/plutotv] Fix videos with non-zero start by [digitall](https://github.com/digitall)
+* [extractor/polskieradio] Adapt to next.js redesigns by [selfisekai](https://github.com/selfisekai)
+* [extractor/reddit] Add vcodec to fallback format by [chengzhicn](https://github.com/chengzhicn)
+* [extractor/reddit] Extract crossposted media by [bashonly](https://github.com/bashonly)
+* [extractor/reddit] Extract video embeds in text posts by [bashonly](https://github.com/bashonly)
+* [extractor/rutube] Support private videos by [mexus](https://github.com/mexus)
+* [extractor/sibnet] Separate from VKIE
+* [extractor/slideslive] Fix extractor by [Grub4K](https://github.com/Grub4K), [bashonly](https://github.com/bashonly)
+* [extractor/slideslive] Support embeds and slides by [Grub4K](https://github.com/Grub4K), [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan)
+* [extractor/soundcloud] Support user permalink by [nosoop](https://github.com/nosoop)
+* [extractor/spankbang] Fix extractor by [JChris246](https://github.com/JChris246)
+* [extractor/stv] Detect DRM
+* [extractor/swearnet] Fix description bug
+* [extractor/tencent] Fix geo-restricted video by [elyse0](https://github.com/elyse0)
+* [extractor/tiktok] Fix subs, `DouyinIE`, improve `_VALID_URL` by [bashonly](https://github.com/bashonly)
+* [extractor/tiktok] Update `_VALID_URL`, add `api_hostname` arg by [bashonly](https://github.com/bashonly)
+* [extractor/tiktok] Update API hostname by [redraskal](https://github.com/redraskal)
+* [extractor/twitcasting] Fix videos with password by [Spicadox](https://github.com/Spicadox), [bashonly](https://github.com/bashonly)
+* [extractor/twitter] Heed `--no-playlist` for multi-video tweets by [Grub4K](https://github.com/Grub4K), [bashonly](https://github.com/bashonly)
+* [extractor/twitter] Refresh guest token when expired by [Grub4K](https://github.com/Grub4K), [bashonly](https://github.com/bashonly)
+* [extractor/twitter:spaces] Add `Referer` to m3u8 by [nixxo](https://github.com/nixxo)
+* [extractor/udemy] Fix lectures that have no URL and detect DRM
+* [extractor/unsupported] Add more URLs
+* [extractor/urplay] Support for audio-only formats by [barsnick](https://github.com/barsnick)
+* [extractor/wistia] Improve extension detection by [Grub4K](https://github.com/Grub4K), [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan)
+* [extractor/yle_areena] Support restricted videos by [docbender](https://github.com/docbender)
+* [extractor/youku] Fix extractor by [KurtBestor](https://github.com/KurtBestor)
+* [extractor/youporn] Fix metadata by [marieell](https://github.com/marieell)
+* [extractor/redgifs] Fix bug in [8c188d5](https://github.com/yt-dlp/yt-dlp/commit/8c188d5d09177ed213a05c900d3523867c5897fd)
+
### 2022.11.11
diff --git a/Makefile b/Makefile
index a395a0e..3e0f3d8 100644
--- a/Makefile
+++ b/Makefile
@@ -13,9 +13,10 @@ pypi-files: AUTHORS Changelog.md LICENSE README.md completions devscripts/* test
.PHONY: all clean install test tar pypi-files completions ot offlinetest codetest
clean-test:
- rm -rf *.3gp *.annotations.xml *.ape *.avi *.description *.dump *.flac *.flv *.frag *.frag.aria2 *.frag.urls \
- *.info.json *.jpeg *.jpg *.live_chat.json *.m4a *.m4v *.mkv *.mp3 *.mp4 *.ogg *.opus *.part* *.png *.sbv *.srt \
- *.swf *.swp *.ttml *.vtt *.wav *.webm *.webp *.ytdl test/testdata/player-*.js
+ rm -rf test/testdata/sigs/player-*.js tmp/ *.annotations.xml *.aria2 *.description *.dump *.frag *.frag.urls \
+ *.frag.aria2 *.frag.urls *.info.json *.live_chat.json *.meta *.part* *.tmp *.temp *.unknown_video *.ytdl \
+ *.3gp *.ape *.ass *.avi *.desktop *.f4v *.flac *.flv *.gif *.jpeg *.jpg *.m4a *.m4v *.mhtml *.mkv *.mov *.mp3 \
+ *.mp4 *.mpga *.oga *.ogg *.opus *.png *.sbv *.srt *.swf *.swp *.tt *.ttml *.url *.vtt *.wav *.webloc *.webm *.webp *.ytdl
clean-dist:
rm -rf MANIFEST build/ dist/ .coverage cover/ hypervideo.tar.gz completions/ hypervideo_dl/extractor/lazy_extractors.py *.spec CONTRIBUTING.md.tmp hypervideo hypervideo.exe hypervideo_dl.egg-info/ AUTHORS .mailmap
clean-cache:
diff --git a/README.md b/README.md
index 3548389..ffa2289 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ hypervideo - A fork of yt-dlp without nonfree parts
* [Extractor Options](#extractor-options)
* [CONFIGURATION](#configuration)
* [Configuration file encoding](#configuration-file-encoding)
- * [Authentication with .netrc file](#authentication-with-netrc-file)
+ * [Authentication with netrc](#authentication-with-netrc)
* [Notes about environment variables](#notes-about-environment-variables)
* [OUTPUT TEMPLATE](#output-template)
* [Output template examples](#output-template-examples)
@@ -41,7 +41,9 @@ hypervideo - A fork of yt-dlp without nonfree parts
* [Modifying metadata examples](#modifying-metadata-examples)
* [EXTRACTOR ARGUMENTS](#extractor-arguments)
* [PLUGINS](#plugins)
-* [EMBEDDING HYPERVIDEO](#embedding-hypervideo)
+ * [Installing Plugins](#installing-plugins)
+ * [Developing Plugins](#developing-plugins)
+* [EMBEDDING HYPERVIDEO](#embedding-yt-dlp)
* [Embedding examples](#embedding-examples)
* [DEPRECATED OPTIONS](#deprecated-options)
* [CONTRIBUTING](CONTRIBUTING.md#contributing-to-yt-dlp)
@@ -54,16 +56,16 @@ hypervideo - A fork of yt-dlp without nonfree parts
# NEW FEATURES
-* Merged with **youtube-dl v2021.12.17+ [commit/de39d12](https://github.com/ytdl-org/youtube-dl/commit/de39d128)** <!--([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))--> and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl)
+* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@42f2d4**](https://github.com/ytdl-org/youtube-dl/commit/07af47960f3bb262ead02490ce65c8c45c01741e) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))
* **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API
* **[Format Sorting](#sorting-formats)**: The default format sorting options have been changed so that higher resolution and better codecs will be now preferred instead of simply using larger bitrate. Furthermore, you can now specify the sort order using `-S`. This allows for much easier format selection than what is possible by simply using `--format` ([examples](#format-selection-examples))
-* **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. Note that the NicoNico livestreams are not available. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details.
+* **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. Note that NicoNico livestreams are not available. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details.
* **YouTube improvements**:
- * Supports Clips, Stories (`ytstories:<channel UCID>`), Search (including filters)**\***, YouTube Music Search, Channel-specific search, Search prefixes (`ytsearch:`, `ytsearchdate:`)**\***, Mixes, YouTube Music Albums/Channels ([except self-uploaded music](https://github.com/yt-dlp/yt-dlp/issues/723)), and Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`)
+ * Supports Clips, Stories (`ytstories:<channel UCID>`), Search (including filters)**\***, YouTube Music Search, Channel-specific search, Search prefixes (`ytsearch:`, `ytsearchdate:`)**\***, Mixes, and Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`)
* Fix for [n-sig based throttling](https://github.com/ytdl-org/youtube-dl/issues/29326) **\***
* Supports some (but not all) age-gated content without cookies
* Download livestreams from the start using `--live-from-start` (*experimental*)
@@ -92,12 +94,16 @@ hypervideo - A fork of yt-dlp without nonfree parts
* **Output template improvements**: Output templates can now have date-time formatting, numeric offsets, object traversal etc. See [output template](#output-template) for details. Even more advanced operations can also be done with the help of `--parse-metadata` and `--replace-in-metadata`
-* **Other new options**: Many new options have been added such as `--alias`, `--print`, `--concat-playlist`, `--wait-for-video`, `--retry-sleep`, `--sleep-requests`, `--convert-thumbnails`, `--force-download-archive`, `--force-overwrites`, `--break-on-reject` etc
+* **Other new options**: Many new options have been added such as `--alias`, `--print`, `--concat-playlist`, `--wait-for-video`, `--retry-sleep`, `--sleep-requests`, `--convert-thumbnails`, `--force-download-archive`, `--force-overwrites`, `--break-match-filter` etc
* **Improvements**: Regex and other operators in `--format`/`--match-filter`, multiple `--postprocessor-args` and `--downloader-args`, faster archive checking, more [format selection options](#format-selection), merge multi-video/audio, multiple `--config-locations`, `--exec` at different stages, etc
* **Plugins**: Extractors and PostProcessors can be loaded from an external file. See [plugins](#plugins) for details
+* **Self updater**: The releases can be updated using `yt-dlp -U`, and downgraded using `--update-to` if required
+
+* **Nightly builds**: [Automated nightly builds](#update-channels) can be used with `--update-to nightly`
+
See [changelog](Changelog.md) or [commits](https://github.com/yt-dlp/yt-dlp/commits) for the full list of changes
Features marked with a **\*** have been back-ported to youtube-dl
@@ -106,6 +112,7 @@ Features marked with a **\*** have been back-ported to youtube-dl
Some of hypervideo's default options are different from that of youtube-dl and youtube-dlc:
+* yt-dlp supports only [Python 3.7+](## "Windows 7"), and *may* remove support for more versions as they [become EOL](https://devguide.python.org/versions/#python-release-cycle); while [youtube-dl still supports Python 2.6+ and 3.2+](https://github.com/ytdl-org/youtube-dl/issues/30568#issue-1118238743)
* The options `--auto-number` (`-A`), `--title` (`-t`) and `--literal` (`-l`), no longer work. See [removed options](#Removed) for details
* `avconv` is not supported as an alternative to `ffmpeg`
* hypervideo stores config files in slightly different locations to youtube-dl. See [CONFIGURATION](#configuration) for a list of correct locations
@@ -125,16 +132,20 @@ Some of hypervideo's default options are different from that of youtube-dl and y
* The upload dates extracted from YouTube are in UTC [when available](https://github.com/yt-dlp/yt-dlp/blob/89e4d86171c7b7c997c77d4714542e0383bf0db0/hypervideo_dl/extractor/youtube.py#L3898-L3900). Use `--compat-options no-youtube-prefer-utc-upload-date` to prefer the non-UTC upload date.
* If `ffmpeg` is used as the downloader, the downloading and merging of formats happen in a single step when possible. Use `--compat-options no-direct-merge` to revert this
* Thumbnail embedding in `mp4` is done with mutagen if possible. Use `--compat-options embed-thumbnail-atomicparsley` to force the use of AtomicParsley instead
-* Some private fields such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this
+* Some internal metadata such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this
* When `--embed-subs` and `--write-subs` are used together, the subtitles are written to disk and also embedded in the media file. You can use just `--embed-subs` to embed the subs and automatically delete the separate file. See [#630 (comment)](https://github.com/yt-dlp/yt-dlp/issues/630#issuecomment-893659460) for more info. `--compat-options no-keep-subs` can be used to revert this
* `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi`
-* hypervideo's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior
+* yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior
+* yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: [~~aria2c~~](https://github.com/yt-dlp/yt-dlp/issues/5931)). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is
+* yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filter` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this
For ease of use, a few more compat options are available:
* `--compat-options all`: Use all compat options (Do NOT use)
-* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams`
-* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect`
+* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter`
+* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter`
+* `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date`
+* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress`. Use this to enable all future compat options
# INSTALLATION
@@ -200,7 +211,7 @@ On some systems, you may need to use `py` or `python` instead of `python3`.
`pyinst.py` accepts any arguments that can be passed to `pyinstaller`, such as `--onefile/-F` or `--onedir/-D`, which is further [documented here](https://pyinstaller.org/en/stable/usage.html#what-to-generate).
-Note that pyinstaller with versions below 4.4 [do not support](https://github.com/pyinstaller/pyinstaller#requirements-and-tested-platforms) Python installed from the Windows store without using a virtual environment.
+**Note**: Pyinstaller versions below 4.4 [do not support](https://github.com/pyinstaller/pyinstaller#requirements-and-tested-platforms) Python installed from the Windows store without using a virtual environment.
**Important**: Running `pyinstaller` directly **without** using `pyinst.py` is **not** officially supported. This may or may not work correctly.
@@ -226,7 +237,10 @@ If you wish to build it anyway, install Python and py2exe, and then simply run `
* **`devscripts/set-variant.py variant [-M update_message]`** - Set the build variant of the executable
* **`devscripts/make_lazy_extractors.py`** - Create lazy extractors. Running this before building the binaries (any variant) will improve their startup performance. Set the environment variable `YTDLP_NO_LAZY_EXTRACTORS=1` if you wish to forcefully disable lazy extractor loading.
-You can also fork the project on GitHub and run your fork's [build workflow](.github/workflows/build.yml) to automatically build a full release
+Note: See their `--help` for more info.
+
+### Forking the project
+If you fork the project on GitHub, you can run your fork's [build workflow](.github/workflows/build.yml) to automatically build the selected version(s) as artifacts. Alternatively, you can run the [release workflow](.github/workflows/release.yml) or enable the [nightly workflow](.github/workflows/release-nightly.yml) to create full (pre-)releases.
# USAGE AND OPTIONS
@@ -285,7 +299,8 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
configuration files
--flat-playlist Do not extract the videos of a playlist,
only list them
- --no-flat-playlist Extract the videos of a playlist
+ --no-flat-playlist Fully extract the videos of a playlist
+ (default)
--live-from-start Download livestreams from the start.
Currently only supported for YouTube
(Experimental)
@@ -297,8 +312,12 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
--no-wait-for-video Do not wait for scheduled streams (default)
--mark-watched Mark videos watched (even with --simulate)
--no-mark-watched Do not mark videos watched (default)
- --no-colors Do not emit color codes in output (Alias:
- --no-colours)
+ --color [STREAM:]POLICY Whether to emit color codes in output,
+ optionally prefixed by the STREAM (stdout or
+ stderr) to apply the setting to. Can be one
+ of "always", "auto" (default), "never", or
+ "no_color" (use non color terminal
+ sequences). Can be used multiple times
--compat-options OPTS Options that can help keep compatibility
with youtube-dl or youtube-dlc
configurations by reverting some of the
@@ -330,6 +349,8 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
--source-address IP Client-side IP address to bind to
-4, --force-ipv4 Make all connections via IPv4
-6, --force-ipv6 Make all connections via IPv6
+ --enable-file-urls Enable file:// URLs. This is disabled by
+ default for security reasons.
## Geo-restriction:
--geo-verification-proxy URL Use this proxy to verify the IP address for
@@ -337,34 +358,31 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
specified by --proxy (or none, if the option
is not present) is used for the actual
downloading
- --geo-bypass Bypass geographic restriction via faking
- X-Forwarded-For HTTP header (default)
- --no-geo-bypass Do not bypass geographic restriction via
- faking X-Forwarded-For HTTP header
- --geo-bypass-country CODE Force bypass geographic restriction with
- explicitly provided two-letter ISO 3166-2
- country code
- --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with
- explicitly provided IP block in CIDR notation
+ --xff VALUE How to fake X-Forwarded-For HTTP header to
+ try bypassing geographic restriction. One of
+ "default" (only when known to be useful),
+ "never", an IP block in CIDR notation, or a
+ two-letter ISO 3166-2 country code
## Video Selection:
- -I, --playlist-items ITEM_SPEC Comma separated playlist_index of the videos
+ -I, --playlist-items ITEM_SPEC Comma separated playlist_index of the items
to download. You can specify a range using
"[START]:[STOP][:STEP]". For backward
compatibility, START-STOP is also supported.
Use negative indices to count from the right
and negative STEP to download in reverse
order. E.g. "-I 1:3,7,-5::2" used on a
- playlist of size 15 will download the videos
+ playlist of size 15 will download the items
at index 1,2,3,7,11,13,15
--min-filesize SIZE Abort download if filesize is smaller than
SIZE, e.g. 50k or 44.6M
- --max-filesize SIZE Abort download if filesize if larger than
+ --max-filesize SIZE Abort download if filesize is larger than
SIZE, e.g. 50k or 44.6M
--date DATE Download only videos uploaded on this date.
The date can be "YYYYMMDD" or in the format
- [now|today|yesterday][-N[day|week|month|year
- ]]. E.g. --date today-2weeks
+ [now|today|yesterday][-N[day|week|month|year]].
+ E.g. "--date today-2weeks" downloads only
+ videos uploaded on the same day two weeks ago
--datebefore DATE Download only videos uploaded on or before
this date. The date formats accepted is the
same as --date
@@ -391,7 +409,10 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
dogs" (caseless). Use "--match-filter -" to
interactively ask whether to download each
video
- --no-match-filter Do not use generic video filter (default)
+ --no-match-filters Do not use any --match-filter (default)
+ --break-match-filters FILTER Same as "--match-filters" but stops the
+ download process when a video is rejected
+ --no-break-match-filters Do not use any --break-match-filters (default)
--no-playlist Download only the video, if the URL refers
to a video and a playlist
--yes-playlist Download the playlist, if the URL refers to
@@ -405,11 +426,9 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
--max-downloads NUMBER Abort after downloading NUMBER files
--break-on-existing Stop the download process when encountering
a file that is in the archive
- --break-on-reject Stop the download process when encountering
- a file that has been filtered out
- --break-per-input --break-on-existing, --break-on-reject,
- --max-downloads, and autonumber resets per
- input URL
+ --break-per-input Alters --max-downloads, --break-on-existing,
+ --break-match-filter, and autonumber to
+ reset per input URL
--no-break-per-input --break-on-existing and similar options
terminates the entire download queue
--skip-playlist-after-errors N Number of allowed failures until the rest of
@@ -441,8 +460,8 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
linear=1::2 --retry-sleep fragment:exp=1:20
--skip-unavailable-fragments Skip unavailable fragments for DASH,
hlsnative and ISM downloads (default)
- (Alias: --no-abort-on-unavailable-fragment)
- --abort-on-unavailable-fragment
+ (Alias: --no-abort-on-unavailable-fragments)
+ --abort-on-unavailable-fragments
Abort download if a fragment is unavailable
(Alias: --no-skip-unavailable-fragments)
--keep-fragments Keep downloaded fragments on disk after
@@ -477,12 +496,14 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
--no-hls-use-mpegts Do not use the mpegts container for HLS
videos. This is default when not downloading
live streams
- --download-sections REGEX Download only chapters whose title matches
- the given regular expression. Time ranges
- prefixed by a "*" can also be used in place
- of chapters to download the specified range.
- Needs ffmpeg. This option can be used
- multiple times to download multiple
+ --download-sections REGEX Download only chapters that match the
+ regular expression. A "*" prefix denotes
+ time-range instead of chapter. Negative
+ timestamps are calculated from the end.
+ "*from-url" can be used to download between
+ the "start_time" and "end_time" extracted
+ from the URL. Needs ffmpeg. This option can
+ be used multiple times to download multiple
sections, e.g. --download-sections
"*10:15-inf" --download-sections "intro"
--downloader [PROTO:]NAME Name or path of the external downloader to
@@ -566,9 +587,8 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
--write-description etc. (default)
--no-write-playlist-metafiles Do not write playlist metadata when using
--write-info-json, --write-description etc.
- --clean-info-json Remove some private fields such as filenames
- from the infojson. Note that it could still
- contain some personal information (default)
+ --clean-info-json Remove some internal metadata such as
+ filenames from the infojson (default)
--no-clean-info-json Write all fields to the infojson
--write-comments Retrieve video comments to be placed in the
infojson. The comments are fetched even
@@ -596,7 +616,7 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
By default, all containers of the most
recently accessed profile are used.
Currently supported keyrings are: basictext,
- gnomekeyring, kwallet
+ gnomekeyring, kwallet, kwallet5, kwallet6
--no-cookies-from-browser Do not load cookies from browser (default)
--cache-dir DIR Location in the filesystem where hypervideo
can store some downloaded information (such
@@ -624,6 +644,7 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
## Verbosity and Simulation Options:
-q, --quiet Activate quiet mode. If used with --verbose,
print the log to stderr
+ --no-quiet Deactivate quiet mode. (Default)
--no-warnings Ignore warnings
-s, --simulate Do not download the video and do not write
anything to disk
@@ -641,7 +662,7 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
screen, optionally prefixed with when to
print it, separated by a ":". Supported
values of "WHEN" are the same as that of
- --use-postprocessor, and "video" (default).
+ --use-postprocessor (default: video).
Implies --quiet. Implies --simulate unless
--no-simulate or later stages of WHEN are
used. This option can be used multiple times
@@ -694,7 +715,7 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
--prefer-insecure Use an unencrypted connection to retrieve
information about the video (Currently
supported only for YouTube)
- --add-header FIELD:VALUE Specify a custom HTTP header and its value,
+ --add-headers FIELD:VALUE Specify a custom HTTP header and its value,
separated by a colon ":". You can use this
option multiple times
--bidi-workaround Work around terminals that lack
@@ -776,6 +797,8 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
--netrc-location PATH Location of .netrc authentication data;
either the path or its containing directory.
Defaults to ~/.netrc
+ --netrc-cmd NETRC_CMD Command to execute to get the credentials
+ for an extractor.
--video-password PASSWORD Video password (vimeo, youku)
--ap-mso MSO Adobe Pass multiple-system operator (TV
provider) identifier, use --ap-list-mso for
@@ -810,11 +833,11 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
specific bitrate like 128K (default 5)
--remux-video FORMAT Remux the video into another container if
necessary (currently supported: avi, flv,
- mkv, mov, mp4, webm, aac, aiff, alac, flac,
- m4a, mka, mp3, ogg, opus, vorbis, wav). If
- target container does not support the
- video/audio codec, remuxing will fail. You
- can specify multiple rules; e.g.
+ gif, mkv, mov, mp4, webm, aac, aiff, alac,
+ flac, m4a, mka, mp3, ogg, opus, vorbis,
+ wav). If target container does not support
+ the video/audio codec, remuxing will fail.
+ You can specify multiple rules; e.g.
"aac>m4a/mov>mp4/mkv" will remux aac to m4a,
mov to mp4 and anything else to mkv
--recode-video FORMAT Re-encode the video into another format if
@@ -869,13 +892,18 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
mkv/mka video files
--no-embed-info-json Do not embed the infojson as an attachment
to the video file
- --parse-metadata FROM:TO Parse additional metadata like title/artist
+ --parse-metadata [WHEN:]FROM:TO
+ Parse additional metadata like title/artist
from other fields; see "MODIFYING METADATA"
- for details
- --replace-in-metadata FIELDS REGEX REPLACE
+ for details. Supported values of "WHEN" are
+ the same as that of --use-postprocessor
+ (default: pre_process)
+ --replace-in-metadata [WHEN:]FIELDS REGEX REPLACE
Replace text in a metadata field using the
given regex. This option can be used
- multiple times
+ multiple times. Supported values of "WHEN"
+ are the same as that of --use-postprocessor
+ (default: pre_process)
--xattrs Write metadata to the video file's xattrs
(using dublin core and xdg standards)
--concat-playlist POLICY Concatenate videos in a playlist. One of
@@ -896,16 +924,13 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
--ffmpeg-location PATH Location of the ffmpeg binary; either the
path to the binary or its containing directory
--exec [WHEN:]CMD Execute a command, optionally prefixed with
- when to execute it (after_move if
- unspecified), separated by a ":". Supported
- values of "WHEN" are the same as that of
- --use-postprocessor. Same syntax as the
- output template can be used to pass any
- field as arguments to the command. After
- download, an additional field "filepath"
- that contains the final path of the
- downloaded file is also available, and if no
- fields are passed, %(filepath)q is appended
+ when to execute it, separated by a ":".
+ Supported values of "WHEN" are the same as
+ that of --use-postprocessor (default:
+ after_move). Same syntax as the output
+ template can be used to pass any field as
+ arguments to the command. If no fields are
+ passed, %(filepath,_filename|)q is appended
to the end of the command. This option can
be used multiple times
--no-exec Remove any previously defined --exec
@@ -945,19 +970,21 @@ You can also fork the project on GitHub and run your fork's [build workflow](.gi
postprocessor is invoked. It can be one of
"pre_process" (after video extraction),
"after_filter" (after video passes filter),
- "before_dl" (before each video download),
- "post_process" (after each video download;
- default), "after_move" (after moving video
- file to it's final locations), "after_video"
- (after downloading and processing all
- formats of a video), or "playlist" (at end
- of playlist). This option can be used
- multiple times to add different postprocessors
+ "video" (after --format; before
+ --print/--output), "before_dl" (before each
+ video download), "post_process" (after each
+ video download; default), "after_move"
+ (after moving video file to it's final
+ locations), "after_video" (after downloading
+ and processing all formats of a video), or
+ "playlist" (at end of playlist). This option
+ can be used multiple times to add different
+ postprocessors
## SponsorBlock Options:
Make chapter entries for, or remove various segments (sponsor,
introductions, etc.) from downloaded YouTube videos using the
- SponsorBlock API (https://sponsor.ajay.app)
+ [SponsorBlock API](https://sponsor.ajay.app)
--sponsorblock-mark CATS SponsorBlock categories to create chapters
for, separated by commas. Available
@@ -1047,7 +1074,7 @@ E.g. with the following configuration file hypervideo will always extract the au
-o ~/YouTube/%(title)s.%(ext)s
```
-Note that options in configuration file are just the same options aka switches used in regular command line calls; thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. They must also be quoted when necessary as-if it were a UNIX shell.
+**Note**: Options in configuration file are just the same options aka switches used in regular command line calls; thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. They must also be quoted when necessary as-if it were a UNIX shell.
You can use `--ignore-config` if you want to disable all configuration files for a particular hypervideo run. If `--ignore-config` is found inside any configuration file, no further configuration will be loaded. For example, having the option in the portable configuration file prevents loading of home, user, and system configurations. Additionally, (for backward compatibility) if `--ignore-config` is found inside the system configuration file, the user configuration is not loaded.
@@ -1057,7 +1084,7 @@ The configuration files are decoded according to the UTF BOM if present, and in
If you want your file to be decoded differently, add `# coding: ENCODING` to the beginning of the file (e.g. `# coding: shift-jis`). There must be no characters before that, even spaces or BOM.
-### Authentication with `.netrc` file
+### Authentication with netrc
You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every hypervideo execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per-extractor basis. For that you will need to create a `.netrc` file in `--netrc-location` and restrict permissions to read/write by only you:
```
@@ -1077,6 +1104,14 @@ To activate authentication with the `.netrc` file you should pass `--netrc` to h
The default location of the .netrc file is `~` (see below).
+As an alternative to using the `.netrc` file, which has the disadvantage of keeping your passwords in a plain text file, you can configure a custom shell command to provide the credentials for an extractor. This is done by providing the `--netrc-cmd` parameter, it shall output the credentials in the netrc format and return `0` on success, other values will be treated as an error. `{}` in the command will be replaced by the name of the extractor to make it possible to select the credentials for the right extractor.
+
+E.g. To use an encrypted `.netrc` file stored as `.authinfo.gpg`
+```
+yt-dlp --netrc-cmd 'gpg --decrypt ~/.authinfo.gpg' https://www.youtube.com/watch?v=BaW_jenozKc
+```
+
+
### Notes about environment variables
* Environment variables are normally specified as `${VARIABLE}`/`$VARIABLE` on UNIX and `%VARIABLE%` on Windows; but is always shown as `${VARIABLE}` in this documentation
* hypervideo also allow using UNIX-style variables on Windows for path-like options; e.g. `--output`, `--config-location`
@@ -1106,7 +1141,7 @@ The field names themselves (the part inside the parenthesis) can also have some
1. **Alternatives**: Alternate fields can be specified separated with a `,`. E.g. `%(release_date>%Y,upload_date>%Y|Unknown)s`
-1. **Replacement**: A replacement value can be specified using a `&` separator. If the field is *not* empty, this replacement value will be used instead of the actual field content. This is done after alternate fields are considered; thus the replacement is used if *any* of the alternative fields is *not* empty.
+1. **Replacement**: A replacement value can be specified using a `&` separator according to the [`str.format` mini-language](https://docs.python.org/3/library/string.html#format-specification-mini-language). If the field is *not* empty, this replacement value will be used instead of the actual field content. This is done after alternate fields are considered; thus the replacement is used if *any* of the alternative fields is *not* empty. E.g. `%(chapters&has chapters|no chapters)s`, `%(title&TITLE={:>20}|NO TITLE)s`
1. **Default**: A literal default value can be specified for when the field is empty using a `|` separator. This overrides `--output-na-placeholder`. E.g. `%(uploader|Unknown)s`
@@ -1121,9 +1156,9 @@ To summarize, the general syntax for a field is:
Additionally, you can set different output templates for the various metadata files separately from the general output template by specifying the type of file followed by the template separated by a colon `:`. The different file types supported are `subtitle`, `thumbnail`, `description`, `annotation` (deprecated), `infojson`, `link`, `pl_thumbnail`, `pl_description`, `pl_infojson`, `chapter`, `pl_video`. E.g. `-o "%(title)s.%(ext)s" -o "thumbnail:%(title)s\%(title)s.%(ext)s"` will put the thumbnails in a folder with the same name as the video. If any of the templates is empty, that type of file will not be written. E.g. `--write-thumbnail -o "thumbnail:"` will write thumbnails only for playlists and not for video.
-<a id="outtmpl-postprocess-note"></a>
+<a id="outtmpl-postprocess-note"/>
-Note: Due to post-processing (i.e. merging etc.), the actual output filename might differ. Use `--print after_move:filepath` to get the name after all post-processing is complete.
+**Note**: Due to post-processing (i.e. merging etc.), the actual output filename might differ. Use `--print after_move:filepath` to get the name after all post-processing is complete.
The available fields are:
@@ -1147,6 +1182,7 @@ The available fields are:
- `channel` (string): Full name of the channel the video is uploaded on
- `channel_id` (string): Id of the channel
- `channel_follower_count` (numeric): Number of followers of the channel
+ - `channel_is_verified` (boolean): Whether the channel is verified on the platform
- `location` (string): Physical location where the video was filmed
- `duration` (numeric): Length of the video in seconds
- `duration_string` (string): Length of the video (HH:mm:ss)
@@ -1168,7 +1204,7 @@ The available fields are:
- `extractor` (string): Name of the extractor
- `extractor_key` (string): Key name of the extractor
- `epoch` (numeric): Unix epoch of when the information extraction was completed
- - `autonumber` (numeric): Number that will be increased with each download, starting at `--autonumber-start`
+ - `autonumber` (numeric): Number that will be increased with each download, starting at `--autonumber-start`, padded with leading zeros to 5 digits
- `video_autonumber` (numeric): Number that will be increased with each video
- `n_entries` (numeric): Total number of extracted items in the playlist
- `playlist_id` (string): Identifier of the playlist that contains the video
@@ -1231,7 +1267,6 @@ Available only when used in `--print`:
- `subtitles_table` (table): The subtitle format table as printed by `--list-subs`
- `automatic_captions_table` (table): The automatic subtitle format table as printed by `--list-subs`
-
Available only in `--sponsorblock-chapter-title`:
- `start_time` (numeric): Start time of the chapter in seconds
@@ -1244,7 +1279,7 @@ Available only in `--sponsorblock-chapter-title`:
Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. E.g. for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `hypervideo test video` and id `BaW_jenozKc`, this will result in a `hypervideo test video-BaW_jenozKc.mp4` file created in the current directory.
-Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with placeholder value provided with `--output-na-placeholder` (`NA` by default).
+**Note**: Some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with placeholder value provided with `--output-na-placeholder` (`NA` by default).
**Tip**: Look at the `-j` output to identify which fields are available for the particular URL
@@ -1351,7 +1386,7 @@ Unless `--video-multistreams` is used, all formats with a video stream except th
## Filtering Formats
-You can also filter the video formats by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`).
+You can also filter the video formats by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"` since filters without a selector are interpreted as `best`).
The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, `>=`, `=` (equals), `!=` (not equals):
@@ -1385,9 +1420,9 @@ Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends
Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain). The comparand of a string comparison needs to be quoted with either double or single quotes if it contains spaces or special characters other than `._-`.
-Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the website. Any other field made available by the extractor can also be used for filtering.
+**Note**: None of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the website. Any other field made available by the extractor can also be used for filtering.
-Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "[height<=?720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. You can also use the filters with `all` to download all formats that satisfy the filter, e.g. `-f "all[vcodec=none]"` selects all audio-only formats.
+Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "bv[height<=?720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. You can also use the filters with `all` to download all formats that satisfy the filter, e.g. `-f "all[vcodec=none]"` selects all audio-only formats.
Format selectors can also be grouped using parentheses; e.g. `-f "(mp4,webm)[height<480]"` will download the best pre-merged mp4 and webm formats with a height lower than 480.
@@ -1405,13 +1440,13 @@ The available fields are:
- `source`: The preference of the source
- `proto`: Protocol used for download (`https`/`ftps` > `http`/`ftp` > `m3u8_native`/`m3u8` > `http_dash_segments`> `websocket_frag` > `mms`/`rtsp` > `f4f`/`f4m`)
- `vcodec`: Video Codec (`av01` > `vp9.2` > `vp9` > `h265` > `h264` > `vp8` > `h263` > `theora` > other)
- - `acodec`: Audio Codec (`flac`/`alac` > `wav`/`aiff` > `opus` > `vorbis` > `aac` > `mp4a` > `mp3` > `eac3` > `ac3` > `dts` > other)
+ - `acodec`: Audio Codec (`flac`/`alac` > `wav`/`aiff` > `opus` > `vorbis` > `aac` > `mp4a` > `mp3` > `ac4` > `eac3` > `ac3` > `dts` > other)
- `codec`: Equivalent to `vcodec,acodec`
- `vext`: Video Extension (`mp4` > `mov` > `webm` > `flv` > other). If `--prefer-free-formats` is used, `webm` is preferred.
- `aext`: Audio Extension (`m4a` > `aac` > `mp3` > `ogg` > `opus` > `webm` > other). If `--prefer-free-formats` is used, the order changes to `ogg` > `opus` > `webm` > `mp3` > `m4a` > `aac`
- `ext`: Equivalent to `vext,aext`
- `filesize`: Exact filesize, if known in advance
- - `fs_approx`: Approximate filesize calculated from the manifests
+ - `fs_approx`: Approximate filesize
- `size`: Exact filesize if available, otherwise approximate filesize
- `height`: Height of video
- `width`: Width of video
@@ -1422,7 +1457,7 @@ The available fields are:
- `tbr`: Total average bitrate in KBit/s
- `vbr`: Average video bitrate in KBit/s
- `abr`: Average audio bitrate in KBit/s
- - `br`: Equivalent to using `tbr,vbr,abr`
+ - `br`: Average bitrate in KBit/s, `tbr`/`vbr`/`abr`
- `asr`: Audio sample rate in Hz
**Deprecation warning**: Many of these fields have (currently undocumented) aliases, that may be removed in a future version. It is recommended to use only the documented field names.
@@ -1572,7 +1607,7 @@ Note that these options preserve their relative order, allowing replacements to
This option also has a few special uses:
-* You can download an additional URL based on the metadata of the currently downloaded video. To do this, set the field `additional_urls` to the URL that you want to download. E.g. `--parse-metadata "description:(?P<additional_urls>https?://www\.vimeo\.com/\d+)` will download the first vimeo video found in the description
+* You can download an additional URL based on the metadata of the currently downloaded video. To do this, set the field `additional_urls` to the URL that you want to download. E.g. `--parse-metadata "description:(?P<additional_urls>https?://www\.vimeo\.com/\d+)"` will download the first vimeo video found in the description
* You can use this to change the metadata that is embedded in the media file. To do this, set the value of the corresponding field with a `meta_` prefix. For example, any value you set to `meta_description` field will be added to the `description` field in the file - you can use this to set a different "description" and "synopsis". To modify the metadata of individual streams, use the `meta<n>_` prefix (e.g. `meta1_language`). Any value set to the `meta_` field will overwrite all default values.
@@ -1635,17 +1670,20 @@ $ hypervideo --replace-in-metadata "title,uploader" "[ _]" "-"
Some extractors accept additional arguments which can be passed using `--extractor-args KEY:ARGS`. `ARGS` is a `;` (semicolon) separated string of `ARG=VAL1,VAL2`. E.g. `--extractor-args "youtube:player-client=android_embedded,web;include_live_dash" --extractor-args "funimation:version=uncut"`
+Note: In CLI, `ARG` can use `-` instead of `_`; e.g. `youtube:player-client"` becomes `youtube:player_client"`
+
The following extractors use this feature:
#### youtube
-* `lang`: Language code to prefer translated metadata of this language (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/hypervideo_dl/extractor/youtube.py#L381-L390) for list of supported content language codes
+* `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/hypervideo_dl/extractor/youtube.py#L381-L390) for list of supported content language codes
* `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively
-* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients.
+* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients.
* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details
+* `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp.
* `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all`
* E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
-* `include_incomplete_formats`: Extract formats that cannot be downloaded completely (live dash and post-live m3u8)
+* `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8)
* `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
* `innertube_key`: Innertube API key to use for all API requests
@@ -1654,7 +1692,10 @@ The following extractors use this feature:
* `approximate_date`: Extract approximate `upload_date` and `timestamp` in flat-playlist. This may cause date-based filters to be slightly off
#### generic
-* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments. Does not apply to ffmpeg
+* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Does not apply to ffmpeg
+* `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs if no value is provided, or else apply the query string given as `variant_query=VALUE`
+* `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist
+* `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live`
#### funimation
* `language`: Audio languages to extract, e.g. `funimation:language=english,japanese`
@@ -1682,6 +1723,7 @@ The following extractors use this feature:
* `dr`: dynamic range to ignore - one or more of `sdr`, `hdr10`, `dv`
#### tiktok
+* `api_hostname`: Hostname to use for mobile API requests, e.g. `api-h2.tiktokv.com`
* `app_version`: App version to call mobile APIs with - should be set along with `manifest_app_version`, e.g. `20.2.1`
* `manifest_app_version`: Numeric app version to call mobile APIs with, e.g. `221`
@@ -1689,9 +1731,18 @@ The following extractors use this feature:
* `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks`
#### twitter
-* `force_graphql`: Force usage of the GraphQL API. By default it will only be used if login cookies are provided
+* `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed
+
+#### stacommu, wrestleuniverse
+* `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage
+
+#### twitch
+* `client_id`: Client ID value to be sent with GraphQL requests, e.g. `twitch:client_id=kimne78kx3ncx6brgo4mv6wki5h1ko`
-NOTE: These options may be changed/removed in the future without concern for backward compatibility
+#### nhkradirulive (NHK らじる★らじる LIVE)
+* `area`: Which regional variation to extract. Valid areas are: `sapporo`, `sendai`, `tokyo`, `nagoya`, `osaka`, `hiroshima`, `matsuyama`, `fukuoka`. Defaults to `tokyo`
+
+**Note**: These options may be changed/removed in the future without concern for backward compatibility
<!-- MANPAGE: MOVE "INSTALLATION" SECTION HERE -->
@@ -1700,17 +1751,69 @@ NOTE: These options may be changed/removed in the future without concern for bac
Plugins are loaded from `<root-dir>/ytdlp_plugins/<type>/__init__.py`; where `<root-dir>` is the directory of the binary (`<root-dir>/hypervideo`), or the root directory of the module if you are running directly from source-code (`<root dir>/hypervideo_dl/__main__.py`). Plugins are currently not supported for the `pip` version
-Plugins can be of `<type>`s `extractor` or `postprocessor`. Extractor plugins do not need to be enabled from the CLI and are automatically invoked when the input URL is suitable for it. Postprocessor plugins can be invoked using `--use-postprocessor NAME`.
+Plugins can be of `<type>`s `extractor` or `postprocessor`.
+- Extractor plugins do not need to be enabled from the CLI and are automatically invoked when the input URL is suitable for it.
+- Extractor plugins take priority over builtin extractors.
+- Postprocessor plugins can be invoked using `--use-postprocessor NAME`.
+
-See [ytdlp_plugins](ytdlp_plugins) for example plugins.
+Plugins are loaded from the namespace packages `hypervideo_dl_plugins.extractor` and `hypervideo_dl_plugins.postprocessor`.
-Note that **all** plugins are imported even if not invoked, and that **there are no checks** performed on plugin code. Use plugins at your own risk and only if you trust the code
+In other words, the file structure on the disk looks something like:
-If you are a plugin author, add [ytdlp-plugins](https://github.com/topics/ytdlp-plugins) as a topic to your repository for discoverability
+ hypervideo_dl_plugins/
+ extractor/
+ myplugin.py
+ postprocessor/
+ myplugin.py
+
+yt-dlp looks for these `hypervideo_dl_plugins` namespace folders in many locations (see below) and loads in plugins from **all** of them.
See the [wiki for some known plugins](https://github.com/yt-dlp/yt-dlp/wiki/Plugins)
+## Installing Plugins
+
+Plugins can be installed using various methods and locations.
+
+1. **Configuration directories**:
+ Plugin packages (containing a `hypervideo_dl_plugins` namespace folder) can be dropped into the following standard [configuration locations](#configuration):
+ * **User Plugins**
+ * `${XDG_CONFIG_HOME}/yt-dlp/plugins/<package name>/hypervideo_dl_plugins/` (recommended on Linux/macOS)
+ * `${XDG_CONFIG_HOME}/yt-dlp-plugins/<package name>/hypervideo_dl_plugins/`
+ * `${APPDATA}/yt-dlp/plugins/<package name>/hypervideo_dl_plugins/` (recommended on Windows)
+ * `${APPDATA}/yt-dlp-plugins/<package name>/hypervideo_dl_plugins/`
+ * `~/.yt-dlp/plugins/<package name>/hypervideo_dl_plugins/`
+ * `~/yt-dlp-plugins/<package name>/hypervideo_dl_plugins/`
+ * **System Plugins**
+ * `/etc/yt-dlp/plugins/<package name>/hypervideo_dl_plugins/`
+ * `/etc/yt-dlp-plugins/<package name>/hypervideo_dl_plugins/`
+2. **Executable location**: Plugin packages can similarly be installed in a `yt-dlp-plugins` directory under the executable location (recommended for portable installations):
+ * Binary: where `<root-dir>/yt-dlp.exe`, `<root-dir>/yt-dlp-plugins/<package name>/hypervideo_dl_plugins/`
+ * Source: where `<root-dir>/hypervideo_dl/__main__.py`, `<root-dir>/yt-dlp-plugins/<package name>/hypervideo_dl_plugins/`
+
+3. **pip and other locations in `PYTHONPATH`**
+ * Plugin packages can be installed and managed using `pip`. See [yt-dlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for an example.
+ * Note: plugin files between plugin packages installed with pip must have unique filenames.
+ * Any path in `PYTHONPATH` is searched in for the `hypervideo_dl_plugins` namespace folder.
+ * Note: This does not apply for Pyinstaller/py2exe builds.
+
+
+`.zip`, `.egg` and `.whl` archives containing a `hypervideo_dl_plugins` namespace folder in their root are also supported as plugin packages.
+* e.g. `${XDG_CONFIG_HOME}/yt-dlp/plugins/mypluginpkg.zip` where `mypluginpkg.zip` contains `hypervideo_dl_plugins/<type>/myplugin.py`
+Run yt-dlp with `--verbose` to check if the plugin has been loaded.
+
+## Developing Plugins
+
+See the [yt-dlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) repo for a template plugin package and the [Plugin Development](https://github.com/yt-dlp/yt-dlp/wiki/Plugin-Development) section of the wiki for a plugin development guide.
+
+All public classes with a name ending in `IE`/`PP` are imported from each file for extractors and postprocessors repectively. This respects underscore prefix (e.g. `_MyBasePluginIE` is private) and `__all__`. Modules can similarly be excluded by prefixing the module name with an underscore (e.g. `_myplugin.py`).
+
+To replace an existing extractor with a subclass of one, set the `plugin_name` class keyword argument (e.g. `class MyPluginIE(ABuiltInIE, plugin_name='myplugin')` will replace `ABuiltInIE` with `MyPluginIE`). Since the extractor replaces the parent, you should exclude the subclass extractor from being imported separately by making it private using one of the methods described above.
+
+If you are a plugin author, add [yt-dlp-plugins](https://github.com/topics/yt-dlp-plugins) as a topic to your repository for discoverability.
+
+See the [Developer Instructions](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#developer-instructions) on how to write and test an extractor.
# EMBEDDING HYPERVIDEO
@@ -1872,7 +1975,7 @@ with hypervideo_dl.YoutubeDL() as ydl:
```python
import hypervideo_dl
-URL = ['https://www.youtube.com/watch?v=BaW_jenozKc']
+URLS = ['https://www.youtube.com/watch?v=BaW_jenozKc']
def format_selector(ctx):
""" Select the best video and the best audio that won't result in an mkv.
@@ -1938,12 +2041,14 @@ While these options are redundant, they are still expected to be used due to the
--reject-title REGEX --match-filter "title !~= (?i)REGEX"
--min-views COUNT --match-filter "view_count >=? COUNT"
--max-views COUNT --match-filter "view_count <=? COUNT"
+ --break-on-reject Use --break-match-filter
--user-agent UA --add-header "User-Agent:UA"
--referer URL --add-header "Referer:URL"
--playlist-start NUMBER -I NUMBER:
--playlist-end NUMBER -I :NUMBER
--playlist-reverse -I ::-1
--no-playlist-reverse Default
+ --no-colors --color no_color
#### Not recommended
@@ -1967,6 +2072,10 @@ While these options still work, their use is not recommended since there are oth
--youtube-skip-hls-manifest --extractor-args "youtube:skip=hls" (Alias: --no-youtube-include-hls-manifest)
--youtube-include-dash-manifest Default (Alias: --no-youtube-skip-dash-manifest)
--youtube-include-hls-manifest Default (Alias: --no-youtube-skip-hls-manifest)
+ --geo-bypass --xff "default"
+ --no-geo-bypass --xff "never"
+ --geo-bypass-country CODE --xff CODE
+ --geo-bypass-ip-block IP_BLOCK --xff IP_BLOCK
#### Developer options
diff --git a/completions/zsh/_hypervideo b/completions/zsh/_hypervideo
index f31f234..b0068a9 100644
--- a/completions/zsh/_hypervideo
+++ b/completions/zsh/_hypervideo
@@ -21,7 +21,7 @@ __hypervideo_dl() {
elif [[ ${prev} == "--recode-video" ]]; then
_arguments '*: :(mp4 flv ogg webm mkv)'
else
- _arguments '*: :(--help --version --ignore-errors --no-abort-on-error --abort-on-error --dump-user-agent --list-extractors --extractor-descriptions --use-extractors --force-generic-extractor --default-search --ignore-config --no-config-locations --config-locations --flat-playlist --no-flat-playlist --live-from-start --no-live-from-start --wait-for-video --no-wait-for-video --mark-watched --no-mark-watched --no-colors --compat-options --alias --proxy --socket-timeout --source-address --force-ipv4 --force-ipv6 --geo-verification-proxy --cn-verification-proxy --geo-bypass --no-geo-bypass --geo-bypass-country --geo-bypass-ip-block --playlist-start --playlist-end --playlist-items --match-title --reject-title --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --match-filters --no-match-filter --no-playlist --yes-playlist --age-limit --download-archive --no-download-archive --max-downloads --break-on-existing --break-on-reject --break-per-input --no-break-per-input --skip-playlist-after-errors --include-ads --no-include-ads --concurrent-fragments --limit-rate --throttled-rate --retries --file-access-retries --fragment-retries --retry-sleep --skip-unavailable-fragments --abort-on-unavailable-fragment --keep-fragments --no-keep-fragments --buffer-size --resize-buffer --no-resize-buffer --http-chunk-size --test --playlist-reverse --no-playlist-reverse --playlist-random --lazy-playlist --no-lazy-playlist --xattr-set-filesize --hls-prefer-native --hls-prefer-ffmpeg --hls-use-mpegts --no-hls-use-mpegts --download-sections --downloader --downloader-args --batch-file --no-batch-file --id --paths --output --output-na-placeholder --autonumber-size --autonumber-start --restrict-filenames --no-restrict-filenames --windows-filenames --no-windows-filenames --trim-filenames --no-overwrites --force-overwrites --no-force-overwrites --continue --no-continue --part --no-part --mtime --no-mtime --write-description --no-write-description --write-info-json --no-write-info-json --write-annotations --no-write-annotations --write-playlist-metafiles --no-write-playlist-metafiles --clean-info-json --no-clean-info-json --write-comments --no-write-comments --load-info-json --cookies --no-cookies --cookies-from-browser --no-cookies-from-browser --cache-dir --no-cache-dir --rm-cache-dir --write-thumbnail --no-write-thumbnail --write-all-thumbnails --list-thumbnails --write-link --write-url-link --write-webloc-link --write-desktop-link --quiet --no-warnings --simulate --no-simulate --ignore-no-formats-error --no-ignore-no-formats-error --skip-download --print --print-to-file --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --dump-single-json --print-json --force-write-archive --newline --no-progress --progress --console-title --progress-template --verbose --dump-pages --write-pages --load-pages --youtube-print-sig-code --print-traffic --call-home --no-call-home --encoding --legacy-server-connect --no-check-certificates --prefer-insecure --user-agent --referer --add-header --bidi-workaround --sleep-requests --sleep-interval --max-sleep-interval --sleep-subtitles --format --format-sort --format-sort-force --no-format-sort-force --video-multistreams --no-video-multistreams --audio-multistreams --no-audio-multistreams --all-formats --prefer-free-formats --no-prefer-free-formats --check-formats --check-all-formats --no-check-formats --list-formats --list-formats-as-table --list-formats-old --merge-output-format --allow-unplayable-formats --no-allow-unplayable-formats --write-subs --no-write-subs --write-auto-subs --no-write-auto-subs --all-subs --list-subs --sub-format --sub-langs --username --password --twofactor --netrc --netrc-location --video-password --ap-mso --ap-username --ap-password --ap-list-mso --client-certificate --client-certificate-key --client-certificate-password --extract-audio --audio-format --audio-quality --remux-video --recode-video --postprocessor-args --keep-video --no-keep-video --post-overwrites --no-post-overwrites --embed-subs --no-embed-subs --embed-thumbnail --no-embed-thumbnail --embed-metadata --no-embed-metadata --embed-chapters --no-embed-chapters --embed-info-json --no-embed-info-json --metadata-from-title --parse-metadata --replace-in-metadata --xattrs --concat-playlist --fixup --prefer-avconv --prefer-ffmpeg --ffmpeg-location --exec --no-exec --exec-before-download --no-exec-before-download --convert-subs --convert-thumbnails --split-chapters --no-split-chapters --remove-chapters --no-remove-chapters --force-keyframes-at-cuts --no-force-keyframes-at-cuts --use-postprocessor --sponsorblock-mark --sponsorblock-remove --sponsorblock-chapter-title --no-sponsorblock --sponsorblock-api --sponskrub --no-sponskrub --sponskrub-cut --no-sponskrub-cut --sponskrub-force --no-sponskrub-force --sponskrub-location --sponskrub-args --extractor-retries --allow-dynamic-mpd --ignore-dynamic-mpd --hls-split-discontinuity --no-hls-split-discontinuity --extractor-args --youtube-include-dash-manifest --youtube-skip-dash-manifest --youtube-include-hls-manifest --youtube-skip-hls-manifest)'
+ _arguments '*: :(--help --version --ignore-errors --no-abort-on-error --abort-on-error --dump-user-agent --list-extractors --extractor-descriptions --use-extractors --force-generic-extractor --default-search --ignore-config --no-config-locations --config-locations --flat-playlist --no-flat-playlist --live-from-start --no-live-from-start --wait-for-video --no-wait-for-video --mark-watched --no-mark-watched --no-colors --color --compat-options --alias --proxy --socket-timeout --source-address --force-ipv4 --force-ipv6 --enable-file-urls --geo-verification-proxy --cn-verification-proxy --xff --geo-bypass --no-geo-bypass --geo-bypass-country --geo-bypass-ip-block --playlist-start --playlist-end --playlist-items --match-title --reject-title --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --match-filters --no-match-filters --break-match-filters --no-break-match-filters --no-playlist --yes-playlist --age-limit --download-archive --no-download-archive --max-downloads --break-on-existing --break-on-reject --break-per-input --no-break-per-input --skip-playlist-after-errors --include-ads --no-include-ads --concurrent-fragments --limit-rate --throttled-rate --retries --file-access-retries --fragment-retries --retry-sleep --skip-unavailable-fragments --abort-on-unavailable-fragments --keep-fragments --no-keep-fragments --buffer-size --resize-buffer --no-resize-buffer --http-chunk-size --test --playlist-reverse --no-playlist-reverse --playlist-random --lazy-playlist --no-lazy-playlist --xattr-set-filesize --hls-prefer-native --hls-prefer-ffmpeg --hls-use-mpegts --no-hls-use-mpegts --download-sections --downloader --downloader-args --batch-file --no-batch-file --id --paths --output --output-na-placeholder --autonumber-size --autonumber-start --restrict-filenames --no-restrict-filenames --windows-filenames --no-windows-filenames --trim-filenames --no-overwrites --force-overwrites --no-force-overwrites --continue --no-continue --part --no-part --mtime --no-mtime --write-description --no-write-description --write-info-json --no-write-info-json --write-annotations --no-write-annotations --write-playlist-metafiles --no-write-playlist-metafiles --clean-info-json --no-clean-info-json --write-comments --no-write-comments --load-info-json --cookies --no-cookies --cookies-from-browser --no-cookies-from-browser --cache-dir --no-cache-dir --rm-cache-dir --write-thumbnail --no-write-thumbnail --write-all-thumbnails --list-thumbnails --write-link --write-url-link --write-webloc-link --write-desktop-link --quiet --no-quiet --no-warnings --simulate --no-simulate --ignore-no-formats-error --no-ignore-no-formats-error --skip-download --print --print-to-file --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --dump-single-json --print-json --force-write-archive --newline --no-progress --progress --console-title --progress-template --verbose --dump-pages --write-pages --load-pages --youtube-print-sig-code --print-traffic --call-home --no-call-home --encoding --legacy-server-connect --no-check-certificates --prefer-insecure --user-agent --referer --add-headers --bidi-workaround --sleep-requests --sleep-interval --max-sleep-interval --sleep-subtitles --format --format-sort --format-sort-force --no-format-sort-force --video-multistreams --no-video-multistreams --audio-multistreams --no-audio-multistreams --all-formats --prefer-free-formats --no-prefer-free-formats --check-formats --check-all-formats --no-check-formats --list-formats --list-formats-as-table --list-formats-old --merge-output-format --allow-unplayable-formats --no-allow-unplayable-formats --write-subs --no-write-subs --write-auto-subs --no-write-auto-subs --all-subs --list-subs --sub-format --sub-langs --username --password --twofactor --netrc --netrc-location --netrc-cmd --video-password --ap-mso --ap-username --ap-password --ap-list-mso --client-certificate --client-certificate-key --client-certificate-password --extract-audio --audio-format --audio-quality --remux-video --recode-video --postprocessor-args --keep-video --no-keep-video --post-overwrites --no-post-overwrites --embed-subs --no-embed-subs --embed-thumbnail --no-embed-thumbnail --embed-metadata --no-embed-metadata --embed-chapters --no-embed-chapters --embed-info-json --no-embed-info-json --metadata-from-title --parse-metadata --replace-in-metadata --xattrs --concat-playlist --fixup --prefer-avconv --prefer-ffmpeg --ffmpeg-location --exec --no-exec --exec-before-download --no-exec-before-download --convert-subs --convert-thumbnails --split-chapters --no-split-chapters --remove-chapters --no-remove-chapters --force-keyframes-at-cuts --no-force-keyframes-at-cuts --use-postprocessor --sponsorblock-mark --sponsorblock-remove --sponsorblock-chapter-title --no-sponsorblock --sponsorblock-api --sponskrub --no-sponskrub --sponskrub-cut --no-sponskrub-cut --sponskrub-force --no-sponskrub-force --sponskrub-location --sponskrub-args --extractor-retries --allow-dynamic-mpd --ignore-dynamic-mpd --hls-split-discontinuity --no-hls-split-discontinuity --extractor-args --youtube-include-dash-manifest --youtube-skip-dash-manifest --youtube-include-hls-manifest --youtube-skip-hls-manifest)'
fi
;;
esac
diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json
new file mode 100644
index 0000000..d03db3f
--- /dev/null
+++ b/devscripts/changelog_override.json
@@ -0,0 +1,73 @@
+[
+ {
+ "action": "add",
+ "when": "29cb20bd563c02671b31dd840139e93dd37150a1",
+ "short": "[priority] **A new release type has been added!**\n * [`nightly`](https://github.com/yt-dlp/yt-dlp/releases/tag/nightly) builds will be made after each push, containing the latest fixes (but also possibly bugs).\n * When using `--update`/`-U`, a release binary will only update to its current channel (either `stable` or `nightly`).\n * The `--update-to` option has been added allowing the user more control over program upgrades (or downgrades).\n * `--update-to` can change the release channel (`stable`, `nightly`) and also upgrade or downgrade to specific tags.\n * **Usage**: `--update-to CHANNEL`, `--update-to TAG`, `--update-to CHANNEL@TAG`"
+ },
+ {
+ "action": "add",
+ "when": "5038f6d713303e0967d002216e7a88652401c22a",
+ "short": "[priority] **YouTube throttling fixes!**"
+ },
+ {
+ "action": "remove",
+ "when": "2e023649ea4e11151545a34dc1360c114981a236"
+ },
+ {
+ "action": "add",
+ "when": "01aba2519a0884ef17d5f85608dbd2a455577147",
+ "short": "[priority] YouTube: Improved throttling and signature fixes"
+ },
+ {
+ "action": "change",
+ "when": "c86e433c35fe5da6cb29f3539eef97497f84ed38",
+ "short": "[extractor/niconico:series] Fix extraction (#6898)",
+ "authors": ["sqrtNOT"]
+ },
+ {
+ "action": "change",
+ "when": "69a40e4a7f6caa5662527ebd2f3c4e8aa02857a2",
+ "short": "[extractor/youtube:music_search_url] Extract title (#7102)",
+ "authors": ["kangalio"]
+ },
+ {
+ "action": "change",
+ "when": "8417f26b8a819cd7ffcd4e000ca3e45033e670fb",
+ "short": "Add option `--color` (#6904)",
+ "authors": ["Grub4K"]
+ },
+ {
+ "action": "change",
+ "when": "b4e0d75848e9447cee2cd3646ce54d4744a7ff56",
+ "short": "Improve `--download-sections`\n - Support negative time-ranges\n - Add `*from-url` to obey time-ranges in URL",
+ "authors": ["pukkandan"]
+ },
+ {
+ "action": "change",
+ "when": "1e75d97db21152acc764b30a688e516f04b8a142",
+ "short": "[extractor/youtube] Add `ios` to default clients used\n - IOS is affected neither by 403 nor by nsig so helps mitigate them preemptively\n - IOS also has higher bit-rate 'premium' formats though they are not labeled as such",
+ "authors": ["pukkandan"]
+ },
+ {
+ "action": "change",
+ "when": "f2ff0f6f1914b82d4a51681a72cc0828115dcb4a",
+ "short": "[extractor/motherless] Add gallery support, fix groups (#7211)",
+ "authors": ["rexlambert22", "Ti4eeT4e"]
+ },
+ {
+ "action": "change",
+ "when": "a4486bfc1dc7057efca9dd3fe70d7fa25c56f700",
+ "short": "[misc] Revert \"Add automatic duplicate issue detection\"",
+ "authors": ["pukkandan"]
+ },
+ {
+ "action": "add",
+ "when": "1ceb657bdd254ad961489e5060f2ccc7d556b729",
+ "short": "[priority] Security: [[CVE-2023-35934](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-35934)] Fix [Cookie leak](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj)\n - `--add-header Cookie:` is deprecated and auto-scoped to input URL domains\n - Cookies are scoped when passed to external downloaders\n - Add `cookies` field to info.json and deprecate `http_headers.Cookie`"
+ },
+ {
+ "action": "change",
+ "when": "b03fa7834579a01cc5fba48c0e73488a16683d48",
+ "short": "[ie/twitter] Revert 92315c03774cfabb3a921884326beb4b981f786b"
+ }
+]
diff --git a/devscripts/changelog_override.schema.json b/devscripts/changelog_override.schema.json
new file mode 100644
index 0000000..9bd747b
--- /dev/null
+++ b/devscripts/changelog_override.schema.json
@@ -0,0 +1,96 @@
+{
+ "$schema": "http://json-schema.org/draft/2020-12/schema",
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "type": "object",
+ "oneOf": [
+ {
+ "type": "object",
+ "properties": {
+ "action": {
+ "enum": [
+ "add"
+ ]
+ },
+ "when": {
+ "type": "string",
+ "pattern": "^([0-9a-f]{40}|\\d{4}\\.\\d{2}\\.\\d{2})$"
+ },
+ "hash": {
+ "type": "string",
+ "pattern": "^[0-9a-f]{40}$"
+ },
+ "short": {
+ "type": "string"
+ },
+ "authors": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "required": [
+ "action",
+ "short"
+ ]
+ },
+ {
+ "type": "object",
+ "properties": {
+ "action": {
+ "enum": [
+ "remove"
+ ]
+ },
+ "when": {
+ "type": "string",
+ "pattern": "^([0-9a-f]{40}|\\d{4}\\.\\d{2}\\.\\d{2})$"
+ },
+ "hash": {
+ "type": "string",
+ "pattern": "^[0-9a-f]{40}$"
+ }
+ },
+ "required": [
+ "action",
+ "hash"
+ ]
+ },
+ {
+ "type": "object",
+ "properties": {
+ "action": {
+ "enum": [
+ "change"
+ ]
+ },
+ "when": {
+ "type": "string",
+ "pattern": "^([0-9a-f]{40}|\\d{4}\\.\\d{2}\\.\\d{2})$"
+ },
+ "hash": {
+ "type": "string",
+ "pattern": "^[0-9a-f]{40}$"
+ },
+ "short": {
+ "type": "string"
+ },
+ "authors": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "required": [
+ "action",
+ "hash",
+ "short",
+ "authors"
+ ]
+ }
+ ]
+ }
+}
diff --git a/devscripts/cli_to_api.py b/devscripts/cli_to_api.py
new file mode 100644
index 0000000..563fa9e
--- /dev/null
+++ b/devscripts/cli_to_api.py
@@ -0,0 +1,48 @@
+# Allow direct execution
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import hypervideo_dl
+import hypervideo_dl.options
+
+create_parser = hypervideo_dl.options.create_parser
+
+
+def parse_patched_options(opts):
+ patched_parser = create_parser()
+ patched_parser.defaults.update({
+ 'ignoreerrors': False,
+ 'retries': 0,
+ 'fragment_retries': 0,
+ 'extract_flat': False,
+ 'concat_playlist': 'never',
+ })
+ hypervideo_dl.options.create_parser = lambda: patched_parser
+ try:
+ return hypervideo_dl.parse_options(opts)
+ finally:
+ hypervideo_dl.options.create_parser = create_parser
+
+
+default_opts = parse_patched_options([]).ydl_opts
+
+
+def cli_to_api(opts, cli_defaults=False):
+ opts = (hypervideo_dl.parse_options if cli_defaults else parse_patched_options)(opts).ydl_opts
+
+ diff = {k: v for k, v in opts.items() if default_opts[k] != v}
+ if 'postprocessors' in diff:
+ diff['postprocessors'] = [pp for pp in diff['postprocessors']
+ if pp not in default_opts['postprocessors']]
+ return diff
+
+
+if __name__ == '__main__':
+ from pprint import pprint
+
+ print('\nThe arguments passed translate to:\n')
+ pprint(cli_to_api(sys.argv[1:]))
+ print('\nCombining these with the CLI defaults gives:\n')
+ pprint(cli_to_api(sys.argv[1:], True))
diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py
index c8815e0..6f52165 100644
--- a/devscripts/lazy_load_template.py
+++ b/devscripts/lazy_load_template.py
@@ -6,6 +6,7 @@ from ..utils import (
age_restricted,
bug_reports_message,
classproperty,
+ variadic,
write_string,
)
diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py
new file mode 100644
index 0000000..1206fd9
--- /dev/null
+++ b/devscripts/make_changelog.py
@@ -0,0 +1,510 @@
+from __future__ import annotations
+
+# Allow direct execution
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import enum
+import itertools
+import json
+import logging
+import re
+from collections import defaultdict
+from dataclasses import dataclass
+from functools import lru_cache
+from pathlib import Path
+
+from devscripts.utils import read_file, run_process, write_file
+
+BASE_URL = 'https://github.com'
+LOCATION_PATH = Path(__file__).parent
+HASH_LENGTH = 7
+
+logger = logging.getLogger(__name__)
+
+
+class CommitGroup(enum.Enum):
+ PRIORITY = 'Important'
+ CORE = 'Core'
+ EXTRACTOR = 'Extractor'
+ DOWNLOADER = 'Downloader'
+ POSTPROCESSOR = 'Postprocessor'
+ MISC = 'Misc.'
+
+ @classmethod
+ @property
+ def ignorable_prefixes(cls):
+ return ('core', 'downloader', 'extractor', 'misc', 'postprocessor', 'upstream')
+
+ @classmethod
+ @lru_cache
+ def commit_lookup(cls):
+ return {
+ name: group
+ for group, names in {
+ cls.PRIORITY: {'priority'},
+ cls.CORE: {
+ 'aes',
+ 'cache',
+ 'compat_utils',
+ 'compat',
+ 'cookies',
+ 'core',
+ 'dependencies',
+ 'formats',
+ 'jsinterp',
+ 'networking',
+ 'outtmpl',
+ 'plugins',
+ 'update',
+ 'upstream',
+ 'utils',
+ },
+ cls.MISC: {
+ 'build',
+ 'cleanup',
+ 'devscripts',
+ 'docs',
+ 'misc',
+ 'test',
+ },
+ cls.EXTRACTOR: {'extractor', 'ie'},
+ cls.DOWNLOADER: {'downloader', 'fd'},
+ cls.POSTPROCESSOR: {'postprocessor', 'pp'},
+ }.items()
+ for name in names
+ }
+
+ @classmethod
+ def get(cls, value):
+ result = cls.commit_lookup().get(value)
+ if result:
+ logger.debug(f'Mapped {value!r} => {result.name}')
+ return result
+
+
+@dataclass
+class Commit:
+ hash: str | None
+ short: str
+ authors: list[str]
+
+ def __str__(self):
+ result = f'{self.short!r}'
+
+ if self.hash:
+ result += f' ({self.hash[:HASH_LENGTH]})'
+
+ if self.authors:
+ authors = ', '.join(self.authors)
+ result += f' by {authors}'
+
+ return result
+
+
+@dataclass
+class CommitInfo:
+ details: str | None
+ sub_details: tuple[str, ...]
+ message: str
+ issues: list[str]
+ commit: Commit
+ fixes: list[Commit]
+
+ def key(self):
+ return ((self.details or '').lower(), self.sub_details, self.message)
+
+
+def unique(items):
+ return sorted({item.strip().lower(): item for item in items if item}.values())
+
+
+class Changelog:
+ MISC_RE = re.compile(r'(?:^|\b)(?:lint(?:ing)?|misc|format(?:ting)?|fixes)(?:\b|$)', re.IGNORECASE)
+ ALWAYS_SHOWN = (CommitGroup.PRIORITY,)
+
+ def __init__(self, groups, repo, collapsible=False):
+ self._groups = groups
+ self._repo = repo
+ self._collapsible = collapsible
+
+ def __str__(self):
+ return '\n'.join(self._format_groups(self._groups)).replace('\t', ' ')
+
+ def _format_groups(self, groups):
+ first = True
+ for item in CommitGroup:
+ if self._collapsible and item not in self.ALWAYS_SHOWN and first:
+ first = False
+ yield '\n<details><summary><h3>Changelog</h3></summary>\n'
+
+ group = groups[item]
+ if group:
+ yield self.format_module(item.value, group)
+
+ if self._collapsible:
+ yield '\n</details>'
+
+ def format_module(self, name, group):
+ result = f'\n#### {name} changes\n' if name else '\n'
+ return result + '\n'.join(self._format_group(group))
+
+ def _format_group(self, group):
+ sorted_group = sorted(group, key=CommitInfo.key)
+ detail_groups = itertools.groupby(sorted_group, lambda item: (item.details or '').lower())
+ for _, items in detail_groups:
+ items = list(items)
+ details = items[0].details
+
+ if details == 'cleanup':
+ items = self._prepare_cleanup_misc_items(items)
+
+ prefix = '-'
+ if details:
+ if len(items) == 1:
+ prefix = f'- **{details}**:'
+ else:
+ yield f'- **{details}**'
+ prefix = '\t-'
+
+ sub_detail_groups = itertools.groupby(items, lambda item: tuple(map(str.lower, item.sub_details)))
+ for sub_details, entries in sub_detail_groups:
+ if not sub_details:
+ for entry in entries:
+ yield f'{prefix} {self.format_single_change(entry)}'
+ continue
+
+ entries = list(entries)
+ sub_prefix = f'{prefix} {", ".join(entries[0].sub_details)}'
+ if len(entries) == 1:
+ yield f'{sub_prefix}: {self.format_single_change(entries[0])}'
+ continue
+
+ yield sub_prefix
+ for entry in entries:
+ yield f'\t{prefix} {self.format_single_change(entry)}'
+
+ def _prepare_cleanup_misc_items(self, items):
+ cleanup_misc_items = defaultdict(list)
+ sorted_items = []
+ for item in items:
+ if self.MISC_RE.search(item.message):
+ cleanup_misc_items[tuple(item.commit.authors)].append(item)
+ else:
+ sorted_items.append(item)
+
+ for commit_infos in cleanup_misc_items.values():
+ sorted_items.append(CommitInfo(
+ 'cleanup', ('Miscellaneous',), ', '.join(
+ self._format_message_link(None, info.commit.hash).strip()
+ for info in sorted(commit_infos, key=lambda item: item.commit.hash or '')),
+ [], Commit(None, '', commit_infos[0].commit.authors), []))
+
+ return sorted_items
+
+ def format_single_change(self, info):
+ message = self._format_message_link(info.message, info.commit.hash)
+ if info.issues:
+ message = message.replace('\n', f' ({self._format_issues(info.issues)})\n', 1)
+
+ if info.commit.authors:
+ message = message.replace('\n', f' by {self._format_authors(info.commit.authors)}\n', 1)
+
+ if info.fixes:
+ fix_message = ', '.join(f'{self._format_message_link(None, fix.hash)}' for fix in info.fixes)
+
+ authors = sorted({author for fix in info.fixes for author in fix.authors}, key=str.casefold)
+ if authors != info.commit.authors:
+ fix_message = f'{fix_message} by {self._format_authors(authors)}'
+
+ message = message.replace('\n', f' (With fixes in {fix_message})\n', 1)
+
+ return message[:-1]
+
+ def _format_message_link(self, message, hash):
+ assert message or hash, 'Improperly defined commit message or override'
+ message = message if message else hash[:HASH_LENGTH]
+ if not hash:
+ return f'{message}\n'
+ return f'[{message}\n'.replace('\n', f']({self.repo_url}/commit/{hash})\n', 1)
+
+ def _format_issues(self, issues):
+ return ', '.join(f'[#{issue}]({self.repo_url}/issues/{issue})' for issue in issues)
+
+ @staticmethod
+ def _format_authors(authors):
+ return ', '.join(f'[{author}]({BASE_URL}/{author})' for author in authors)
+
+ @property
+ def repo_url(self):
+ return f'{BASE_URL}/{self._repo}'
+
+
+class CommitRange:
+ COMMAND = 'git'
+ COMMIT_SEPARATOR = '-----'
+
+ AUTHOR_INDICATOR_RE = re.compile(r'Authored by:? ', re.IGNORECASE)
+ MESSAGE_RE = re.compile(r'''
+ (?:\[(?P<prefix>[^\]]+)\]\ )?
+ (?:(?P<sub_details>`?[^:`]+`?): )?
+ (?P<message>.+?)
+ (?:\ \((?P<issues>\#\d+(?:,\ \#\d+)*)\))?
+ ''', re.VERBOSE | re.DOTALL)
+ EXTRACTOR_INDICATOR_RE = re.compile(r'(?:Fix|Add)\s+Extractors?', re.IGNORECASE)
+ REVERT_RE = re.compile(r'(?:\[[^\]]+\]\s+)?(?i:Revert)\s+([\da-f]{40})')
+ FIXES_RE = re.compile(r'(?i:Fix(?:es)?(?:\s+bugs?)?(?:\s+in|\s+for)?|Revert)\s+([\da-f]{40})')
+ UPSTREAM_MERGE_RE = re.compile(r'Update to ytdl-commit-([\da-f]+)')
+
+ def __init__(self, start, end, default_author=None):
+ self._start, self._end = start, end
+ self._commits, self._fixes = self._get_commits_and_fixes(default_author)
+ self._commits_added = []
+
+ def __iter__(self):
+ return iter(itertools.chain(self._commits.values(), self._commits_added))
+
+ def __len__(self):
+ return len(self._commits) + len(self._commits_added)
+
+ def __contains__(self, commit):
+ if isinstance(commit, Commit):
+ if not commit.hash:
+ return False
+ commit = commit.hash
+
+ return commit in self._commits
+
+ def _get_commits_and_fixes(self, default_author):
+ result = run_process(
+ self.COMMAND, 'log', f'--format=%H%n%s%n%b%n{self.COMMIT_SEPARATOR}',
+ f'{self._start}..{self._end}' if self._start else self._end).stdout
+
+ commits, reverts = {}, {}
+ fixes = defaultdict(list)
+ lines = iter(result.splitlines(False))
+ for i, commit_hash in enumerate(lines):
+ short = next(lines)
+ skip = short.startswith('Release ') or short == '[version] update'
+
+ authors = [default_author] if default_author else []
+ for line in iter(lambda: next(lines), self.COMMIT_SEPARATOR):
+ match = self.AUTHOR_INDICATOR_RE.match(line)
+ if match:
+ authors = sorted(map(str.strip, line[match.end():].split(',')), key=str.casefold)
+
+ commit = Commit(commit_hash, short, authors)
+ if skip and (self._start or not i):
+ logger.debug(f'Skipped commit: {commit}')
+ continue
+ elif skip:
+ logger.debug(f'Reached Release commit, breaking: {commit}')
+ break
+
+ revert_match = self.REVERT_RE.fullmatch(commit.short)
+ if revert_match:
+ reverts[revert_match.group(1)] = commit
+ continue
+
+ fix_match = self.FIXES_RE.search(commit.short)
+ if fix_match:
+ commitish = fix_match.group(1)
+ fixes[commitish].append(commit)
+
+ commits[commit.hash] = commit
+
+ for commitish, revert_commit in reverts.items():
+ reverted = commits.pop(commitish, None)
+ if reverted:
+ logger.debug(f'{commit} fully reverted {reverted}')
+ else:
+ commits[revert_commit.hash] = revert_commit
+
+ for commitish, fix_commits in fixes.items():
+ if commitish in commits:
+ hashes = ', '.join(commit.hash[:HASH_LENGTH] for commit in fix_commits)
+ logger.info(f'Found fix(es) for {commitish[:HASH_LENGTH]}: {hashes}')
+ for fix_commit in fix_commits:
+ del commits[fix_commit.hash]
+ else:
+ logger.debug(f'Commit with fixes not in changes: {commitish[:HASH_LENGTH]}')
+
+ return commits, fixes
+
+ def apply_overrides(self, overrides):
+ for override in overrides:
+ when = override.get('when')
+ if when and when not in self and when != self._start:
+ logger.debug(f'Ignored {when!r}, not in commits {self._start!r}')
+ continue
+
+ override_hash = override.get('hash') or when
+ if override['action'] == 'add':
+ commit = Commit(override.get('hash'), override['short'], override.get('authors') or [])
+ logger.info(f'ADD {commit}')
+ self._commits_added.append(commit)
+
+ elif override['action'] == 'remove':
+ if override_hash in self._commits:
+ logger.info(f'REMOVE {self._commits[override_hash]}')
+ del self._commits[override_hash]
+
+ elif override['action'] == 'change':
+ if override_hash not in self._commits:
+ continue
+ commit = Commit(override_hash, override['short'], override.get('authors') or [])
+ logger.info(f'CHANGE {self._commits[commit.hash]} -> {commit}')
+ self._commits[commit.hash] = commit
+
+ self._commits = {key: value for key, value in reversed(self._commits.items())}
+
+ def groups(self):
+ group_dict = defaultdict(list)
+ for commit in self:
+ upstream_re = self.UPSTREAM_MERGE_RE.search(commit.short)
+ if upstream_re:
+ commit.short = f'[core/upstream] Merged with youtube-dl {upstream_re.group(1)}'
+
+ match = self.MESSAGE_RE.fullmatch(commit.short)
+ if not match:
+ logger.error(f'Error parsing short commit message: {commit.short!r}')
+ continue
+
+ prefix, sub_details_alt, message, issues = match.groups()
+ issues = [issue.strip()[1:] for issue in issues.split(',')] if issues else []
+
+ if prefix:
+ groups, details, sub_details = zip(*map(self.details_from_prefix, prefix.split(',')))
+ group = next(iter(filter(None, groups)), None)
+ details = ', '.join(unique(details))
+ sub_details = list(itertools.chain.from_iterable(sub_details))
+ else:
+ group = CommitGroup.CORE
+ details = None
+ sub_details = []
+
+ if sub_details_alt:
+ sub_details.append(sub_details_alt)
+ sub_details = tuple(unique(sub_details))
+
+ if not group:
+ if self.EXTRACTOR_INDICATOR_RE.search(commit.short):
+ group = CommitGroup.EXTRACTOR
+ else:
+ group = CommitGroup.POSTPROCESSOR
+ logger.warning(f'Failed to map {commit.short!r}, selected {group.name.lower()}')
+
+ commit_info = CommitInfo(
+ details, sub_details, message.strip(),
+ issues, commit, self._fixes[commit.hash])
+
+ logger.debug(f'Resolved {commit.short!r} to {commit_info!r}')
+ group_dict[group].append(commit_info)
+
+ return group_dict
+
+ @staticmethod
+ def details_from_prefix(prefix):
+ if not prefix:
+ return CommitGroup.CORE, None, ()
+
+ prefix, _, details = prefix.partition('/')
+ prefix = prefix.strip()
+ details = details.strip()
+
+ group = CommitGroup.get(prefix.lower())
+ if group is CommitGroup.PRIORITY:
+ prefix, _, details = details.partition('/')
+
+ if not details and prefix and prefix not in CommitGroup.ignorable_prefixes:
+ logger.debug(f'Replaced details with {prefix!r}')
+ details = prefix or None
+
+ if details == 'common':
+ details = None
+
+ if details:
+ details, *sub_details = details.split(':')
+ else:
+ sub_details = []
+
+ return group, details, sub_details
+
+
+def get_new_contributors(contributors_path, commits):
+ contributors = set()
+ if contributors_path.exists():
+ for line in read_file(contributors_path).splitlines():
+ author, _, _ = line.strip().partition(' (')
+ authors = author.split('/')
+ contributors.update(map(str.casefold, authors))
+
+ new_contributors = set()
+ for commit in commits:
+ for author in commit.authors:
+ author_folded = author.casefold()
+ if author_folded not in contributors:
+ contributors.add(author_folded)
+ new_contributors.add(author)
+
+ return sorted(new_contributors, key=str.casefold)
+
+
+if __name__ == '__main__':
+ import argparse
+
+ parser = argparse.ArgumentParser(
+ description='Create a changelog markdown from a git commit range')
+ parser.add_argument(
+ 'commitish', default='HEAD', nargs='?',
+ help='The commitish to create the range from (default: %(default)s)')
+ parser.add_argument(
+ '-v', '--verbosity', action='count', default=0,
+ help='increase verbosity (can be used twice)')
+ parser.add_argument(
+ '-c', '--contributors', action='store_true',
+ help='update CONTRIBUTORS file (default: %(default)s)')
+ parser.add_argument(
+ '--contributors-path', type=Path, default=LOCATION_PATH.parent / 'CONTRIBUTORS',
+ help='path to the CONTRIBUTORS file')
+ parser.add_argument(
+ '--no-override', action='store_true',
+ help='skip override json in commit generation (default: %(default)s)')
+ parser.add_argument(
+ '--override-path', type=Path, default=LOCATION_PATH / 'changelog_override.json',
+ help='path to the changelog_override.json file')
+ parser.add_argument(
+ '--default-author', default='pukkandan',
+ help='the author to use without a author indicator (default: %(default)s)')
+ parser.add_argument(
+ '--repo', default='hypervideo/hypervideo',
+ help='the github repository to use for the operations (default: %(default)s)')
+ parser.add_argument(
+ '--collapsible', action='store_true',
+ help='make changelog collapsible (default: %(default)s)')
+ args = parser.parse_args()
+
+ logging.basicConfig(
+ datefmt='%Y-%m-%d %H-%M-%S', format='{asctime} | {levelname:<8} | {message}',
+ level=logging.WARNING - 10 * args.verbosity, style='{', stream=sys.stderr)
+
+ commits = CommitRange(None, args.commitish, args.default_author)
+
+ if not args.no_override:
+ if args.override_path.exists():
+ overrides = json.loads(read_file(args.override_path))
+ commits.apply_overrides(overrides)
+ else:
+ logger.warning(f'File {args.override_path.as_posix()} does not exist')
+
+ logger.info(f'Loaded {len(commits)} commits')
+
+ new_contributors = get_new_contributors(args.contributors_path, commits)
+ if new_contributors:
+ if args.contributors:
+ write_file(args.contributors_path, '\n'.join(new_contributors) + '\n', mode='a')
+ logger.info(f'New contributors: {", ".join(new_contributors)}')
+
+ print(Changelog(commits.groups(), args.repo, args.collapsible))
diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py
index 69e1758..bc4b5ac 100644
--- a/devscripts/make_lazy_extractors.py
+++ b/devscripts/make_lazy_extractors.py
@@ -40,8 +40,12 @@ def main():
_ALL_CLASSES = get_all_ies() # Must be before import
+ import hypervideo_dl.plugins
from hypervideo_dl.extractor.common import InfoExtractor, SearchInfoExtractor
+ # Filter out plugins
+ _ALL_CLASSES = [cls for cls in _ALL_CLASSES if not cls.__module__.startswith(f'{hypervideo_dl.plugins.PACKAGE_NAME}.')]
+
DummyInfoExtractor = type('InfoExtractor', (InfoExtractor,), {'IE_NAME': NO_ATTR})
module_src = '\n'.join((
MODULE_TEMPLATE,
diff --git a/devscripts/make_readme.py b/devscripts/make_readme.py
index 6adfca0..525349f 100644
--- a/devscripts/make_readme.py
+++ b/devscripts/make_readme.py
@@ -45,33 +45,43 @@ switch_col_width = len(re.search(r'(?m)^\s{5,}', options).group())
delim = f'\n{" " * switch_col_width}'
PATCHES = (
- ( # Standardize update message
+ ( # Standardize `--update` message
r'(?m)^( -U, --update\s+).+(\n \s.+)*$',
r'\1Update this program to the latest version',
),
- ( # Headings
+ ( # Headings
r'(?m)^ (\w.+\n)( (?=\w))?',
r'## \1'
),
- ( # Do not split URLs
+ ( # Fixup `--date` formatting
+ rf'(?m)( --date DATE.+({delim}[^\[]+)*)\[.+({delim}.+)*$',
+ (rf'\1[now|today|yesterday][-N[day|week|month|year]].{delim}'
+ f'E.g. "--date today-2weeks" downloads only{delim}'
+ 'videos uploaded on the same day two weeks ago'),
+ ),
+ ( # Do not split URLs
rf'({delim[:-1]})? (?P<label>\[\S+\] )?(?P<url>https?({delim})?:({delim})?/({delim})?/(({delim})?\S+)+)\s',
lambda mobj: ''.join((delim, mobj.group('label') or '', re.sub(r'\s+', '', mobj.group('url')), '\n'))
),
- ( # Do not split "words"
+ ( # Do not split "words"
rf'(?m)({delim}\S+)+$',
lambda mobj: ''.join((delim, mobj.group(0).replace(delim, '')))
),
- ( # Allow overshooting last line
+ ( # Allow overshooting last line
rf'(?m)^(?P<prev>.+)${delim}(?P<current>.+)$(?!{delim})',
lambda mobj: (mobj.group().replace(delim, ' ')
if len(mobj.group()) - len(delim) + 1 <= max_width + ALLOWED_OVERSHOOT
else mobj.group())
),
- ( # Avoid newline when a space is available b/w switch and description
+ ( # Avoid newline when a space is available b/w switch and description
DISABLE_PATCH, # This creates issues with prepare_manpage
r'(?m)^(\s{4}-.{%d})(%s)' % (switch_col_width - 6, delim),
r'\1 '
),
+ ( # Replace brackets with a Markdown link
+ r'SponsorBlock API \((http.+)\)',
+ r'[SponsorBlock API](\1)'
+ ),
)
readme = read_file(README_FILE)
diff --git a/devscripts/utils.py b/devscripts/utils.py
index 3f67e62..66b0905 100644
--- a/devscripts/utils.py
+++ b/devscripts/utils.py
@@ -1,5 +1,6 @@
import argparse
import functools
+import subprocess
def read_file(fname):
@@ -12,8 +13,8 @@ def write_file(fname, content, mode='w'):
return f.write(content)
-# Get the version without importing the package
def read_version(fname='hypervideo_dl/version.py'):
+ """Get the version without importing the package"""
exec(compile(read_file(fname), fname, 'exec'))
return locals()['__version__']
@@ -33,3 +34,13 @@ def get_filename_args(has_infile=False, default_outfile=None):
def compose_functions(*functions):
return lambda x: functools.reduce(lambda y, f: f(y), functions, x)
+
+
+def run_process(*args, **kwargs):
+ kwargs.setdefault('text', True)
+ kwargs.setdefault('check', True)
+ kwargs.setdefault('capture_output', True)
+ if kwargs['text']:
+ kwargs.setdefault('encoding', 'utf-8')
+ kwargs.setdefault('errors', 'replace')
+ return subprocess.run(args, **kwargs)
diff --git a/hypervideo_dl/YoutubeDL.py b/hypervideo_dl/YoutubeDL.py
index 012c3b8..8d33187 100644
--- a/hypervideo_dl/YoutubeDL.py
+++ b/hypervideo_dl/YoutubeDL.py
@@ -1,9 +1,10 @@
import collections
import contextlib
+import copy
import datetime
import errno
import fileinput
-import functools
+import http.cookiejar
import io
import itertools
import json
@@ -13,6 +14,7 @@ import os
import random
import re
import shutil
+import string
import subprocess
import sys
import tempfile
@@ -20,19 +22,29 @@ import time
import tokenize
import traceback
import unicodedata
-import urllib.request
-from string import ascii_letters
from .cache import Cache
-from .compat import compat_os_name, compat_shlex_quote
-from .cookies import load_cookies
+from .compat import functools, urllib # isort: split
+from .compat import compat_os_name, compat_shlex_quote, urllib_req_to_req
+from .cookies import LenientSimpleCookie, load_cookies
from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
from .downloader.rtmp import rtmpdump_version
from .extractor import gen_extractor_classes, get_info_extractor
from .extractor.common import UnsupportedURLIE
from .extractor.openload import PhantomJSwrapper
from .minicurses import format_text
-from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors
+from .networking import HEADRequest, Request, RequestDirector
+from .networking.common import _REQUEST_HANDLERS, _RH_PREFERENCES
+from .networking.exceptions import (
+ HTTPError,
+ NoSupportingHandlers,
+ RequestError,
+ SSLError,
+ _CompatHTTPError,
+ network_exceptions,
+)
+from .plugins import directories as plugin_directories
+from .postprocessor import _PLUGIN_CLASSES as plugin_pps
from .postprocessor import (
EmbedThumbnailPP,
FFmpegFixupDuplicateMoovPP,
@@ -67,13 +79,11 @@ from .utils import (
ExtractorError,
FormatSorter,
GeoRestrictedError,
- HEADRequest,
ISO3166Utils,
LazyList,
MaxDownloadsReached,
Namespace,
PagedList,
- PerRequestProxyHandler,
PlaylistEntries,
Popen,
PostProcessingError,
@@ -82,9 +92,6 @@ from .utils import (
SameFileError,
UnavailableVideoError,
UserNotLive,
- YoutubeDLCookieProcessor,
- YoutubeDLHandler,
- YoutubeDLRedirectHandler,
age_restricted,
args_to_str,
bug_reports_message,
@@ -97,6 +104,7 @@ from .utils import (
error_to_compat_str,
escapeHTML,
expand_path,
+ extract_basic_auth,
filter_dict,
float_or_none,
format_bytes,
@@ -112,24 +120,18 @@ from .utils import (
locked_file,
make_archive_id,
make_dir,
- make_HTTPS_handler,
- merge_headers,
- network_exceptions,
number_of_digits,
orderedSet,
orderedSet_from_options,
parse_filesize,
preferredencoding,
prepend_extension,
- register_socks_protocols,
remove_terminal_sequences,
render_table,
replace_extension,
sanitize_filename,
sanitize_path,
sanitize_url,
- sanitized_Request,
- std_headers,
str_or_none,
strftime_or_none,
subtitles_filename,
@@ -147,7 +149,14 @@ from .utils import (
write_json_file,
write_string,
)
-from .version import RELEASE_GIT_HEAD, VARIANT, __version__
+from .utils._utils import _YDLLogger
+from .utils.networking import (
+ HTTPHeaderDict,
+ clean_headers,
+ clean_proxies,
+ std_headers,
+)
+from .version import CHANNEL, RELEASE_GIT_HEAD, VARIANT, __version__
if compat_os_name == 'nt':
import ctypes
@@ -187,6 +196,8 @@ class YoutubeDL:
ap_username: Multiple-system operator account username.
ap_password: Multiple-system operator account password.
usenetrc: Use netrc for authentication instead.
+ netrc_location: Location of the netrc file. Defaults to ~/.netrc.
+ netrc_cmd: Use a shell command to get credentials
verbose: Print additional info to stdout.
quiet: Do not print messages to stdout.
no_warnings: Do not print out anything for warnings.
@@ -243,8 +254,6 @@ class YoutubeDL:
overwrites: Overwrite all video and metadata files if True,
overwrite only non-video files if None
and don't overwrite any file if False
- For compatibility with youtube-dl,
- "nooverwrites" may also be used instead
playlist_items: Specific indices of playlist to download.
playlistrandom: Download playlist items in random order.
lazy_playlist: Process playlist entries as they are received.
@@ -255,7 +264,7 @@ class YoutubeDL:
consoletitle: Display progress in console window's titlebar.
writedescription: Write the video description to a .description file
writeinfojson: Write the video description to a .info.json file
- clean_infojson: Remove private fields from the infojson
+ clean_infojson: Remove internal metadata from the infojson
getcomments: Extract video comments. This will not be written to disk
unless writeinfojson is also given
writeannotations: Write the video annotations to a .annotations.xml file
@@ -277,7 +286,7 @@ class YoutubeDL:
subtitles. The language can be prefixed with a "-" to
exclude it from the requested languages, e.g. ['all', '-live_chat']
keepvideo: Keep the video file after post-processing
- daterange: A DateRange object, download only if the upload_date is in the range.
+ daterange: A utils.DateRange object, download only if the upload_date is in the range.
skip_download: Skip the actual download of the video file
cachedir: Location of the cache files in the filesystem.
False to disable filesystem cache.
@@ -297,8 +306,6 @@ class YoutubeDL:
Videos already present in the file are not downloaded again.
break_on_existing: Stop the download process after attempting to download a
file that is in the archive.
- break_on_reject: Stop the download process when encountering a video that
- has been filtered out.
break_per_url: Whether break_on_reject and break_on_existing
should act on each input URL as opposed to for the entire queue
cookiefile: File name or text stream from where cookies should be read and dumped to
@@ -315,6 +322,7 @@ class YoutubeDL:
If not provided and the key is encrypted, hypervideo will ask interactively
prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
(Only supported by some extractors)
+ enable_file_urls: Enable file:// URLs. This is disabled by default for security reasons.
http_headers: A dictionary of custom headers to be used for all requests
proxy: URL of the proxy server to use
geo_verification_proxy: URL of the proxy to use for IP address verification
@@ -327,13 +335,13 @@ class YoutubeDL:
'auto' for elaborate guessing
encoding: Use this encoding instead of the system-specified.
extract_flat: Whether to resolve and process url_results further
- * False: Always process (default)
+ * False: Always process. Default for API
* True: Never process
* 'in_playlist': Do not process inside playlist/multi_video
* 'discard': Always process, but don't return the result
from inside playlist/multi_video
* 'discard_in_playlist': Same as "discard", but only for
- playlists (not multi_video)
+ playlists (not multi_video). Default for CLI
wait_for_video: If given, wait for scheduled streams to become available.
The value should be a tuple containing the range
(min_secs, max_secs) to wait between retries
@@ -410,8 +418,15 @@ class YoutubeDL:
- If it returns None, the video is downloaded.
- If it returns utils.NO_DEFAULT, the user is interactively
asked whether to download the video.
+ - Raise utils.DownloadCancelled(msg) to abort remaining
+ downloads when a video is rejected.
match_filter_func in utils.py is one example for this.
- no_color: Do not emit color codes in output.
+ color: A Dictionary with output stream names as keys
+ and their respective color policy as values.
+ Can also just be a single color policy,
+ in which case it applies to all outputs.
+ Valid stream names are 'stdout' and 'stderr'.
+ Valid color policies are one of 'always', 'auto', 'no_color' or 'never'.
geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
HTTP header
geo_bypass_country:
@@ -468,7 +483,7 @@ class YoutubeDL:
can also be used
The following options are used by the extractors:
- extractor_retries: Number of times to retry for known errors
+ extractor_retries: Number of times to retry for known errors (default: 3)
dynamic_mpd: Whether to process dynamic DASH manifests (default: True)
hls_split_discontinuity: Split HLS playlists to different formats at
discontinuities such as ad breaks (default: False)
@@ -479,6 +494,9 @@ class YoutubeDL:
The following options are deprecated and may be removed in the future:
+ break_on_reject: Stop the download process when encountering a video that
+ has been filtered out.
+ - `raise DownloadCancelled(msg)` in match_filter instead
force_generic_extractor: Force downloader to use the generic extractor
- Use allowed_extractors = ['generic', 'default']
playliststart: - Use playlist_items
@@ -530,6 +548,8 @@ class YoutubeDL:
data will be downloaded and processed by extractor.
You can reduce network I/O by disabling it if you don't
care about HLS. (only for youtube)
+ no_color: Same as `color='no_color'`
+ no_overwrites: Same as `overwrites=False`
"""
_NUMERIC_FIELDS = {
@@ -549,8 +569,8 @@ class YoutubeDL:
'width', 'height', 'aspect_ratio', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'audio_channels',
'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns',
'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start',
- 'preference', 'language', 'language_preference', 'quality', 'source_preference',
- 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options',
+ 'preference', 'language', 'language_preference', 'quality', 'source_preference', 'cookies',
+ 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'hls_aes', 'downloader_options',
'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time'
}
_format_selection_exts = {
@@ -581,8 +601,8 @@ class YoutubeDL:
self._playlist_level = 0
self._playlist_urls = set()
self.cache = Cache(self)
+ self.__header_cookies = []
- windows_enable_vt_mode()
stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout
self._out_files = Namespace(
out=stdout,
@@ -591,9 +611,31 @@ class YoutubeDL:
console=None if compat_os_name == 'nt' else next(
filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None)
)
+
+ try:
+ windows_enable_vt_mode()
+ except Exception as e:
+ self.write_debug(f'Failed to enable VT mode: {e}')
+
+ if self.params.get('no_color'):
+ if self.params.get('color') is not None:
+ self.params.setdefault('_warnings', []).append(
+ 'Overwriting params from "color" with "no_color"')
+ self.params['color'] = 'no_color'
+
+ term_allow_color = os.environ.get('TERM', '').lower() != 'dumb'
+
+ def process_color_policy(stream):
+ stream_name = {sys.stdout: 'stdout', sys.stderr: 'stderr'}[stream]
+ policy = traverse_obj(self.params, ('color', (stream_name, None), {str}), get_all=False)
+ if policy in ('auto', None):
+ return term_allow_color and supports_terminal_sequences(stream)
+ assert policy in ('always', 'never', 'no_color'), policy
+ return {'always': True, 'never': False}.get(policy, policy)
+
self._allow_colors = Namespace(**{
- type_: not self.params.get('no_color') and supports_terminal_sequences(stream)
- for type_, stream in self._out_files.items_ if type_ != 'console'
+ name: process_color_policy(stream)
+ for name, stream in self._out_files.items_ if name != 'console'
})
# The code is left like this to be reused for future deprecations
@@ -605,7 +647,7 @@ class YoutubeDL:
'\n You will no longer receive updates on this version')
if current_version < MIN_SUPPORTED:
msg = 'Python version %d.%d is no longer supported'
- self.deprecation_warning(
+ self.deprecated_feature(
f'{msg}! Please update to Python %d.%d or above' % (*current_version, *MIN_RECOMMENDED))
if self.params.get('allow_unplayable_formats'):
@@ -636,6 +678,11 @@ class YoutubeDL:
raise
self.params['compat_opts'] = set(self.params.get('compat_opts', ()))
+ self.params['http_headers'] = HTTPHeaderDict(std_headers, self.params.get('http_headers'))
+ self._load_cookies(self.params['http_headers'].get('Cookie')) # compat
+ self.params['http_headers'].pop('Cookie', None)
+ self._request_director = self.build_request_director(_REQUEST_HANDLERS.values(), _RH_PREFERENCES)
+
if auto_init and auto_init != 'no_verbose_header':
self.print_debug_header()
@@ -706,9 +753,6 @@ class YoutubeDL:
else self.params['format'] if callable(self.params['format'])
else self.build_format_selector(self.params['format']))
- # Set http_headers defaults according to std_headers
- self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {}))
-
hooks = {
'post_hooks': self.add_post_hook,
'progress_hooks': self.add_progress_hook,
@@ -725,9 +769,6 @@ class YoutubeDL:
get_postprocessor(pp_def.pop('key'))(self, **pp_def),
when=when)
- self._setup_opener()
- register_socks_protocols()
-
def preload_download_archive(fn):
"""Preload the archive, if any is specified"""
archive = set()
@@ -903,11 +944,17 @@ class YoutubeDL:
self.save_console_title()
return self
+ def save_cookies(self):
+ if self.params.get('cookiefile') is not None:
+ self.cookiejar.save()
+
def __exit__(self, *args):
self.restore_console_title()
+ self.close()
- if self.params.get('cookiefile') is not None:
- self.cookiejar.save(ignore_discard=True, ignore_expires=True)
+ def close(self):
+ self.save_cookies()
+ self._request_director.close()
def trouble(self, message=None, tb=None, is_error=True):
"""Determine action to take when a download problem appears.
@@ -950,6 +997,7 @@ class YoutubeDL:
ID='green',
DELIM='blue',
ERROR='red',
+ BAD_FORMAT='light red',
WARNING='yellow',
SUPPRESS='light black',
)
@@ -963,7 +1011,7 @@ class YoutubeDL:
text = text.encode(encoding, 'ignore').decode(encoding)
if fallback is not None and text != original_text:
text = fallback
- return format_text(text, f) if allow_colors else text if fallback is None else fallback
+ return format_text(text, f) if allow_colors is True else text if fallback is None else fallback
def _format_out(self, *args, **kwargs):
return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs)
@@ -1066,7 +1114,7 @@ class YoutubeDL:
# correspondingly that is not what we want since we need to keep
# '%%' intact for template dict substitution step. Working around
# with boundary-alike separator hack.
- sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
+ sep = ''.join(random.choices(string.ascii_letters, k=32))
outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$')
# outtmpl should be expand_path'ed before template dict substitution
@@ -1144,7 +1192,7 @@ class YoutubeDL:
}
MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})'
MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
- INTERNAL_FORMAT_RE = re.compile(rf'''(?x)
+ INTERNAL_FORMAT_RE = re.compile(rf'''(?xs)
(?P<negate>-)?
(?P<fields>{FIELD_RE})
(?P<maths>(?:{MATH_OPERATORS_RE}{MATH_FIELD_RE})*)
@@ -1225,32 +1273,45 @@ class YoutubeDL:
return list(obj)
return repr(obj)
+ class _ReplacementFormatter(string.Formatter):
+ def get_field(self, field_name, args, kwargs):
+ if field_name.isdigit():
+ return args[0], -1
+ raise ValueError('Unsupported field')
+
+ replacement_formatter = _ReplacementFormatter()
+
def create_key(outer_mobj):
if not outer_mobj.group('has_key'):
return outer_mobj.group(0)
key = outer_mobj.group('key')
mobj = re.match(INTERNAL_FORMAT_RE, key)
- initial_field = mobj.group('fields') if mobj else ''
- value, replacement, default = None, None, na
+ value, replacement, default, last_field = None, None, na, ''
while mobj:
mobj = mobj.groupdict()
default = mobj['default'] if mobj['default'] is not None else default
value = get_value(mobj)
- replacement = mobj['replacement']
+ last_field, replacement = mobj['fields'], mobj['replacement']
if value is None and mobj['alternate']:
mobj = re.match(INTERNAL_FORMAT_RE, mobj['remaining'][1:])
else:
break
- fmt = outer_mobj.group('format')
- if fmt == 's' and value is not None and key in field_size_compat_map.keys():
- fmt = f'0{field_size_compat_map[key]:d}d'
+ if None not in (value, replacement):
+ try:
+ value = replacement_formatter.format(replacement, value)
+ except ValueError:
+ value, default = None, na
- value = default if value is None else value if replacement is None else replacement
+ fmt = outer_mobj.group('format')
+ if fmt == 's' and last_field in field_size_compat_map.keys() and isinstance(value, int):
+ fmt = f'0{field_size_compat_map[last_field]:d}d'
flags = outer_mobj.group('conversion') or ''
str_fmt = f'{fmt[:-1]}s'
- if fmt[-1] == 'l': # list
+ if value is None:
+ value, fmt = default, 's'
+ elif fmt[-1] == 'l': # list
delim = '\n' if '#' in flags else ', '
value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt
elif fmt[-1] == 'j': # json
@@ -1275,24 +1336,26 @@ class YoutubeDL:
value = format_decimal_suffix(value, f'%{num_fmt}f%s' if num_fmt else '%d%s',
factor=1024 if '#' in flags else 1000)
elif fmt[-1] == 'S': # filename sanitization
- value, fmt = filename_sanitizer(initial_field, value, restricted='#' in flags), str_fmt
+ value, fmt = filename_sanitizer(last_field, value, restricted='#' in flags), str_fmt
elif fmt[-1] == 'c':
if value:
value = str(value)[0]
else:
fmt = str_fmt
- elif fmt[-1] not in 'rs': # numeric
+ elif fmt[-1] not in 'rsa': # numeric
value = float_or_none(value)
if value is None:
value, fmt = default, 's'
if sanitize:
+ # If value is an object, sanitize might convert it to a string
+ # So we convert it to repr first
if fmt[-1] == 'r':
- # If value is an object, sanitize might convert it to a string
- # So we convert it to repr first
value, fmt = repr(value), str_fmt
- if fmt[-1] in 'csr':
- value = sanitizer(initial_field, value)
+ elif fmt[-1] == 'a':
+ value, fmt = ascii(value), str_fmt
+ if fmt[-1] in 'csra':
+ value = sanitizer(last_field, value)
key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format'))
TMPL_DICT[key] = value
@@ -1357,7 +1420,7 @@ class YoutubeDL:
def _match_entry(self, info_dict, incomplete=False, silent=False):
"""Returns None if the file should be downloaded"""
- _type = info_dict.get('_type', 'video')
+ _type = 'video' if 'playlist-match-filter' in self.params['compat_opts'] else info_dict.get('_type', 'video')
assert incomplete or _type == 'video', 'Only video result can be considered complete'
video_title = info_dict.get('title', info_dict.get('id', 'entry'))
@@ -1398,31 +1461,47 @@ class YoutubeDL:
return 'Skipping "%s" because it is age restricted' % video_title
match_filter = self.params.get('match_filter')
- if match_filter is not None:
+ if match_filter is None:
+ return None
+
+ cancelled = None
+ try:
try:
ret = match_filter(info_dict, incomplete=incomplete)
except TypeError:
# For backward compatibility
ret = None if incomplete else match_filter(info_dict)
- if ret is NO_DEFAULT:
- while True:
- filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME)
- reply = input(self._format_screen(
- f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip()
- if reply in {'y', ''}:
- return None
- elif reply == 'n':
- return f'Skipping {video_title}'
- elif ret is not None:
- return ret
- return None
+ except DownloadCancelled as err:
+ if err.msg is not NO_DEFAULT:
+ raise
+ ret, cancelled = err.msg, err
+
+ if ret is NO_DEFAULT:
+ while True:
+ filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME)
+ reply = input(self._format_screen(
+ f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip()
+ if reply in {'y', ''}:
+ return None
+ elif reply == 'n':
+ if cancelled:
+ raise type(cancelled)(f'Skipping {video_title}')
+ return f'Skipping {video_title}'
+ return ret
if self.in_download_archive(info_dict):
- reason = '%s has already been recorded in the archive' % video_title
+ reason = ''.join((
+ format_field(info_dict, 'id', f'{self._format_screen("%s", self.Styles.ID)}: '),
+ format_field(info_dict, 'title', f'{self._format_screen("%s", self.Styles.EMPHASIS)} '),
+ 'has already been recorded in the archive'))
break_opt, break_err = 'break_on_existing', ExistingVideoReached
else:
- reason = check_filter()
- break_opt, break_err = 'break_on_reject', RejectedVideoReached
+ try:
+ reason = check_filter()
+ except DownloadCancelled as e:
+ reason, break_opt, break_err = e.msg, 'match_filter', type(e)
+ else:
+ break_opt, break_err = 'break_on_reject', RejectedVideoReached
if reason is not None:
if not silent:
self.to_screen('[download] ' + reason)
@@ -1475,7 +1554,8 @@ class YoutubeDL:
temp_id = ie.get_temp_id(url)
if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': key}):
- self.to_screen(f'[{key}] {temp_id}: has already been recorded in the archive')
+ self.to_screen(f'[download] {self._format_screen(temp_id, self.Styles.ID)}: '
+ 'has already been recorded in the archive')
if self.params.get('break_on_existing', False):
raise ExistingVideoReached()
break
@@ -1563,8 +1643,67 @@ class YoutubeDL:
self.to_screen('')
raise
+ def _load_cookies(self, data, *, autoscope=True):
+ """Loads cookies from a `Cookie` header
+
+ This tries to work around the security vulnerability of passing cookies to every domain.
+ See: https://github.com/hypervideo/hypervideo/security/advisories/GHSA-v8mc-9377-rwjj
+
+ @param data The Cookie header as string to load the cookies from
+ @param autoscope If `False`, scope cookies using Set-Cookie syntax and error for cookie without domains
+ If `True`, save cookies for later to be stored in the jar with a limited scope
+ If a URL, save cookies in the jar with the domain of the URL
+ """
+ for cookie in LenientSimpleCookie(data).values():
+ if autoscope and any(cookie.values()):
+ raise ValueError('Invalid syntax in Cookie Header')
+
+ domain = cookie.get('domain') or ''
+ expiry = cookie.get('expires')
+ if expiry == '': # 0 is valid
+ expiry = None
+ prepared_cookie = http.cookiejar.Cookie(
+ cookie.get('version') or 0, cookie.key, cookie.value, None, False,
+ domain, True, True, cookie.get('path') or '', bool(cookie.get('path')),
+ cookie.get('secure') or False, expiry, False, None, None, {})
+
+ if domain:
+ self.cookiejar.set_cookie(prepared_cookie)
+ elif autoscope is True:
+ self.deprecated_feature(
+ 'Passing cookies as a header is a potential security risk; '
+ 'they will be scoped to the domain of the downloaded urls. '
+ 'Please consider loading cookies from a file or browser instead.')
+ self.__header_cookies.append(prepared_cookie)
+ elif autoscope:
+ self.report_warning(
+ 'The extractor result contains an unscoped cookie as an HTTP header. '
+ f'If you are using hypervideo with an input URL{bug_reports_message(before=",")}',
+ only_once=True)
+ self._apply_header_cookies(autoscope, [prepared_cookie])
+ else:
+ self.report_error('Unscoped cookies are not allowed; please specify some sort of scoping',
+ tb=False, is_error=False)
+
+ def _apply_header_cookies(self, url, cookies=None):
+ """Applies stray header cookies to the provided url
+
+ This loads header cookies and scopes them to the domain provided in `url`.
+ While this is not ideal, it helps reduce the risk of them being sent
+ to an unintended destination while mostly maintaining compatibility.
+ """
+ parsed = urllib.parse.urlparse(url)
+ if not parsed.hostname:
+ return
+
+ for cookie in map(copy.copy, cookies or self.__header_cookies):
+ cookie.domain = f'.{parsed.hostname}'
+ self.cookiejar.set_cookie(cookie)
+
@_handle_extraction_exceptions
def __extract_info(self, url, ie, download, extra_info, process):
+ self._apply_header_cookies(url)
+
try:
ie_result = ie.extract(url)
except UserNotLive as e:
@@ -1624,8 +1763,8 @@ class YoutubeDL:
if result_type in ('url', 'url_transparent'):
ie_result['url'] = sanitize_url(
ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https')
- if ie_result.get('original_url'):
- extra_info.setdefault('original_url', ie_result['original_url'])
+ if ie_result.get('original_url') and not extra_info.get('original_url'):
+ extra_info = {'original_url': ie_result['original_url'], **extra_info}
extract_flat = self.params.get('extract_flat', False)
if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
@@ -1638,7 +1777,7 @@ class YoutubeDL:
self.add_extra_info(info_copy, extra_info)
info_copy, _ = self.pre_process(info_copy)
self._fill_common_fields(info_copy, False)
- self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
+ self.__forced_printings(info_copy)
self._raise_pending_errors(info_copy)
if self.params.get('force_write_download_archive', False):
self.record_download_archive(info_copy)
@@ -1768,7 +1907,7 @@ class YoutubeDL:
return {
**info,
'playlist_index': 0,
- '__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)),
+ '__last_playlist_index': max(ie_result.get('requested_entries') or (0, 0)),
'extractor': ie_result['extractor'],
'extractor_key': ie_result['extractor_key'],
}
@@ -1842,7 +1981,7 @@ class YoutubeDL:
continue
entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip')
- if not lazy and 'playlist-index' in self.params.get('compat_opts', []):
+ if not lazy and 'playlist-index' in self.params['compat_opts']:
playlist_index = ie_result['requested_entries'][i]
entry_copy = collections.ChainMap(entry, {
@@ -1860,11 +1999,10 @@ class YoutubeDL:
self.to_screen('[download] Downloading item %s of %s' % (
self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS)))
- extra.update({
+ entry_result = self.__process_iterable_entry(entry, download, collections.ChainMap({
'playlist_index': playlist_index,
'playlist_autonumber': i + 1,
- })
- entry_result = self.__process_iterable_entry(entry, download, extra)
+ }, extra))
if not entry_result:
failures += 1
if failures >= max_failures:
@@ -1908,7 +2046,7 @@ class YoutubeDL:
'!=': operator.ne,
}
operator_rex = re.compile(r'''(?x)\s*
- (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
+ (?P<key>[\w.-]+)\s*
(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
(?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
''' % '|'.join(map(re.escape, OPERATORS.keys())))
@@ -2025,90 +2163,88 @@ class YoutubeDL:
allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False),
'video': self.params.get('allow_multiple_video_streams', False)}
- check_formats = self.params.get('check_formats') == 'selected'
-
def _parse_filter(tokens):
filter_parts = []
- for type, string, start, _, _ in tokens:
- if type == tokenize.OP and string == ']':
+ for type, string_, start, _, _ in tokens:
+ if type == tokenize.OP and string_ == ']':
return ''.join(filter_parts)
else:
- filter_parts.append(string)
+ filter_parts.append(string_)
def _remove_unused_ops(tokens):
# Remove operators that we don't use and join them with the surrounding strings.
# E.g. 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
ALLOWED_OPS = ('/', '+', ',', '(', ')')
last_string, last_start, last_end, last_line = None, None, None, None
- for type, string, start, end, line in tokens:
- if type == tokenize.OP and string == '[':
+ for type, string_, start, end, line in tokens:
+ if type == tokenize.OP and string_ == '[':
if last_string:
yield tokenize.NAME, last_string, last_start, last_end, last_line
last_string = None
- yield type, string, start, end, line
+ yield type, string_, start, end, line
# everything inside brackets will be handled by _parse_filter
- for type, string, start, end, line in tokens:
- yield type, string, start, end, line
- if type == tokenize.OP and string == ']':
+ for type, string_, start, end, line in tokens:
+ yield type, string_, start, end, line
+ if type == tokenize.OP and string_ == ']':
break
- elif type == tokenize.OP and string in ALLOWED_OPS:
+ elif type == tokenize.OP and string_ in ALLOWED_OPS:
if last_string:
yield tokenize.NAME, last_string, last_start, last_end, last_line
last_string = None
- yield type, string, start, end, line
+ yield type, string_, start, end, line
elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
if not last_string:
- last_string = string
+ last_string = string_
last_start = start
last_end = end
else:
- last_string += string
+ last_string += string_
if last_string:
yield tokenize.NAME, last_string, last_start, last_end, last_line
def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
selectors = []
current_selector = None
- for type, string, start, _, _ in tokens:
+ for type, string_, start, _, _ in tokens:
# ENCODING is only defined in python 3.x
if type == getattr(tokenize, 'ENCODING', None):
continue
elif type in [tokenize.NAME, tokenize.NUMBER]:
- current_selector = FormatSelector(SINGLE, string, [])
+ current_selector = FormatSelector(SINGLE, string_, [])
elif type == tokenize.OP:
- if string == ')':
+ if string_ == ')':
if not inside_group:
# ')' will be handled by the parentheses group
tokens.restore_last_token()
break
- elif inside_merge and string in ['/', ',']:
+ elif inside_merge and string_ in ['/', ',']:
tokens.restore_last_token()
break
- elif inside_choice and string == ',':
+ elif inside_choice and string_ == ',':
tokens.restore_last_token()
break
- elif string == ',':
+ elif string_ == ',':
if not current_selector:
raise syntax_error('"," must follow a format selector', start)
selectors.append(current_selector)
current_selector = None
- elif string == '/':
+ elif string_ == '/':
if not current_selector:
raise syntax_error('"/" must follow a format selector', start)
first_choice = current_selector
second_choice = _parse_format_selection(tokens, inside_choice=True)
current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
- elif string == '[':
+ elif string_ == '[':
if not current_selector:
current_selector = FormatSelector(SINGLE, 'best', [])
format_filter = _parse_filter(tokens)
current_selector.filters.append(format_filter)
- elif string == '(':
+ elif string_ == '(':
if current_selector:
raise syntax_error('Unexpected "("', start)
group = _parse_format_selection(tokens, inside_group=True)
current_selector = FormatSelector(GROUP, group, [])
- elif string == '+':
+ elif string_ == '+':
if not current_selector:
raise syntax_error('Unexpected "+"', start)
selector_1 = current_selector
@@ -2117,7 +2253,7 @@ class YoutubeDL:
raise syntax_error('Expected a selector', start)
current_selector = FormatSelector(MERGE, (selector_1, selector_2), [])
else:
- raise syntax_error(f'Operator not recognized: "{string}"', start)
+ raise syntax_error(f'Operator not recognized: "{string_}"', start)
elif type == tokenize.ENDMARKER:
break
if current_selector:
@@ -2199,10 +2335,19 @@ class YoutubeDL:
return new_dict
def _check_formats(formats):
- if not check_formats:
+ if self.params.get('check_formats') == 'selected':
+ yield from self._check_formats(formats)
+ return
+ elif (self.params.get('check_formats') is not None
+ or self.params.get('allow_unplayable_formats')):
yield from formats
return
- yield from self._check_formats(formats)
+
+ for f in formats:
+ if f.get('has_drm'):
+ yield from self._check_formats([f])
+ else:
+ yield f
def _build_selector_function(selector):
if isinstance(selector, list): # ,
@@ -2341,12 +2486,34 @@ class YoutubeDL:
parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
return _build_selector_function(parsed_selector)
- def _calc_headers(self, info_dict):
- res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {})
-
- cookies = self._calc_cookies(info_dict['url'])
+ def _calc_headers(self, info_dict, load_cookies=False):
+ res = HTTPHeaderDict(self.params['http_headers'], info_dict.get('http_headers'))
+ clean_headers(res)
+
+ if load_cookies: # For --load-info-json
+ self._load_cookies(res.get('Cookie'), autoscope=info_dict['url']) # compat
+ self._load_cookies(info_dict.get('cookies'), autoscope=False)
+ # The `Cookie` header is removed to prevent leaks and unscoped cookies.
+ # See: https://github.com/hypervideo/hypervideo/security/advisories/GHSA-v8mc-9377-rwjj
+ res.pop('Cookie', None)
+ cookies = self.cookiejar.get_cookies_for_url(info_dict['url'])
if cookies:
- res['Cookie'] = cookies
+ encoder = LenientSimpleCookie()
+ values = []
+ for cookie in cookies:
+ _, value = encoder.value_encode(cookie.value)
+ values.append(f'{cookie.name}={value}')
+ if cookie.domain:
+ values.append(f'Domain={cookie.domain}')
+ if cookie.path:
+ values.append(f'Path={cookie.path}')
+ if cookie.secure:
+ values.append('Secure')
+ if cookie.expires:
+ values.append(f'Expires={cookie.expires}')
+ if cookie.version:
+ values.append(f'Version={cookie.version}')
+ info_dict['cookies'] = '; '.join(values)
if 'X-Forwarded-For' not in res:
x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
@@ -2356,9 +2523,8 @@ class YoutubeDL:
return res
def _calc_cookies(self, url):
- pr = sanitized_Request(url)
- self.cookiejar.add_cookie_header(pr)
- return pr.get_header('Cookie')
+ self.deprecation_warning('"YoutubeDL._calc_cookies" is deprecated and may be removed in a future version')
+ return self.cookiejar.get_cookie_header(url)
def _sort_thumbnails(self, thumbnails):
thumbnails.sort(key=lambda t: (
@@ -2403,11 +2569,7 @@ class YoutubeDL:
def _fill_common_fields(self, info_dict, final=True):
# TODO: move sanitization here
if final:
- title = info_dict.get('title', NO_DEFAULT)
- if title is NO_DEFAULT:
- raise ExtractorError('Missing "title" field in extractor result',
- video_id=info_dict['id'], ie=info_dict['extractor'])
- info_dict['fulltitle'] = title
+ title = info_dict['fulltitle'] = info_dict.get('title')
if not title:
if title == '':
self.write_debug('Extractor gave empty title. Creating a generic title')
@@ -2462,15 +2624,8 @@ class YoutubeDL:
def sort_formats(self, info_dict):
formats = self._get_formats(info_dict)
- if not formats:
- return
- # Backward compatibility with InfoExtractor._sort_formats
- field_preference = formats[0].pop('__sort_fields', None)
- if field_preference:
- info_dict['_format_sort_fields'] = field_preference
-
formats.sort(key=FormatSorter(
- self, info_dict.get('_format_sort_fields', [])).calculate_preference)
+ self, info_dict.get('_format_sort_fields') or []).calculate_preference)
def process_video_result(self, info_dict, download=True):
assert info_dict.get('_type', 'video') == 'video'
@@ -2557,13 +2712,17 @@ class YoutubeDL:
info_dict['requested_subtitles'] = self.process_subtitles(
info_dict['id'], subtitles, automatic_captions)
- self.sort_formats(info_dict)
formats = self._get_formats(info_dict)
- # or None ensures --clean-infojson removes it
- info_dict['_has_drm'] = any(f.get('has_drm') for f in formats) or None
+ # Backward compatibility with InfoExtractor._sort_formats
+ field_preference = (formats or [{}])[0].pop('__sort_fields', None)
+ if field_preference:
+ info_dict['_format_sort_fields'] = field_preference
+
+ info_dict['_has_drm'] = any( # or None ensures --clean-infojson removes it
+ f.get('has_drm') and f['has_drm'] != 'maybe' for f in formats) or None
if not self.params.get('allow_unplayable_formats'):
- formats = [f for f in formats if not f.get('has_drm')]
+ formats = [f for f in formats if not f.get('has_drm') or f['has_drm'] == 'maybe']
if formats and all(f.get('acodec') == f.get('vcodec') == 'none' for f in formats):
self.report_warning(
@@ -2597,22 +2756,49 @@ class YoutubeDL:
if not formats:
self.raise_no_formats(info_dict)
- formats_dict = {}
-
- # We check that all the formats have the format and format_id fields
- for i, format in enumerate(formats):
+ for format in formats:
sanitize_string_field(format, 'format_id')
sanitize_numeric_fields(format)
format['url'] = sanitize_url(format['url'])
+ if format.get('ext') is None:
+ format['ext'] = determine_ext(format['url']).lower()
+ if format.get('protocol') is None:
+ format['protocol'] = determine_protocol(format)
+ if format.get('resolution') is None:
+ format['resolution'] = self.format_resolution(format, default=None)
+ if format.get('dynamic_range') is None and format.get('vcodec') != 'none':
+ format['dynamic_range'] = 'SDR'
+ if format.get('aspect_ratio') is None:
+ format['aspect_ratio'] = try_call(lambda: round(format['width'] / format['height'], 2))
+ if (not format.get('manifest_url') # For fragmented formats, "tbr" is often max bitrate and not average
+ and info_dict.get('duration') and format.get('tbr')
+ and not format.get('filesize') and not format.get('filesize_approx')):
+ format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))
+ format['http_headers'] = self._calc_headers(collections.ChainMap(format, info_dict), load_cookies=True)
+
+ # Safeguard against old/insecure infojson when using --load-info-json
+ if info_dict.get('http_headers'):
+ info_dict['http_headers'] = HTTPHeaderDict(info_dict['http_headers'])
+ info_dict['http_headers'].pop('Cookie', None)
+
+ # This is copied to http_headers by the above _calc_headers and can now be removed
+ if '__x_forwarded_for_ip' in info_dict:
+ del info_dict['__x_forwarded_for_ip']
+
+ self.sort_formats({
+ 'formats': formats,
+ '_format_sort_fields': info_dict.get('_format_sort_fields')
+ })
+
+ # Sanitize and group by format_id
+ formats_dict = {}
+ for i, format in enumerate(formats):
if not format.get('format_id'):
format['format_id'] = str(i)
else:
# Sanitize format_id from characters used in format selector expression
format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
- format_id = format['format_id']
- if format_id not in formats_dict:
- formats_dict[format_id] = []
- formats_dict[format_id].append(format)
+ formats_dict.setdefault(format['format_id'], []).append(format)
# Make sure all formats have unique format_id
common_exts = set(itertools.chain(*self._format_selection_exts.values()))
@@ -2621,40 +2807,17 @@ class YoutubeDL:
for i, format in enumerate(ambiguous_formats):
if ambigious_id:
format['format_id'] = '%s-%d' % (format_id, i)
- if format.get('ext') is None:
- format['ext'] = determine_ext(format['url']).lower()
# Ensure there is no conflict between id and ext in format selection
# See https://github.com/hypervideo/hypervideo/issues/1282
if format['format_id'] != format['ext'] and format['format_id'] in common_exts:
format['format_id'] = 'f%s' % format['format_id']
- for i, format in enumerate(formats):
- if format.get('format') is None:
- format['format'] = '{id} - {res}{note}'.format(
- id=format['format_id'],
- res=self.format_resolution(format),
- note=format_field(format, 'format_note', ' (%s)'),
- )
- if format.get('protocol') is None:
- format['protocol'] = determine_protocol(format)
- if format.get('resolution') is None:
- format['resolution'] = self.format_resolution(format, default=None)
- if format.get('dynamic_range') is None and format.get('vcodec') != 'none':
- format['dynamic_range'] = 'SDR'
- if format.get('aspect_ratio') is None:
- format['aspect_ratio'] = try_call(lambda: round(format['width'] / format['height'], 2))
- if (info_dict.get('duration') and format.get('tbr')
- and not format.get('filesize') and not format.get('filesize_approx')):
- format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))
-
- # Add HTTP headers, so that external programs can use them from the
- # json output
- full_format_info = info_dict.copy()
- full_format_info.update(format)
- format['http_headers'] = self._calc_headers(full_format_info)
- # Remove private housekeeping stuff
- if '__x_forwarded_for_ip' in info_dict:
- del info_dict['__x_forwarded_for_ip']
+ if format.get('format') is None:
+ format['format'] = '{id} - {res}{note}'.format(
+ id=format['format_id'],
+ res=self.format_resolution(format),
+ note=format_field(format, 'format_note', ' (%s)'),
+ )
if self.params.get('check_formats') is True:
formats = LazyList(self._check_formats(formats[::-1]), reverse=True)
@@ -2690,33 +2853,31 @@ class YoutubeDL:
self.list_formats(info_dict)
if list_only:
# Without this printing, -F --print-json will not work
- self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
+ self.__forced_printings(info_dict)
return info_dict
format_selector = self.format_selector
- if format_selector is None:
- req_format = self._default_format_spec(info_dict, download=download)
- self.write_debug('Default format spec: %s' % req_format)
- format_selector = self.build_format_selector(req_format)
-
while True:
if interactive_format_selection:
- req_format = input(
- self._format_screen('\nEnter format selector: ', self.Styles.EMPHASIS))
+ req_format = input(self._format_screen('\nEnter format selector ', self.Styles.EMPHASIS)
+ + '(Press ENTER for default, or Ctrl+C to quit)'
+ + self._format_screen(': ', self.Styles.EMPHASIS))
try:
- format_selector = self.build_format_selector(req_format)
+ format_selector = self.build_format_selector(req_format) if req_format else None
except SyntaxError as err:
self.report_error(err, tb=False, is_error=False)
continue
+ if format_selector is None:
+ req_format = self._default_format_spec(info_dict, download=download)
+ self.write_debug(f'Default format spec: {req_format}')
+ format_selector = self.build_format_selector(req_format)
+
formats_to_download = list(format_selector({
'formats': formats,
'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
- 'incomplete_formats': (
- # All formats are video-only or
- all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
- # all formats are audio-only
- or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)),
+ 'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video
+ or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio
}))
if interactive_format_selection and not formats_to_download:
self.report_error('Requested format is not available', tb=False, is_error=False)
@@ -2751,11 +2912,13 @@ class YoutubeDL:
new_info.update(fmt)
offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf')
end_time = offset + min(chapter.get('end_time', duration), duration)
+ # duration may not be accurate. So allow deviations <1sec
+ if end_time == float('inf') or end_time > offset + duration + 1:
+ end_time = None
if chapter or offset:
new_info.update({
'section_start': offset + chapter.get('start_time', 0),
- # duration may not be accurate. So allow deviations <1sec
- 'section_end': end_time if end_time <= offset + duration + 1 else None,
+ 'section_end': end_time,
'section_title': chapter.get('title'),
'section_number': chapter.get('index'),
})
@@ -2811,10 +2974,14 @@ class YoutubeDL:
self.params.get('subtitleslangs'), {'all': all_sub_langs}, use_regex=True)
except re.error as e:
raise ValueError(f'Wrong regex for subtitlelangs: {e.pattern}')
- elif normal_sub_langs:
- requested_langs = ['en'] if 'en' in normal_sub_langs else normal_sub_langs[:1]
else:
- requested_langs = ['en'] if 'en' in all_sub_langs else all_sub_langs[:1]
+ requested_langs = LazyList(itertools.chain(
+ ['en'] if 'en' in normal_sub_langs else [],
+ filter(lambda f: f.startswith('en'), normal_sub_langs),
+ ['en'] if 'en' in all_sub_langs else [],
+ filter(lambda f: f.startswith('en'), all_sub_langs),
+ normal_sub_langs, all_sub_langs,
+ ))[:1]
if requested_langs:
self.to_screen(f'[info] {video_id}: Downloading subtitles: {", ".join(requested_langs)}')
@@ -2846,6 +3013,12 @@ class YoutubeDL:
if info_dict is None:
return
info_copy = info_dict.copy()
+ info_copy.setdefault('filename', self.prepare_filename(info_dict))
+ if info_dict.get('requested_formats') is not None:
+ # For RTMP URLs, also include the playpath
+ info_copy['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
+ elif info_dict.get('url'):
+ info_copy['urls'] = info_dict['url'] + info_dict.get('play_path', '')
info_copy['formats_table'] = self.render_formats_table(info_dict)
info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict)
info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles'))
@@ -2858,7 +3031,7 @@ class YoutubeDL:
fmt = '%({})s'
if tmpl.startswith('{'):
- tmpl = f'.{tmpl}'
+ tmpl, fmt = f'.{tmpl}', '%({})j'
if tmpl.endswith('='):
tmpl, fmt = tmpl[:-1], '{0} = %({0})#j'
return '\n'.join(map(fmt.format, [tmpl] if mobj.group('dict') else tmpl.split(',')))
@@ -2871,46 +3044,36 @@ class YoutubeDL:
tmpl = format_tmpl(tmpl)
self.to_screen(f'[info] Writing {tmpl!r} to: {filename}')
if self._ensure_dir_exists(filename):
- with open(filename, 'a', encoding='utf-8') as f:
- f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n')
+ with open(filename, 'a', encoding='utf-8', newline='') as f:
+ f.write(self.evaluate_outtmpl(tmpl, info_copy) + os.linesep)
- def __forced_printings(self, info_dict, filename, incomplete):
- def print_mandatory(field, actual_field=None):
- if actual_field is None:
- actual_field = field
- if (self.params.get('force%s' % field, False)
- and (not incomplete or info_dict.get(actual_field) is not None)):
- self.to_stdout(info_dict[actual_field])
-
- def print_optional(field):
- if (self.params.get('force%s' % field, False)
- and info_dict.get(field) is not None):
- self.to_stdout(info_dict[field])
-
- info_dict = info_dict.copy()
- if filename is not None:
- info_dict['filename'] = filename
- if info_dict.get('requested_formats') is not None:
- # For RTMP URLs, also include the playpath
- info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
- elif info_dict.get('url'):
- info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
+ return info_copy
+ def __forced_printings(self, info_dict, filename=None, incomplete=True):
if (self.params.get('forcejson')
or self.params['forceprint'].get('video')
or self.params['print_to_file'].get('video')):
self.post_extract(info_dict)
- self._forceprint('video', info_dict)
-
- print_mandatory('title')
- print_mandatory('id')
- print_mandatory('url', 'urls')
- print_optional('thumbnail')
- print_optional('description')
- print_optional('filename')
- if self.params.get('forceduration') and info_dict.get('duration') is not None:
- self.to_stdout(formatSeconds(info_dict['duration']))
- print_mandatory('format')
+ if filename:
+ info_dict['filename'] = filename
+ info_copy = self._forceprint('video', info_dict)
+
+ def print_field(field, actual_field=None, optional=False):
+ if actual_field is None:
+ actual_field = field
+ if self.params.get(f'force{field}') and (
+ info_copy.get(field) is not None or (not optional and not incomplete)):
+ self.to_stdout(info_copy[actual_field])
+
+ print_field('title')
+ print_field('id')
+ print_field('url', 'urls')
+ print_field('thumbnail', optional=True)
+ print_field('description', optional=True)
+ print_field('filename')
+ if self.params.get('forceduration') and info_copy.get('duration') is not None:
+ self.to_stdout(formatSeconds(info_copy['duration']))
+ print_field('format')
if self.params.get('forcejson'):
self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
@@ -2975,6 +3138,16 @@ class YoutubeDL:
# Does nothing under normal operation - for backward compatibility of process_info
self.post_extract(info_dict)
+
+ def replace_info_dict(new_info):
+ nonlocal info_dict
+ if new_info == info_dict:
+ return
+ info_dict.clear()
+ info_dict.update(new_info)
+
+ new_info, _ = self.pre_process(info_dict, 'video')
+ replace_info_dict(new_info)
self._num_downloads += 1
# info_dict['_filename'] needs to be set for backward compatibility
@@ -3088,13 +3261,6 @@ class YoutubeDL:
for link_type, should_write in write_links.items()):
return
- def replace_info_dict(new_info):
- nonlocal info_dict
- if new_info == info_dict:
- return
- info_dict.clear()
- info_dict.update(new_info)
-
new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
replace_info_dict(new_info)
@@ -3121,7 +3287,7 @@ class YoutubeDL:
fd, success = None, True
if info_dict.get('protocol') or info_dict.get('url'):
fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
- if fd is not FFmpegFD and (
+ if fd != FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and (
info_dict.get('section_start') or info_dict.get('section_end')):
msg = ('This format cannot be partially downloaded' if FFmpegFD.available()
else 'You have requested downloading the video partially, but ffmpeg is not installed')
@@ -3129,7 +3295,6 @@ class YoutubeDL:
return
if info_dict.get('requested_formats') is not None:
- requested_formats = info_dict['requested_formats']
old_ext = info_dict['ext']
if self.params.get('merge_output_format') is None:
if (info_dict['ext'] == 'webm'
@@ -3156,19 +3321,22 @@ class YoutubeDL:
full_filename = correct_ext(full_filename)
temp_filename = correct_ext(temp_filename)
dl_filename = existing_video_file(full_filename, temp_filename)
+
info_dict['__real_download'] = False
+ # NOTE: Copy so that original format dicts are not modified
+ info_dict['requested_formats'] = list(map(dict, info_dict['requested_formats']))
merger = FFmpegMergerPP(self)
downloaded = []
if dl_filename is not None:
self.report_file_already_downloaded(dl_filename)
elif fd:
- for f in requested_formats if fd != FFmpegFD else []:
+ for f in info_dict['requested_formats'] if fd != FFmpegFD else []:
f['filepath'] = fname = prepend_extension(
correct_ext(temp_filename, info_dict['ext']),
'f%s' % f['format_id'], info_dict['ext'])
downloaded.append(fname)
- info_dict['url'] = '\n'.join(f['url'] for f in requested_formats)
+ info_dict['url'] = '\n'.join(f['url'] for f in info_dict['requested_formats'])
success, real_download = self.dl(temp_filename, info_dict)
info_dict['__real_download'] = real_download
else:
@@ -3192,7 +3360,7 @@ class YoutubeDL:
f'You have requested downloading multiple formats to stdout {reason}. '
'The formats will be streamed one after the other')
fname = temp_filename
- for f in requested_formats:
+ for f in info_dict['requested_formats']:
new_info = dict(info_dict)
del new_info['requested_formats']
new_info.update(f)
@@ -3283,14 +3451,15 @@ class YoutubeDL:
) for pp in self._pps['post_process'])
if not postprocessed_by_ffmpeg:
- ffmpeg_fixup(ext == 'm4a' and info_dict.get('container') == 'm4a_dash',
+ ffmpeg_fixup(fd != FFmpegFD and ext == 'm4a'
+ and info_dict.get('container') == 'm4a_dash',
'writing DASH m4a. Only some players support this container',
FFmpegFixupM4aPP)
ffmpeg_fixup(downloader == 'hlsnative' and not self.params.get('hls_use_mpegts')
or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None,
'Possible MPEG-TS in MP4 container or malformed AAC timestamps',
FFmpegFixupM3u8PP)
- ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD',
+ ffmpeg_fixup(info_dict.get('is_live') and downloader == 'dashsegments',
'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP)
ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP)
@@ -3354,18 +3523,19 @@ class YoutubeDL:
[info_filename], mode='r',
openhook=fileinput.hook_encoded('utf-8'))) as f:
# FileInput doesn't have a read method, we can't call json.load
- info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True))
- try:
- self.__download_wrapper(self.process_ie_result)(info, download=True)
- except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e:
- if not isinstance(e, EntryNotInPlaylist):
- self.to_stderr('\r')
- webpage_url = info.get('webpage_url')
- if webpage_url is not None:
+ infos = [self.sanitize_info(info, self.params.get('clean_infojson', True))
+ for info in variadic(json.loads('\n'.join(f)))]
+ for info in infos:
+ try:
+ self.__download_wrapper(self.process_ie_result)(info, download=True)
+ except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e:
+ if not isinstance(e, EntryNotInPlaylist):
+ self.to_stderr('\r')
+ webpage_url = info.get('webpage_url')
+ if webpage_url is None:
+ raise
self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}')
- return self.download([webpage_url])
- else:
- raise
+ self.download([webpage_url])
return self._download_retcode
@staticmethod
@@ -3385,7 +3555,8 @@ class YoutubeDL:
if remove_private_keys:
reject = lambda k, v: v is None or k.startswith('__') or k in {
'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries',
- 'entries', 'filepath', '_filename', 'infojson_filename', 'original_url', 'playlist_autonumber',
+ 'entries', 'filepath', '_filename', 'filename', 'infojson_filename', 'original_url',
+ 'playlist_autonumber', '_format_sort_fields',
}
else:
reject = lambda k, v: False
@@ -3455,7 +3626,8 @@ class YoutubeDL:
return infodict
def run_all_pps(self, key, info, *, additional_pps=None):
- self._forceprint(key, info)
+ if key != 'video':
+ self._forceprint(key, info)
for pp in (additional_pps or []) + self._pps[key]:
info = self.run_pp(pp, info)
return info
@@ -3623,7 +3795,7 @@ class YoutubeDL:
def simplified_codec(f, field):
assert field in ('acodec', 'vcodec')
- codec = f.get(field, 'unknown')
+ codec = f.get(field)
if not codec:
return 'unknown'
elif codec != 'none':
@@ -3645,8 +3817,11 @@ class YoutubeDL:
format_field(f, 'fps', '\t%d', func=round),
format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''),
format_field(f, 'audio_channels', '\t%s'),
- delim,
- format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes),
+ delim, (
+ format_field(f, 'filesize', ' \t%s', func=format_bytes)
+ or format_field(f, 'filesize_approx', '≈\t%s', func=format_bytes)
+ or format_field(try_call(lambda: format_bytes(int(info_dict['duration'] * f['tbr'] * (1024 / 8)))),
+ None, self._format_out('~\t%s', self.Styles.SUPPRESS))),
format_field(f, 'tbr', '\t%dk', func=round),
shorten_protocol_name(f.get('protocol', '')),
delim,
@@ -3655,13 +3830,13 @@ class YoutubeDL:
simplified_codec(f, 'acodec'),
format_field(f, 'abr', '\t%dk', func=round),
format_field(f, 'asr', '\t%s', func=format_decimal_suffix),
- join_nonempty(
- self._format_out('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None,
- format_field(f, 'language', '[%s]'),
- join_nonempty(format_field(f, 'format_note'),
- format_field(f, 'container', ignore=(None, f.get('ext'))),
- delim=', '),
- delim=' '),
+ join_nonempty(format_field(f, 'language', '[%s]'), join_nonempty(
+ self._format_out('UNSUPPORTED', self.Styles.BAD_FORMAT) if f.get('ext') in ('f4f', 'f4m') else None,
+ (self._format_out('Maybe DRM', self.Styles.WARNING) if f.get('has_drm') == 'maybe'
+ else self._format_out('DRM', self.Styles.BAD_FORMAT) if f.get('has_drm') else None),
+ format_field(f, 'format_note'),
+ format_field(f, 'container', ignore=(None, f.get('ext'))),
+ delim=', '), delim=' '),
] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
header_line = self._list_format_headers(
'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', 'CH', delim, '\tFILESIZE', '\tTBR', 'PROTO',
@@ -3710,12 +3885,6 @@ class YoutubeDL:
def list_subtitles(self, video_id, subtitles, name='subtitles'):
self.__list_table(video_id, name, self.render_subtitles_table, video_id, subtitles)
- def urlopen(self, req):
- """ Start an HTTP download """
- if isinstance(req, str):
- req = sanitized_Request(req)
- return self._opener.open(req, timeout=self._socket_timeout)
-
def print_debug_header(self):
if not self.params.get('verbose'):
return
@@ -3724,13 +3893,21 @@ class YoutubeDL:
# These imports can be slow. So import them only as needed
from .extractor.extractors import _LAZY_LOADER
- from .extractor.extractors import _PLUGIN_CLASSES as plugin_extractors
+ from .extractor.extractors import (
+ _PLUGIN_CLASSES as plugin_ies,
+ _PLUGIN_OVERRIDES as plugin_ie_overrides
+ )
def get_encoding(stream):
ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__))
+ additional_info = []
+ if os.environ.get('TERM', '').lower() == 'dumb':
+ additional_info.append('dumb')
if not supports_terminal_sequences(stream):
from .utils import WINDOWS_VT_MODE # Must be imported locally
- ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)'
+ additional_info.append('No VT' if WINDOWS_VT_MODE is False else 'No ANSI')
+ if additional_info:
+ ret = f'{ret} ({",".join(additional_info)})'
return ret
encoding_str = 'Encodings: locale %s, fs %s, pref %s, %s' % (
@@ -3753,12 +3930,13 @@ class YoutubeDL:
source = detect_variant()
if VARIANT not in (None, 'pip'):
source += '*'
+ klass = type(self)
write_debug(join_nonempty(
f'{"hypervideo" if REPOSITORY == "hypervideo/hypervideo" else REPOSITORY} version',
- __version__,
- f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '',
+ f'{CHANNEL}@{__version__}',
+ f'[{RELEASE_GIT_HEAD[:9]}]' if RELEASE_GIT_HEAD else '',
'' if source == 'unknown' else f'({source})',
- '' if _IN_CLI else 'API',
+ '' if _IN_CLI else 'API' if klass == YoutubeDL else f'API:{self.__module__}.{klass.__qualname__}',
delim=' '))
if not _IN_CLI:
@@ -3769,10 +3947,6 @@ class YoutubeDL:
write_debug('Lazy loading extractors is forcibly disabled')
else:
write_debug('Lazy loading extractors is disabled')
- if plugin_extractors or plugin_postprocessors:
- write_debug('Plugins: %s' % [
- '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}')
- for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())])
if self.params['compat_opts']:
write_debug('Compatibility options: %s' % ', '.join(self.params['compat_opts']))
@@ -3799,12 +3973,22 @@ class YoutubeDL:
join_nonempty(*get_package_info(m)) for m in available_dependencies.values()
})) or 'none'))
- self._setup_opener()
- proxy_map = {}
- for handler in self._opener.handlers:
- if hasattr(handler, 'proxies'):
- proxy_map.update(handler.proxies)
- write_debug(f'Proxy map: {proxy_map}')
+ write_debug(f'Proxy map: {self.proxies}')
+ # write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}')
+ for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items():
+ display_list = ['%s%s' % (
+ klass.__name__, '' if klass.__name__ == name else f' as {name}')
+ for name, klass in plugins.items()]
+ if plugin_type == 'Extractor':
+ display_list.extend(f'{plugins[-1].IE_NAME.partition("+")[2]} ({parent.__name__})'
+ for parent, plugins in plugin_ie_overrides.items())
+ if not display_list:
+ continue
+ write_debug(f'{plugin_type} Plugins: {", ".join(sorted(display_list))}')
+
+ plugin_dirs = plugin_directories()
+ if plugin_dirs:
+ write_debug(f'Plugin directories: {plugin_dirs}')
# Not implemented
if False and self.params.get('call_home'):
@@ -3818,55 +4002,110 @@ class YoutubeDL:
'See https://yt-dl.org/update if you need help updating.' %
latest_version)
- def _setup_opener(self):
- if hasattr(self, '_opener'):
- return
- timeout_val = self.params.get('socket_timeout')
- self._socket_timeout = 20 if timeout_val is None else float(timeout_val)
-
- opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser')
- opts_cookiefile = self.params.get('cookiefile')
+ @functools.cached_property
+ def proxies(self):
+ """Global proxy configuration"""
opts_proxy = self.params.get('proxy')
-
- self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self)
-
- cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
if opts_proxy is not None:
if opts_proxy == '':
- proxies = {}
- else:
- proxies = {'http': opts_proxy, 'https': opts_proxy}
+ opts_proxy = '__noproxy__'
+ proxies = {'all': opts_proxy}
else:
proxies = urllib.request.getproxies()
- # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
+ # compat. Set HTTPS_PROXY to __noproxy__ to revert
if 'http' in proxies and 'https' not in proxies:
proxies['https'] = proxies['http']
- proxy_handler = PerRequestProxyHandler(proxies)
- debuglevel = 1 if self.params.get('debug_printtraffic') else 0
- https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
- ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
- redirect_handler = YoutubeDLRedirectHandler()
- data_handler = urllib.request.DataHandler()
+ return proxies
+
+ @functools.cached_property
+ def cookiejar(self):
+ """Global cookiejar instance"""
+ return load_cookies(
+ self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self)
- # When passing our own FileHandler instance, build_opener won't add the
- # default FileHandler and allows us to disable the file protocol, which
- # can be used for malicious purposes (see
- # https://github.com/ytdl-org/youtube-dl/issues/8227)
- file_handler = urllib.request.FileHandler()
+ @property
+ def _opener(self):
+ """
+ Get a urllib OpenerDirector from the Urllib handler (deprecated).
+ """
+ self.deprecation_warning('YoutubeDL._opener is deprecated, use YoutubeDL.urlopen()')
+ handler = self._request_director.handlers['Urllib']
+ return handler._get_instance(cookiejar=self.cookiejar, proxies=self.proxies)
+
+ def urlopen(self, req):
+ """ Start an HTTP download """
+ if isinstance(req, str):
+ req = Request(req)
+ elif isinstance(req, urllib.request.Request):
+ self.deprecation_warning(
+ 'Passing a urllib.request.Request object to YoutubeDL.urlopen() is deprecated. '
+ 'Use hypervideo_dl.networking.common.Request instead.')
+ req = urllib_req_to_req(req)
+ assert isinstance(req, Request)
- def file_open(*args, **kwargs):
- raise urllib.error.URLError('file:// scheme is explicitly disabled in hypervideo for security reasons')
- file_handler.file_open = file_open
+ # compat: Assume user:pass url params are basic auth
+ url, basic_auth_header = extract_basic_auth(req.url)
+ if basic_auth_header:
+ req.headers['Authorization'] = basic_auth_header
+ req.url = sanitize_url(url)
- opener = urllib.request.build_opener(
- proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
+ clean_proxies(proxies=req.proxies, headers=req.headers)
+ clean_headers(req.headers)
- # Delete the default user-agent header, which would otherwise apply in
- # cases where our custom HTTP handler doesn't come into play
- # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
- opener.addheaders = []
- self._opener = opener
+ try:
+ return self._request_director.send(req)
+ except NoSupportingHandlers as e:
+ for ue in e.unsupported_errors:
+ if not (ue.handler and ue.msg):
+ continue
+ if ue.handler.RH_KEY == 'Urllib' and 'unsupported url scheme: "file"' in ue.msg.lower():
+ raise RequestError(
+ 'file:// URLs are disabled by default in hypervideo for security reasons. '
+ 'Use --enable-file-urls to enable at your own risk.', cause=ue) from ue
+ raise
+ except SSLError as e:
+ if 'UNSAFE_LEGACY_RENEGOTIATION_DISABLED' in str(e):
+ raise RequestError('UNSAFE_LEGACY_RENEGOTIATION_DISABLED: Try using --legacy-server-connect', cause=e) from e
+ elif 'SSLV3_ALERT_HANDSHAKE_FAILURE' in str(e):
+ raise RequestError(
+ 'SSLV3_ALERT_HANDSHAKE_FAILURE: The server may not support the current cipher list. '
+ 'Try using --legacy-server-connect', cause=e) from e
+ raise
+ except HTTPError as e: # TODO: Remove in a future release
+ raise _CompatHTTPError(e) from e
+
+ def build_request_director(self, handlers, preferences=None):
+ logger = _YDLLogger(self)
+ headers = self.params['http_headers'].copy()
+ proxies = self.proxies.copy()
+ clean_headers(headers)
+ clean_proxies(proxies, headers)
+
+ director = RequestDirector(logger=logger, verbose=self.params.get('debug_printtraffic'))
+ for handler in handlers:
+ director.add_handler(handler(
+ logger=logger,
+ headers=headers,
+ cookiejar=self.cookiejar,
+ proxies=proxies,
+ prefer_system_certs='no-certifi' in self.params['compat_opts'],
+ verify=not self.params.get('nocheckcertificate'),
+ **traverse_obj(self.params, {
+ 'verbose': 'debug_printtraffic',
+ 'source_address': 'source_address',
+ 'timeout': 'socket_timeout',
+ 'legacy_ssl_support': 'legacyserverconnect',
+ 'enable_file_urls': 'enable_file_urls',
+ 'client_cert': {
+ 'client_certificate': 'client_certificate',
+ 'client_certificate_key': 'client_certificate_key',
+ 'client_certificate_password': 'client_certificate_password',
+ },
+ }),
+ ))
+ director.preferences.update(preferences or [])
+ return director
def encode(self, s):
if isinstance(s, bytes):
@@ -3919,7 +4158,7 @@ class YoutubeDL:
elif not self.params.get('overwrites', True) and os.path.exists(descfn):
self.to_screen(f'[info] {label.title()} description is already present')
elif ie_result.get('description') is None:
- self.report_warning(f'There\'s no {label} description to write')
+ self.to_screen(f'[info] There\'s no {label} description to write')
return False
else:
try:
@@ -3935,15 +4174,18 @@ class YoutubeDL:
''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error'''
ret = []
subtitles = info_dict.get('requested_subtitles')
- if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
+ if not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
# subtitles download errors are already managed as troubles in relevant IE
# that way it will silently go on when used with unsupporting IE
return ret
-
+ elif not subtitles:
+ self.to_screen('[info] There are no subtitles for the requested languages')
+ return ret
sub_filename_base = self.prepare_filename(info_dict, 'subtitle')
if not sub_filename_base:
self.to_screen('[info] Skipping writing video subtitles')
return ret
+
for sub_lang, sub_info in subtitles.items():
sub_format = sub_info['ext']
sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
@@ -3990,6 +4232,9 @@ class YoutubeDL:
thumbnails, ret = [], []
if write_all or self.params.get('writethumbnail', False):
thumbnails = info_dict.get('thumbnails') or []
+ if not thumbnails:
+ self.to_screen(f'[info] There are no {label} thumbnails to download')
+ return ret
multiple = write_all and len(thumbnails) > 1
if thumb_filename_base is None:
@@ -4013,15 +4258,18 @@ class YoutubeDL:
else:
self.to_screen(f'[info] Downloading {thumb_display_id} ...')
try:
- uf = self.urlopen(sanitized_Request(t['url'], headers=t.get('http_headers', {})))
+ uf = self.urlopen(Request(t['url'], headers=t.get('http_headers', {})))
self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
with open(encodeFilename(thumb_filename), 'wb') as thumbf:
shutil.copyfileobj(uf, thumbf)
ret.append((thumb_filename, thumb_filename_final))
t['filepath'] = thumb_filename
except network_exceptions as err:
+ if isinstance(err, HTTPError) and err.status == 404:
+ self.to_screen(f'[info] {thumb_display_id.title()} does not exist')
+ else:
+ self.report_warning(f'Unable to download {thumb_display_id}: {err}')
thumbnails.pop(idx)
- self.report_warning(f'Unable to download {thumb_display_id}: {err}')
if ret and not write_all:
break
return ret
diff --git a/hypervideo_dl/__init__.py b/hypervideo_dl/__init__.py
index 8ac1c0c..60b012f 100644
--- a/hypervideo_dl/__init__.py
+++ b/hypervideo_dl/__init__.py
@@ -9,6 +9,7 @@ import optparse
import os
import re
import sys
+import traceback
from .compat import compat_shlex_quote, workaround_optparse_bug9161
from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS
@@ -49,11 +50,11 @@ from .utils import (
read_stdin,
render_table,
setproctitle,
- std_headers,
traverse_obj,
variadic,
write_string,
)
+from .utils.networking import std_headers
from .YoutubeDL import YoutubeDL
@@ -82,17 +83,20 @@ def get_urls(urls, batchfile, verbose):
def print_extractor_information(opts, urls):
- # Importing GenericIE is currently slow since it imports other extractors
- # TODO: Move this back to module level after generalization of embed detection
- from .extractor.generic import GenericIE
-
out = ''
if opts.list_extractors:
- for ie in list_extractors(opts.age_limit):
- write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie.working() else '') + '\n', out=sys.stdout)
- matchedUrls = [url for url in urls if ie.suitable(url)]
- for mu in matchedUrls:
- write_string(' ' + mu + '\n', out=sys.stdout)
+ # Importing GenericIE is currently slow since it imports YoutubeIE
+ from .extractor.generic import GenericIE
+
+ urls = dict.fromkeys(urls, False)
+ for ie in list_extractor_classes(opts.age_limit):
+ out += ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie.working() else '') + '\n'
+ if ie == GenericIE:
+ matched_urls = [url for url, matched in urls.items() if not matched]
+ else:
+ matched_urls = tuple(filter(ie.suitable, urls.keys()))
+ urls.update(dict.fromkeys(matched_urls, True))
+ out += ''.join(f' {url}\n' for url in matched_urls)
elif opts.list_extractor_descriptions:
for ie in list_extractors(opts.age_limit):
if not ie.working():
@@ -180,8 +184,8 @@ def validate_options(opts):
raise ValueError(f'{max_name} "{max_val}" must be must be greater than or equal to {min_name} "{min_val}"')
# Usernames and passwords
- validate(not opts.usenetrc or (opts.username is None and opts.password is None),
- '.netrc', msg='using {name} conflicts with giving username/password')
+ validate(sum(map(bool, (opts.usenetrc, opts.netrc_cmd, opts.username))) <= 1, '.netrc',
+ msg='{name}, netrc command and username/password are mutually exclusive options')
validate(opts.password is None or opts.username is not None, 'account username', msg='{name} missing')
validate(opts.ap_password is None or opts.ap_username is not None,
'TV Provider account username', msg='{name} missing')
@@ -309,34 +313,60 @@ def validate_options(opts):
if outtmpl_default == '':
opts.skip_download = None
del opts.outtmpl['default']
- if outtmpl_default and not os.path.splitext(outtmpl_default)[1] and opts.extractaudio:
- raise ValueError(
- 'Cannot download a video and extract audio into the same file! '
- f'Use "{outtmpl_default}.%(ext)s" instead of "{outtmpl_default}" as the output template')
- def parse_chapters(name, value):
- chapters, ranges = [], []
+ def parse_chapters(name, value, advanced=False):
+ parse_timestamp = lambda x: float('inf') if x in ('inf', 'infinite') else parse_duration(x)
+ TIMESTAMP_RE = r'''(?x)(?:
+ (?P<start_sign>-?)(?P<start>[^-]+)
+ )?\s*-\s*(?:
+ (?P<end_sign>-?)(?P<end>[^-]+)
+ )?'''
+
+ chapters, ranges, from_url = [], [], False
for regex in value or []:
- if regex.startswith('*'):
- for range in regex[1:].split(','):
- dur = tuple(map(parse_duration, range.strip().split('-')))
- if len(dur) == 2 and all(t is not None for t in dur):
- ranges.append(dur)
- else:
- raise ValueError(f'invalid {name} time range "{regex}". Must be of the form *start-end')
+ if advanced and regex == '*from-url':
+ from_url = True
+ continue
+ elif not regex.startswith('*'):
+ try:
+ chapters.append(re.compile(regex))
+ except re.error as err:
+ raise ValueError(f'invalid {name} regex "{regex}" - {err}')
continue
- try:
- chapters.append(re.compile(regex))
- except re.error as err:
- raise ValueError(f'invalid {name} regex "{regex}" - {err}')
- return chapters, ranges
- opts.remove_chapters, opts.remove_ranges = parse_chapters('--remove-chapters', opts.remove_chapters)
- opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges))
+ for range_ in map(str.strip, regex[1:].split(',')):
+ mobj = range_ != '-' and re.fullmatch(TIMESTAMP_RE, range_)
+ dur = mobj and [parse_timestamp(mobj.group('start') or '0'), parse_timestamp(mobj.group('end') or 'inf')]
+ signs = mobj and (mobj.group('start_sign'), mobj.group('end_sign'))
+
+ err = None
+ if None in (dur or [None]):
+ err = 'Must be of the form "*start-end"'
+ elif not advanced and any(signs):
+ err = 'Negative timestamps are not allowed'
+ else:
+ dur[0] *= -1 if signs[0] else 1
+ dur[1] *= -1 if signs[1] else 1
+ if dur[1] == float('-inf'):
+ err = '"-inf" is not a valid end'
+ if err:
+ raise ValueError(f'invalid {name} time range "{regex}". {err}')
+ ranges.append(dur)
+
+ return chapters, ranges, from_url
+
+ opts.remove_chapters, opts.remove_ranges, _ = parse_chapters('--remove-chapters', opts.remove_chapters)
+ opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges, True))
# Cookies from browser
if opts.cookiesfrombrowser:
- mobj = re.match(r'(?P<name>[^+:]+)(\s*\+\s*(?P<keyring>[^:]+))?(\s*:(?P<profile>.+))?', opts.cookiesfrombrowser)
+ container = None
+ mobj = re.fullmatch(r'''(?x)
+ (?P<name>[^+:]+)
+ (?:\s*\+\s*(?P<keyring>[^:]+))?
+ (?:\s*:\s*(?!:)(?P<profile>.+?))?
+ (?:\s*::\s*(?P<container>.+))?
+ ''', opts.cookiesfrombrowser)
if mobj is None:
raise ValueError(f'invalid cookies from browser arguments: {opts.cookiesfrombrowser}')
browser_name, keyring, profile = mobj.group('name', 'keyring', 'profile')
@@ -370,10 +400,12 @@ def validate_options(opts):
raise ValueError(f'{cmd} is invalid; {err}')
yield action
- parse_metadata = opts.parse_metadata or []
if opts.metafromtitle is not None:
- parse_metadata.append('title:%s' % opts.metafromtitle)
- opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, parse_metadata)))
+ opts.parse_metadata.setdefault('pre_process', []).append('title:%s' % opts.metafromtitle)
+ opts.parse_metadata = {
+ k: list(itertools.chain(*map(metadataparser_actions, v)))
+ for k, v in opts.parse_metadata.items()
+ }
# Other options
if opts.playlist_items is not None:
@@ -382,14 +414,19 @@ def validate_options(opts):
except Exception as err:
raise ValueError(f'Invalid playlist-items {opts.playlist_items!r}: {err}')
- geo_bypass_code = opts.geo_bypass_ip_block or opts.geo_bypass_country
- if geo_bypass_code is not None:
+ opts.geo_bypass_country, opts.geo_bypass_ip_block = None, None
+ if opts.geo_bypass.lower() not in ('default', 'never'):
try:
- GeoUtils.random_ipv4(geo_bypass_code)
+ GeoUtils.random_ipv4(opts.geo_bypass)
except Exception:
- raise ValueError('unsupported geo-bypass country or ip-block')
+ raise ValueError(f'Unsupported --xff "{opts.geo_bypass}"')
+ if len(opts.geo_bypass) == 2:
+ opts.geo_bypass_country = opts.geo_bypass
+ else:
+ opts.geo_bypass_ip_block = opts.geo_bypass
+ opts.geo_bypass = opts.geo_bypass.lower() != 'never'
- opts.match_filter = match_filter_func(opts.match_filter)
+ opts.match_filter = match_filter_func(opts.match_filter, opts.breaking_match_filter)
if opts.download_archive is not None:
opts.download_archive = expand_path(opts.download_archive)
@@ -413,6 +450,10 @@ def validate_options(opts):
elif ed and proto == 'default':
default_downloader = ed.get_basename()
+ for policy in opts.color.values():
+ if policy not in ('always', 'auto', 'no_color', 'never'):
+ raise ValueError(f'"{policy}" is not a valid color policy')
+
warnings, deprecation_warnings = [], []
# Common mistake: -f best
@@ -540,11 +581,11 @@ def validate_options(opts):
def get_postprocessors(opts):
yield from opts.add_postprocessors
- if opts.parse_metadata:
+ for when, actions in opts.parse_metadata.items():
yield {
'key': 'MetadataParser',
- 'actions': opts.parse_metadata,
- 'when': 'pre_process'
+ 'actions': actions,
+ 'when': when
}
sponsorblock_query = opts.sponsorblock_mark | opts.sponsorblock_remove
if sponsorblock_query:
@@ -677,9 +718,13 @@ def parse_options(argv=None):
postprocessors = list(get_postprocessors(opts))
- any_getting = (any(opts.forceprint.values()) or opts.dumpjson or opts.dump_single_json
- or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail
- or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration)
+ print_only = bool(opts.forceprint) and all(k not in opts.forceprint for k in POSTPROCESS_WHEN[3:])
+ any_getting = any(getattr(opts, k) for k in (
+ 'dumpjson', 'dump_single_json', 'getdescription', 'getduration', 'getfilename',
+ 'getformat', 'getid', 'getthumbnail', 'gettitle', 'geturl'
+ ))
+ if opts.quiet is None:
+ opts.quiet = any_getting or opts.print_json or bool(opts.forceprint)
any_printing = opts.print_json
@@ -692,6 +737,7 @@ def parse_options(argv=None):
return parser, opts, urls, {
'usenetrc': opts.usenetrc,
'netrc_location': opts.netrc_location,
+ 'netrc_cmd': opts.netrc_cmd,
'username': opts.username,
'password': opts.password,
'twofactor': opts.twofactor,
@@ -699,7 +745,10 @@ def parse_options(argv=None):
'ap_mso': opts.ap_mso,
'ap_username': opts.ap_username,
'ap_password': opts.ap_password,
- 'quiet': (opts.quiet or any_getting or any_printing),
+ 'client_certificate': opts.client_certificate,
+ 'client_certificate_key': opts.client_certificate_key,
+ 'client_certificate_password': opts.client_certificate_password,
+ 'quiet': opts.quiet,
'no_warnings': opts.no_warnings,
'forceurl': opts.geturl,
'forcetitle': opts.gettitle,
@@ -810,6 +859,7 @@ def parse_options(argv=None):
'legacyserverconnect': opts.legacy_server_connect,
'nocheckcertificate': opts.no_check_certificate,
'prefer_insecure': opts.prefer_insecure,
+ 'enable_file_urls': opts.enable_file_urls,
'http_headers': opts.headers,
'proxy': opts.proxy,
'socket_timeout': opts.socket_timeout,
@@ -844,7 +894,7 @@ def parse_options(argv=None):
'playlist_items': opts.playlist_items,
'xattr_set_filesize': opts.xattr_set_filesize,
'match_filter': opts.match_filter,
- 'no_color': opts.no_color,
+ 'color': opts.color,
'ffmpeg_location': opts.ffmpeg_location,
'hls_prefer_native': opts.hls_prefer_native,
'hls_use_mpegts': opts.hls_use_mpegts,
@@ -890,14 +940,18 @@ def _real_main(argv=None):
if opts.rm_cachedir:
ydl.cache.remove()
- updater = Updater(ydl)
- if opts.update_self and updater.update() and actual_use:
- if updater.cmd:
- return updater.restart()
- # This code is reachable only for zip variant in py < 3.10
- # It makes sense to exit here, but the old behavior is to continue
- ydl.report_warning('Restart hypervideo to use the updated version')
- # return 100, 'ERROR: The program must exit for the update to complete'
+ try:
+ updater = Updater(ydl, opts.update_self)
+ if opts.update_self and updater.update() and actual_use:
+ if updater.cmd:
+ return updater.restart()
+ # This code is reachable only for zip variant in py < 3.10
+ # It makes sense to exit here, but the old behavior is to continue
+ ydl.report_warning('Restart hypervideo to use the updated version')
+ # return 100, 'ERROR: The program must exit for the update to complete'
+ except Exception:
+ traceback.print_exc()
+ ydl._download_retcode = 100
if not actual_use:
if pre_process:
@@ -911,6 +965,8 @@ def _real_main(argv=None):
parser.destroy()
try:
if opts.load_info_filename is not None:
+ if all_urls:
+ ydl.report_warning('URLs are ignored due to --load-info-json')
return ydl.download_with_info_file(expand_path(opts.load_info_filename))
else:
return ydl.download(all_urls)
diff --git a/hypervideo_dl/__pyinstaller/__init__.py b/hypervideo_dl/__pyinstaller/__init__.py
new file mode 100644
index 0000000..1c52aad
--- /dev/null
+++ b/hypervideo_dl/__pyinstaller/__init__.py
@@ -0,0 +1,5 @@
+import os
+
+
+def get_hook_dirs():
+ return [os.path.dirname(__file__)]
diff --git a/hypervideo_dl/__pyinstaller/hook-yt_dlp.py b/hypervideo_dl/__pyinstaller/hook-yt_dlp.py
new file mode 100644
index 0000000..67e1291
--- /dev/null
+++ b/hypervideo_dl/__pyinstaller/hook-yt_dlp.py
@@ -0,0 +1,32 @@
+import sys
+
+from PyInstaller.utils.hooks import collect_submodules
+
+
+def pycryptodome_module():
+ try:
+ import Cryptodome # noqa: F401
+ except ImportError:
+ try:
+ import Crypto # noqa: F401
+ print('WARNING: Using Crypto since Cryptodome is not available. '
+ 'Install with: pip install pycryptodomex', file=sys.stderr)
+ return 'Crypto'
+ except ImportError:
+ pass
+ return 'Cryptodome'
+
+
+def get_hidden_imports():
+ yield from ('hypervideo_dl.compat._legacy', 'hypervideo_dl.compat._deprecated')
+ yield from ('hypervideo_dl.utils._legacy', 'hypervideo_dl.utils._deprecated')
+ yield pycryptodome_module()
+ yield from collect_submodules('websockets')
+ # These are auto-detected, but explicitly add them just in case
+ yield from ('mutagen', 'brotli', 'certifi')
+
+
+hiddenimports = list(get_hidden_imports())
+print(f'Adding imports: {hiddenimports}')
+
+excludedimports = ['youtube_dl', 'youtube_dlc', 'test', 'ytdlp_plugins', 'devscripts']
diff --git a/hypervideo_dl/aes.py b/hypervideo_dl/aes.py
index 60ce99c..b3a383c 100644
--- a/hypervideo_dl/aes.py
+++ b/hypervideo_dl/aes.py
@@ -2,17 +2,17 @@ import base64
from math import ceil
from .compat import compat_ord
-from .dependencies import Cryptodome_AES
+from .dependencies import Cryptodome
from .utils import bytes_to_intlist, intlist_to_bytes
-if Cryptodome_AES:
+if Cryptodome.AES:
def aes_cbc_decrypt_bytes(data, key, iv):
""" Decrypt bytes with AES-CBC using pycryptodome """
- return Cryptodome_AES.new(key, Cryptodome_AES.MODE_CBC, iv).decrypt(data)
+ return Cryptodome.AES.new(key, Cryptodome.AES.MODE_CBC, iv).decrypt(data)
def aes_gcm_decrypt_and_verify_bytes(data, key, tag, nonce):
""" Decrypt bytes with AES-GCM using pycryptodome """
- return Cryptodome_AES.new(key, Cryptodome_AES.MODE_GCM, nonce).decrypt_and_verify(data, tag)
+ return Cryptodome.AES.new(key, Cryptodome.AES.MODE_GCM, nonce).decrypt_and_verify(data, tag)
else:
def aes_cbc_decrypt_bytes(data, key, iv):
diff --git a/hypervideo_dl/cache.py b/hypervideo_dl/cache.py
index 2e9c1ef..fa72814 100644
--- a/hypervideo_dl/cache.py
+++ b/hypervideo_dl/cache.py
@@ -1,10 +1,10 @@
import contextlib
-import errno
import json
import os
import re
import shutil
import traceback
+import urllib.parse
from .utils import expand_path, traverse_obj, version_tuple, write_json_file
from .version import __version__
@@ -22,11 +22,9 @@ class Cache:
return expand_path(res)
def _get_cache_fn(self, section, key, dtype):
- assert re.match(r'^[a-zA-Z0-9_.-]+$', section), \
- 'invalid section %r' % section
- assert re.match(r'^[a-zA-Z0-9_.-]+$', key), 'invalid key %r' % key
- return os.path.join(
- self._get_root_dir(), section, f'{key}.{dtype}')
+ assert re.match(r'^[\w.-]+$', section), f'invalid section {section!r}'
+ key = urllib.parse.quote(key, safe='').replace('%', ',') # encode non-ascii characters
+ return os.path.join(self._get_root_dir(), section, f'{key}.{dtype}')
@property
def enabled(self):
@@ -40,11 +38,7 @@ class Cache:
fn = self._get_cache_fn(section, key, dtype)
try:
- try:
- os.makedirs(os.path.dirname(fn))
- except OSError as ose:
- if ose.errno != errno.EEXIST:
- raise
+ os.makedirs(os.path.dirname(fn), exist_ok=True)
self._ydl.write_debug(f'Saving {section}.{key} to cache')
write_json_file({'hypervideo_version': __version__, 'data': data}, fn)
except Exception:
diff --git a/hypervideo_dl/casefold.py b/hypervideo_dl/casefold.py
new file mode 100644
index 0000000..41a53e5
--- /dev/null
+++ b/hypervideo_dl/casefold.py
@@ -0,0 +1,5 @@
+import warnings
+
+warnings.warn(DeprecationWarning(f'{__name__} is deprecated'))
+
+casefold = str.casefold
diff --git a/hypervideo_dl/compat/__init__.py b/hypervideo_dl/compat/__init__.py
index 2f2621b..445178d 100644
--- a/hypervideo_dl/compat/__init__.py
+++ b/hypervideo_dl/compat/__init__.py
@@ -1,14 +1,11 @@
import os
import sys
-import warnings
import xml.etree.ElementTree as etree
-from ._deprecated import * # noqa: F401, F403
from .compat_utils import passthrough_module
-# XXX: Implement this the same way as other DeprecationWarnings without circular import
-passthrough_module(__name__, '._legacy', callback=lambda attr: warnings.warn(
- DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=3))
+passthrough_module(__name__, '._deprecated')
+del passthrough_module
# HTMLParseError has been deprecated in Python 3.3 and removed in
@@ -72,7 +69,11 @@ else:
compat_expanduser = os.path.expanduser
-# NB: Add modules that are imported dynamically here so that PyInstaller can find them
-# See https://github.com/pyinstaller/pyinstaller-hooks-contrib/issues/438
-if False:
- from . import _legacy # noqa: F401
+def urllib_req_to_req(urllib_request):
+ """Convert urllib Request to a networking Request"""
+ from ..networking import Request
+ from ..utils.networking import HTTPHeaderDict
+ return Request(
+ urllib_request.get_full_url(), data=urllib_request.data, method=urllib_request.get_method(),
+ headers=HTTPHeaderDict(urllib_request.headers, urllib_request.unredirected_hdrs),
+ extensions={'timeout': urllib_request.timeout} if hasattr(urllib_request, 'timeout') else None)
diff --git a/hypervideo_dl/compat/_deprecated.py b/hypervideo_dl/compat/_deprecated.py
index 342f1f8..607bae9 100644
--- a/hypervideo_dl/compat/_deprecated.py
+++ b/hypervideo_dl/compat/_deprecated.py
@@ -1,4 +1,12 @@
"""Deprecated - New code should avoid these"""
+import warnings
+
+from .compat_utils import passthrough_module
+
+# XXX: Implement this the same way as other DeprecationWarnings without circular import
+passthrough_module(__name__, '.._legacy', callback=lambda attr: warnings.warn(
+ DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=6))
+del passthrough_module
import base64
import urllib.error
@@ -8,7 +16,6 @@ compat_str = str
compat_b64decode = base64.b64decode
-compat_HTTPError = urllib.error.HTTPError
compat_urlparse = urllib.parse
compat_parse_qs = urllib.parse.parse_qs
compat_urllib_parse_unquote = urllib.parse.unquote
diff --git a/hypervideo_dl/compat/_legacy.py b/hypervideo_dl/compat/_legacy.py
index d19333d..90ccf0f 100644
--- a/hypervideo_dl/compat/_legacy.py
+++ b/hypervideo_dl/compat/_legacy.py
@@ -1,5 +1,6 @@
""" Do not use! """
+import base64
import collections
import ctypes
import getpass
@@ -15,12 +16,12 @@ import shlex
import shutil
import socket
import struct
+import subprocess
import tokenize
import urllib.error
import urllib.parse
import urllib.request
import xml.etree.ElementTree as etree
-from subprocess import DEVNULL
# isort: split
import asyncio # noqa: F401
@@ -29,10 +30,11 @@ from asyncio import run as compat_asyncio_run # noqa: F401
from re import Pattern as compat_Pattern # noqa: F401
from re import match as compat_Match # noqa: F401
+from . import compat_expanduser, compat_HTMLParseError, compat_realpath
from .compat_utils import passthrough_module
-from ..dependencies import Cryptodome_AES as compat_pycrypto_AES # noqa: F401
from ..dependencies import brotli as compat_brotli # noqa: F401
from ..dependencies import websockets as compat_websockets # noqa: F401
+from ..dependencies.Cryptodome import AES as compat_pycrypto_AES # noqa: F401
passthrough_module(__name__, '...utils', ('WINDOWS_VT_MODE', 'windows_enable_vt_mode'))
@@ -47,41 +49,48 @@ def compat_setenv(key, value, env=os.environ):
env[key] = value
+compat_base64_b64decode = base64.b64decode
compat_basestring = str
compat_casefold = str.casefold
compat_chr = chr
compat_collections_abc = collections.abc
-compat_cookiejar = http.cookiejar
-compat_cookiejar_Cookie = http.cookiejar.Cookie
-compat_cookies = http.cookies
-compat_cookies_SimpleCookie = http.cookies.SimpleCookie
-compat_etree_Element = etree.Element
-compat_etree_register_namespace = etree.register_namespace
+compat_cookiejar = compat_http_cookiejar = http.cookiejar
+compat_cookiejar_Cookie = compat_http_cookiejar_Cookie = http.cookiejar.Cookie
+compat_cookies = compat_http_cookies = http.cookies
+compat_cookies_SimpleCookie = compat_http_cookies_SimpleCookie = http.cookies.SimpleCookie
+compat_etree_Element = compat_xml_etree_ElementTree_Element = etree.Element
+compat_etree_register_namespace = compat_xml_etree_register_namespace = etree.register_namespace
compat_filter = filter
compat_get_terminal_size = shutil.get_terminal_size
compat_getenv = os.getenv
-compat_getpass = getpass.getpass
+compat_getpass = compat_getpass_getpass = getpass.getpass
compat_html_entities = html.entities
compat_html_entities_html5 = html.entities.html5
-compat_HTMLParser = html.parser.HTMLParser
+compat_html_parser_HTMLParseError = compat_HTMLParseError
+compat_HTMLParser = compat_html_parser_HTMLParser = html.parser.HTMLParser
compat_http_client = http.client
compat_http_server = http.server
+compat_HTTPError = urllib.error.HTTPError
compat_input = input
compat_integer_types = (int, )
compat_itertools_count = itertools.count
compat_kwargs = lambda kwargs: kwargs
compat_map = map
compat_numeric_types = (int, float, complex)
+compat_os_path_expanduser = compat_expanduser
+compat_os_path_realpath = compat_realpath
compat_print = print
compat_shlex_split = shlex.split
compat_socket_create_connection = socket.create_connection
compat_Struct = struct.Struct
compat_struct_pack = struct.pack
compat_struct_unpack = struct.unpack
-compat_subprocess_get_DEVNULL = lambda: DEVNULL
+compat_subprocess_get_DEVNULL = lambda: subprocess.DEVNULL
compat_tokenize_tokenize = tokenize.tokenize
compat_urllib_error = urllib.error
+compat_urllib_HTTPError = urllib.error.HTTPError
compat_urllib_parse = urllib.parse
+compat_urllib_parse_parse_qs = urllib.parse.parse_qs
compat_urllib_parse_quote = urllib.parse.quote
compat_urllib_parse_quote_plus = urllib.parse.quote_plus
compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus
@@ -90,8 +99,10 @@ compat_urllib_parse_urlunparse = urllib.parse.urlunparse
compat_urllib_request = urllib.request
compat_urllib_request_DataHandler = urllib.request.DataHandler
compat_urllib_response = urllib.response
-compat_urlretrieve = urllib.request.urlretrieve
-compat_xml_parse_error = etree.ParseError
+compat_urlretrieve = compat_urllib_request_urlretrieve = urllib.request.urlretrieve
+compat_xml_parse_error = compat_xml_etree_ElementTree_ParseError = etree.ParseError
compat_xpath = lambda xpath: xpath
compat_zip = zip
workaround_optparse_bug9161 = lambda: None
+
+legacy = []
diff --git a/hypervideo_dl/compat/compat_utils.py b/hypervideo_dl/compat/compat_utils.py
index 1bf6566..8e94125 100644
--- a/hypervideo_dl/compat/compat_utils.py
+++ b/hypervideo_dl/compat/compat_utils.py
@@ -1,5 +1,6 @@
import collections
import contextlib
+import functools
import importlib
import sys
import types
@@ -10,61 +11,73 @@ _Package = collections.namedtuple('Package', ('name', 'version'))
def get_package_info(module):
- parent = module.__name__.split('.')[0]
- parent_module = None
- with contextlib.suppress(ImportError):
- parent_module = importlib.import_module(parent)
-
- for attr in ('__version__', 'version_string', 'version'):
- version = getattr(parent_module, attr, None)
- if version is not None:
- break
- return _Package(getattr(module, '_hypervideo_dl__identifier', parent), str(version))
+ return _Package(
+ name=getattr(module, '_hypervideo_dl__identifier', module.__name__),
+ version=str(next(filter(None, (
+ getattr(module, attr, None)
+ for attr in ('__version__', 'version_string', 'version')
+ )), None)))
def _is_package(module):
- try:
- module.__getattribute__('__path__')
- except AttributeError:
- return False
- return True
-
-
-def passthrough_module(parent, child, allowed_attributes=None, *, callback=lambda _: None):
- parent_module = importlib.import_module(parent)
- child_module = None # Import child module only as needed
-
- class PassthroughModule(types.ModuleType):
- def __getattr__(self, attr):
- if _is_package(parent_module):
- with contextlib.suppress(ImportError):
- return importlib.import_module(f'.{attr}', parent)
-
- ret = self.__from_child(attr)
- if ret is _NO_ATTRIBUTE:
- raise AttributeError(f'module {parent} has no attribute {attr}')
- callback(attr)
- return ret
-
- def __from_child(self, attr):
- if allowed_attributes is None:
- if attr.startswith('__') and attr.endswith('__'):
- return _NO_ATTRIBUTE
- elif attr not in allowed_attributes:
+ return '__path__' in vars(module)
+
+
+def _is_dunder(name):
+ return name.startswith('__') and name.endswith('__')
+
+
+class EnhancedModule(types.ModuleType):
+ def __bool__(self):
+ return vars(self).get('__bool__', lambda: True)()
+
+ def __getattribute__(self, attr):
+ try:
+ ret = super().__getattribute__(attr)
+ except AttributeError:
+ if _is_dunder(attr):
+ raise
+ getter = getattr(self, '__getattr__', None)
+ if not getter:
+ raise
+ ret = getter(attr)
+ return ret.fget() if isinstance(ret, property) else ret
+
+
+def passthrough_module(parent, child, allowed_attributes=(..., ), *, callback=lambda _: None):
+ """Passthrough parent module into a child module, creating the parent if necessary"""
+ def __getattr__(attr):
+ if _is_package(parent):
+ with contextlib.suppress(ModuleNotFoundError):
+ return importlib.import_module(f'.{attr}', parent.__name__)
+
+ ret = from_child(attr)
+ if ret is _NO_ATTRIBUTE:
+ raise AttributeError(f'module {parent.__name__} has no attribute {attr}')
+ callback(attr)
+ return ret
+
+ @functools.lru_cache(maxsize=None)
+ def from_child(attr):
+ nonlocal child
+ if attr not in allowed_attributes:
+ if ... not in allowed_attributes or _is_dunder(attr):
return _NO_ATTRIBUTE
- nonlocal child_module
- child_module = child_module or importlib.import_module(child, parent)
+ if isinstance(child, str):
+ child = importlib.import_module(child, parent.__name__)
- with contextlib.suppress(AttributeError):
- return getattr(child_module, attr)
+ if _is_package(child):
+ with contextlib.suppress(ImportError):
+ return passthrough_module(f'{parent.__name__}.{attr}',
+ importlib.import_module(f'.{attr}', child.__name__))
- if _is_package(child_module):
- with contextlib.suppress(ImportError):
- return importlib.import_module(f'.{attr}', child)
+ with contextlib.suppress(AttributeError):
+ return getattr(child, attr)
- return _NO_ATTRIBUTE
+ return _NO_ATTRIBUTE
- # Python 3.6 does not have module level __getattr__
- # https://peps.python.org/pep-0562/
- sys.modules[parent].__class__ = PassthroughModule
+ parent = sys.modules.get(parent, types.ModuleType(parent))
+ parent.__class__ = EnhancedModule
+ parent.__getattr__ = __getattr__
+ return parent
diff --git a/hypervideo_dl/compat/types.py b/hypervideo_dl/compat/types.py
new file mode 100644
index 0000000..4aa3b0e
--- /dev/null
+++ b/hypervideo_dl/compat/types.py
@@ -0,0 +1,13 @@
+# flake8: noqa: F405
+from types import * # noqa: F403
+
+from .compat_utils import passthrough_module
+
+passthrough_module(__name__, 'types')
+del passthrough_module
+
+try:
+ # NB: pypy has builtin NoneType, so checking NameError won't work
+ from types import NoneType # >= 3.10
+except ImportError:
+ NoneType = type(None)
diff --git a/hypervideo_dl/compat/urllib/__init__.py b/hypervideo_dl/compat/urllib/__init__.py
new file mode 100644
index 0000000..b27cc61
--- /dev/null
+++ b/hypervideo_dl/compat/urllib/__init__.py
@@ -0,0 +1,10 @@
+# flake8: noqa: F405
+from urllib import * # noqa: F403
+
+del request
+from . import request # noqa: F401
+
+from ..compat_utils import passthrough_module
+
+passthrough_module(__name__, 'urllib')
+del passthrough_module
diff --git a/hypervideo_dl/compat/urllib/request.py b/hypervideo_dl/compat/urllib/request.py
new file mode 100644
index 0000000..ff63b2f
--- /dev/null
+++ b/hypervideo_dl/compat/urllib/request.py
@@ -0,0 +1,40 @@
+# flake8: noqa: F405
+from urllib.request import * # noqa: F403
+
+from ..compat_utils import passthrough_module
+
+passthrough_module(__name__, 'urllib.request')
+del passthrough_module
+
+
+from .. import compat_os_name
+
+if compat_os_name == 'nt':
+ # On older python versions, proxies are extracted from Windows registry erroneously. [1]
+ # If the https proxy in the registry does not have a scheme, urllib will incorrectly add https:// to it. [2]
+ # It is unlikely that the user has actually set it to be https, so we should be fine to safely downgrade
+ # it to http on these older python versions to avoid issues
+ # This also applies for ftp proxy type, as ftp:// proxy scheme is not supported.
+ # 1: https://github.com/python/cpython/issues/86793
+ # 2: https://github.com/python/cpython/blob/51f1ae5ceb0673316c4e4b0175384e892e33cc6e/Lib/urllib/request.py#L2683-L2698
+ import sys
+ from urllib.request import getproxies_environment, getproxies_registry
+
+ def getproxies_registry_patched():
+ proxies = getproxies_registry()
+ if (
+ sys.version_info >= (3, 10, 5) # https://docs.python.org/3.10/whatsnew/changelog.html#python-3-10-5-final
+ or (3, 9, 13) <= sys.version_info < (3, 10) # https://docs.python.org/3.9/whatsnew/changelog.html#python-3-9-13-final
+ ):
+ return proxies
+
+ for scheme in ('https', 'ftp'):
+ if scheme in proxies and proxies[scheme].startswith(f'{scheme}://'):
+ proxies[scheme] = 'http' + proxies[scheme][len(scheme):]
+
+ return proxies
+
+ def getproxies():
+ return getproxies_environment() or getproxies_registry_patched()
+
+del compat_os_name
diff --git a/hypervideo_dl/cookies.py b/hypervideo_dl/cookies.py
index 97457a1..97c742b 100644
--- a/hypervideo_dl/cookies.py
+++ b/hypervideo_dl/cookies.py
@@ -1,7 +1,9 @@
import base64
+import collections
import contextlib
import http.cookiejar
import http.cookies
+import io
import json
import os
import re
@@ -11,6 +13,7 @@ import subprocess
import sys
import tempfile
import time
+import urllib.request
from datetime import datetime, timedelta, timezone
from enum import Enum, auto
from hashlib import pbkdf2_hmac
@@ -20,6 +23,7 @@ from .aes import (
aes_gcm_decrypt_and_verify_bytes,
unpad_pkcs7,
)
+from .compat import functools
from .dependencies import (
_SECRETSTORAGE_UNAVAILABLE_REASON,
secretstorage,
@@ -28,36 +32,24 @@ from .dependencies import (
from .minicurses import MultilinePrinter, QuietMultilinePrinter
from .utils import (
Popen,
- YoutubeDLCookieJar,
error_to_str,
expand_path,
is_path_like,
+ sanitize_url,
+ str_or_none,
try_call,
+ write_string,
)
+from .utils._utils import _YDLLogger
+from .utils.networking import normalize_url
CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'}
SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'}
-class YDLLogger:
- def __init__(self, ydl=None):
- self._ydl = ydl
-
- def debug(self, message):
- if self._ydl:
- self._ydl.write_debug(message)
-
- def info(self, message):
- if self._ydl:
- self._ydl.to_screen(f'[Cookies] {message}')
-
- def warning(self, message, only_once=False):
- if self._ydl:
- self._ydl.report_warning(message, only_once)
-
- def error(self, message):
- if self._ydl:
- self._ydl.report_error(message)
+class YDLLogger(_YDLLogger):
+ def warning(self, message, only_once=False): # compat
+ return super().warning(message, once=only_once)
class ProgressBar(MultilinePrinter):
_DELAY, _timer = 0.1, 0
@@ -105,7 +97,7 @@ def load_cookies(cookie_file, browser_specification, ydl):
jar = YoutubeDLCookieJar(cookie_file)
if not is_filename or os.access(cookie_file, os.R_OK):
- jar.load(ignore_discard=True, ignore_expires=True)
+ jar.load()
cookie_jars.append(jar)
return _merge_cookie_jars(cookie_jars)
@@ -146,7 +138,7 @@ def _extract_firefox_cookies(profile, container, logger):
containers_path = os.path.join(os.path.dirname(cookie_database_path), 'containers.json')
if not os.path.isfile(containers_path) or not os.access(containers_path, os.R_OK):
raise FileNotFoundError(f'could not read containers.json in {search_root}')
- with open(containers_path) as containers:
+ with open(containers_path, encoding='utf8') as containers:
identities = json.load(containers).get('identities', [])
container_id = next((context.get('userContextId') for context in identities if container in (
context.get('name'),
@@ -346,7 +338,9 @@ class ChromeCookieDecryptor:
Linux:
- cookies are either v10 or v11
- v10: AES-CBC encrypted with a fixed key
+ - also attempts empty password if decryption fails
- v11: AES-CBC encrypted with an OS protected key (keyring)
+ - also attempts empty password if decryption fails
- v11 keys can be stored in various places depending on the activate desktop environment [2]
Mac:
@@ -361,7 +355,7 @@ class ChromeCookieDecryptor:
Sources:
- [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/
- - [2] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_linux.cc
+ - [2] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_linux.cc
- KeyStorageLinux::CreateService
"""
@@ -383,32 +377,49 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor):
def __init__(self, browser_keyring_name, logger, *, keyring=None):
self._logger = logger
self._v10_key = self.derive_key(b'peanuts')
- password = _get_linux_keyring_password(browser_keyring_name, keyring, logger)
- self._v11_key = None if password is None else self.derive_key(password)
+ self._empty_key = self.derive_key(b'')
self._cookie_counts = {'v10': 0, 'v11': 0, 'other': 0}
+ self._browser_keyring_name = browser_keyring_name
+ self._keyring = keyring
+
+ @functools.cached_property
+ def _v11_key(self):
+ password = _get_linux_keyring_password(self._browser_keyring_name, self._keyring, self._logger)
+ return None if password is None else self.derive_key(password)
@staticmethod
def derive_key(password):
# values from
- # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_linux.cc
+ # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_linux.cc
return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1, key_length=16)
def decrypt(self, encrypted_value):
+ """
+
+ following the same approach as the fix in [1]: if cookies fail to decrypt then attempt to decrypt
+ with an empty password. The failure detection is not the same as what chromium uses so the
+ results won't be perfect
+
+ References:
+ - [1] https://chromium.googlesource.com/chromium/src/+/bbd54702284caca1f92d656fdcadf2ccca6f4165%5E%21/
+ - a bugfix to try an empty password as a fallback
+ """
version = encrypted_value[:3]
ciphertext = encrypted_value[3:]
if version == b'v10':
self._cookie_counts['v10'] += 1
- return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger)
+ return _decrypt_aes_cbc_multi(ciphertext, (self._v10_key, self._empty_key), self._logger)
elif version == b'v11':
self._cookie_counts['v11'] += 1
if self._v11_key is None:
self._logger.warning('cannot decrypt v11 cookies: no key found', only_once=True)
return None
- return _decrypt_aes_cbc(ciphertext, self._v11_key, self._logger)
+ return _decrypt_aes_cbc_multi(ciphertext, (self._v11_key, self._empty_key), self._logger)
else:
+ self._logger.warning(f'unknown cookie version: "{version}"', only_once=True)
self._cookie_counts['other'] += 1
return None
@@ -423,7 +434,7 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor):
@staticmethod
def derive_key(password):
# values from
- # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm
+ # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_mac.mm
return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1003, key_length=16)
def decrypt(self, encrypted_value):
@@ -436,12 +447,12 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor):
self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True)
return None
- return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger)
+ return _decrypt_aes_cbc_multi(ciphertext, (self._v10_key,), self._logger)
else:
self._cookie_counts['other'] += 1
# other prefixes are considered 'old data' which were stored as plaintext
- # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm
+ # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_mac.mm
return encrypted_value
@@ -461,7 +472,7 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor):
self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True)
return None
- # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc
+ # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc
# kNonceLength
nonce_length = 96 // 8
# boringssl
@@ -478,23 +489,27 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor):
else:
self._cookie_counts['other'] += 1
# any other prefix means the data is DPAPI encrypted
- # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc
+ # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc
return _decrypt_windows_dpapi(encrypted_value, self._logger).decode()
def _extract_safari_cookies(profile, logger):
- if profile is not None:
- logger.error('safari does not support profiles')
if sys.platform != 'darwin':
raise ValueError(f'unsupported platform: {sys.platform}')
- cookies_path = os.path.expanduser('~/Library/Cookies/Cookies.binarycookies')
+ if profile:
+ cookies_path = os.path.expanduser(profile)
+ if not os.path.isfile(cookies_path):
+ raise FileNotFoundError('custom safari cookies database not found')
+
+ else:
+ cookies_path = os.path.expanduser('~/Library/Cookies/Cookies.binarycookies')
- if not os.path.isfile(cookies_path):
- logger.debug('Trying secondary cookie location')
- cookies_path = os.path.expanduser('~/Library/Containers/com.apple.Safari/Data/Library/Cookies/Cookies.binarycookies')
if not os.path.isfile(cookies_path):
- raise FileNotFoundError('could not find safari cookies database')
+ logger.debug('Trying secondary cookie location')
+ cookies_path = os.path.expanduser('~/Library/Containers/com.apple.Safari/Data/Library/Cookies/Cookies.binarycookies')
+ if not os.path.isfile(cookies_path):
+ raise FileNotFoundError('could not find safari cookies database')
with open(cookies_path, 'rb') as f:
cookies_data = f.read()
@@ -657,19 +672,27 @@ class _LinuxDesktopEnvironment(Enum):
"""
OTHER = auto()
CINNAMON = auto()
+ DEEPIN = auto()
GNOME = auto()
- KDE = auto()
+ KDE3 = auto()
+ KDE4 = auto()
+ KDE5 = auto()
+ KDE6 = auto()
PANTHEON = auto()
+ UKUI = auto()
UNITY = auto()
XFCE = auto()
+ LXQT = auto()
class _LinuxKeyring(Enum):
"""
- https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.h
+ https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_util_linux.h
SelectedLinuxBackend
"""
- KWALLET = auto()
+ KWALLET = auto() # KDE4
+ KWALLET5 = auto()
+ KWALLET6 = auto()
GNOMEKEYRING = auto()
BASICTEXT = auto()
@@ -677,7 +700,7 @@ class _LinuxKeyring(Enum):
SUPPORTED_KEYRINGS = _LinuxKeyring.__members__.keys()
-def _get_linux_desktop_environment(env):
+def _get_linux_desktop_environment(env, logger):
"""
https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util.cc
GetDesktopEnvironment
@@ -692,51 +715,97 @@ def _get_linux_desktop_environment(env):
return _LinuxDesktopEnvironment.GNOME
else:
return _LinuxDesktopEnvironment.UNITY
+ elif xdg_current_desktop == 'Deepin':
+ return _LinuxDesktopEnvironment.DEEPIN
elif xdg_current_desktop == 'GNOME':
return _LinuxDesktopEnvironment.GNOME
elif xdg_current_desktop == 'X-Cinnamon':
return _LinuxDesktopEnvironment.CINNAMON
elif xdg_current_desktop == 'KDE':
- return _LinuxDesktopEnvironment.KDE
+ kde_version = env.get('KDE_SESSION_VERSION', None)
+ if kde_version == '5':
+ return _LinuxDesktopEnvironment.KDE5
+ elif kde_version == '6':
+ return _LinuxDesktopEnvironment.KDE6
+ elif kde_version == '4':
+ return _LinuxDesktopEnvironment.KDE4
+ else:
+ logger.info(f'unknown KDE version: "{kde_version}". Assuming KDE4')
+ return _LinuxDesktopEnvironment.KDE4
elif xdg_current_desktop == 'Pantheon':
return _LinuxDesktopEnvironment.PANTHEON
elif xdg_current_desktop == 'XFCE':
return _LinuxDesktopEnvironment.XFCE
+ elif xdg_current_desktop == 'UKUI':
+ return _LinuxDesktopEnvironment.UKUI
+ elif xdg_current_desktop == 'LXQt':
+ return _LinuxDesktopEnvironment.LXQT
+ else:
+ logger.info(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"')
+
elif desktop_session is not None:
- if desktop_session in ('mate', 'gnome'):
+ if desktop_session == 'deepin':
+ return _LinuxDesktopEnvironment.DEEPIN
+ elif desktop_session in ('mate', 'gnome'):
return _LinuxDesktopEnvironment.GNOME
- elif 'kde' in desktop_session:
- return _LinuxDesktopEnvironment.KDE
- elif 'xfce' in desktop_session:
+ elif desktop_session in ('kde4', 'kde-plasma'):
+ return _LinuxDesktopEnvironment.KDE4
+ elif desktop_session == 'kde':
+ if 'KDE_SESSION_VERSION' in env:
+ return _LinuxDesktopEnvironment.KDE4
+ else:
+ return _LinuxDesktopEnvironment.KDE3
+ elif 'xfce' in desktop_session or desktop_session == 'xubuntu':
return _LinuxDesktopEnvironment.XFCE
+ elif desktop_session == 'ukui':
+ return _LinuxDesktopEnvironment.UKUI
+ else:
+ logger.info(f'DESKTOP_SESSION is set to an unknown value: "{desktop_session}"')
+
else:
if 'GNOME_DESKTOP_SESSION_ID' in env:
return _LinuxDesktopEnvironment.GNOME
elif 'KDE_FULL_SESSION' in env:
- return _LinuxDesktopEnvironment.KDE
+ if 'KDE_SESSION_VERSION' in env:
+ return _LinuxDesktopEnvironment.KDE4
+ else:
+ return _LinuxDesktopEnvironment.KDE3
return _LinuxDesktopEnvironment.OTHER
def _choose_linux_keyring(logger):
"""
- https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.cc
- SelectBackend
+ SelectBackend in [1]
+
+ There is currently support for forcing chromium to use BASIC_TEXT by creating a file called
+ `Disable Local Encryption` [1] in the user data dir. The function to write this file (`WriteBackendUse()` [1])
+ does not appear to be called anywhere other than in tests, so the user would have to create this file manually
+ and so would be aware enough to tell hypervideo to use the BASIC_TEXT keyring.
+
+ References:
+ - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_util_linux.cc
"""
- desktop_environment = _get_linux_desktop_environment(os.environ)
+ desktop_environment = _get_linux_desktop_environment(os.environ, logger)
logger.debug(f'detected desktop environment: {desktop_environment.name}')
- if desktop_environment == _LinuxDesktopEnvironment.KDE:
+ if desktop_environment == _LinuxDesktopEnvironment.KDE4:
linux_keyring = _LinuxKeyring.KWALLET
- elif desktop_environment == _LinuxDesktopEnvironment.OTHER:
+ elif desktop_environment == _LinuxDesktopEnvironment.KDE5:
+ linux_keyring = _LinuxKeyring.KWALLET5
+ elif desktop_environment == _LinuxDesktopEnvironment.KDE6:
+ linux_keyring = _LinuxKeyring.KWALLET6
+ elif desktop_environment in (
+ _LinuxDesktopEnvironment.KDE3, _LinuxDesktopEnvironment.LXQT, _LinuxDesktopEnvironment.OTHER
+ ):
linux_keyring = _LinuxKeyring.BASICTEXT
else:
linux_keyring = _LinuxKeyring.GNOMEKEYRING
return linux_keyring
-def _get_kwallet_network_wallet(logger):
+def _get_kwallet_network_wallet(keyring, logger):
""" The name of the wallet used to store network passwords.
- https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/kwallet_dbus.cc
+ https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/kwallet_dbus.cc
KWalletDBus::NetworkWallet
which does a dbus call to the following function:
https://api.kde.org/frameworks/kwallet/html/classKWallet_1_1Wallet.html
@@ -744,10 +813,22 @@ def _get_kwallet_network_wallet(logger):
"""
default_wallet = 'kdewallet'
try:
+ if keyring == _LinuxKeyring.KWALLET:
+ service_name = 'org.kde.kwalletd'
+ wallet_path = '/modules/kwalletd'
+ elif keyring == _LinuxKeyring.KWALLET5:
+ service_name = 'org.kde.kwalletd5'
+ wallet_path = '/modules/kwalletd5'
+ elif keyring == _LinuxKeyring.KWALLET6:
+ service_name = 'org.kde.kwalletd6'
+ wallet_path = '/modules/kwalletd6'
+ else:
+ raise ValueError(keyring)
+
stdout, _, returncode = Popen.run([
'dbus-send', '--session', '--print-reply=literal',
- '--dest=org.kde.kwalletd5',
- '/modules/kwalletd5',
+ f'--dest={service_name}',
+ wallet_path,
'org.kde.KWallet.networkWallet'
], text=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
@@ -762,8 +843,8 @@ def _get_kwallet_network_wallet(logger):
return default_wallet
-def _get_kwallet_password(browser_keyring_name, logger):
- logger.debug('using kwallet-query to obtain password from kwallet')
+def _get_kwallet_password(browser_keyring_name, keyring, logger):
+ logger.debug(f'using kwallet-query to obtain password from {keyring.name}')
if shutil.which('kwallet-query') is None:
logger.error('kwallet-query command not found. KWallet and kwallet-query '
@@ -771,7 +852,7 @@ def _get_kwallet_password(browser_keyring_name, logger):
'included in the kwallet package for your distribution')
return b''
- network_wallet = _get_kwallet_network_wallet(logger)
+ network_wallet = _get_kwallet_network_wallet(keyring, logger)
try:
stdout, _, returncode = Popen.run([
@@ -793,8 +874,9 @@ def _get_kwallet_password(browser_keyring_name, logger):
# checks hasEntry. To verify this:
# dbus-monitor "interface='org.kde.KWallet'" "type=method_return"
# while starting chrome.
- # this may be a bug as the intended behaviour is to generate a random password and store
- # it, but that doesn't matter here.
+ # this was identified as a bug later and fixed in
+ # https://chromium.googlesource.com/chromium/src/+/bbd54702284caca1f92d656fdcadf2ccca6f4165%5E%21/#F0
+ # https://chromium.googlesource.com/chromium/src/+/5463af3c39d7f5b6d11db7fbd51e38cc1974d764
return b''
else:
logger.debug('password found')
@@ -832,8 +914,8 @@ def _get_linux_keyring_password(browser_keyring_name, keyring, logger):
keyring = _LinuxKeyring[keyring] if keyring else _choose_linux_keyring(logger)
logger.debug(f'Chosen keyring: {keyring.name}')
- if keyring == _LinuxKeyring.KWALLET:
- return _get_kwallet_password(browser_keyring_name, logger)
+ if keyring in (_LinuxKeyring.KWALLET, _LinuxKeyring.KWALLET5, _LinuxKeyring.KWALLET6):
+ return _get_kwallet_password(browser_keyring_name, keyring, logger)
elif keyring == _LinuxKeyring.GNOMEKEYRING:
return _get_gnome_keyring_password(browser_keyring_name, logger)
elif keyring == _LinuxKeyring.BASICTEXT:
@@ -861,6 +943,10 @@ def _get_mac_keyring_password(browser_keyring_name, logger):
def _get_windows_v10_key(browser_root, logger):
+ """
+ References:
+ - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc
+ """
path = _find_most_recently_used_file(browser_root, 'Local State', logger)
if path is None:
logger.error('could not find local state file')
@@ -869,11 +955,13 @@ def _get_windows_v10_key(browser_root, logger):
with open(path, encoding='utf8') as f:
data = json.load(f)
try:
+ # kOsCryptEncryptedKeyPrefName in [1]
base64_key = data['os_crypt']['encrypted_key']
except KeyError:
logger.error('no encrypted key in Local State')
return None
encrypted_key = base64.b64decode(base64_key)
+ # kDPAPIKeyPrefix in [1]
prefix = b'DPAPI'
if not encrypted_key.startswith(prefix):
logger.error('invalid key')
@@ -885,13 +973,15 @@ def pbkdf2_sha1(password, salt, iterations, key_length):
return pbkdf2_hmac('sha1', password, salt, iterations, key_length)
-def _decrypt_aes_cbc(ciphertext, key, logger, initialization_vector=b' ' * 16):
- plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector))
- try:
- return plaintext.decode()
- except UnicodeDecodeError:
- logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True)
- return None
+def _decrypt_aes_cbc_multi(ciphertext, keys, logger, initialization_vector=b' ' * 16):
+ for key in keys:
+ plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector))
+ try:
+ return plaintext.decode()
+ except UnicodeDecodeError:
+ pass
+ logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True)
+ return None
def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger):
@@ -1085,3 +1175,150 @@ class LenientSimpleCookie(http.cookies.SimpleCookie):
else:
morsel = None
+
+
+class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
+ """
+ See [1] for cookie file format.
+
+ 1. https://curl.haxx.se/docs/http-cookies.html
+ """
+ _HTTPONLY_PREFIX = '#HttpOnly_'
+ _ENTRY_LEN = 7
+ _HEADER = '''# Netscape HTTP Cookie File
+# This file is generated by hypervideo. Do not edit.
+
+'''
+ _CookieFileEntry = collections.namedtuple(
+ 'CookieFileEntry',
+ ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
+
+ def __init__(self, filename=None, *args, **kwargs):
+ super().__init__(None, *args, **kwargs)
+ if is_path_like(filename):
+ filename = os.fspath(filename)
+ self.filename = filename
+
+ @staticmethod
+ def _true_or_false(cndn):
+ return 'TRUE' if cndn else 'FALSE'
+
+ @contextlib.contextmanager
+ def open(self, file, *, write=False):
+ if is_path_like(file):
+ with open(file, 'w' if write else 'r', encoding='utf-8') as f:
+ yield f
+ else:
+ if write:
+ file.truncate(0)
+ yield file
+
+ def _really_save(self, f, ignore_discard, ignore_expires):
+ now = time.time()
+ for cookie in self:
+ if (not ignore_discard and cookie.discard
+ or not ignore_expires and cookie.is_expired(now)):
+ continue
+ name, value = cookie.name, cookie.value
+ if value is None:
+ # cookies.txt regards 'Set-Cookie: foo' as a cookie
+ # with no name, whereas http.cookiejar regards it as a
+ # cookie with no value.
+ name, value = '', name
+ f.write('%s\n' % '\t'.join((
+ cookie.domain,
+ self._true_or_false(cookie.domain.startswith('.')),
+ cookie.path,
+ self._true_or_false(cookie.secure),
+ str_or_none(cookie.expires, default=''),
+ name, value
+ )))
+
+ def save(self, filename=None, ignore_discard=True, ignore_expires=True):
+ """
+ Save cookies to a file.
+ Code is taken from CPython 3.6
+ https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
+
+ if filename is None:
+ if self.filename is not None:
+ filename = self.filename
+ else:
+ raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
+
+ # Store session cookies with `expires` set to 0 instead of an empty string
+ for cookie in self:
+ if cookie.expires is None:
+ cookie.expires = 0
+
+ with self.open(filename, write=True) as f:
+ f.write(self._HEADER)
+ self._really_save(f, ignore_discard, ignore_expires)
+
+ def load(self, filename=None, ignore_discard=True, ignore_expires=True):
+ """Load cookies from a file."""
+ if filename is None:
+ if self.filename is not None:
+ filename = self.filename
+ else:
+ raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
+
+ def prepare_line(line):
+ if line.startswith(self._HTTPONLY_PREFIX):
+ line = line[len(self._HTTPONLY_PREFIX):]
+ # comments and empty lines are fine
+ if line.startswith('#') or not line.strip():
+ return line
+ cookie_list = line.split('\t')
+ if len(cookie_list) != self._ENTRY_LEN:
+ raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
+ cookie = self._CookieFileEntry(*cookie_list)
+ if cookie.expires_at and not cookie.expires_at.isdigit():
+ raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
+ return line
+
+ cf = io.StringIO()
+ with self.open(filename) as f:
+ for line in f:
+ try:
+ cf.write(prepare_line(line))
+ except http.cookiejar.LoadError as e:
+ if f'{line.strip()} '[0] in '[{"':
+ raise http.cookiejar.LoadError(
+ 'Cookies file must be Netscape formatted, not JSON. See '
+ 'https://github.com/hypervideo/hypervideo/wiki/FAQ#how-do-i-pass-cookies-to-hypervideo')
+ write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
+ continue
+ cf.seek(0)
+ self._really_load(cf, filename, ignore_discard, ignore_expires)
+ # Session cookies are denoted by either `expires` field set to
+ # an empty string or 0. MozillaCookieJar only recognizes the former
+ # (see [1]). So we need force the latter to be recognized as session
+ # cookies on our own.
+ # Session cookies may be important for cookies-based authentication,
+ # e.g. usually, when user does not check 'Remember me' check box while
+ # logging in on a site, some important cookies are stored as session
+ # cookies so that not recognizing them will result in failed login.
+ # 1. https://bugs.python.org/issue17164
+ for cookie in self:
+ # Treat `expires=0` cookies as session cookies
+ if cookie.expires == 0:
+ cookie.expires = None
+ cookie.discard = True
+
+ def get_cookie_header(self, url):
+ """Generate a Cookie HTTP header for a given url"""
+ cookie_req = urllib.request.Request(normalize_url(sanitize_url(url)))
+ self.add_cookie_header(cookie_req)
+ return cookie_req.get_header('Cookie')
+
+ def get_cookies_for_url(self, url):
+ """Generate a list of Cookie objects for a given url"""
+ # Policy `_now` attribute must be set before calling `_cookies_for_request`
+ # Ref: https://github.com/python/cpython/blob/3.7/Lib/http/cookiejar.py#L1360
+ self._policy._now = self._now = int(time.time())
+ return self._cookies_for_request(urllib.request.Request(normalize_url(sanitize_url(url))))
+
+ def clear(self, *args, **kwargs):
+ with contextlib.suppress(KeyError):
+ return super().clear(*args, **kwargs)
diff --git a/hypervideo_dl/dependencies/Cryptodome.py b/hypervideo_dl/dependencies/Cryptodome.py
new file mode 100644
index 0000000..592827c
--- /dev/null
+++ b/hypervideo_dl/dependencies/Cryptodome.py
@@ -0,0 +1,38 @@
+from ..compat.compat_utils import passthrough_module
+
+try:
+ import Cryptodome as _parent
+except ImportError:
+ try:
+ import Crypto as _parent
+ except (ImportError, SyntaxError): # Old Crypto gives SyntaxError in newer Python
+ _parent = passthrough_module(__name__, 'no_Cryptodome')
+ __bool__ = lambda: False
+
+del passthrough_module
+
+__version__ = ''
+AES = PKCS1_v1_5 = Blowfish = PKCS1_OAEP = SHA1 = CMAC = RSA = None
+try:
+ if _parent.__name__ == 'Cryptodome':
+ from Cryptodome import __version__
+ from Cryptodome.Cipher import AES, PKCS1_OAEP, Blowfish, PKCS1_v1_5
+ from Cryptodome.Hash import CMAC, SHA1
+ from Cryptodome.PublicKey import RSA
+ elif _parent.__name__ == 'Crypto':
+ from Crypto import __version__
+ from Crypto.Cipher import AES, PKCS1_OAEP, Blowfish, PKCS1_v1_5 # noqa: F401
+ from Crypto.Hash import CMAC, SHA1 # noqa: F401
+ from Crypto.PublicKey import RSA # noqa: F401
+except ImportError:
+ __version__ = f'broken {__version__}'.strip()
+
+
+_hypervideo_dl__identifier = _parent.__name__
+if AES and _hypervideo_dl__identifier == 'Crypto':
+ try:
+ # In pycrypto, mode defaults to ECB. See:
+ # https://www.pycryptodome.org/en/latest/src/vs_pycrypto.html#:~:text=not%20have%20ECB%20as%20default%20mode
+ AES.new(b'abcdefghijklmnop')
+ except TypeError:
+ _hypervideo_dl__identifier = 'pycrypto'
diff --git a/hypervideo_dl/dependencies/__init__.py b/hypervideo_dl/dependencies/__init__.py
new file mode 100644
index 0000000..126dd88
--- /dev/null
+++ b/hypervideo_dl/dependencies/__init__.py
@@ -0,0 +1,83 @@
+# flake8: noqa: F401
+"""Imports all optional dependencies for the project.
+An attribute "_hypervideo_dl__identifier" may be inserted into the module if it uses an ambiguous namespace"""
+
+try:
+ import brotlicffi as brotli
+except ImportError:
+ try:
+ import brotli
+ except ImportError:
+ brotli = None
+
+
+try:
+ import certifi
+except ImportError:
+ certifi = None
+else:
+ from os.path import exists as _path_exists
+
+ # The certificate may not be bundled in executable
+ if not _path_exists(certifi.where()):
+ certifi = None
+
+
+try:
+ import mutagen
+except ImportError:
+ mutagen = None
+
+
+secretstorage = None
+try:
+ import secretstorage
+ _SECRETSTORAGE_UNAVAILABLE_REASON = None
+except ImportError:
+ _SECRETSTORAGE_UNAVAILABLE_REASON = (
+ 'as the `secretstorage` module is not installed. '
+ 'Please install by running `python3 -m pip install secretstorage`')
+except Exception as _err:
+ _SECRETSTORAGE_UNAVAILABLE_REASON = f'as the `secretstorage` module could not be initialized. {_err}'
+
+
+try:
+ import sqlite3
+except ImportError:
+ # although sqlite3 is part of the standard library, it is possible to compile python without
+ # sqlite support. See: https://github.com/hypervideo/hypervideo/issues/544
+ sqlite3 = None
+
+
+try:
+ import websockets
+except (ImportError, SyntaxError):
+ # websockets 3.10 on python 3.6 causes SyntaxError
+ # See https://github.com/hypervideo/hypervideo/issues/2633
+ websockets = None
+
+
+try:
+ import xattr # xattr or pyxattr
+except ImportError:
+ xattr = None
+else:
+ if hasattr(xattr, 'set'): # pyxattr
+ xattr._hypervideo_dl__identifier = 'pyxattr'
+
+
+from . import Cryptodome
+
+all_dependencies = {k: v for k, v in globals().items() if not k.startswith('_')}
+available_dependencies = {k: v for k, v in all_dependencies.items() if v}
+
+
+# Deprecated
+Cryptodome_AES = Cryptodome.AES
+
+
+__all__ = [
+ 'all_dependencies',
+ 'available_dependencies',
+ *all_dependencies.keys(),
+]
diff --git a/hypervideo_dl/downloader/__init__.py b/hypervideo_dl/downloader/__init__.py
index c34dbce..51a9f28 100644
--- a/hypervideo_dl/downloader/__init__.py
+++ b/hypervideo_dl/downloader/__init__.py
@@ -30,7 +30,7 @@ from .hls import HlsFD
from .http import HttpFD
from .ism import IsmFD
from .mhtml import MhtmlFD
-from .niconico import NiconicoDmcFD
+from .niconico import NiconicoDmcFD, NiconicoLiveFD
from .rtmp import RtmpFD
from .rtsp import RtspFD
from .websocket import WebSocketFragmentFD
@@ -50,6 +50,7 @@ PROTOCOL_MAP = {
'ism': IsmFD,
'mhtml': MhtmlFD,
'niconico_dmc': NiconicoDmcFD,
+ 'niconico_live': NiconicoLiveFD,
'fc2_live': FC2LiveFD,
'websocket_frag': WebSocketFragmentFD,
'youtube_live_chat': YoutubeLiveChatFD,
diff --git a/hypervideo_dl/downloader/common.py b/hypervideo_dl/downloader/common.py
index 72d4822..93c46cf 100644
--- a/hypervideo_dl/downloader/common.py
+++ b/hypervideo_dl/downloader/common.py
@@ -20,6 +20,7 @@ from ..utils import (
RetryManager,
classproperty,
decodeArgument,
+ deprecation_warning,
encodeFilename,
format_bytes,
join_nonempty,
@@ -48,10 +49,10 @@ class FileDownloader:
verbose: Print additional info to stdout.
quiet: Do not print messages to stdout.
ratelimit: Download speed limit, in bytes/sec.
- continuedl: Attempt to continue downloads if possible
throttledratelimit: Assume the download is being throttled below this speed (bytes/sec)
- retries: Number of times to retry for HTTP error 5xx
- file_access_retries: Number of times to retry on file access error
+ retries: Number of times to retry for expected network errors.
+ Default is 0 for API, but 10 for CLI
+ file_access_retries: Number of times to retry on file access error (default: 3)
buffersize: Size of download buffer in bytes.
noresizebuffer: Do not automatically resize the download buffer.
continuedl: Try to continue downloads if possible.
@@ -137,17 +138,21 @@ class FileDownloader:
def format_percent(percent):
return ' N/A%' if percent is None else f'{percent:>5.1f}%'
- @staticmethod
- def calc_eta(start, now, total, current):
+ @classmethod
+ def calc_eta(cls, start_or_rate, now_or_remaining, total=NO_DEFAULT, current=NO_DEFAULT):
+ if total is NO_DEFAULT:
+ rate, remaining = start_or_rate, now_or_remaining
+ if None in (rate, remaining):
+ return None
+ return int(float(remaining) / rate)
+
+ start, now = start_or_rate, now_or_remaining
if total is None:
return None
if now is None:
now = time.time()
- dif = now - start
- if current == 0 or dif < 0.001: # One millisecond
- return None
- rate = float(current) / dif
- return int((float(total) - float(current)) / rate)
+ rate = cls.calc_speed(start, now, current)
+ return rate and int((float(total) - float(current)) / rate)
@staticmethod
def calc_speed(start, now, bytes):
@@ -165,6 +170,12 @@ class FileDownloader:
return 'inf' if retries == float('inf') else int(retries)
@staticmethod
+ def filesize_or_none(unencoded_filename):
+ if os.path.isfile(unencoded_filename):
+ return os.path.getsize(unencoded_filename)
+ return 0
+
+ @staticmethod
def best_block_size(elapsed_time, bytes):
new_min = max(bytes / 2.0, 1.0)
new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
@@ -180,7 +191,9 @@ class FileDownloader:
@staticmethod
def parse_bytes(bytestr):
"""Parse a string indicating a byte quantity into an integer."""
- parse_bytes(bytestr)
+ deprecation_warning('hypervideo_dl.FileDownloader.parse_bytes is deprecated and '
+ 'may be removed in the future. Use hypervideo_dl.utils.parse_bytes instead')
+ return parse_bytes(bytestr)
def slow_down(self, start_time, now, byte_counter):
"""Sleep if the download speed is over the rate limit."""
@@ -222,7 +235,7 @@ class FileDownloader:
sleep_func=fd.params.get('retry_sleep_functions', {}).get('file_access'))
def wrapper(self, func, *args, **kwargs):
- for retry in RetryManager(self.params.get('file_access_retries'), error_callback, fd=self):
+ for retry in RetryManager(self.params.get('file_access_retries', 3), error_callback, fd=self):
try:
return func(self, *args, **kwargs)
except OSError as err:
@@ -242,7 +255,8 @@ class FileDownloader:
@wrap_file_access('remove')
def try_remove(self, filename):
- os.remove(filename)
+ if os.path.isfile(filename):
+ os.remove(filename)
@wrap_file_access('rename')
def try_rename(self, old_filename, new_filename):
@@ -282,7 +296,8 @@ class FileDownloader:
self._multiline = BreaklineStatusPrinter(self.ydl._out_files.out, lines)
else:
self._multiline = MultilinePrinter(self.ydl._out_files.out, lines, not self.params.get('quiet'))
- self._multiline.allow_colors = self._multiline._HAVE_FULLCAP and not self.params.get('no_color')
+ self._multiline.allow_colors = self.ydl._allow_colors.out and self.ydl._allow_colors.out != 'no_color'
+ self._multiline._HAVE_FULLCAP = self.ydl._allow_colors.out
def _finish_multiline_status(self):
self._multiline.end()
@@ -404,7 +419,6 @@ class FileDownloader:
"""Download to a filename using the info from info_dict
Return True on success and False otherwise
"""
-
nooverwrites_and_exists = (
not self.params.get('overwrites', True)
and os.path.exists(encodeFilename(filename))
diff --git a/hypervideo_dl/downloader/external.py b/hypervideo_dl/downloader/external.py
index 75257a7..c751541 100644
--- a/hypervideo_dl/downloader/external.py
+++ b/hypervideo_dl/downloader/external.py
@@ -1,12 +1,16 @@
import enum
-import os.path
+import json
+import os
import re
import subprocess
import sys
+import tempfile
import time
+import uuid
from .fragment import FragmentFD
from ..compat import functools
+from ..networking import Request
from ..postprocessor.ffmpeg import EXT_TO_OUT_FORMATS, FFmpegPostProcessor
from ..utils import (
Popen,
@@ -20,7 +24,7 @@ from ..utils import (
determine_ext,
encodeArgument,
encodeFilename,
- handle_youtubedl_headers,
+ find_available_port,
remove_end,
traverse_obj,
)
@@ -39,6 +43,7 @@ class ExternalFD(FragmentFD):
def real_download(self, filename, info_dict):
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
+ self._cookies_tempfile = None
try:
started = time.time()
@@ -51,6 +56,9 @@ class ExternalFD(FragmentFD):
# should take place
retval = 0
self.to_screen('[%s] Interrupted by user' % self.get_basename())
+ finally:
+ if self._cookies_tempfile:
+ self.try_remove(self._cookies_tempfile)
if retval == 0:
status = {
@@ -60,7 +68,6 @@ class ExternalFD(FragmentFD):
}
if filename != '-':
fsize = os.path.getsize(encodeFilename(tmpfilename))
- self.to_screen(f'\r[{self.get_basename()}] Downloaded {fsize} bytes')
self.try_rename(tmpfilename, filename)
status.update({
'downloaded_bytes': fsize,
@@ -101,6 +108,7 @@ class ExternalFD(FragmentFD):
return all((
not info_dict.get('to_stdout') or Features.TO_STDOUT in cls.SUPPORTED_FEATURES,
'+' not in info_dict['protocol'] or Features.MULTIPLE_FORMATS in cls.SUPPORTED_FEATURES,
+ not traverse_obj(info_dict, ('hls_aes', ...), 'extra_param_to_segment_url'),
all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')),
))
@@ -122,6 +130,16 @@ class ExternalFD(FragmentFD):
self.get_basename(), self.params.get('external_downloader_args'), self.EXE_NAME,
keys, *args, **kwargs)
+ def _write_cookies(self):
+ if not self.ydl.cookiejar.filename:
+ tmp_cookies = tempfile.NamedTemporaryFile(suffix='.cookies', delete=False)
+ tmp_cookies.close()
+ self._cookies_tempfile = tmp_cookies.name
+ self.to_screen(f'[download] Writing temporary cookies file to "{self._cookies_tempfile}"')
+ # real_download resets _cookies_tempfile; if it's None then save() will write to cookiejar.filename
+ self.ydl.cookiejar.save(self._cookies_tempfile)
+ return self.ydl.cookiejar.filename or self._cookies_tempfile
+
def _call_downloader(self, tmpfilename, info_dict):
""" Either overwrite this or implement _make_cmd """
cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)]
@@ -129,8 +147,7 @@ class ExternalFD(FragmentFD):
self._debug_cmd(cmd)
if 'fragments' not in info_dict:
- _, stderr, returncode = Popen.run(
- cmd, text=True, stderr=subprocess.PIPE if self._CAPTURE_STDERR else None)
+ _, stderr, returncode = self._call_process(cmd, info_dict)
if returncode and stderr:
self.to_stderr(stderr)
return returncode
@@ -140,7 +157,7 @@ class ExternalFD(FragmentFD):
retry_manager = RetryManager(self.params.get('fragment_retries'), self.report_retry,
frag_index=None, fatal=not skip_unavailable_fragments)
for retry in retry_manager:
- _, stderr, returncode = Popen.run(cmd, text=True, stderr=subprocess.PIPE)
+ _, stderr, returncode = self._call_process(cmd, info_dict)
if not returncode:
break
# TODO: Decide whether to retry based on error code
@@ -172,6 +189,9 @@ class ExternalFD(FragmentFD):
self.try_remove(encodeFilename('%s.frag.urls' % tmpfilename))
return 0
+ def _call_process(self, cmd, info_dict):
+ return Popen.run(cmd, text=True, stderr=subprocess.PIPE if self._CAPTURE_STDERR else None)
+
class CurlFD(ExternalFD):
AVAILABLE_OPT = '-V'
@@ -179,6 +199,9 @@ class CurlFD(ExternalFD):
def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '--location', '-o', tmpfilename, '--compressed']
+ cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url'])
+ if cookie_header:
+ cmd += ['--cookie', cookie_header]
if info_dict.get('http_headers') is not None:
for key, val in info_dict['http_headers'].items():
cmd += ['--header', f'{key}: {val}']
@@ -209,6 +232,9 @@ class AxelFD(ExternalFD):
if info_dict.get('http_headers') is not None:
for key, val in info_dict['http_headers'].items():
cmd += ['-H', f'{key}: {val}']
+ cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url'])
+ if cookie_header:
+ cmd += ['-H', f'Cookie: {cookie_header}', '--max-redirect=0']
cmd += self._configuration_args()
cmd += ['--', info_dict['url']]
return cmd
@@ -218,7 +244,9 @@ class WgetFD(ExternalFD):
AVAILABLE_OPT = '--version'
def _make_cmd(self, tmpfilename, info_dict):
- cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies', '--compression=auto']
+ cmd = [self.exe, '-O', tmpfilename, '-nv', '--compression=auto']
+ if self.ydl.cookiejar.get_cookie_header(info_dict['url']):
+ cmd += ['--load-cookies', self._write_cookies()]
if info_dict.get('http_headers') is not None:
for key, val in info_dict['http_headers'].items():
cmd += ['--header', f'{key}: {val}']
@@ -256,8 +284,17 @@ class Aria2cFD(ExternalFD):
def _aria2c_filename(fn):
return fn if os.path.isabs(fn) else f'.{os.path.sep}{fn}'
+ def _call_downloader(self, tmpfilename, info_dict):
+ # FIXME: Disabled due to https://github.com/hypervideo/hypervideo/issues/5931
+ if False and 'no-external-downloader-progress' not in self.params.get('compat_opts', []):
+ info_dict['__rpc'] = {
+ 'port': find_available_port() or 19190,
+ 'secret': str(uuid.uuid4()),
+ }
+ return super()._call_downloader(tmpfilename, info_dict)
+
def _make_cmd(self, tmpfilename, info_dict):
- cmd = [self.exe, '-c',
+ cmd = [self.exe, '-c', '--no-conf',
'--console-log-level=warn', '--summary-interval=0', '--download-result=hide',
'--http-accept-gzip=true', '--file-allocation=none', '-x16', '-j16', '-s16']
if 'fragments' in info_dict:
@@ -265,6 +302,8 @@ class Aria2cFD(ExternalFD):
else:
cmd += ['--min-split-size', '1M']
+ if self.ydl.cookiejar.get_cookie_header(info_dict['url']):
+ cmd += [f'--load-cookies={self._write_cookies()}']
if info_dict.get('http_headers') is not None:
for key, val in info_dict['http_headers'].items():
cmd += ['--header', f'{key}: {val}']
@@ -276,6 +315,12 @@ class Aria2cFD(ExternalFD):
cmd += self._bool_option('--show-console-readout', 'noprogress', 'false', 'true', '=')
cmd += self._configuration_args()
+ if '__rpc' in info_dict:
+ cmd += [
+ '--enable-rpc',
+ f'--rpc-listen-port={info_dict["__rpc"]["port"]}',
+ f'--rpc-secret={info_dict["__rpc"]["secret"]}']
+
# aria2c strips out spaces from the beginning/end of filenames and paths.
# We work around this issue by adding a "./" to the beginning of the
# filename and relative path, and adding a "/" at the end of the path.
@@ -304,6 +349,87 @@ class Aria2cFD(ExternalFD):
cmd += ['--', info_dict['url']]
return cmd
+ def aria2c_rpc(self, rpc_port, rpc_secret, method, params=()):
+ # Does not actually need to be UUID, just unique
+ sanitycheck = str(uuid.uuid4())
+ d = json.dumps({
+ 'jsonrpc': '2.0',
+ 'id': sanitycheck,
+ 'method': method,
+ 'params': [f'token:{rpc_secret}', *params],
+ }).encode('utf-8')
+ request = Request(
+ f'http://localhost:{rpc_port}/jsonrpc',
+ data=d, headers={
+ 'Content-Type': 'application/json',
+ 'Content-Length': f'{len(d)}',
+ }, proxies={'all': None})
+ with self.ydl.urlopen(request) as r:
+ resp = json.load(r)
+ assert resp.get('id') == sanitycheck, 'Something went wrong with RPC server'
+ return resp['result']
+
+ def _call_process(self, cmd, info_dict):
+ if '__rpc' not in info_dict:
+ return super()._call_process(cmd, info_dict)
+
+ send_rpc = functools.partial(self.aria2c_rpc, info_dict['__rpc']['port'], info_dict['__rpc']['secret'])
+ started = time.time()
+
+ fragmented = 'fragments' in info_dict
+ frag_count = len(info_dict['fragments']) if fragmented else 1
+ status = {
+ 'filename': info_dict.get('_filename'),
+ 'status': 'downloading',
+ 'elapsed': 0,
+ 'downloaded_bytes': 0,
+ 'fragment_count': frag_count if fragmented else None,
+ 'fragment_index': 0 if fragmented else None,
+ }
+ self._hook_progress(status, info_dict)
+
+ def get_stat(key, *obj, average=False):
+ val = tuple(filter(None, map(float, traverse_obj(obj, (..., ..., key))))) or [0]
+ return sum(val) / (len(val) if average else 1)
+
+ with Popen(cmd, text=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) as p:
+ # Add a small sleep so that RPC client can receive response,
+ # or the connection stalls infinitely
+ time.sleep(0.2)
+ retval = p.poll()
+ while retval is None:
+ # We don't use tellStatus as we won't know the GID without reading stdout
+ # Ref: https://aria2.github.io/manual/en/html/aria2c.html#aria2.tellActive
+ active = send_rpc('aria2.tellActive')
+ completed = send_rpc('aria2.tellStopped', [0, frag_count])
+
+ downloaded = get_stat('totalLength', completed) + get_stat('completedLength', active)
+ speed = get_stat('downloadSpeed', active)
+ total = frag_count * get_stat('totalLength', active, completed, average=True)
+ if total < downloaded:
+ total = None
+
+ status.update({
+ 'downloaded_bytes': int(downloaded),
+ 'speed': speed,
+ 'total_bytes': None if fragmented else total,
+ 'total_bytes_estimate': total,
+ 'eta': (total - downloaded) / (speed or 1),
+ 'fragment_index': min(frag_count, len(completed) + 1) if fragmented else None,
+ 'elapsed': time.time() - started
+ })
+ self._hook_progress(status, info_dict)
+
+ if not active and len(completed) >= frag_count:
+ send_rpc('aria2.shutdown')
+ retval = p.wait()
+ break
+
+ time.sleep(0.1)
+ retval = p.poll()
+
+ return '', p.stderr.read(), retval
+
class HttpieFD(ExternalFD):
AVAILABLE_OPT = '--version'
@@ -315,6 +441,14 @@ class HttpieFD(ExternalFD):
if info_dict.get('http_headers') is not None:
for key, val in info_dict['http_headers'].items():
cmd += [f'{key}:{val}']
+
+ # httpie 3.1.0+ removes the Cookie header on redirect, so this should be safe for now. [1]
+ # If we ever need cookie handling for redirects, we can export the cookiejar into a session. [2]
+ # 1: https://github.com/httpie/httpie/security/advisories/GHSA-9w4w-cpc8-h2fq
+ # 2: https://httpie.io/docs/cli/sessions
+ cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url'])
+ if cookie_header:
+ cmd += [f'Cookie:{cookie_header}']
return cmd
@@ -342,7 +476,6 @@ class FFmpegFD(ExternalFD):
and cls.can_download(info_dict))
def _call_downloader(self, tmpfilename, info_dict):
- urls = [f['url'] for f in info_dict.get('requested_formats', [])] or [info_dict['url']]
ffpp = FFmpegPostProcessor(downloader=self)
if not ffpp.available:
self.report_error('m3u8 download detected but ffmpeg could not be found. Please install')
@@ -372,16 +505,6 @@ class FFmpegFD(ExternalFD):
# http://trac.ffmpeg.org/ticket/6125#comment:10
args += ['-seekable', '1' if seekable else '0']
- http_headers = None
- if info_dict.get('http_headers'):
- youtubedl_headers = handle_youtubedl_headers(info_dict['http_headers'])
- http_headers = [
- # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv:
- # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header.
- '-headers',
- ''.join(f'{key}: {val}\r\n' for key, val in youtubedl_headers.items())
- ]
-
env = None
proxy = self.params.get('proxy')
if proxy:
@@ -434,21 +557,31 @@ class FFmpegFD(ExternalFD):
start_time, end_time = info_dict.get('section_start') or 0, info_dict.get('section_end')
- for i, url in enumerate(urls):
- if http_headers is not None and re.match(r'^https?://', url):
- args += http_headers
+ selected_formats = info_dict.get('requested_formats') or [info_dict]
+ for i, fmt in enumerate(selected_formats):
+ is_http = re.match(r'^https?://', fmt['url'])
+ cookies = self.ydl.cookiejar.get_cookies_for_url(fmt['url']) if is_http else []
+ if cookies:
+ args.extend(['-cookies', ''.join(
+ f'{cookie.name}={cookie.value}; path={cookie.path}; domain={cookie.domain};\r\n'
+ for cookie in cookies)])
+ if fmt.get('http_headers') and is_http:
+ # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv:
+ # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header.
+ args.extend(['-headers', ''.join(f'{key}: {val}\r\n' for key, val in fmt['http_headers'].items())])
+
if start_time:
args += ['-ss', str(start_time)]
if end_time:
args += ['-t', str(end_time - start_time)]
- args += self._configuration_args((f'_i{i + 1}', '_i')) + ['-i', url]
+ args += self._configuration_args((f'_i{i + 1}', '_i')) + ['-i', fmt['url']]
if not (start_time or end_time) or not self.params.get('force_keyframes_at_cuts'):
args += ['-c', 'copy']
if info_dict.get('requested_formats') or protocol == 'http_dash_segments':
- for (i, fmt) in enumerate(info_dict.get('requested_formats') or [info_dict]):
+ for i, fmt in enumerate(selected_formats):
stream_number = fmt.get('manifest_stream_number', 0)
args.extend(['-map', f'{i}:{stream_number}'])
@@ -488,8 +621,9 @@ class FFmpegFD(ExternalFD):
args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True))
self._debug_cmd(args)
+ piped = any(fmt['url'] in ('-', 'pipe:') for fmt in selected_formats)
with Popen(args, stdin=subprocess.PIPE, env=env) as proc:
- if url in ('-', 'pipe:'):
+ if piped:
self.on_process_started(proc, proc.stdin)
try:
retval = proc.wait()
@@ -499,7 +633,7 @@ class FFmpegFD(ExternalFD):
# produces a file that is playable (this is mostly useful for live
# streams). Note that Windows is not affected and produces playable
# files (see https://github.com/ytdl-org/youtube-dl/issues/8300).
- if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and url not in ('-', 'pipe:'):
+ if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and not piped:
proc.communicate_or_kill(b'q')
else:
proc.kill(timeout=None)
diff --git a/hypervideo_dl/downloader/f4m.py b/hypervideo_dl/downloader/f4m.py
index 306f921..28cbba0 100644
--- a/hypervideo_dl/downloader/f4m.py
+++ b/hypervideo_dl/downloader/f4m.py
@@ -3,11 +3,11 @@ import io
import itertools
import struct
import time
-import urllib.error
import urllib.parse
from .fragment import FragmentFD
from ..compat import compat_etree_fromstring
+from ..networking.exceptions import HTTPError
from ..utils import fix_xml_ampersands, xpath_text
@@ -312,7 +312,7 @@ class F4mFD(FragmentFD):
self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME)
urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
- man_url = urlh.geturl()
+ man_url = urlh.url
# Some manifests may be malformed, e.g. prosiebensat1 generated manifests
# (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244
# and https://github.com/ytdl-org/youtube-dl/issues/7823)
@@ -407,8 +407,8 @@ class F4mFD(FragmentFD):
if box_type == b'mdat':
self._append_fragment(ctx, box_data)
break
- except urllib.error.HTTPError as err:
- if live and (err.code == 404 or err.code == 410):
+ except HTTPError as err:
+ if live and (err.status == 404 or err.status == 410):
# We didn't keep up with the live window. Continue
# with the next available fragment.
msg = 'Fragment %d unavailable' % frag_i
diff --git a/hypervideo_dl/downloader/fragment.py b/hypervideo_dl/downloader/fragment.py
index e61bd0e..fa97923 100644
--- a/hypervideo_dl/downloader/fragment.py
+++ b/hypervideo_dl/downloader/fragment.py
@@ -1,24 +1,19 @@
import concurrent.futures
import contextlib
-import http.client
import json
import math
import os
import struct
import time
-import urllib.error
from .common import FileDownloader
from .http import HttpFD
from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
from ..compat import compat_os_name
-from ..utils import (
- DownloadError,
- RetryManager,
- encodeFilename,
- sanitized_Request,
- traverse_obj,
-)
+from ..networking import Request
+from ..networking.exceptions import HTTPError, IncompleteRead
+from ..utils import DownloadError, RetryManager, encodeFilename, traverse_obj
+from ..utils.networking import HTTPHeaderDict
class HttpQuietDownloader(HttpFD):
@@ -34,8 +29,8 @@ class FragmentFD(FileDownloader):
Available options:
- fragment_retries: Number of times to retry a fragment for HTTP error (DASH
- and hlsnative only)
+ fragment_retries: Number of times to retry a fragment for HTTP error
+ (DASH and hlsnative only). Default is 0 for API, but 10 for CLI
skip_unavailable_fragments:
Skip unavailable fragments (DASH and hlsnative only)
keep_fragments: Keep downloaded fragments on disk after downloading is
@@ -75,7 +70,7 @@ class FragmentFD(FileDownloader):
def _prepare_url(self, info_dict, url):
headers = info_dict.get('http_headers')
- return sanitized_Request(url, None, headers) if headers else url
+ return Request(url, None, headers) if headers else url
def _prepare_and_start_frag_download(self, ctx, info_dict):
self._prepare_frag_download(ctx)
@@ -121,6 +116,11 @@ class FragmentFD(FileDownloader):
'request_data': request_data,
'ctx_id': ctx.get('ctx_id'),
}
+ frag_resume_len = 0
+ if ctx['dl'].params.get('continuedl', True):
+ frag_resume_len = self.filesize_or_none(self.temp_name(fragment_filename))
+ fragment_info_dict['frag_resume_len'] = ctx['frag_resume_len'] = frag_resume_len
+
success, _ = ctx['dl'].download(fragment_filename, fragment_info_dict)
if not success:
return False
@@ -155,9 +155,7 @@ class FragmentFD(FileDownloader):
del ctx['fragment_filename_sanitized']
def _prepare_frag_download(self, ctx):
- if 'live' not in ctx:
- ctx['live'] = False
- if not ctx['live']:
+ if not ctx.setdefault('live', False):
total_frags_str = '%d' % ctx['total_frags']
ad_frags = ctx.get('ad_frags', 0)
if ad_frags:
@@ -170,15 +168,17 @@ class FragmentFD(FileDownloader):
**self.params,
'noprogress': True,
'test': False,
+ 'sleep_interval': 0,
+ 'max_sleep_interval': 0,
+ 'sleep_interval_subtitles': 0,
})
tmpfilename = self.temp_name(ctx['filename'])
open_mode = 'wb'
- resume_len = 0
# Establish possible resume length
- if os.path.isfile(encodeFilename(tmpfilename)):
+ resume_len = self.filesize_or_none(tmpfilename)
+ if resume_len > 0:
open_mode = 'ab'
- resume_len = os.path.getsize(encodeFilename(tmpfilename))
# Should be initialized before ytdl file check
ctx.update({
@@ -187,7 +187,9 @@ class FragmentFD(FileDownloader):
})
if self.__do_ytdl_file(ctx):
- if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))):
+ ytdl_file_exists = os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename'])))
+ continuedl = self.params.get('continuedl', True)
+ if continuedl and ytdl_file_exists:
self._read_ytdl_file(ctx)
is_corrupt = ctx.get('ytdl_corrupt') is True
is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0
@@ -201,7 +203,12 @@ class FragmentFD(FileDownloader):
if 'ytdl_corrupt' in ctx:
del ctx['ytdl_corrupt']
self._write_ytdl_file(ctx)
+
else:
+ if not continuedl:
+ if ytdl_file_exists:
+ self._read_ytdl_file(ctx)
+ ctx['fragment_index'] = resume_len = 0
self._write_ytdl_file(ctx)
assert ctx['fragment_index'] == 0
@@ -274,12 +281,10 @@ class FragmentFD(FileDownloader):
else:
frag_downloaded_bytes = s['downloaded_bytes']
state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes']
- if not ctx['live']:
- state['eta'] = self.calc_eta(
- start, time_now, estimated_size - resume_len,
- state['downloaded_bytes'] - resume_len)
ctx['speed'] = state['speed'] = self.calc_speed(
- ctx['fragment_started'], time_now, frag_downloaded_bytes)
+ ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx.get('frag_resume_len', 0))
+ if not ctx['live']:
+ state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes'])
ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes
self._hook_progress(state, info_dict)
@@ -290,14 +295,12 @@ class FragmentFD(FileDownloader):
def _finish_frag_download(self, ctx, info_dict):
ctx['dest_stream'].close()
if self.__do_ytdl_file(ctx):
- ytdl_filename = encodeFilename(self.ytdl_filename(ctx['filename']))
- if os.path.isfile(ytdl_filename):
- self.try_remove(ytdl_filename)
+ self.try_remove(self.ytdl_filename(ctx['filename']))
elapsed = time.time() - ctx['started']
to_file = ctx['tmpfilename'] != '-'
if to_file:
- downloaded_bytes = os.path.getsize(encodeFilename(ctx['tmpfilename']))
+ downloaded_bytes = self.filesize_or_none(ctx['tmpfilename'])
else:
downloaded_bytes = ctx['complete_frags_downloaded_bytes']
@@ -360,7 +363,8 @@ class FragmentFD(FileDownloader):
if not decrypt_info or decrypt_info['METHOD'] != 'AES-128':
return frag_content
iv = decrypt_info.get('IV') or struct.pack('>8xq', fragment['media_sequence'])
- decrypt_info['KEY'] = decrypt_info.get('KEY') or _get_key(info_dict.get('_decryption_key_url') or decrypt_info['URI'])
+ decrypt_info['KEY'] = (decrypt_info.get('KEY')
+ or _get_key(traverse_obj(info_dict, ('hls_aes', 'uri')) or decrypt_info['URI']))
# Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block
# size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded,
# not what it decrypts to.
@@ -382,7 +386,7 @@ class FragmentFD(FileDownloader):
max_workers = self.params.get('concurrent_fragment_downloads', 1)
if max_progress > 1:
self._prepare_multiline_status(max_progress)
- is_live = any(traverse_obj(args, (..., 2, 'is_live'), default=[]))
+ is_live = any(traverse_obj(args, (..., 2, 'is_live')))
def thread_func(idx, ctx, fragments, info_dict, tpe):
ctx['max_progress'] = max_progress
@@ -448,7 +452,7 @@ class FragmentFD(FileDownloader):
frag_index = ctx['fragment_index'] = fragment['frag_index']
ctx['last_error'] = None
- headers = info_dict.get('http_headers', {}).copy()
+ headers = HTTPHeaderDict(info_dict.get('http_headers'))
byte_range = fragment.get('byte_range')
if byte_range:
headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1)
@@ -465,9 +469,10 @@ class FragmentFD(FileDownloader):
for retry in RetryManager(self.params.get('fragment_retries'), error_callback):
try:
ctx['fragment_count'] = fragment.get('fragment_count')
- if not self._download_fragment(ctx, fragment['url'], info_dict, headers):
+ if not self._download_fragment(
+ ctx, fragment['url'], info_dict, headers, info_dict.get('request_data')):
return
- except (urllib.error.HTTPError, http.client.IncompleteRead) as err:
+ except (HTTPError, IncompleteRead) as err:
retry.error = err
continue
except DownloadError: # has own retry settings
@@ -495,7 +500,7 @@ class FragmentFD(FileDownloader):
download_fragment(fragment, ctx_copy)
return fragment, fragment['frag_index'], ctx_copy.get('fragment_filename_sanitized')
- self.report_warning('The download speed shown is only of one thread. This is a known issue and patches are welcome')
+ self.report_warning('The download speed shown is only of one thread. This is a known issue')
with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool:
try:
for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments):
diff --git a/hypervideo_dl/downloader/hls.py b/hypervideo_dl/downloader/hls.py
index 4520edc..2ea9a1e 100644
--- a/hypervideo_dl/downloader/hls.py
+++ b/hypervideo_dl/downloader/hls.py
@@ -7,8 +7,15 @@ from . import get_suitable_downloader
from .external import FFmpegFD
from .fragment import FragmentFD
from .. import webvtt
-from ..dependencies import Cryptodome_AES
-from ..utils import bug_reports_message, parse_m3u8_attributes, update_url_query
+from ..dependencies import Cryptodome
+from ..utils import (
+ bug_reports_message,
+ parse_m3u8_attributes,
+ remove_start,
+ traverse_obj,
+ update_url_query,
+ urljoin,
+)
class HlsFD(FragmentFD):
@@ -21,7 +28,16 @@ class HlsFD(FragmentFD):
FD_NAME = 'hlsnative'
@staticmethod
- def can_download(manifest, info_dict, allow_unplayable_formats=False):
+ def _has_drm(manifest): # TODO: https://github.com/hypervideo/hypervideo/pull/5039
+ return bool(re.search('|'.join((
+ r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
+ r'#EXT-X-(?:SESSION-)?KEY:.*?KEYFORMAT="com\.apple\.streamingkeydelivery"', # Apple FairPlay
+ r'#EXT-X-(?:SESSION-)?KEY:.*?KEYFORMAT="com\.microsoft\.playready"', # Microsoft PlayReady
+ r'#EXT-X-FAXS-CM:', # Adobe Flash Access
+ )), manifest))
+
+ @classmethod
+ def can_download(cls, manifest, info_dict, allow_unplayable_formats=False):
UNSUPPORTED_FEATURES = [
# r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2]
@@ -43,13 +59,15 @@ class HlsFD(FragmentFD):
]
if not allow_unplayable_formats:
UNSUPPORTED_FEATURES += [
- r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1]
+ r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1], but not necessarily DRM
]
def check_results():
yield not info_dict.get('is_live')
for feature in UNSUPPORTED_FEATURES:
yield not re.search(feature, manifest)
+ if not allow_unplayable_formats:
+ yield not cls._has_drm(manifest)
return all(check_results())
def real_download(self, filename, info_dict):
@@ -57,13 +75,13 @@ class HlsFD(FragmentFD):
self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
- man_url = urlh.geturl()
+ man_url = urlh.url
s = urlh.read().decode('utf-8', 'ignore')
can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None
if can_download:
has_ffmpeg = FFmpegFD.available()
- no_crypto = not Cryptodome_AES and '#EXT-X-KEY:METHOD=AES-128' in s
+ no_crypto = not Cryptodome.AES and '#EXT-X-KEY:METHOD=AES-128' in s
if no_crypto and has_ffmpeg:
can_download, message = False, 'The stream has AES-128 encryption and pycryptodome is not available'
elif no_crypto:
@@ -74,14 +92,13 @@ class HlsFD(FragmentFD):
message = ('Live HLS streams are not supported by the native downloader. If this is a livestream, '
f'please {install_ffmpeg}add "--downloader ffmpeg --hls-use-mpegts" to your command')
if not can_download:
- has_drm = re.search('|'.join([
- r'#EXT-X-FAXS-CM:', # Adobe Flash Access
- r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
- ]), s)
- if has_drm and not self.params.get('allow_unplayable_formats'):
- self.report_error(
- 'This video is DRM protected; Try selecting another format with --format or '
- 'add --check-formats to automatically fallback to the next best format')
+ if self._has_drm(s) and not self.params.get('allow_unplayable_formats'):
+ if info_dict.get('has_drm') and self.params.get('test'):
+ self.to_screen(f'[{self.FD_NAME}] This format is DRM protected', skip_eol=True)
+ else:
+ self.report_error(
+ 'This format is DRM protected; Try selecting another format with --format or '
+ 'add --check-formats to automatically fallback to the next best format', tb=False)
return False
message = message or 'Unsupported features have been detected'
fd = FFmpegFD(self.ydl, self.params)
@@ -150,6 +167,13 @@ class HlsFD(FragmentFD):
i = 0
media_sequence = 0
decrypt_info = {'METHOD': 'NONE'}
+ external_aes_key = traverse_obj(info_dict, ('hls_aes', 'key'))
+ if external_aes_key:
+ external_aes_key = binascii.unhexlify(remove_start(external_aes_key, '0x'))
+ assert len(external_aes_key) in (16, 24, 32), 'Invalid length for HLS AES-128 key'
+ external_aes_iv = traverse_obj(info_dict, ('hls_aes', 'iv'))
+ if external_aes_iv:
+ external_aes_iv = binascii.unhexlify(remove_start(external_aes_iv, '0x').zfill(32))
byte_range = {}
discontinuity_count = 0
frag_index = 0
@@ -165,10 +189,7 @@ class HlsFD(FragmentFD):
frag_index += 1
if frag_index <= ctx['fragment_index']:
continue
- frag_url = (
- line
- if re.match(r'^https?://', line)
- else urllib.parse.urljoin(man_url, line))
+ frag_url = urljoin(man_url, line)
if extra_query:
frag_url = update_url_query(frag_url, extra_query)
@@ -190,10 +211,7 @@ class HlsFD(FragmentFD):
return False
frag_index += 1
map_info = parse_m3u8_attributes(line[11:])
- frag_url = (
- map_info.get('URI')
- if re.match(r'^https?://', map_info.get('URI'))
- else urllib.parse.urljoin(man_url, map_info.get('URI')))
+ frag_url = urljoin(man_url, map_info.get('URI'))
if extra_query:
frag_url = update_url_query(frag_url, extra_query)
@@ -218,15 +236,18 @@ class HlsFD(FragmentFD):
decrypt_url = decrypt_info.get('URI')
decrypt_info = parse_m3u8_attributes(line[11:])
if decrypt_info['METHOD'] == 'AES-128':
- if 'IV' in decrypt_info:
+ if external_aes_iv:
+ decrypt_info['IV'] = external_aes_iv
+ elif 'IV' in decrypt_info:
decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32))
- if not re.match(r'^https?://', decrypt_info['URI']):
- decrypt_info['URI'] = urllib.parse.urljoin(
- man_url, decrypt_info['URI'])
- if extra_query:
- decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
- if decrypt_url != decrypt_info['URI']:
- decrypt_info['KEY'] = None
+ if external_aes_key:
+ decrypt_info['KEY'] = external_aes_key
+ else:
+ decrypt_info['URI'] = urljoin(man_url, decrypt_info['URI'])
+ if extra_query:
+ decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
+ if decrypt_url != decrypt_info['URI']:
+ decrypt_info['KEY'] = None
elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
media_sequence = int(line[22:])
diff --git a/hypervideo_dl/downloader/http.py b/hypervideo_dl/downloader/http.py
index 95c870e..65579be 100644
--- a/hypervideo_dl/downloader/http.py
+++ b/hypervideo_dl/downloader/http.py
@@ -1,12 +1,14 @@
-import http.client
import os
import random
-import socket
-import ssl
import time
-import urllib.error
from .common import FileDownloader
+from ..networking import Request
+from ..networking.exceptions import (
+ CertificateVerifyError,
+ HTTPError,
+ TransportError,
+)
from ..utils import (
ContentTooShortError,
RetryManager,
@@ -16,18 +18,10 @@ from ..utils import (
encodeFilename,
int_or_none,
parse_http_range,
- sanitized_Request,
try_call,
write_xattr,
)
-
-RESPONSE_READ_EXCEPTIONS = (
- TimeoutError,
- socket.timeout, # compat: py < 3.10
- ConnectionError,
- ssl.SSLError,
- http.client.HTTPException
-)
+from ..utils.networking import HTTPHeaderDict
class HttpFD(FileDownloader):
@@ -45,11 +39,8 @@ class HttpFD(FileDownloader):
ctx.tmpfilename = self.temp_name(filename)
ctx.stream = None
- # Do not include the Accept-Encoding header
- headers = {'Youtubedl-no-compression': 'True'}
- add_headers = info_dict.get('http_headers')
- if add_headers:
- headers.update(add_headers)
+ # Disable compression
+ headers = HTTPHeaderDict({'Accept-Encoding': 'identity'}, info_dict.get('http_headers'))
is_test = self.params.get('test', False)
chunk_size = self._TEST_FILE_SIZE if is_test else (
@@ -120,10 +111,10 @@ class HttpFD(FileDownloader):
if try_call(lambda: range_end >= ctx.content_len):
range_end = ctx.content_len - 1
- request = sanitized_Request(url, request_data, headers)
+ request = Request(url, request_data, headers)
has_range = range_start is not None
if has_range:
- request.add_header('Range', f'bytes={int(range_start)}-{int_or_none(range_end) or ""}')
+ request.headers['Range'] = f'bytes={int(range_start)}-{int_or_none(range_end) or ""}'
# Establish connection
try:
ctx.data = self.ydl.urlopen(request)
@@ -150,20 +141,21 @@ class HttpFD(FileDownloader):
# Content-Range is either not present or invalid. Assuming remote webserver is
# trying to send the whole file, resume is not possible, so wiping the local file
# and performing entire redownload
- self.report_unable_to_resume()
+ elif range_start > 0:
+ self.report_unable_to_resume()
ctx.resume_len = 0
ctx.open_mode = 'wb'
- ctx.data_len = ctx.content_len = int_or_none(ctx.data.info().get('Content-length', None))
- except urllib.error.HTTPError as err:
- if err.code == 416:
+ ctx.data_len = ctx.content_len = int_or_none(ctx.data.headers.get('Content-length', None))
+ except HTTPError as err:
+ if err.status == 416:
# Unable to resume (requested range not satisfiable)
try:
# Open the connection again without the range header
ctx.data = self.ydl.urlopen(
- sanitized_Request(url, request_data, headers))
- content_length = ctx.data.info()['Content-Length']
- except urllib.error.HTTPError as err:
- if err.code < 500 or err.code >= 600:
+ Request(url, request_data, headers))
+ content_length = ctx.data.headers['Content-Length']
+ except HTTPError as err:
+ if err.status < 500 or err.status >= 600:
raise
else:
# Examine the reported length
@@ -191,17 +183,13 @@ class HttpFD(FileDownloader):
ctx.resume_len = 0
ctx.open_mode = 'wb'
return
- elif err.code < 500 or err.code >= 600:
+ elif err.status < 500 or err.status >= 600:
# Unexpected HTTP error
raise
raise RetryDownload(err)
- except urllib.error.URLError as err:
- if isinstance(err.reason, ssl.CertificateError):
- raise
- raise RetryDownload(err)
- # In urllib.request.AbstractHTTPHandler, the response is partially read on request.
- # Any errors that occur during this will not be wrapped by URLError
- except RESPONSE_READ_EXCEPTIONS as err:
+ except CertificateVerifyError:
+ raise
+ except TransportError as err:
raise RetryDownload(err)
def close_stream():
@@ -211,7 +199,12 @@ class HttpFD(FileDownloader):
ctx.stream = None
def download():
- data_len = ctx.data.info().get('Content-length', None)
+ data_len = ctx.data.headers.get('Content-length')
+
+ if ctx.data.headers.get('Content-encoding'):
+ # Content-encoding is present, Content-length is not reliable anymore as we are
+ # doing auto decompression. (See: https://github.com/hypervideo/hypervideo/pull/6176)
+ data_len = None
# Range HTTP header may be ignored/unsupported by a webserver
# (e.g. extractor/scivee.py, extractor/bambuser.py).
@@ -252,7 +245,7 @@ class HttpFD(FileDownloader):
try:
# Download and write
data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter))
- except RESPONSE_READ_EXCEPTIONS as err:
+ except TransportError as err:
retry(err)
byte_counter += len(data_block)
@@ -333,15 +326,15 @@ class HttpFD(FileDownloader):
elif speed:
ctx.throttle_start = None
- if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len:
- ctx.resume_len = byte_counter
- # ctx.block_size = block_size
- raise NextFragment()
-
if ctx.stream is None:
self.to_stderr('\n')
self.report_error('Did not get any data blocks')
return False
+
+ if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len:
+ ctx.resume_len = byte_counter
+ raise NextFragment()
+
if ctx.tmpfilename != '-':
ctx.stream.close()
@@ -353,7 +346,7 @@ class HttpFD(FileDownloader):
# Update file modification time
if self.params.get('updatetime', True):
- info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None))
+ info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.headers.get('last-modified', None))
self._hook_progress({
'downloaded_bytes': byte_counter,
diff --git a/hypervideo_dl/downloader/ism.py b/hypervideo_dl/downloader/ism.py
index a157a8a..dd688f5 100644
--- a/hypervideo_dl/downloader/ism.py
+++ b/hypervideo_dl/downloader/ism.py
@@ -2,9 +2,9 @@ import binascii
import io
import struct
import time
-import urllib.error
from .fragment import FragmentFD
+from ..networking.exceptions import HTTPError
from ..utils import RetryManager
u8 = struct.Struct('>B')
@@ -271,7 +271,7 @@ class IsmFD(FragmentFD):
write_piff_header(ctx['dest_stream'], info_dict['_download_params'])
extra_state['ism_track_written'] = True
self._append_fragment(ctx, frag_content)
- except urllib.error.HTTPError as err:
+ except HTTPError as err:
retry.error = err
continue
diff --git a/hypervideo_dl/downloader/niconico.py b/hypervideo_dl/downloader/niconico.py
index 77ed39e..5720f6e 100644
--- a/hypervideo_dl/downloader/niconico.py
+++ b/hypervideo_dl/downloader/niconico.py
@@ -1,8 +1,12 @@
+import json
import threading
+import time
from . import get_suitable_downloader
from .common import FileDownloader
-from ..utils import sanitized_Request
+from .external import FFmpegFD
+from ..networking import Request
+from ..utils import DownloadError, WebSocketsWrapper, str_or_none, try_get
class NiconicoDmcFD(FileDownloader):
@@ -24,7 +28,7 @@ class NiconicoDmcFD(FileDownloader):
heartbeat_data = heartbeat_info_dict['data'].encode()
heartbeat_interval = heartbeat_info_dict.get('interval', 30)
- request = sanitized_Request(heartbeat_url, heartbeat_data)
+ request = Request(heartbeat_url, heartbeat_data)
def heartbeat():
try:
@@ -50,3 +54,93 @@ class NiconicoDmcFD(FileDownloader):
timer[0].cancel()
download_complete = True
return success
+
+
+class NiconicoLiveFD(FileDownloader):
+ """ Downloads niconico live without being stopped """
+
+ def real_download(self, filename, info_dict):
+ video_id = info_dict['video_id']
+ ws_url = info_dict['url']
+ ws_extractor = info_dict['ws']
+ ws_origin_host = info_dict['origin']
+ cookies = info_dict.get('cookies')
+ live_quality = info_dict.get('live_quality', 'high')
+ live_latency = info_dict.get('live_latency', 'high')
+ dl = FFmpegFD(self.ydl, self.params or {})
+
+ new_info_dict = info_dict.copy()
+ new_info_dict.update({
+ 'protocol': 'm3u8',
+ })
+
+ def communicate_ws(reconnect):
+ if reconnect:
+ ws = WebSocketsWrapper(ws_url, {
+ 'Cookies': str_or_none(cookies) or '',
+ 'Origin': f'https://{ws_origin_host}',
+ 'Accept': '*/*',
+ 'User-Agent': self.params['http_headers']['User-Agent'],
+ })
+ if self.ydl.params.get('verbose', False):
+ self.to_screen('[debug] Sending startWatching request')
+ ws.send(json.dumps({
+ 'type': 'startWatching',
+ 'data': {
+ 'stream': {
+ 'quality': live_quality,
+ 'protocol': 'hls+fmp4',
+ 'latency': live_latency,
+ 'chasePlay': False
+ },
+ 'room': {
+ 'protocol': 'webSocket',
+ 'commentable': True
+ },
+ 'reconnect': True,
+ }
+ }))
+ else:
+ ws = ws_extractor
+ with ws:
+ while True:
+ recv = ws.recv()
+ if not recv:
+ continue
+ data = json.loads(recv)
+ if not data or not isinstance(data, dict):
+ continue
+ if data.get('type') == 'ping':
+ # pong back
+ ws.send(r'{"type":"pong"}')
+ ws.send(r'{"type":"keepSeat"}')
+ elif data.get('type') == 'disconnect':
+ self.write_debug(data)
+ return True
+ elif data.get('type') == 'error':
+ self.write_debug(data)
+ message = try_get(data, lambda x: x['body']['code'], str) or recv
+ return DownloadError(message)
+ elif self.ydl.params.get('verbose', False):
+ if len(recv) > 100:
+ recv = recv[:100] + '...'
+ self.to_screen('[debug] Server said: %s' % recv)
+
+ def ws_main():
+ reconnect = False
+ while True:
+ try:
+ ret = communicate_ws(reconnect)
+ if ret is True:
+ return
+ except BaseException as e:
+ self.to_screen('[%s] %s: Connection error occured, reconnecting after 10 seconds: %s' % ('niconico:live', video_id, str_or_none(e)))
+ time.sleep(10)
+ continue
+ finally:
+ reconnect = True
+
+ thread = threading.Thread(target=ws_main, daemon=True)
+ thread.start()
+
+ return dl.download(filename, new_info_dict)
diff --git a/hypervideo_dl/downloader/youtube_live_chat.py b/hypervideo_dl/downloader/youtube_live_chat.py
index dfd290a..b610490 100644
--- a/hypervideo_dl/downloader/youtube_live_chat.py
+++ b/hypervideo_dl/downloader/youtube_live_chat.py
@@ -1,8 +1,8 @@
import json
import time
-import urllib.error
from .fragment import FragmentFD
+from ..networking.exceptions import HTTPError
from ..utils import (
RegexNotFoundError,
RetryManager,
@@ -10,6 +10,7 @@ from ..utils import (
int_or_none,
try_get,
)
+from ..utils.networking import HTTPHeaderDict
class YoutubeLiveChatFD(FragmentFD):
@@ -37,10 +38,7 @@ class YoutubeLiveChatFD(FragmentFD):
start_time = int(time.time() * 1000)
def dl_fragment(url, data=None, headers=None):
- http_headers = info_dict.get('http_headers', {})
- if headers:
- http_headers = http_headers.copy()
- http_headers.update(headers)
+ http_headers = HTTPHeaderDict(info_dict.get('http_headers'), headers)
return self._download_fragment(ctx, url, info_dict, http_headers, data)
def parse_actions_replay(live_chat_continuation):
@@ -129,7 +127,7 @@ class YoutubeLiveChatFD(FragmentFD):
or frag_index == 1 and try_refresh_replay_beginning
or parse_actions_replay)
return (True, *func(live_chat_continuation))
- except urllib.error.HTTPError as err:
+ except HTTPError as err:
retry.error = err
continue
return False, None, None, None
diff --git a/hypervideo_dl/extractor/_extractors.py b/hypervideo_dl/extractor/_extractors.py
index 2fe15f6..f11554b 100644
--- a/hypervideo_dl/extractor/_extractors.py
+++ b/hypervideo_dl/extractor/_extractors.py
@@ -15,13 +15,13 @@ from .youtube import ( # Youtube is moved to the top to improve performance
YoutubeSearchURLIE,
YoutubeMusicSearchURLIE,
YoutubeSubscriptionsIE,
- YoutubeStoriesIE,
YoutubeTruncatedIDIE,
YoutubeTruncatedURLIE,
YoutubeYtBeIE,
YoutubeYtUserIE,
YoutubeWatchLaterIE,
- YoutubeShortsAudioPivotIE
+ YoutubeShortsAudioPivotIE,
+ YoutubeConsentRedirectIE,
)
from .abc import (
@@ -78,6 +78,8 @@ from .agora import (
WyborczaVideoIE,
)
from .airmozilla import AirMozillaIE
+from .airtv import AirTVIE
+from .aitube import AitubeKZVideoIE
from .aljazeera import AlJazeeraIE
from .alphaporno import AlphaPornoIE
from .amara import AmaraIE
@@ -86,7 +88,10 @@ from .alura import (
AluraCourseIE
)
from .amcnetworks import AMCNetworksIE
-from .amazon import AmazonStoreIE
+from .amazon import (
+ AmazonStoreIE,
+ AmazonReviewsIE,
+)
from .amazonminitv import (
AmazonMiniTVIE,
AmazonMiniTVSeasonIE,
@@ -96,6 +101,7 @@ from .americastestkitchen import (
AmericasTestKitchenIE,
AmericasTestKitchenSeasonIE,
)
+from .anchorfm import AnchorFMEpisodeIE
from .angel import AngelIE
from .anvato import AnvatoIE
from .aol import AolIE
@@ -116,6 +122,7 @@ from .applepodcasts import ApplePodcastsIE
from .archiveorg import (
ArchiveOrgIE,
YoutubeWebArchiveIE,
+ VLiveWebArchiveIE,
)
from .arcpublishing import ArcPublishingIE
from .arkena import ArkenaIE
@@ -183,6 +190,10 @@ from .bbc import (
from .beeg import BeegIE
from .behindkink import BehindKinkIE
from .bellmedia import BellMediaIE
+from .beatbump import (
+ BeatBumpVideoIE,
+ BeatBumpPlaylistIE,
+)
from .beatport import BeatportIE
from .berufetv import BerufeTVIE
from .bet import BetIE
@@ -192,13 +203,18 @@ from .bfmtv import (
BFMTVLiveIE,
BFMTVArticleIE,
)
-from .bibeltv import BibelTVIE
+from .bibeltv import (
+ BibelTVLiveIE,
+ BibelTVSeriesIE,
+ BibelTVVideoIE,
+)
from .bigflix import BigflixIE
from .bigo import BigoIE
from .bild import BildIE
from .bilibili import (
BiliBiliIE,
BiliBiliBangumiIE,
+ BiliBiliBangumiSeasonIE,
BiliBiliBangumiMediaIE,
BiliBiliSearchIE,
BilibiliCategoryIE,
@@ -227,19 +243,28 @@ from .bleacherreport import (
BleacherReportIE,
BleacherReportCMSIE,
)
+from .blerp import BlerpIE
from .blogger import BloggerIE
from .bloomberg import BloombergIE
from .bokecc import BokeCCIE
from .bongacams import BongaCamsIE
from .bostonglobe import BostonGlobeIE
from .box import BoxIE
-from .booyah import BooyahClipsIE
+from .boxcast import BoxCastVideoIE
from .bpb import BpbIE
from .br import (
BRIE,
BRMediathekIE,
)
from .bravotv import BravoTVIE
+from .brainpop import (
+ BrainPOPIE,
+ BrainPOPJrIE,
+ BrainPOPELLIE,
+ BrainPOPEspIE,
+ BrainPOPFrIE,
+ BrainPOPIlIE,
+)
from .breakcom import BreakIE
from .breitbart import BreitBartIE
from .brightcove import (
@@ -259,6 +284,10 @@ from .camdemy import (
CamdemyIE,
CamdemyFolderIE
)
+from .camfm import (
+ CamFMEpisodeIE,
+ CamFMShowIE
+)
from .cammodels import CamModelsIE
from .camsoda import CamsodaIE
from .camtasia import CamtasiaEmbedIE
@@ -266,12 +295,6 @@ from .camwithher import CamWithHerIE
from .canalalpha import CanalAlphaIE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
-from .canvas import (
- CanvasIE,
- CanvasEenIE,
- VrtNUIE,
- DagelijkseKostIE,
-)
from .carambatv import (
CarambaTVIE,
CarambaTVPageIE,
@@ -280,19 +303,23 @@ from .cartoonnetwork import CartoonNetworkIE
from .cbc import (
CBCIE,
CBCPlayerIE,
+ CBCPlayerPlaylistIE,
CBCGemIE,
CBCGemPlaylistIE,
CBCGemLiveIE,
)
-from .cbs import CBSIE
-from .cbslocal import (
- CBSLocalIE,
- CBSLocalArticleIE,
+from .cbs import (
+ CBSIE,
+ ParamountPressExpressIE,
)
from .cbsinteractive import CBSInteractiveIE
from .cbsnews import (
CBSNewsEmbedIE,
CBSNewsIE,
+ CBSLocalIE,
+ CBSLocalArticleIE,
+ CBSLocalLiveIE,
+ CBSNewsLiveIE,
CBSNewsLiveVideoIE,
)
from .cbssports import (
@@ -331,6 +358,7 @@ from .ciscolive import (
)
from .ciscowebex import CiscoWebexIE
from .cjsw import CJSWIE
+from .clipchamp import ClipchampIE
from .cliphunter import CliphunterIE
from .clippit import ClippitIE
from .cliprs import ClipRsIE
@@ -378,9 +406,12 @@ from .crowdbunker import (
CrowdBunkerIE,
CrowdBunkerChannelIE,
)
+from .crtvg import CrtvgIE
from .crunchyroll import (
CrunchyrollBetaIE,
CrunchyrollBetaShowIE,
+ CrunchyrollMusicIE,
+ CrunchyrollArtistIE,
)
from .cspan import CSpanIE, CSpanCongressIE
from .ctsnews import CtsNewsIE
@@ -397,6 +428,10 @@ from .cybrary import (
CybraryIE,
CybraryCourseIE
)
+from .dacast import (
+ DacastVODIE,
+ DacastPlaylistIE,
+)
from .daftsex import DaftsexIE
from .dailymail import DailyMailIE
from .dailymotion import (
@@ -427,6 +462,10 @@ from .deezer import (
)
from .democracynow import DemocracynowIE
from .detik import DetikEmbedIE
+from .dlf import (
+ DLFIE,
+ DLFCorpusIE,
+)
from .dfb import DFBIE
from .dhm import DHMIE
from .digg import DiggIE
@@ -459,6 +498,7 @@ from .dplay import (
DiscoveryPlusItalyIE,
DiscoveryPlusItalyShowIE,
DiscoveryPlusIndiaShowIE,
+ GlobalCyclingNetworkPlusIE,
)
from .dreisat import DreiSatIE
from .drbonanza import DRBonanzaIE
@@ -466,6 +506,8 @@ from .drtuber import DrTuberIE
from .drtv import (
DRTVIE,
DRTVLiveIE,
+ DRTVSeasonIE,
+ DRTVSeriesIE,
)
from .dtube import DTubeIE
from .dvtv import DVTVIE
@@ -480,6 +522,7 @@ from .deuxm import (
DeuxMNewsIE
)
from .digitalconcerthall import DigitalConcertHallIE
+from .discogs import DiscogsReleasePlaylistIE
from .discovery import DiscoveryIE
from .disney import DisneyIE
from .dispeak import DigitallySpeakingIE
@@ -494,6 +537,7 @@ from .dw import (
)
from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE
from .ebaumsworld import EbaumsWorldIE
+from .ebay import EbayIE
from .echomsk import EchoMskIE
from .egghead import (
EggheadCourseIE,
@@ -503,6 +547,7 @@ from .ehow import EHowIE
from .eighttracks import EightTracksIE
from .einthusan import EinthusanIE
from .eitb import EitbIE
+from .elevensports import ElevenSportsIE
from .ellentube import (
EllenTubeIE,
EllenTubeVideoIE,
@@ -536,7 +581,8 @@ from .espn import (
ESPNCricInfoIE,
)
from .esri import EsriVideoIE
-from .europa import EuropaIE
+from .ettutv import EttuTvIE
+from .europa import EuropaIE, EuroParlWebstreamIE
from .europeantour import EuropeanTourIE
from .eurosport import EurosportIE
from .euscreen import EUScreenIE
@@ -622,6 +668,7 @@ from .funimation import (
FunimationShowIE,
)
from .funk import FunkIE
+from .funker530 import Funker530IE
from .fusion import FusionIE
from .fuyintv import FuyinTVIE
from .gab import (
@@ -657,10 +704,18 @@ from .gfycat import GfycatIE
from .giantbomb import GiantBombIE
from .giga import GigaIE
from .glide import GlideIE
+from .globalplayer import (
+ GlobalPlayerLiveIE,
+ GlobalPlayerLivePlaylistIE,
+ GlobalPlayerAudioIE,
+ GlobalPlayerAudioEpisodeIE,
+ GlobalPlayerVideoIE
+)
from .globo import (
GloboIE,
GloboArticleIE,
)
+from .gmanetwork import GMANetworkVideoIE
from .go import GoIE
from .godtube import GodTubeIE
from .gofile import GofileIE
@@ -692,13 +747,16 @@ from .hearthisat import HearThisAtIE
from .heise import HeiseIE
from .hellporno import HellPornoIE
from .helsinki import HelsinkiIE
-from .hentaistigma import HentaiStigmaIE
from .hgtv import HGTVComShowIE
from .hketv import HKETVIE
from .hidive import HiDiveIE
from .historicfilms import HistoricFilmsIE
from .hitbox import HitboxIE, HitboxLiveIE
from .hitrecord import HitRecordIE
+from .hollywoodreporter import (
+ HollywoodReporterIE,
+ HollywoodReporterPlaylistIE,
+)
from .holodex import HolodexIE
from .hotnewhiphop import HotNewHipHopIE
from .hotstar import (
@@ -710,6 +768,7 @@ from .hotstar import (
)
from .howcast import HowcastIE
from .howstuffworks import HowStuffWorksIE
+from .hrefli import HrefLiRedirectIE
from .hrfensehen import HRFernsehenIE
from .hrti import (
HRTiIE,
@@ -732,12 +791,14 @@ from .hungama import (
HungamaAlbumPlaylistIE,
)
from .hypem import HypemIE
+from .hypergryph import MonsterSirenHypergryphMusicIE
from .hytale import HytaleIE
from .icareus import IcareusIE
from .ichinanalive import (
IchinanaLiveIE,
IchinanaLiveClipIE,
)
+from .idolplus import IdolPlusIE
from .ign import (
IGNIE,
IGNVideoIE,
@@ -822,23 +883,29 @@ from .japandiet import (
from .jeuxvideo import JeuxVideoIE
from .jove import JoveIE
from .joj import JojIE
+from .jstream import JStreamIE
from .jwplatform import JWPlatformIE
from .kakao import KakaoIE
from .kaltura import KalturaIE
from .kanal2 import Kanal2IE
+from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE
from .karrierevideos import KarriereVideosIE
from .keezmovies import KeezMoviesIE
from .kelbyone import KelbyOneIE
-from .ketnet import KetnetIE
from .khanacademy import (
KhanAcademyIE,
KhanAcademyUnitIE,
)
+from .kick import (
+ KickIE,
+ KickVODIE,
+)
from .kicker import KickerIE
from .kickstarter import KickStarterIE
from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE
+from .kommunetv import KommunetvIE
from .kompas import KompasVideoIE
from .konserthusetplay import KonserthusetPlayIE
from .koo import KooIE
@@ -890,6 +957,10 @@ from .leeco import (
LePlaylistIE,
LetvCloudIE,
)
+from .lefigaro import (
+ LeFigaroVideoEmbedIE,
+ LeFigaroVideoSectionIE,
+)
from .lego import LEGOIE
from .lemonde import LemondeIE
from .lenta import LentaIE
@@ -908,10 +979,6 @@ from .limelight import (
LimelightChannelIE,
LimelightChannelListIE,
)
-from .line import (
- LineLiveIE,
- LineLiveChannelIE,
-)
from .linkedin import (
LinkedInIE,
LinkedInLearningIE,
@@ -938,11 +1005,15 @@ from .lrt import (
LRTVODIE,
LRTStreamIE
)
+from .lumni import (
+ LumniIE
+)
from .lynda import (
LyndaIE,
LyndaCourseIE
)
from .m6 import M6IE
+from .magellantv import MagellanTVIE
from .magentamusik360 import MagentaMusik360IE
from .mailru import (
MailRuIE,
@@ -982,6 +1053,10 @@ from .mediasite import (
MediasiteCatalogIE,
MediasiteNamedCatalogIE,
)
+from .mediastream import (
+ MediaStreamIE,
+ WinSportsVideoIE,
+)
from .mediaworksnz import MediaWorksNZVODIE
from .medici import MediciIE
from .megaphone import MegaphoneIE
@@ -1047,7 +1122,8 @@ from .mojvideo import MojvideoIE
from .morningstar import MorningstarIE
from .motherless import (
MotherlessIE,
- MotherlessGroupIE
+ MotherlessGroupIE,
+ MotherlessGalleryIE,
)
from .motorsport import MotorsportIE
from .movieclips import MovieClipsIE
@@ -1067,6 +1143,7 @@ from .mtv import (
)
from .muenchentv import MuenchenTVIE
from .murrtube import MurrtubeIE, MurrtubeUserIE
+from .museai import MuseAIIE
from .musescore import MuseScoreIE
from .musicdex import (
MusicdexSongIE,
@@ -1088,6 +1165,7 @@ from .myvi import (
)
from .myvideoge import MyVideoGeIE
from .myvidster import MyVidsterIE
+from .mzaalo import MzaaloIE
from .n1 import (
N1InfoAssetIE,
N1InfoIIE,
@@ -1136,6 +1214,7 @@ from .nebula import (
NebulaSubscriptionsIE,
NebulaChannelIE,
)
+from .nekohacker import NekoHackerIE
from .nerdcubed import NerdCubedFeedIE
from .netzkino import NetzkinoIE
from .neteasemusic import (
@@ -1150,6 +1229,7 @@ from .neteasemusic import (
from .netverse import (
NetverseIE,
NetversePlaylistIE,
+ NetverseSearchIE,
)
from .newgrounds import (
NewgroundsIE,
@@ -1174,6 +1254,8 @@ from .nfhsnetwork import NFHSNetworkIE
from .nfl import (
NFLIE,
NFLArticleIE,
+ NFLPlusEpisodeIE,
+ NFLPlusReplayIE,
)
from .nhk import (
NhkVodIE,
@@ -1181,6 +1263,9 @@ from .nhk import (
NhkForSchoolBangumiIE,
NhkForSchoolSubjectIE,
NhkForSchoolProgramListIE,
+ NhkRadioNewsPageIE,
+ NhkRadiruIE,
+ NhkRadiruLiveIE,
)
from .nhl import NHLIE
from .nick import (
@@ -1200,6 +1285,7 @@ from .niconico import (
NicovideoSearchIE,
NicovideoSearchURLIE,
NicovideoTagURLIE,
+ NiconicoLiveIE,
)
from .ninecninemedia import (
NineCNineMediaIE,
@@ -1211,6 +1297,7 @@ from .nintendo import NintendoIE
from .nitter import NitterIE
from .njpwworld import NJPWWorldIE
from .nobelprize import NobelPrizeIE
+from .noice import NoicePodcastIE
from .nonktube import NonkTubeIE
from .noodlemagazine import NoodleMagazineIE
from .noovo import NoovoIE
@@ -1256,6 +1343,7 @@ from .nrl import NRLTVIE
from .ntvcojp import NTVCoJpCUIE
from .ntvde import NTVDeIE
from .ntvru import NTVRuIE
+from .nubilesporn import NubilesPornIE
from .nytimes import (
NYTimesIE,
NYTimesArticleIE,
@@ -1263,8 +1351,10 @@ from .nytimes import (
)
from .nuvid import NuvidIE
from .nzherald import NZHeraldIE
+from .nzonscreen import NZOnScreenIE
from .nzz import NZZIE
from .odatv import OdaTVIE
+from .odkmedia import OnDemandChinaEpisodeIE
from .odnoklassniki import OdnoklassnikiIE
from .oftv import (
OfTVIE,
@@ -1276,6 +1366,7 @@ from .on24 import On24IE
from .ondemandkorea import OnDemandKoreaIE
from .onefootball import OneFootballIE
from .onenewsnz import OneNewsNZIE
+from .oneplace import OnePlacePodcastIE
from .onet import (
OnetIE,
OnetChannelIE,
@@ -1304,6 +1395,7 @@ from .orf import (
ORFIPTVIE,
)
from .outsidetv import OutsideTVIE
+from .owncloud import OwnCloudIE
from .packtpub import (
PacktPubIE,
PacktPubCourseIE,
@@ -1329,7 +1421,7 @@ from .patreon import (
PatreonIE,
PatreonCampaignIE
)
-from .pbs import PBSIE
+from .pbs import PBSIE, PBSKidsIE
from .pearvideo import PearVideoIE
from .peekvids import PeekVidsIE, PlayVidsIE
from .peertube import (
@@ -1347,6 +1439,7 @@ from .periscope import (
PeriscopeIE,
PeriscopeUserIE,
)
+from .pgatour import PGATourIE
from .philharmoniedeparis import PhilharmonieDeParisIE
from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE
@@ -1398,11 +1491,12 @@ from .pokergo import (
from .polsatgo import PolsatGoIE
from .polskieradio import (
PolskieRadioIE,
+ PolskieRadioLegacyIE,
+ PolskieRadioAuditionIE,
PolskieRadioCategoryIE,
PolskieRadioPlayerIE,
PolskieRadioPodcastIE,
PolskieRadioPodcastListIE,
- PolskieRadioRadioKierowcowIE,
)
from .popcorntimes import PopcorntimesIE
from .popcorntv import PopcornTVIE
@@ -1425,6 +1519,7 @@ from .puhutv import (
PuhuTVIE,
PuhuTVSerieIE,
)
+from .pr0gramm import Pr0grammStaticIE, Pr0grammIE
from .prankcast import PrankCastIE
from .premiershiprugby import PremiershipRugbyIE
from .presstv import PressTVIE
@@ -1439,6 +1534,7 @@ from .prx import (
)
from .puls4 import Puls4IE
from .pyvideo import PyvideoIE
+from .qdance import QDanceIE
from .qingting import QingTingIE
from .qqmusic import (
QQMusicIE,
@@ -1471,6 +1567,8 @@ from .radlive import (
RadLiveSeasonIE,
)
from .rai import (
+ RaiIE,
+ RaiCulturaIE,
RaiPlayIE,
RaiPlayLiveIE,
RaiPlayPlaylistIE,
@@ -1479,13 +1577,16 @@ from .rai import (
RaiPlaySoundPlaylistIE,
RaiNewsIE,
RaiSudtirolIE,
- RaiIE,
)
from .raywenderlich import (
RayWenderlichIE,
RayWenderlichCourseIE,
)
from .rbmaradio import RBMARadioIE
+from .rbgtum import (
+ RbgTumIE,
+ RbgTumCourseIE,
+)
from .rcs import (
RCSIE,
RCSEmbedsIE,
@@ -1497,6 +1598,7 @@ from .rcti import (
RCTIPlusTVIE,
)
from .rds import RDSIE
+from .recurbate import RecurbateIE
from .redbee import ParliamentLiveUKIE, RTBFIE
from .redbulltv import (
RedBullTVIE,
@@ -1519,6 +1621,7 @@ from .rentv import (
from .restudy import RestudyIE
from .reuters import ReutersIE
from .reverbnation import ReverbNationIE
+from .rheinmaintv import RheinMainTVIE
from .rice import RICEIE
from .rmcdecouverte import RMCDecouverteIE
from .rockstargames import RockstarGamesIE
@@ -1530,7 +1633,11 @@ from .rokfin import (
)
from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE
from .rottentomatoes import RottenTomatoesIE
-from .rozhlas import RozhlasIE
+from .rozhlas import (
+ RozhlasIE,
+ RozhlasVltavaIE,
+ MujRozhlasIE,
+)
from .rte import RteIE, RteRadioIE
from .rtlnl import (
RtlNlIE,
@@ -1553,6 +1660,11 @@ from .rtnews import (
from .rtp import RTPIE
from .rtrfm import RTRFMIE
from .rts import RTSIE
+from .rtvcplay import (
+ RTVCPlayIE,
+ RTVCPlayEmbedIE,
+ RTVCKalturaIE,
+)
from .rtve import (
RTVEALaCartaIE,
RTVEAudioIE,
@@ -1567,6 +1679,7 @@ from .ruhd import RUHDIE
from .rule34video import Rule34VideoIE
from .rumble import (
RumbleEmbedIE,
+ RumbleIE,
RumbleChannelIE,
)
from .rutube import (
@@ -1586,8 +1699,8 @@ from .megatvcom import (
MegaTVComIE,
MegaTVComEmbedIE,
)
-from .ant1newsgr import (
- Ant1NewsGrWatchIE,
+from .antenna import (
+ AntennaGrWatchIE,
Ant1NewsGrArticleIE,
Ant1NewsGrEmbedIE,
)
@@ -1597,6 +1710,7 @@ from .ruv import (
RuvIE,
RuvSpilaIE
)
+from .s4c import S4CIE
from .safari import (
SafariIE,
SafariApiIE,
@@ -1621,6 +1735,7 @@ from .scte import (
)
from .scrolller import ScrolllerIE
from .seeker import SeekerIE
+from .senalcolombia import SenalColombiaLiveIE
from .senategov import SenateISVPIE, SenateGovIE
from .sendtonews import SendtoNewsIE
from .servus import ServusIE
@@ -1639,6 +1754,7 @@ from .shared import (
VivoIE,
)
from .sharevideos import ShareVideosEmbedIE
+from .sibnet import SibnetEmbedIE
from .shemaroome import ShemarooMeIE
from .showroomlive import ShowRoomLiveIE
from .simplecast import (
@@ -1686,6 +1802,7 @@ from .soundcloud import (
SoundcloudSetIE,
SoundcloudRelatedIE,
SoundcloudUserIE,
+ SoundcloudUserPermalinkIE,
SoundcloudTrackStationIE,
SoundcloudPlaylistIE,
SoundcloudSearchIE,
@@ -1716,6 +1833,7 @@ from .spike import (
BellatorIE,
ParamountNetworkIE,
)
+from .stageplus import StagePlusVODConcertIE
from .startrek import StarTrekIE
from .stitcher import (
StitcherIE,
@@ -1741,6 +1859,10 @@ from .srgssr import (
SRGSSRPlayIE,
)
from .srmediathek import SRMediathekIE
+from .stacommu import (
+ StacommuLiveIE,
+ StacommuVODIE,
+)
from .stanfordoc import StanfordOpenClassroomIE
from .startv import StarTVIE
from .steam import (
@@ -1753,7 +1875,6 @@ from .storyfire import (
StoryFireSeriesIE,
)
from .streamable import StreamableIE
-from .streamanity import StreamanityIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .streamff import StreamFFIE
@@ -1781,6 +1902,11 @@ from .sztvhu import SztvHuIE
from .tagesschau import TagesschauIE
from .tass import TassIE
from .tbs import TBSIE
+from .tbsjp import (
+ TBSJPEpisodeIE,
+ TBSJPProgramIE,
+ TBSJPPlaylistIE,
+)
from .tdslifeway import TDSLifewayIE
from .teachable import (
TeachableIE,
@@ -1791,7 +1917,10 @@ from .teachertube import (
TeacherTubeUserIE,
)
from .teachingchannel import TeachingChannelIE
-from .teamcoco import TeamcocoIE
+from .teamcoco import (
+ TeamcocoIE,
+ ConanClassicIE,
+)
from .teamtreehouse import TeamTreeHouseIE
from .techtalks import TechTalksIE
from .ted import (
@@ -1803,6 +1932,7 @@ from .ted import (
from .tele5 import Tele5IE
from .tele13 import Tele13IE
from .telebruxelles import TeleBruxellesIE
+from .telecaribe import TelecaribePlayIE
from .telecinco import TelecincoIE
from .telegraaf import TelegraafIE
from .telegram import TelegramEmbedIE
@@ -1817,7 +1947,7 @@ from .telequebec import (
)
from .teletask import TeleTaskIE
from .telewebion import TelewebionIE
-from .tempo import TempoIE
+from .tempo import TempoIE, IVXPlayerIE
from .tencent import (
IflixEpisodeIE,
IflixSeriesIE,
@@ -1847,6 +1977,11 @@ from .theweatherchannel import TheWeatherChannelIE
from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE
+from .thisvid import (
+ ThisVidIE,
+ ThisVidMemberIE,
+ ThisVidPlaylistIE,
+)
from .threespeak import (
ThreeSpeakIE,
ThreeSpeakUserIE,
@@ -1859,6 +1994,7 @@ from .tiktok import (
TikTokEffectIE,
TikTokTagIE,
TikTokVMIE,
+ TikTokLiveIE,
DouyinIE,
)
from .tinypic import TinyPicIE
@@ -1888,6 +2024,7 @@ from .traileraddict import TrailerAddictIE
from .triller import (
TrillerIE,
TrillerUserIE,
+ TrillerShortIE,
)
from .trilulilu import TriluliluIE
from .trovo import (
@@ -1896,6 +2033,7 @@ from .trovo import (
TrovoChannelVodIE,
TrovoChannelClipIE,
)
+from .trtcocuk import TrtCocukVideoIE
from .trueid import TrueIDIE
from .trunews import TruNewsIE
from .truth import TruthIE
@@ -1908,10 +2046,9 @@ from .tubitv import (
)
from .tumblr import TumblrIE
from .tunein import (
- TuneInClipIE,
TuneInStationIE,
- TuneInProgramIE,
- TuneInTopicIE,
+ TuneInPodcastIE,
+ TuneInPodcastEpisodeIE,
TuneInShortenerIE,
)
from .tunepk import TunePkIE
@@ -1979,7 +2116,6 @@ from .tvp import (
)
from .tvplay import (
TVPlayIE,
- ViafreeIE,
TVPlayHomeIE,
)
from .tvplayer import TVPlayerIE
@@ -2009,6 +2145,10 @@ from .twitter import (
TwitterSpacesIE,
TwitterShortenerIE,
)
+from .txxx import (
+ TxxxIE,
+ PornTopIE,
+)
from .udemy import (
UdemyIE,
UdemyCourseIE
@@ -2079,6 +2219,13 @@ from .videocampus_sachsen import (
)
from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
+from .videoken import (
+ VideoKenIE,
+ VideoKenPlayerIE,
+ VideoKenPlaylistIE,
+ VideoKenCategoryIE,
+ VideoKenTopicIE,
+)
from .videomore import (
VideomoreIE,
VideomoreVideoIE,
@@ -2127,17 +2274,16 @@ from .viu import (
ViuIE,
ViuPlaylistIE,
ViuOTTIE,
+ ViuOTTIndonesiaIE,
)
from .vk import (
VKIE,
VKUserVideosIE,
VKWallPostIE,
+ VKPlayIE,
+ VKPlayLiveIE,
)
-from .vlive import (
- VLiveIE,
- VLivePostIE,
- VLiveChannelIE,
-)
+from .vocaroo import VocarooIE
from .vodlocker import VodlockerIE
from .vodpl import VODPlIE
from .vodplatform import VODPlatformIE
@@ -2146,6 +2292,7 @@ from .voicy import (
VoicyIE,
VoicyChannelIE,
)
+from .volejtv import VolejTVIE
from .voot import (
VootIE,
VootSeriesIE,
@@ -2154,7 +2301,12 @@ from .voxmedia import (
VoxMediaVolumeIE,
VoxMediaIE,
)
-from .vrt import VRTIE
+from .vrt import (
+ VRTIE,
+ VrtNUIE,
+ KetnetIE,
+ DagelijkseKostIE,
+)
from .vrak import VrakIE
from .vrv import (
VRVIE,
@@ -2191,6 +2343,7 @@ from .wdr import (
WDRElefantIE,
WDRMobileIE,
)
+from .webcamerapl import WebcameraplIE
from .webcaster import (
WebcasterIE,
WebcasterFeedIE,
@@ -2204,8 +2357,20 @@ from .weibo import (
WeiboMobileIE
)
from .weiqitv import WeiqiTVIE
+from .weverse import (
+ WeverseIE,
+ WeverseMediaIE,
+ WeverseMomentIE,
+ WeverseLiveTabIE,
+ WeverseMediaTabIE,
+ WeverseLiveIE,
+)
+from .wevidi import WeVidiIE
+from .weyyak import WeyyakIE
+from .whyp import WhypIE
from .wikimedia import WikimediaIE
from .willow import WillowIE
+from .wimbledon import WimbledonIE
from .wimtv import WimTVIE
from .whowatch import WhoWatchIE
from .wistia import (
@@ -2222,11 +2387,22 @@ from .wppilot import (
WPPilotIE,
WPPilotChannelsIE,
)
+from .wrestleuniverse import (
+ WrestleUniverseVODIE,
+ WrestleUniversePPVIE,
+)
from .wsj import (
WSJIE,
WSJArticleIE,
)
from .wwe import WWEIE
+from .wykop import (
+ WykopDigIE,
+ WykopDigCommentIE,
+ WykopPostIE,
+ WykopPostCommentIE,
+)
+from .xanimu import XanimuIE
from .xbef import XBefIE
from .xboxclips import XboxClipsIE
from .xfileshare import XFileShareIE
@@ -2235,12 +2411,6 @@ from .xhamster import (
XHamsterEmbedIE,
XHamsterUserIE,
)
-from .xiami import (
- XiamiSongIE,
- XiamiAlbumIE,
- XiamiArtistIE,
- XiamiCollectionIE
-)
from .ximalaya import (
XimalayaIE,
XimalayaAlbumIE
@@ -2251,13 +2421,14 @@ from .xnxx import XNXXIE
from .xstream import XstreamIE
from .xtube import XTubeUserIE, XTubeIE
from .xuite import XuiteIE
-from .xvideos import XVideosIE
+from .xvideos import (
+ XVideosIE,
+ XVideosQuickiesIE
+)
from .xxxymovies import XXXYMoviesIE
from .yahoo import (
YahooIE,
YahooSearchIE,
- YahooGyaOPlayerIE,
- YahooGyaOIE,
YahooJapanNewsIE,
)
from .yandexdisk import YandexDiskIE
@@ -2275,6 +2446,10 @@ from .yandexvideo import (
ZenYandexChannelIE,
)
from .yapfiles import YapFilesIE
+from .yappy import (
+ YappyIE,
+ YappyProfileIE,
+)
from .yesjapan import YesJapanIE
from .yinyuetai import YinYueTaiIE
from .yle_areena import YleAreenaIE
@@ -2292,6 +2467,10 @@ from .younow import (
from .youporn import YouPornIE
from .yourporn import YourPornIE
from .yourupload import YourUploadIE
+from .zaiko import (
+ ZaikoIE,
+ ZaikoETicketIE,
+)
from .zapiks import ZapiksIE
from .zattoo import (
BBVTVIE,
@@ -2349,6 +2528,7 @@ from .zingmp3 import (
ZingMp3WeekChartIE,
ZingMp3ChartMusicVideoIE,
ZingMp3UserIE,
+ ZingMp3HubIE,
)
from .zoom import ZoomIE
from .zype import ZypeIE
diff --git a/hypervideo_dl/extractor/abc.py b/hypervideo_dl/extractor/abc.py
index 0ca76b8..f56133e 100644
--- a/hypervideo_dl/extractor/abc.py
+++ b/hypervideo_dl/extractor/abc.py
@@ -12,6 +12,7 @@ from ..utils import (
int_or_none,
parse_iso8601,
str_or_none,
+ traverse_obj,
try_get,
unescapeHTML,
update_url_query,
@@ -85,6 +86,15 @@ class ABCIE(InfoExtractor):
'uploader': 'Behind the News',
'uploader_id': 'behindthenews',
}
+ }, {
+ 'url': 'https://www.abc.net.au/news/2023-06-25/wagner-boss-orders-troops-back-to-bases-to-avoid-bloodshed/102520540',
+ 'info_dict': {
+ 'id': '102520540',
+ 'title': 'Wagner Group retreating from Russia, leader Prigozhin to move to Belarus',
+ 'ext': 'mp4',
+ 'description': 'Wagner troops leave Rostov-on-Don and\xa0Yevgeny Prigozhin will move to Belarus under a deal brokered by Belarusian President Alexander Lukashenko to end the mutiny.',
+ 'thumbnail': 'https://live-production.wcms.abc-cdn.net.au/0c170f5b57f0105c432f366c0e8e267b?impolicy=wcms_crop_resize&cropH=2813&cropW=5000&xPos=0&yPos=249&width=862&height=485',
+ }
}]
def _real_extract(self, url):
@@ -107,7 +117,7 @@ class ABCIE(InfoExtractor):
video = True
if mobj is None:
- mobj = re.search(r'(?P<type>)"sources": (?P<json_data>\[[^\]]+\]),', webpage)
+ mobj = re.search(r'(?P<type>)"(?:sources|files|renditions)":\s*(?P<json_data>\[[^\]]+\])', webpage)
if mobj is None:
mobj = re.search(
r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);',
@@ -121,7 +131,8 @@ class ABCIE(InfoExtractor):
urls_info = self._parse_json(
mobj.group('json_data'), video_id, transform_source=js_to_json)
youtube = mobj.group('type') == 'YouTube'
- video = mobj.group('type') == 'Video' or urls_info[0]['contentType'] == 'video/mp4'
+ video = mobj.group('type') == 'Video' or traverse_obj(
+ urls_info, (0, ('contentType', 'MIMEType')), get_all=False) == 'video/mp4'
if not isinstance(urls_info, list):
urls_info = [urls_info]
diff --git a/hypervideo_dl/extractor/abematv.py b/hypervideo_dl/extractor/abematv.py
index 80046af..8f962ba 100644
--- a/hypervideo_dl/extractor/abematv.py
+++ b/hypervideo_dl/extractor/abematv.py
@@ -22,80 +22,23 @@ from ..utils import (
int_or_none,
intlist_to_bytes,
OnDemandPagedList,
- request_to_url,
time_seconds,
traverse_obj,
update_url_query,
)
-# NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862)
-
-def add_opener(ydl, handler):
- ''' Add a handler for opening URLs, like _download_webpage '''
- # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
- # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
- assert isinstance(ydl._opener, urllib.request.OpenerDirector)
- ydl._opener.add_handler(handler)
-
-
-def remove_opener(ydl, handler):
- '''
- Remove handler(s) for opening URLs
- @param handler Either handler object itself or handler type.
- Specifying handler type will remove all handler which isinstance returns True.
- '''
+def add_opener(ydl, handler): # FIXME: Create proper API in .networking
+ """Add a handler for opening URLs, like _download_webpage"""
# https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
# https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
- opener = ydl._opener
- assert isinstance(ydl._opener, urllib.request.OpenerDirector)
- if isinstance(handler, (type, tuple)):
- find_cp = lambda x: isinstance(x, handler)
- else:
- find_cp = lambda x: x is handler
-
- removed = []
- for meth in dir(handler):
- if meth in ["redirect_request", "do_open", "proxy_open"]:
- # oops, coincidental match
- continue
-
- i = meth.find("_")
- protocol = meth[:i]
- condition = meth[i + 1:]
-
- if condition.startswith("error"):
- j = condition.find("_") + i + 1
- kind = meth[j + 1:]
- try:
- kind = int(kind)
- except ValueError:
- pass
- lookup = opener.handle_error.get(protocol, {})
- opener.handle_error[protocol] = lookup
- elif condition == "open":
- kind = protocol
- lookup = opener.handle_open
- elif condition == "response":
- kind = protocol
- lookup = opener.process_response
- elif condition == "request":
- kind = protocol
- lookup = opener.process_request
- else:
- continue
-
- handlers = lookup.setdefault(kind, [])
- if handlers:
- handlers[:] = [x for x in handlers if not find_cp(x)]
-
- removed.append(x for x in handlers if find_cp(x))
-
- if removed:
- for x in opener.handlers:
- if find_cp(x):
- x.add_parent(None)
- opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)]
+ rh = ydl._request_director.handlers['Urllib']
+ if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES:
+ return
+ opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=ydl.proxies)
+ assert isinstance(opener, urllib.request.OpenerDirector)
+ opener.add_handler(handler)
+ rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license')
class AbemaLicenseHandler(urllib.request.BaseHandler):
@@ -137,11 +80,11 @@ class AbemaLicenseHandler(urllib.request.BaseHandler):
return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey))
def abematv_license_open(self, url):
- url = request_to_url(url)
+ url = url.get_full_url() if isinstance(url, urllib.request.Request) else url
ticket = urllib.parse.urlparse(url).netloc
response_data = self._get_videokey_from_ticket(ticket)
return urllib.response.addinfourl(io.BytesIO(response_data), headers={
- 'Content-Length': len(response_data),
+ 'Content-Length': str(len(response_data)),
}, url=url, code=200)
@@ -156,7 +99,7 @@ class AbemaTVBaseIE(InfoExtractor):
def _generate_aks(cls, deviceid):
deviceid = deviceid.encode('utf-8')
# add 1 hour and then drop minute and secs
- ts_1hour = int((time_seconds(hours=9) // 3600 + 1) * 3600)
+ ts_1hour = int((time_seconds() // 3600 + 1) * 3600)
time_struct = time.gmtime(ts_1hour)
ts_1hour_str = str(ts_1hour).encode('utf-8')
@@ -190,6 +133,16 @@ class AbemaTVBaseIE(InfoExtractor):
if self._USERTOKEN:
return self._USERTOKEN
+ username, _ = self._get_login_info()
+ AbemaTVBaseIE._USERTOKEN = username and self.cache.load(self._NETRC_MACHINE, username)
+ if AbemaTVBaseIE._USERTOKEN:
+ # try authentication with locally stored token
+ try:
+ self._get_media_token(True)
+ return
+ except ExtractorError as e:
+ self.report_warning(f'Failed to login with cached user token; obtaining a fresh one ({e})')
+
AbemaTVBaseIE._DEVICE_ID = str(uuid.uuid4())
aks = self._generate_aks(self._DEVICE_ID)
user_data = self._download_json(
@@ -203,10 +156,7 @@ class AbemaTVBaseIE(InfoExtractor):
})
AbemaTVBaseIE._USERTOKEN = user_data['token']
- # don't allow adding it 2 times or more, though it's guarded
- remove_opener(self._downloader, AbemaLicenseHandler)
add_opener(self._downloader, AbemaLicenseHandler(self))
-
return self._USERTOKEN
def _get_media_token(self, invalidate=False, to_show=True):
@@ -300,6 +250,11 @@ class AbemaTVIE(AbemaTVBaseIE):
_TIMETABLE = None
def _perform_login(self, username, password):
+ self._get_device_token()
+ if self.cache.load(self._NETRC_MACHINE, username) and self._get_media_token():
+ self.write_debug('Skipping logging in')
+ return
+
if '@' in username: # don't strictly check if it's email address or not
ep, method = 'user/email', 'email'
else:
@@ -319,6 +274,7 @@ class AbemaTVIE(AbemaTVBaseIE):
AbemaTVBaseIE._USERTOKEN = login_response['token']
self._get_media_token(True)
+ self.cache.store(self._NETRC_MACHINE, username, AbemaTVBaseIE._USERTOKEN)
def _real_extract(self, url):
# starting download using infojson from this extractor is undefined behavior,
@@ -416,10 +372,20 @@ class AbemaTVIE(AbemaTVBaseIE):
f'https://api.abema.io/v1/video/programs/{video_id}', video_id,
note='Checking playability',
headers=headers)
- ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType'), default=[])
+ ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType'))
if 3 not in ondemand_types:
# cannot acquire decryption key for these streams
self.report_warning('This is a premium-only stream')
+ info.update(traverse_obj(api_response, {
+ 'series': ('series', 'title'),
+ 'season': ('season', 'title'),
+ 'season_number': ('season', 'sequence'),
+ 'episode_number': ('episode', 'number'),
+ }))
+ if not title:
+ title = traverse_obj(api_response, ('episode', 'title'))
+ if not description:
+ description = traverse_obj(api_response, ('episode', 'content'))
m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8'
elif video_type == 'slots':
@@ -489,7 +455,7 @@ class AbemaTVTitleIE(AbemaTVBaseIE):
})
yield from (
self.url_result(f'https://abema.tv/video/episode/{x}')
- for x in traverse_obj(programs, ('programs', ..., 'id'), default=[]))
+ for x in traverse_obj(programs, ('programs', ..., 'id')))
def _entries(self, playlist_id, series_version):
return OnDemandPagedList(
diff --git a/hypervideo_dl/extractor/acast.py b/hypervideo_dl/extractor/acast.py
index f2f828f..427d04c 100644
--- a/hypervideo_dl/extractor/acast.py
+++ b/hypervideo_dl/extractor/acast.py
@@ -40,28 +40,33 @@ class ACastBaseIE(InfoExtractor):
class ACastIE(ACastBaseIE):
IE_NAME = 'acast'
- _VALID_URL = r'''(?x)
+ _VALID_URL = r'''(?x:
https?://
(?:
(?:(?:embed|www)\.)?acast\.com/|
play\.acast\.com/s/
)
- (?P<channel>[^/]+)/(?P<id>[^/#?]+)
- '''
+ (?P<channel>[^/]+)/(?P<id>[^/#?"]+)
+ )'''
+ _EMBED_REGEX = [rf'(?x)<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL})']
_TESTS = [{
'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna',
- 'md5': 'f5598f3ad1e4776fed12ec1407153e4b',
'info_dict': {
'id': '2a92b283-1a75-4ad8-8396-499c641de0d9',
'ext': 'mp3',
'title': '2. Raggarmordet - Röster ur det förflutna',
- 'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67',
+ 'description': 'md5:013959207e05011ad14a222cf22278cc',
'timestamp': 1477346700,
'upload_date': '20161024',
'duration': 2766,
- 'creator': 'Anton Berg & Martin Johnson',
+ 'creator': 'Third Ear Studio',
'series': 'Spår',
'episode': '2. Raggarmordet - Röster ur det förflutna',
+ 'thumbnail': 'https://assets.pippa.io/shows/616ebe1886d7b1398620b943/616ebe33c7e6e70013cae7da.jpg',
+ 'episode_number': 2,
+ 'display_id': '2.raggarmordet-rosterurdetforflutna',
+ 'season_number': 4,
+ 'season': 'Season 4',
}
}, {
'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015',
@@ -73,6 +78,23 @@ class ACastIE(ACastBaseIE):
'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9',
'only_matching': True,
}]
+ _WEBPAGE_TESTS = [{
+ 'url': 'https://ausi.anu.edu.au/news/democracy-sausage-episode-can-labor-be-long-form-government',
+ 'info_dict': {
+ 'id': '646c68fb21fbf20011e9c651',
+ 'ext': 'mp3',
+ 'creator': 'The Australian National University',
+ 'display_id': 'can-labor-be-a-long-form-government',
+ 'duration': 2618,
+ 'thumbnail': 'https://assets.pippa.io/shows/6113e8578b4903809f16f7e5/1684821529295-515b9520db9ce53275b995eb302f941c.jpeg',
+ 'title': 'Can Labor be a long-form government?',
+ 'episode': 'Can Labor be a long-form government?',
+ 'upload_date': '20230523',
+ 'series': 'Democracy Sausage with Mark Kenny',
+ 'timestamp': 1684826362,
+ 'description': 'md5:feabe1fc5004c78ee59c84a46bf4ba16',
+ }
+ }]
def _real_extract(self, url):
channel, display_id = self._match_valid_url(url).groups()
diff --git a/hypervideo_dl/extractor/adn.py b/hypervideo_dl/extractor/adn.py
index e0c18c8..b59dbc8 100644
--- a/hypervideo_dl/extractor/adn.py
+++ b/hypervideo_dl/extractor/adn.py
@@ -6,10 +6,8 @@ import random
from .common import InfoExtractor
from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
-from ..compat import (
- compat_HTTPError,
- compat_b64decode,
-)
+from ..compat import compat_b64decode
+from ..networking.exceptions import HTTPError
from ..utils import (
ass_subtitles_timecode,
bytes_to_intlist,
@@ -142,9 +140,9 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
self._HEADERS = {'authorization': 'Bearer ' + access_token}
except ExtractorError as e:
message = None
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
resp = self._parse_json(
- e.cause.read().decode(), None, fatal=False) or {}
+ e.cause.response.read().decode(), None, fatal=False) or {}
message = resp.get('message') or resp.get('code')
self.report_warning(message or self._LOGIN_ERR_MESSAGE)
@@ -168,7 +166,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
}, data=b'')['token']
links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link')
- self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)])
+ self._K = ''.join(random.choices('0123456789abcdef', k=16))
message = bytes_to_intlist(json.dumps({
'k': self._K,
't': token,
@@ -195,14 +193,14 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
})
break
except ExtractorError as e:
- if not isinstance(e.cause, compat_HTTPError):
+ if not isinstance(e.cause, HTTPError):
raise e
- if e.cause.code == 401:
+ if e.cause.status == 401:
# This usually goes away with a different random pkcs1pad, so retry
continue
- error = self._parse_json(e.cause.read(), video_id)
+ error = self._parse_json(e.cause.response.read(), video_id)
message = error.get('message')
if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country':
self.raise_geo_restricted(msg=message)
diff --git a/hypervideo_dl/extractor/adobepass.py b/hypervideo_dl/extractor/adobepass.py
index e5944f7..5eed0ca 100644
--- a/hypervideo_dl/extractor/adobepass.py
+++ b/hypervideo_dl/extractor/adobepass.py
@@ -2,11 +2,11 @@ import getpass
import json
import re
import time
-import urllib.error
import xml.etree.ElementTree as etree
from .common import InfoExtractor
from ..compat import compat_urlparse
+from ..networking.exceptions import HTTPError
from ..utils import (
NO_DEFAULT,
ExtractorError,
@@ -1394,7 +1394,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en
form_page, urlh = form_page_res
post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url')
if not re.match(r'https?://', post_url):
- post_url = compat_urlparse.urljoin(urlh.geturl(), post_url)
+ post_url = compat_urlparse.urljoin(urlh.url, post_url)
form_data = self._hidden_inputs(form_page)
form_data.update(data)
return self._download_webpage_handle(
@@ -1473,7 +1473,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en
elif 'automatically signed in with' in provider_redirect_page:
# Seems like comcast is rolling up new way of automatically signing customers
oauth_redirect_url = self._html_search_regex(
- r'continue:\s*"(https://oauth.xfinity.com/oauth/authorize\?.+)"', provider_redirect_page,
+ r'continue:\s*"(https://oauth\.xfinity\.com/oauth/authorize\?.+)"', provider_redirect_page,
'oauth redirect (signed)')
# Just need to process the request. No useful data comes back
self._download_webpage(oauth_redirect_url, video_id, 'Confirming auto login')
@@ -1573,7 +1573,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en
}), headers={
'Content-Type': 'application/x-www-form-urlencoded'
})
- elif mso_id == 'Spectrum':
+ elif mso_id in ('Spectrum', 'Charter_Direct'):
# Spectrum's login for is dynamically loaded via JS so we need to hardcode the flow
# as a one-off implementation.
provider_redirect_page, urlh = provider_redirect_page_res
@@ -1619,7 +1619,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en
hidden_data['history'] = 1
provider_login_page_res = self._download_webpage_handle(
- urlh.geturl(), video_id, 'Sending first bookend',
+ urlh.url, video_id, 'Sending first bookend',
query=hidden_data)
provider_association_redirect, urlh = post_form(
@@ -1629,7 +1629,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en
})
provider_refresh_redirect_url = extract_redirect_url(
- provider_association_redirect, url=urlh.geturl())
+ provider_association_redirect, url=urlh.url)
last_bookend_page, urlh = self._download_webpage_handle(
provider_refresh_redirect_url, video_id,
@@ -1638,7 +1638,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en
hidden_data['history'] = 3
mvpd_confirm_page_res = self._download_webpage_handle(
- urlh.geturl(), video_id, 'Sending final bookend',
+ urlh.url, video_id, 'Sending final bookend',
query=hidden_data)
post_form(mvpd_confirm_page_res, 'Confirming Login')
@@ -1652,7 +1652,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en
hidden_data['history_val'] = 1
provider_login_redirect_page_res = self._download_webpage_handle(
- urlh.geturl(), video_id, 'Sending First Bookend',
+ urlh.url, video_id, 'Sending First Bookend',
query=hidden_data)
provider_login_redirect_page, urlh = provider_login_redirect_page_res
@@ -1680,7 +1680,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en
})
provider_refresh_redirect_url = extract_redirect_url(
- provider_association_redirect, url=urlh.geturl())
+ provider_association_redirect, url=urlh.url)
last_bookend_page, urlh = self._download_webpage_handle(
provider_refresh_redirect_url, video_id,
@@ -1690,7 +1690,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en
hidden_data['history_val'] = 3
mvpd_confirm_page_res = self._download_webpage_handle(
- urlh.geturl(), video_id, 'Sending Final Bookend',
+ urlh.url, video_id, 'Sending Final Bookend',
query=hidden_data)
post_form(mvpd_confirm_page_res, 'Confirming Login')
@@ -1699,7 +1699,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en
# based redirect that should be followed.
provider_redirect_page, urlh = provider_redirect_page_res
provider_refresh_redirect_url = extract_redirect_url(
- provider_redirect_page, url=urlh.geturl())
+ provider_redirect_page, url=urlh.url)
if provider_refresh_redirect_url:
provider_redirect_page_res = self._download_webpage_handle(
provider_refresh_redirect_url, video_id,
@@ -1724,7 +1724,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en
'requestor_id': requestor_id,
}), headers=mvpd_headers)
except ExtractorError as e:
- if not mso_id and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401:
+ if not mso_id and isinstance(e.cause, HTTPError) and e.cause.status == 401:
raise_mvpd_required()
raise
if '<pendingLogout' in session:
diff --git a/hypervideo_dl/extractor/adultswim.py b/hypervideo_dl/extractor/adultswim.py
index bd29eb4..daaedde 100644
--- a/hypervideo_dl/extractor/adultswim.py
+++ b/hypervideo_dl/extractor/adultswim.py
@@ -170,8 +170,10 @@ class AdultSwimIE(TurnerBaseIE):
continue
ext = determine_ext(asset_url, mimetype2ext(asset.get('mime_type')))
if ext == 'm3u8':
- info['formats'].extend(self._extract_m3u8_formats(
- asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+ info['formats'].extend(fmts)
+ self._merge_subtitles(subs, target=info['subtitles'])
elif ext == 'f4m':
continue
# info['formats'].extend(self._extract_f4m_formats(
diff --git a/hypervideo_dl/extractor/aenetworks.py b/hypervideo_dl/extractor/aenetworks.py
index d7c4010..f049a0f 100644
--- a/hypervideo_dl/extractor/aenetworks.py
+++ b/hypervideo_dl/extractor/aenetworks.py
@@ -3,6 +3,8 @@ from ..utils import (
ExtractorError,
GeoRestrictedError,
int_or_none,
+ remove_start,
+ traverse_obj,
update_url_query,
urlencode_postdata,
)
@@ -72,7 +74,14 @@ class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
requestor_id, brand = self._DOMAIN_MAP[domain]
result = self._download_json(
'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand,
- filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0]
+ filter_value, query={'filter[%s]' % filter_key: filter_value})
+ result = traverse_obj(
+ result, ('results',
+ lambda k, v: k == 0 and v[filter_key] == filter_value),
+ get_all=False)
+ if not result:
+ raise ExtractorError('Show not found in A&E feed (too new?)', expected=True,
+ video_id=remove_start(filter_value, '/'))
title = result['title']
video_id = result['id']
media_url = result['publicUrl']
@@ -123,7 +132,7 @@ class AENetworksIE(AENetworksBaseIE):
'skip_download': True,
},
'add_ie': ['ThePlatform'],
- 'skip': 'This video is only available for users of participating TV providers.',
+ 'skip': 'Geo-restricted - This content is not available in your location.'
}, {
'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
'info_dict': {
@@ -140,6 +149,7 @@ class AENetworksIE(AENetworksBaseIE):
'skip_download': True,
},
'add_ie': ['ThePlatform'],
+ 'skip': 'This video is only available for users of participating TV providers.',
}, {
'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8',
'only_matching': True
@@ -303,6 +313,7 @@ class HistoryTopicIE(AENetworksBaseIE):
class HistoryPlayerIE(AENetworksBaseIE):
IE_NAME = 'history:player'
_VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|biography)\.com)/player/(?P<id>\d+)'
+ _TESTS = []
def _real_extract(self, url):
domain, video_id = self._match_valid_url(url).groups()
diff --git a/hypervideo_dl/extractor/aeonco.py b/hypervideo_dl/extractor/aeonco.py
index 4655862..390eae3 100644
--- a/hypervideo_dl/extractor/aeonco.py
+++ b/hypervideo_dl/extractor/aeonco.py
@@ -1,5 +1,6 @@
from .common import InfoExtractor
from .vimeo import VimeoIE
+from ..utils import ExtractorError, traverse_obj, url_or_none
class AeonCoIE(InfoExtractor):
@@ -19,22 +20,55 @@ class AeonCoIE(InfoExtractor):
}
}, {
'url': 'https://aeon.co/videos/dazzling-timelapse-shows-how-microbes-spoil-our-food-and-sometimes-enrich-it',
- 'md5': '4e5f3dad9dbda0dbfa2da41a851e631e',
+ 'md5': '03582d795382e49f2fd0b427b55de409',
'info_dict': {
- 'id': '728595228',
+ 'id': '759576926',
'ext': 'mp4',
'title': 'Wrought',
- 'thumbnail': 'https://i.vimeocdn.com/video/1484618528-c91452611f9a4e4497735a533da60d45b2fe472deb0c880f0afaab0cd2efb22a-d_1280',
- 'uploader': 'Biofilm Productions',
- 'uploader_id': 'user140352216',
- 'uploader_url': 'https://vimeo.com/user140352216',
+ 'thumbnail': 'https://i.vimeocdn.com/video/1525599692-84614af88e446612f49ca966cf8f80eab2c73376bedd80555741c521c26f9a3e-d_1280',
+ 'uploader': 'Aeon Video',
+ 'uploader_id': 'aeonvideo',
+ 'uploader_url': 'https://vimeo.com/aeonvideo',
'duration': 1344
}
+ }, {
+ 'url': 'https://aeon.co/videos/chew-over-the-prisoners-dilemma-and-see-if-you-can-find-the-rational-path-out',
+ 'md5': '1cfda0bf3ae24df17d00f2c0cb6cc21b',
+ 'info_dict': {
+ 'id': 'emyi4z-O0ls',
+ 'ext': 'mp4',
+ 'title': 'How to outsmart the Prisoner’s Dilemma - Lucas Husted',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/emyi4z-O0ls/maxresdefault.webp',
+ 'uploader': 'TED-Ed',
+ 'uploader_id': '@TEDEd',
+ 'uploader_url': 'https://www.youtube.com/@TEDEd',
+ 'duration': 344,
+ 'upload_date': '20200827',
+ 'channel_id': 'UCsooa4yRKGN_zEE8iknghZA',
+ 'playable_in_embed': True,
+ 'description': 'md5:c0959524f08cb60f96fd010f3dfb17f3',
+ 'categories': ['Education'],
+ 'like_count': int,
+ 'channel': 'TED-Ed',
+ 'chapters': 'count:7',
+ 'channel_url': 'https://www.youtube.com/channel/UCsooa4yRKGN_zEE8iknghZA',
+ 'tags': 'count:26',
+ 'availability': 'public',
+ 'channel_follower_count': int,
+ 'view_count': int,
+ 'age_limit': 0,
+ 'live_status': 'not_live',
+ 'comment_count': int,
+ },
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- vimeo_id = self._search_regex(r'hosterId":\s*"(?P<id>[0-9]+)', webpage, 'vimeo id')
- vimeo_url = VimeoIE._smuggle_referrer(f'https://player.vimeo.com/video/{vimeo_id}', 'https://aeon.co')
- return self.url_result(vimeo_url, VimeoIE)
+ embed_url = traverse_obj(self._yield_json_ld(webpage, video_id), (
+ lambda _, v: v['@type'] == 'VideoObject', 'embedUrl', {url_or_none}), get_all=False)
+ if not embed_url:
+ raise ExtractorError('No embed URL found in webpage')
+ if 'player.vimeo.com' in embed_url:
+ embed_url = VimeoIE._smuggle_referrer(embed_url, 'https://aeon.co/')
+ return self.url_result(embed_url)
diff --git a/hypervideo_dl/extractor/afreecatv.py b/hypervideo_dl/extractor/afreecatv.py
index 9276fe7..3d26d9c 100644
--- a/hypervideo_dl/extractor/afreecatv.py
+++ b/hypervideo_dl/extractor/afreecatv.py
@@ -77,59 +77,6 @@ class AfreecaTVIE(InfoExtractor):
}],
'skip': 'Video is gone',
}, {
- 'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793',
- 'info_dict': {
- 'id': '18650793',
- 'ext': 'mp4',
- 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'uploader': '윈아디',
- 'uploader_id': 'badkids',
- 'duration': 107,
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- 'url': 'http://vod.afreecatv.com/PLAYER/STATION/10481652',
- 'info_dict': {
- 'id': '10481652',
- 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
- 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
- 'uploader': 'dailyapril',
- 'uploader_id': 'dailyapril',
- 'duration': 6492,
- },
- 'playlist_count': 2,
- 'playlist': [{
- 'md5': 'd8b7c174568da61d774ef0203159bf97',
- 'info_dict': {
- 'id': '20160502_c4c62b9d_174361386_1',
- 'ext': 'mp4',
- 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 1)",
- 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
- 'uploader': 'dailyapril',
- 'uploader_id': 'dailyapril',
- 'upload_date': '20160502',
- 'duration': 3601,
- },
- }, {
- 'md5': '58f2ce7f6044e34439ab2d50612ab02b',
- 'info_dict': {
- 'id': '20160502_39e739bb_174361386_2',
- 'ext': 'mp4',
- 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 2)",
- 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
- 'uploader': 'dailyapril',
- 'uploader_id': 'dailyapril',
- 'upload_date': '20160502',
- 'duration': 2891,
- },
- }],
- 'params': {
- 'skip_download': True,
- },
- }, {
# non standard key
'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605',
'info_dict': {
@@ -146,8 +93,8 @@ class AfreecaTVIE(InfoExtractor):
'skip_download': True,
},
}, {
- # PARTIAL_ADULT
- 'url': 'http://vod.afreecatv.com/PLAYER/STATION/32028439',
+ # adult content
+ 'url': 'https://vod.afreecatv.com/player/97267690',
'info_dict': {
'id': '20180327_27901457_202289533_1',
'ext': 'mp4',
@@ -161,16 +108,25 @@ class AfreecaTVIE(InfoExtractor):
'params': {
'skip_download': True,
},
- 'expected_warnings': ['adult content'],
+ 'skip': 'The VOD does not exist',
}, {
'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',
'only_matching': True,
}, {
- 'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030',
- 'only_matching': True,
- }, {
- 'url': 'http://vod.afreecatv.com/player/15055030',
- 'only_matching': True,
+ 'url': 'https://vod.afreecatv.com/player/96753363',
+ 'info_dict': {
+ 'id': '20230108_9FF5BEE1_244432674_1',
+ 'ext': 'mp4',
+ 'uploader_id': 'rlantnghks',
+ 'uploader': '페이즈으',
+ 'duration': 10840,
+ 'thumbnail': 'http://videoimg.afreecatv.com/php/SnapshotLoad.php?rowKey=20230108_9FF5BEE1_244432674_1_r',
+ 'upload_date': '20230108',
+ 'title': '젠지 페이즈',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
@staticmethod
@@ -223,26 +179,21 @@ class AfreecaTVIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- if re.search(r'alert\(["\']This video has been deleted', webpage):
- raise ExtractorError(
- 'Video %s has been deleted' % video_id, expected=True)
-
- station_id = self._search_regex(
- r'nStationNo\s*=\s*(\d+)', webpage, 'station')
- bbs_id = self._search_regex(
- r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs')
- video_id = self._search_regex(
- r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id)
-
partial_view = False
adult_view = False
for _ in range(2):
+ data = self._download_json(
+ 'https://api.m.afreecatv.com/station/video/a/view',
+ video_id, headers={'Referer': url}, data=urlencode_postdata({
+ 'nTitleNo': video_id,
+ 'nApiLevel': 10,
+ }))['data']
+ if traverse_obj(data, ('code', {int})) == -6221:
+ raise ExtractorError('The VOD does not exist', expected=True)
query = {
'nTitleNo': video_id,
- 'nStationNo': station_id,
- 'nBbsNo': bbs_id,
+ 'nStationNo': data['station_no'],
+ 'nBbsNo': data['bbs_no'],
}
if partial_view:
query['partialView'] = 'SKIP_ADULT'
diff --git a/hypervideo_dl/extractor/airtv.py b/hypervideo_dl/extractor/airtv.py
new file mode 100644
index 0000000..0b73a96
--- /dev/null
+++ b/hypervideo_dl/extractor/airtv.py
@@ -0,0 +1,96 @@
+from .common import InfoExtractor
+from .youtube import YoutubeIE
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ mimetype2ext,
+ parse_iso8601,
+ traverse_obj
+)
+
+
+class AirTVIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.air\.tv/watch\?v=(?P<id>\w+)'
+ _TESTS = [{
+ # without youtube_id
+ 'url': 'https://www.air.tv/watch?v=W87jcWleSn2hXZN47zJZsQ',
+ 'info_dict': {
+ 'id': 'W87jcWleSn2hXZN47zJZsQ',
+ 'ext': 'mp4',
+ 'release_date': '20221003',
+ 'release_timestamp': 1664792603,
+ 'channel_id': 'vgfManQlRQKgoFQ8i8peFQ',
+ 'title': 'md5:c12d49ed367c3dadaa67659aff43494c',
+ 'upload_date': '20221003',
+ 'duration': 151,
+ 'view_count': int,
+ 'thumbnail': 'https://cdn-sp-gcs.air.tv/videos/W/8/W87jcWleSn2hXZN47zJZsQ/b13fc56464f47d9d62a36d110b9b5a72-4096x2160_9.jpg',
+ 'timestamp': 1664792603,
+ }
+ }, {
+ # with youtube_id
+ 'url': 'https://www.air.tv/watch?v=sv57EC8tRXG6h8dNXFUU1Q',
+ 'info_dict': {
+ 'id': '2ZTqmpee-bQ',
+ 'ext': 'mp4',
+ 'comment_count': int,
+ 'tags': 'count:11',
+ 'channel_follower_count': int,
+ 'like_count': int,
+ 'uploader': 'Newsflare',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/2ZTqmpee-bQ/maxresdefault.webp',
+ 'availability': 'public',
+ 'title': 'Geese Chase Alligator Across Golf Course',
+ 'uploader_id': 'NewsflareBreaking',
+ 'channel_url': 'https://www.youtube.com/channel/UCzSSoloGEz10HALUAbYhngQ',
+ 'description': 'md5:99b21d9cea59330149efbd9706e208f5',
+ 'age_limit': 0,
+ 'channel_id': 'UCzSSoloGEz10HALUAbYhngQ',
+ 'uploader_url': 'http://www.youtube.com/user/NewsflareBreaking',
+ 'view_count': int,
+ 'categories': ['News & Politics'],
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'channel': 'Newsflare',
+ 'duration': 37,
+ 'upload_date': '20180511',
+ }
+ }]
+
+ def _get_formats_and_subtitle(self, json_data, video_id):
+ formats, subtitles = [], {}
+ for source in traverse_obj(json_data, 'sources', 'sources_desktop', ...):
+ ext = determine_ext(source.get('src'), mimetype2ext(source.get('type')))
+ if ext == 'm3u8':
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(source.get('src'), video_id)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+ else:
+ formats.append({'url': source.get('src'), 'ext': ext})
+ return formats, subtitles
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ nextjs_json = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['initialState']['videos'][display_id]
+ if nextjs_json.get('youtube_id'):
+ return self.url_result(
+ f'https://www.youtube.com/watch?v={nextjs_json.get("youtube_id")}', YoutubeIE)
+
+ formats, subtitles = self._get_formats_and_subtitle(nextjs_json, display_id)
+ return {
+ 'id': display_id,
+ 'title': nextjs_json.get('title') or self._html_search_meta('og:title', webpage),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'description': nextjs_json.get('description') or None,
+ 'duration': int_or_none(nextjs_json.get('duration')),
+ 'thumbnails': [
+ {'url': thumbnail}
+ for thumbnail in traverse_obj(nextjs_json, ('default_thumbnails', ...))],
+ 'channel_id': traverse_obj(nextjs_json, 'channel', 'channel_slug'),
+ 'timestamp': parse_iso8601(nextjs_json.get('created')),
+ 'release_timestamp': parse_iso8601(nextjs_json.get('published')),
+ 'view_count': int_or_none(nextjs_json.get('views')),
+ }
diff --git a/hypervideo_dl/extractor/aitube.py b/hypervideo_dl/extractor/aitube.py
new file mode 100644
index 0000000..89a6450
--- /dev/null
+++ b/hypervideo_dl/extractor/aitube.py
@@ -0,0 +1,60 @@
+from .common import InfoExtractor
+from ..utils import int_or_none, merge_dicts
+
+
+class AitubeKZVideoIE(InfoExtractor):
+ _VALID_URL = r'https?://aitube\.kz/(?:video|embed/)\?(?:[^\?]+)?id=(?P<id>[\w-]+)'
+ _TESTS = [{
+ # id paramater as first parameter
+ 'url': 'https://aitube.kz/video?id=9291d29b-c038-49a1-ad42-3da2051d353c&playlistId=d55b1f5f-ef2a-4f23-b646-2a86275b86b7&season=1',
+ 'info_dict': {
+ 'id': '9291d29b-c038-49a1-ad42-3da2051d353c',
+ 'ext': 'mp4',
+ 'duration': 2174.0,
+ 'channel_id': '94962f73-013b-432c-8853-1bd78ca860fe',
+ 'like_count': int,
+ 'channel': 'ASTANA TV',
+ 'comment_count': int,
+ 'view_count': int,
+ 'description': 'Смотреть любимые сериалы и видео, поделиться видео и сериалами с друзьями и близкими',
+ 'thumbnail': 'https://cdn.static02.aitube.kz/kz.aitudala.aitube.staticaccess/files/ddf2a2ff-bee3-409b-b5f2-2a8202bba75b',
+ 'upload_date': '20221102',
+ 'timestamp': 1667370519,
+ 'title': 'Ангел хранитель 1 серия',
+ 'channel_follower_count': int,
+ }
+ }, {
+ # embed url
+ 'url': 'https://aitube.kz/embed/?id=9291d29b-c038-49a1-ad42-3da2051d353c',
+ 'only_matching': True,
+ }, {
+ # id parameter is not as first paramater
+ 'url': 'https://aitube.kz/video?season=1&id=9291d29b-c038-49a1-ad42-3da2051d353c&playlistId=d55b1f5f-ef2a-4f23-b646-2a86275b86b7',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ nextjs_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['videoInfo']
+ json_ld_data = self._search_json_ld(webpage, video_id)
+
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ f'https://api-http.aitube.kz/kz.aitudala.aitube.staticaccess/video/{video_id}/video', video_id)
+
+ return merge_dicts({
+ 'id': video_id,
+ 'title': nextjs_data.get('title') or self._html_search_meta(['name', 'og:title'], webpage),
+ 'description': nextjs_data.get('description'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'view_count': (nextjs_data.get('viewCount')
+ or int_or_none(self._html_search_meta('ya:ovs:views_total', webpage))),
+ 'like_count': nextjs_data.get('likeCount'),
+ 'channel': nextjs_data.get('channelTitle'),
+ 'channel_id': nextjs_data.get('channelId'),
+ 'thumbnail': nextjs_data.get('coverUrl'),
+ 'comment_count': nextjs_data.get('commentCount'),
+ 'channel_follower_count': int_or_none(nextjs_data.get('channelSubscriberCount')),
+ }, json_ld_data)
diff --git a/hypervideo_dl/extractor/amazon.py b/hypervideo_dl/extractor/amazon.py
index 4d31706..a03f983 100644
--- a/hypervideo_dl/extractor/amazon.py
+++ b/hypervideo_dl/extractor/amazon.py
@@ -1,5 +1,17 @@
+import re
+
from .common import InfoExtractor
-from ..utils import ExtractorError, int_or_none
+from ..utils import (
+ ExtractorError,
+ clean_html,
+ float_or_none,
+ get_element_by_attribute,
+ get_element_by_class,
+ int_or_none,
+ js_to_json,
+ traverse_obj,
+ url_or_none,
+)
class AmazonStoreIE(InfoExtractor):
@@ -9,7 +21,7 @@ class AmazonStoreIE(InfoExtractor):
'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/',
'info_dict': {
'id': 'B098XNCHLD',
- 'title': 'md5:dae240564cbb2642170c02f7f0d7e472',
+ 'title': str,
},
'playlist_mincount': 1,
'playlist': [{
@@ -20,28 +32,32 @@ class AmazonStoreIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 34,
},
- }]
+ }],
+ 'expected_warnings': ['Unable to extract data'],
}, {
'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3',
'info_dict': {
'id': 'B0863TXGM3',
- 'title': 'md5:d1d3352428f8f015706c84b31e132169',
+ 'title': str,
},
'playlist_mincount': 4,
+ 'expected_warnings': ['Unable to extract data'],
}, {
'url': 'https://www.amazon.com/dp/B0845NXCXF/',
'info_dict': {
'id': 'B0845NXCXF',
- 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e',
+ 'title': str,
},
'playlist-mincount': 1,
+ 'expected_warnings': ['Unable to extract data'],
}, {
'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ',
'info_dict': {
'id': 'B08WX337PQ',
- 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e',
+ 'title': str,
},
'playlist_mincount': 1,
+ 'expected_warnings': ['Unable to extract data'],
}]
def _real_extract(self, url):
@@ -52,7 +68,7 @@ class AmazonStoreIE(InfoExtractor):
try:
data_json = self._search_json(
r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', id,
- transform_source=lambda x: x.replace(R'\\u', R'\u'))
+ transform_source=js_to_json)
except ExtractorError as e:
retry.error = e
@@ -66,3 +82,89 @@ class AmazonStoreIE(InfoExtractor):
'width': int_or_none(video.get('videoWidth')),
} for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')]
return self.playlist_result(entries, playlist_id=id, playlist_title=data_json.get('title'))
+
+
+class AmazonReviewsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/gp/customer-reviews/(?P<id>[^/&#$?]+)'
+ _TESTS = [{
+ 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl',
+ 'info_dict': {
+ 'id': 'R10VE9VUSY19L3',
+ 'ext': 'mp4',
+ 'title': 'Get squad #Suspicious',
+ 'description': 'md5:7012695052f440a1e064e402d87e0afb',
+ 'uploader': 'Kimberly Cronkright',
+ 'average_rating': 1.0,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'expected_warnings': ['Review body was not found in webpage'],
+ }, {
+ 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl?language=es_US',
+ 'info_dict': {
+ 'id': 'R10VE9VUSY19L3',
+ 'ext': 'mp4',
+ 'title': 'Get squad #Suspicious',
+ 'description': 'md5:7012695052f440a1e064e402d87e0afb',
+ 'uploader': 'Kimberly Cronkright',
+ 'average_rating': 1.0,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'expected_warnings': ['Review body was not found in webpage'],
+ }, {
+ 'url': 'https://www.amazon.in/gp/customer-reviews/RV1CO8JN5VGXV/',
+ 'info_dict': {
+ 'id': 'RV1CO8JN5VGXV',
+ 'ext': 'mp4',
+ 'title': 'Not sure about its durability',
+ 'description': 'md5:1a252c106357f0a3109ebf37d2e87494',
+ 'uploader': 'Shoaib Gulzar',
+ 'average_rating': 2.0,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'expected_warnings': ['Review body was not found in webpage'],
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ for retry in self.RetryManager():
+ webpage = self._download_webpage(url, video_id)
+ review_body = get_element_by_attribute('data-hook', 'review-body', webpage)
+ if not review_body:
+ retry.error = ExtractorError('Review body was not found in webpage', expected=True)
+
+ formats, subtitles = [], {}
+
+ manifest_url = self._search_regex(
+ r'data-video-url="([^"]+)"', review_body, 'm3u8 url', default=None)
+ if url_or_none(manifest_url):
+ fmts, subtitles = self._extract_m3u8_formats_and_subtitles(
+ manifest_url, video_id, 'mp4', fatal=False)
+ formats.extend(fmts)
+
+ video_url = self._search_regex(
+ r'<input[^>]+\bvalue="([^"]+)"[^>]+\bclass="video-url"', review_body, 'mp4 url', default=None)
+ if url_or_none(video_url):
+ formats.append({
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'format_id': 'http-mp4',
+ })
+
+ if not formats:
+ self.raise_no_formats('No video found for this customer review', expected=True)
+
+ return {
+ 'id': video_id,
+ 'title': (clean_html(get_element_by_attribute('data-hook', 'review-title', webpage))
+ or self._html_extract_title(webpage)),
+ 'description': clean_html(traverse_obj(re.findall(
+ r'<span(?:\s+class="cr-original-review-content")?>(.+?)</span>', review_body), -1)),
+ 'uploader': clean_html(get_element_by_class('a-profile-name', webpage)),
+ 'average_rating': float_or_none(clean_html(get_element_by_attribute(
+ 'data-hook', 'review-star-rating', webpage) or '').partition(' ')[0]),
+ 'thumbnail': self._search_regex(
+ r'data-thumbnail-url="([^"]+)"', review_body, 'thumbnail', default=None),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/amazonminitv.py b/hypervideo_dl/extractor/amazonminitv.py
index 7309968..b57d985 100644
--- a/hypervideo_dl/extractor/amazonminitv.py
+++ b/hypervideo_dl/extractor/amazonminitv.py
@@ -191,7 +191,7 @@ query content($sessionIdToken: String!, $deviceLocale: String, $contentId: ID!,
class AmazonMiniTVSeasonIE(AmazonMiniTVBaseIE):
IE_NAME = 'amazonminitv:season'
_VALID_URL = r'amazonminitv:season:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)'
- IE_DESC = 'Amazon MiniTV Series, "minitv:season:" prefix'
+ IE_DESC = 'Amazon MiniTV Season, "minitv:season:" prefix'
_TESTS = [{
'url': 'amazonminitv:season:amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0',
'playlist_mincount': 6,
@@ -250,6 +250,7 @@ query getEpisodes($sessionIdToken: String!, $clientId: String, $episodeOrSeasonI
class AmazonMiniTVSeriesIE(AmazonMiniTVBaseIE):
IE_NAME = 'amazonminitv:series'
_VALID_URL = r'amazonminitv:series:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)'
+ IE_DESC = 'Amazon MiniTV Series, "minitv:series:" prefix'
_TESTS = [{
'url': 'amazonminitv:series:amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0',
'playlist_mincount': 3,
diff --git a/hypervideo_dl/extractor/americastestkitchen.py b/hypervideo_dl/extractor/americastestkitchen.py
index abda55d..e889458 100644
--- a/hypervideo_dl/extractor/americastestkitchen.py
+++ b/hypervideo_dl/extractor/americastestkitchen.py
@@ -11,7 +11,7 @@ from ..utils import (
class AmericasTestKitchenIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:cooks(?:country|illustrated)/)?(?P<resource_type>episode|videos)/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?:cooks(?:country|illustrated)/)?(?P<resource_type>episode|videos)/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers',
'md5': 'b861c3e365ac38ad319cfd509c30577f',
@@ -72,6 +72,12 @@ class AmericasTestKitchenIE(InfoExtractor):
}, {
'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington',
'only_matching': True,
+ }, {
+ 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -100,7 +106,7 @@ class AmericasTestKitchenIE(InfoExtractor):
class AmericasTestKitchenSeasonIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com(?P<show>/cookscountry)?/episodes/browse/season_(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|(?P<cooks>cooks(?:country|illustrated)))\.com(?:(?:/(?P<show2>cooks(?:country|illustrated)))?(?:/?$|(?<!ated)(?<!ated\.com)/episodes/browse/season_(?P<season>\d+)))'
_TESTS = [{
# ATK Season
'url': 'https://www.americastestkitchen.com/episodes/browse/season_1',
@@ -117,29 +123,73 @@ class AmericasTestKitchenSeasonIE(InfoExtractor):
'title': 'Season 12',
},
'playlist_count': 13,
+ }, {
+ # America's Test Kitchen Series
+ 'url': 'https://www.americastestkitchen.com/',
+ 'info_dict': {
+ 'id': 'americastestkitchen',
+ 'title': 'America\'s Test Kitchen',
+ },
+ 'playlist_count': 558,
+ }, {
+ # Cooks Country Series
+ 'url': 'https://www.americastestkitchen.com/cookscountry',
+ 'info_dict': {
+ 'id': 'cookscountry',
+ 'title': 'Cook\'s Country',
+ },
+ 'playlist_count': 199,
+ }, {
+ 'url': 'https://www.americastestkitchen.com/cookscountry/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.cookscountry.com/episodes/browse/season_12',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.cookscountry.com',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.americastestkitchen.com/cooksillustrated/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.cooksillustrated.com',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- show_path, season_number = self._match_valid_url(url).group('show', 'id')
- season_number = int(season_number)
+ season_number, show1, show = self._match_valid_url(url).group('season', 'show', 'show2')
+ show_path = ('/' + show) if show else ''
+ show = show or show1
+ season_number = int_or_none(season_number)
+
+ slug, title = {
+ 'americastestkitchen': ('atk', 'America\'s Test Kitchen'),
+ 'cookscountry': ('cco', 'Cook\'s Country'),
+ 'cooksillustrated': ('cio', 'Cook\'s Illustrated'),
+ }[show]
- slug = 'cco' if show_path == '/cookscountry' else 'atk'
+ facet_filters = [
+ 'search_document_klass:episode',
+ 'search_show_slug:' + slug,
+ ]
- season = 'Season %d' % season_number
+ if season_number:
+ playlist_id = 'season_%d' % season_number
+ playlist_title = 'Season %d' % season_number
+ facet_filters.append('search_season_list:' + playlist_title)
+ else:
+ playlist_id = show
+ playlist_title = title
season_search = self._download_json(
'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug,
- season, headers={
+ playlist_id, headers={
'Origin': 'https://www.americastestkitchen.com',
'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805',
'X-Algolia-Application-Id': 'Y1FNZXUI30',
}, query={
- 'facetFilters': json.dumps([
- 'search_season_list:' + season,
- 'search_document_klass:episode',
- 'search_show_slug:' + slug,
- ]),
- 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug,
+ 'facetFilters': json.dumps(facet_filters),
+ 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title,search_atk_episode_season' % slug,
'attributesToHighlight': '',
'hitsPerPage': 1000,
})
@@ -162,4 +212,4 @@ class AmericasTestKitchenSeasonIE(InfoExtractor):
}
return self.playlist_result(
- entries(), 'season_%d' % season_number, season)
+ entries(), playlist_id, playlist_title)
diff --git a/hypervideo_dl/extractor/amp.py b/hypervideo_dl/extractor/amp.py
index b0cbd77..0d259c5 100644
--- a/hypervideo_dl/extractor/amp.py
+++ b/hypervideo_dl/extractor/amp.py
@@ -5,6 +5,7 @@ from ..utils import (
int_or_none,
mimetype2ext,
parse_iso8601,
+ strip_jsonp,
unified_timestamp,
url_or_none,
)
@@ -15,7 +16,7 @@ class AMPIE(InfoExtractor): # XXX: Conventionally, base classes should end with
def _extract_feed_info(self, url):
feed = self._download_json(
url, None, 'Downloading Akamai AMP feed',
- 'Unable to download Akamai AMP feed')
+ 'Unable to download Akamai AMP feed', transform_source=strip_jsonp)
item = feed.get('channel', {}).get('item')
if not item:
raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error']))
@@ -73,8 +74,10 @@ class AMPIE(InfoExtractor): # XXX: Conventionally, base classes should end with
media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
video_id, f4m_id='hds', fatal=False))
elif ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
diff --git a/hypervideo_dl/extractor/anchorfm.py b/hypervideo_dl/extractor/anchorfm.py
new file mode 100644
index 0000000..52f2ad0
--- /dev/null
+++ b/hypervideo_dl/extractor/anchorfm.py
@@ -0,0 +1,98 @@
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ float_or_none,
+ int_or_none,
+ str_or_none,
+ traverse_obj,
+ unified_timestamp
+)
+
+
+class AnchorFMEpisodeIE(InfoExtractor):
+ _VALID_URL = r'https?://anchor\.fm/(?P<channel_name>\w+)/(?:embed/)?episodes/[\w-]+-(?P<episode_id>\w+)'
+ _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL})']
+ _TESTS = [{
+ 'url': 'https://anchor.fm/lovelyti/episodes/Chrisean-Rock-takes-to-twitter-to-announce-shes-pregnant--Blueface-denies-he-is-the-father-e1tpt3d',
+ 'info_dict': {
+ 'id': 'e1tpt3d',
+ 'ext': 'mp3',
+ 'title': ' Chrisean Rock takes to twitter to announce she\'s pregnant, Blueface denies he is the father!',
+ 'description': 'md5:207d167de3e28ceb4ddc1ebf5a30044c',
+ 'thumbnail': 'https://s3-us-west-2.amazonaws.com/anchor-generated-image-bank/production/podcast_uploaded_nologo/1034827/1034827-1658438968460-5f3bfdf3601e8.jpg',
+ 'duration': 624.718,
+ 'uploader': 'Lovelyti ',
+ 'uploader_id': '991541',
+ 'channel': 'lovelyti',
+ 'modified_date': '20230121',
+ 'modified_timestamp': 1674285178,
+ 'release_date': '20230121',
+ 'release_timestamp': 1674285179,
+ 'episode_id': 'e1tpt3d',
+ }
+ }, {
+ # embed url
+ 'url': 'https://anchor.fm/apakatatempo/embed/episodes/S2E75-Perang-Bintang-di-Balik-Kasus-Ferdy-Sambo-dan-Ismail-Bolong-e1shjqd',
+ 'info_dict': {
+ 'id': 'e1shjqd',
+ 'ext': 'mp3',
+ 'title': 'S2E75 Perang Bintang di Balik Kasus Ferdy Sambo dan Ismail Bolong',
+ 'description': 'md5:9e95ad9293bf00178bf8d33e9cb92c41',
+ 'duration': 1042.008,
+ 'thumbnail': 'https://s3-us-west-2.amazonaws.com/anchor-generated-image-bank/production/podcast_uploaded_episode400/2627805/2627805-1671590688729-4db3882ac9e4b.jpg',
+ 'release_date': '20221221',
+ 'release_timestamp': 1671595916,
+ 'modified_date': '20221221',
+ 'modified_timestamp': 1671590834,
+ 'channel': 'apakatatempo',
+ 'uploader': 'Podcast Tempo',
+ 'uploader_id': '2585461',
+ 'season': 'Season 2',
+ 'season_number': 2,
+ 'episode_id': 'e1shjqd',
+ }
+ }]
+
+ _WEBPAGE_TESTS = [{
+ 'url': 'https://podcast.tempo.co/podcast/192/perang-bintang-di-balik-kasus-ferdy-sambo-dan-ismail-bolong',
+ 'info_dict': {
+ 'id': 'e1shjqd',
+ 'ext': 'mp3',
+ 'release_date': '20221221',
+ 'duration': 1042.008,
+ 'season': 'Season 2',
+ 'modified_timestamp': 1671590834,
+ 'uploader_id': '2585461',
+ 'modified_date': '20221221',
+ 'description': 'md5:9e95ad9293bf00178bf8d33e9cb92c41',
+ 'season_number': 2,
+ 'title': 'S2E75 Perang Bintang di Balik Kasus Ferdy Sambo dan Ismail Bolong',
+ 'release_timestamp': 1671595916,
+ 'episode_id': 'e1shjqd',
+ 'thumbnail': 'https://s3-us-west-2.amazonaws.com/anchor-generated-image-bank/production/podcast_uploaded_episode400/2627805/2627805-1671590688729-4db3882ac9e4b.jpg',
+ 'uploader': 'Podcast Tempo',
+ 'channel': 'apakatatempo',
+ }
+ }]
+
+ def _real_extract(self, url):
+ channel_name, episode_id = self._match_valid_url(url).group('channel_name', 'episode_id')
+ api_data = self._download_json(f'https://anchor.fm/api/v3/episodes/{episode_id}', episode_id)
+
+ return {
+ 'id': episode_id,
+ 'title': traverse_obj(api_data, ('episode', 'title')),
+ 'url': traverse_obj(api_data, ('episode', 'episodeEnclosureUrl'), ('episodeAudios', 0, 'url')),
+ 'ext': 'mp3',
+ 'vcodec': 'none',
+ 'thumbnail': traverse_obj(api_data, ('episode', 'episodeImage')),
+ 'description': clean_html(traverse_obj(api_data, ('episode', ('description', 'descriptionPreview')), get_all=False)),
+ 'duration': float_or_none(traverse_obj(api_data, ('episode', 'duration')), 1000),
+ 'modified_timestamp': unified_timestamp(traverse_obj(api_data, ('episode', 'modified'))),
+ 'release_timestamp': int_or_none(traverse_obj(api_data, ('episode', 'publishOnUnixTimestamp'))),
+ 'episode_id': episode_id,
+ 'uploader': traverse_obj(api_data, ('creator', 'name')),
+ 'uploader_id': str_or_none(traverse_obj(api_data, ('creator', 'userId'))),
+ 'season_number': int_or_none(traverse_obj(api_data, ('episode', 'podcastSeasonNumber'))),
+ 'channel': channel_name or traverse_obj(api_data, ('creator', 'vanitySlug')),
+ }
diff --git a/hypervideo_dl/extractor/antenna.py b/hypervideo_dl/extractor/antenna.py
new file mode 100644
index 0000000..c78717a
--- /dev/null
+++ b/hypervideo_dl/extractor/antenna.py
@@ -0,0 +1,143 @@
+import urllib.parse
+
+from .common import InfoExtractor
+from ..networking import HEADRequest
+from ..utils import (
+ ExtractorError,
+ determine_ext,
+ make_archive_id,
+ scale_thumbnails_to_max_format_width,
+)
+
+
+class AntennaBaseIE(InfoExtractor):
+ def _download_and_extract_api_data(self, video_id, netloc, cid=None):
+ info = self._download_json(f'{self.http_scheme()}//{netloc}{self._API_PATH}',
+ video_id, query={'cid': cid or video_id})
+ if not info.get('url'):
+ raise ExtractorError(f'No source found for {video_id}')
+
+ ext = determine_ext(info['url'])
+ if ext == 'm3u8':
+ formats, subs = self._extract_m3u8_formats_and_subtitles(info['url'], video_id, 'mp4')
+ else:
+ formats, subs = [{'url': info['url'], 'format_id': ext}], {}
+
+ thumbnails = scale_thumbnails_to_max_format_width(
+ formats, [{'url': info['thumb']}], r'(?<=/imgHandler/)\d+') if info.get('thumb') else []
+ return {
+ 'id': video_id,
+ 'title': info.get('title'),
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ 'subtitles': subs,
+ }
+
+
+class AntennaGrWatchIE(AntennaBaseIE):
+ IE_NAME = 'antenna:watch'
+ IE_DESC = 'antenna.gr and ant1news.gr videos'
+ _VALID_URL = r'https?://(?P<netloc>(?:www\.)?(?:antenna|ant1news)\.gr)/watch/(?P<id>\d+)/'
+ _API_PATH = '/templates/data/player'
+
+ _TESTS = [{
+ 'url': 'https://www.ant1news.gr/watch/1506168/ant1-news-09112021-stis-18-45',
+ 'md5': 'c472d9dd7cd233c63aff2ea42201cda6',
+ 'info_dict': {
+ 'id': '1506168',
+ 'ext': 'mp4',
+ 'title': 'md5:0ad00fa66ecf8aa233d26ab0dba7514a',
+ 'description': 'md5:18665af715a6dcfeac1d6153a44f16b0',
+ 'thumbnail': r're:https://ant1media\.azureedge\.net/imgHandler/\d+/26d46bf6-8158-4f02-b197-7096c714b2de\.jpg',
+ },
+ }, {
+ 'url': 'https://www.antenna.gr/watch/1643812/oi-prodotes-epeisodio-01',
+ 'md5': '8f6f7dd3b1dba4d835ba990e25f31243',
+ 'info_dict': {
+ 'id': '1643812',
+ 'ext': 'mp4',
+ 'format_id': 'mp4',
+ 'title': 'ΟΙ ΠΡΟΔΟΤΕΣ – ΕΠΕΙΣΟΔΙΟ 01',
+ 'thumbnail': r're:https://ant1media\.azureedge\.net/imgHandler/\d+/b3d63096-e72d-43c4-87a0-00d4363d242f\.jpg',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id, netloc = self._match_valid_url(url).group('id', 'netloc')
+ webpage = self._download_webpage(url, video_id)
+ info = self._download_and_extract_api_data(video_id, netloc)
+ info['description'] = self._og_search_description(webpage, default=None)
+ info['_old_archive_ids'] = [make_archive_id('Ant1NewsGrWatch', video_id)],
+ return info
+
+
+class Ant1NewsGrArticleIE(AntennaBaseIE):
+ IE_NAME = 'ant1newsgr:article'
+ IE_DESC = 'ant1news.gr articles'
+ _VALID_URL = r'https?://(?:www\.)?ant1news\.gr/[^/]+/article/(?P<id>\d+)/'
+
+ _TESTS = [{
+ 'url': 'https://www.ant1news.gr/afieromata/article/549468/o-tzeims-mpont-sta-meteora-oi-apeiles-kai-o-xesikomos-ton-kalogeron',
+ 'md5': '294f18331bb516539d72d85a82887dcc',
+ 'info_dict': {
+ 'id': '_xvg/m_cmbatw=',
+ 'ext': 'mp4',
+ 'title': 'md5:a93e8ecf2e4073bfdffcb38f59945411',
+ 'timestamp': 1603092840,
+ 'upload_date': '20201019',
+ 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/756206d2-d640-40e2-b201-3555abdfc0db.jpg',
+ },
+ }, {
+ 'url': 'https://ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn',
+ 'info_dict': {
+ 'id': '620286',
+ 'title': 'md5:91fe569e952e4d146485740ae927662b',
+ },
+ 'playlist_mincount': 2,
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle')
+ embed_urls = list(Ant1NewsGrEmbedIE._extract_embed_urls(url, webpage))
+ if not embed_urls:
+ raise ExtractorError('no videos found for %s' % video_id, expected=True)
+ return self.playlist_from_matches(
+ embed_urls, video_id, info.get('title'), ie=Ant1NewsGrEmbedIE.ie_key(),
+ video_kwargs={'url_transparent': True, 'timestamp': info.get('timestamp')})
+
+
+class Ant1NewsGrEmbedIE(AntennaBaseIE):
+ IE_NAME = 'ant1newsgr:embed'
+ IE_DESC = 'ant1news.gr embedded videos'
+ _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player'
+ _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)'
+ _EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)']
+ _API_PATH = '/news/templates/data/jsonPlayer'
+
+ _TESTS = [{
+ 'url': 'https://www.antenna.gr/templates/pages/player?cid=3f_li_c_az_jw_y_u=&w=670&h=377',
+ 'md5': 'dfc58c3a11a5a9aad2ba316ed447def3',
+ 'info_dict': {
+ 'id': '3f_li_c_az_jw_y_u=',
+ 'ext': 'mp4',
+ 'title': 'md5:a30c93332455f53e1e84ae0724f0adf7',
+ 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/bbe31201-3f09-4a4e-87f5-8ad2159fffe2.jpg',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ canonical_url = self._request_webpage(
+ HEADRequest(url), video_id,
+ note='Resolve canonical player URL',
+ errnote='Could not resolve canonical player URL').url
+ _, netloc, _, _, query, _ = urllib.parse.urlparse(canonical_url)
+ cid = urllib.parse.parse_qs(query)['cid'][0]
+
+ return self._download_and_extract_api_data(video_id, netloc, cid=cid)
diff --git a/hypervideo_dl/extractor/anvato.py b/hypervideo_dl/extractor/anvato.py
index 79bfe41..0df5033 100644
--- a/hypervideo_dl/extractor/anvato.py
+++ b/hypervideo_dl/extractor/anvato.py
@@ -336,7 +336,7 @@ class AnvatoIE(InfoExtractor):
elif media_format == 'm3u8-variant' or ext == 'm3u8':
# For some videos the initial m3u8 URL returns JSON instead
manifest_json = self._download_json(
- video_url, video_id, note='Downloading manifest JSON', errnote=False)
+ video_url, video_id, note='Downloading manifest JSON', fatal=False)
if manifest_json:
video_url = manifest_json.get('master_m3u8')
if not video_url:
@@ -392,14 +392,6 @@ class AnvatoIE(InfoExtractor):
url = smuggle_url(url, {'token': anvplayer_data['token']})
yield cls.url_result(url, AnvatoIE, video_id)
- def _extract_anvato_videos(self, webpage, video_id):
- anvplayer_data = self._parse_json(
- self._html_search_regex(
- self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
- video_id)
- return self._get_anvato_videos(
- anvplayer_data['accessKey'], anvplayer_data['video'], 'default') # cbslocal token = 'default'
-
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
self._initialize_geo_bypass({
diff --git a/hypervideo_dl/extractor/archiveorg.py b/hypervideo_dl/extractor/archiveorg.py
index 90dda9f..2541cd6 100644
--- a/hypervideo_dl/extractor/archiveorg.py
+++ b/hypervideo_dl/extractor/archiveorg.py
@@ -3,12 +3,14 @@ import re
import urllib.parse
from .common import InfoExtractor
+from .naver import NaverBaseIE
from .youtube import YoutubeBaseInfoExtractor, YoutubeIE
-from ..compat import compat_HTTPError, compat_urllib_parse_unquote
+from ..compat import compat_urllib_parse_unquote
+from ..networking import HEADRequest
+from ..networking.exceptions import HTTPError
from ..utils import (
KNOWN_EXTENSIONS,
ExtractorError,
- HEADRequest,
bug_reports_message,
clean_html,
dict_get,
@@ -897,7 +899,7 @@ class YoutubeWebArchiveIE(InfoExtractor):
video_id, note='Fetching archived video file url', expected_status=True)
except ExtractorError as e:
# HTTP Error 404 is expected if the video is not saved.
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 404:
self.raise_no_formats(
'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True)
else:
@@ -924,7 +926,7 @@ class YoutubeWebArchiveIE(InfoExtractor):
info['thumbnails'] = self._extract_thumbnails(video_id)
if urlh:
- url = compat_urllib_parse_unquote(urlh.geturl())
+ url = compat_urllib_parse_unquote(urlh.url)
video_file_url_qs = parse_qs(url)
# Attempt to recover any ext & format info from playback url & response headers
format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))}
@@ -945,3 +947,237 @@ class YoutubeWebArchiveIE(InfoExtractor):
if not info.get('title'):
info['title'] = video_id
return info
+
+
+class VLiveWebArchiveIE(InfoExtractor):
+ IE_NAME = 'web.archive:vlive'
+ IE_DESC = 'web.archive.org saved vlive videos'
+ _VALID_URL = r'''(?x)
+ (?:https?://)?web\.archive\.org/
+ (?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
+ (?:https?(?::|%3[Aa])//)?(?:
+ (?:(?:www|m)\.)?vlive\.tv(?::(?:80|443))?/(?:video|embed)/(?P<id>[0-9]+) # VLive URL
+ )
+ '''
+ _TESTS = [{
+ 'url': 'https://web.archive.org/web/20221221144331/http://www.vlive.tv/video/1326',
+ 'md5': 'cc7314812855ce56de70a06a27314983',
+ 'info_dict': {
+ 'id': '1326',
+ 'ext': 'mp4',
+ 'title': "Girl's Day's Broadcast",
+ 'creator': "Girl's Day",
+ 'view_count': int,
+ 'uploader_id': 'muploader_a',
+ 'uploader_url': None,
+ 'uploader': None,
+ 'upload_date': '20150817',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
+ 'timestamp': 1439816449,
+ 'like_count': int,
+ 'channel': 'Girl\'s Day',
+ 'channel_id': 'FDF27',
+ 'comment_count': int,
+ 'release_timestamp': 1439818140,
+ 'release_date': '20150817',
+ 'duration': 1014,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://web.archive.org/web/20221221182103/http://www.vlive.tv/video/16937',
+ 'info_dict': {
+ 'id': '16937',
+ 'ext': 'mp4',
+ 'title': '첸백시 걍방',
+ 'creator': 'EXO',
+ 'view_count': int,
+ 'subtitles': 'mincount:12',
+ 'uploader_id': 'muploader_j',
+ 'uploader_url': 'http://vlive.tv',
+ 'uploader': None,
+ 'upload_date': '20161112',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
+ 'timestamp': 1478923074,
+ 'like_count': int,
+ 'channel': 'EXO',
+ 'channel_id': 'F94BD',
+ 'comment_count': int,
+ 'release_timestamp': 1478924280,
+ 'release_date': '20161112',
+ 'duration': 906,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870',
+ 'info_dict': {
+ 'id': '101870',
+ 'ext': 'mp4',
+ 'title': '[ⓓ xV] “레벨이들 매력에 반해? 안 반해?” 움직이는 HD 포토 (레드벨벳:Red Velvet)',
+ 'creator': 'Dispatch',
+ 'view_count': int,
+ 'subtitles': 'mincount:6',
+ 'uploader_id': 'V__FRA08071',
+ 'uploader_url': 'http://vlive.tv',
+ 'uploader': None,
+ 'upload_date': '20181130',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
+ 'timestamp': 1543601327,
+ 'like_count': int,
+ 'channel': 'Dispatch',
+ 'channel_id': 'C796F3',
+ 'comment_count': int,
+ 'release_timestamp': 1543601040,
+ 'release_date': '20181130',
+ 'duration': 279,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ # The wayback machine has special timestamp and "mode" values:
+ # timestamp:
+ # 1 = the first capture
+ # 2 = the last capture
+ # mode:
+ # id_ = Identity - perform no alterations of the original resource, return it as it was archived.
+ _WAYBACK_BASE_URL = 'https://web.archive.org/web/2id_/'
+
+ def _download_archived_page(self, url, video_id, *, timestamp='2', **kwargs):
+ for retry in self.RetryManager():
+ try:
+ return self._download_webpage(f'https://web.archive.org/web/{timestamp}id_/{url}', video_id, **kwargs)
+ except ExtractorError as e:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 404:
+ raise ExtractorError('Page was not archived', expected=True)
+ retry.error = e
+ continue
+
+ def _download_archived_json(self, url, video_id, **kwargs):
+ page = self._download_archived_page(url, video_id, **kwargs)
+ if not page:
+ raise ExtractorError('Page was not archived', expected=True)
+ else:
+ return self._parse_json(page, video_id)
+
+ def _extract_formats_from_m3u8(self, m3u8_url, params, video_id):
+ m3u8_doc = self._download_archived_page(m3u8_url, video_id, note='Downloading m3u8', query=params, fatal=False)
+ if not m3u8_doc:
+ return
+
+ # M3U8 document should be changed to archive domain
+ m3u8_doc = m3u8_doc.splitlines()
+ url_base = m3u8_url.rsplit('/', 1)[0]
+ first_segment = None
+ for i, line in enumerate(m3u8_doc):
+ if not line.startswith('#'):
+ m3u8_doc[i] = f'{self._WAYBACK_BASE_URL}{url_base}/{line}?{urllib.parse.urlencode(params)}'
+ first_segment = first_segment or m3u8_doc[i]
+
+ # Segments may not have been archived. See https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870
+ urlh = self._request_webpage(HEADRequest(first_segment), video_id, errnote=False,
+ fatal=False, note='Check first segment availablity')
+ if urlh:
+ formats, subtitles = self._parse_m3u8_formats_and_subtitles('\n'.join(m3u8_doc), ext='mp4', video_id=video_id)
+ if subtitles:
+ self._report_ignoring_subs('m3u8')
+ return formats
+
+ # Closely follows the logic of the ArchiveTeam grab script
+ # See: https://github.com/ArchiveTeam/vlive-grab/blob/master/vlive.lua
+ def _real_extract(self, url):
+ video_id, url_date = self._match_valid_url(url).group('id', 'date')
+
+ webpage = self._download_archived_page(f'https://www.vlive.tv/video/{video_id}', video_id, timestamp=url_date)
+
+ player_info = self._search_json(r'__PRELOADED_STATE__\s*=', webpage, 'player info', video_id)
+ user_country = traverse_obj(player_info, ('common', 'userCountry'))
+
+ main_script_url = self._search_regex(r'<script\s+src="([^"]+/js/main\.[^"]+\.js)"', webpage, 'main script url')
+ main_script = self._download_archived_page(main_script_url, video_id, note='Downloading main script')
+ app_id = self._search_regex(r'appId\s*=\s*"([^"]+)"', main_script, 'app id')
+
+ inkey = self._download_archived_json(
+ f'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/{video_id}/inkey', video_id, note='Fetching inkey', query={
+ 'appId': app_id,
+ 'platformType': 'PC',
+ 'gcc': user_country,
+ 'locale': 'en_US',
+ }, fatal=False)
+
+ vod_id = traverse_obj(player_info, ('postDetail', 'post', 'officialVideo', 'vodId'))
+
+ vod_data = self._download_archived_json(
+ f'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{vod_id}', video_id, note='Fetching vod data', query={
+ 'key': inkey.get('inkey'),
+ 'pid': 'rmcPlayer_16692457559726800', # partially unix time and partially random. Fixed value used by archiveteam project
+ 'sid': '2024',
+ 'ver': '2.0',
+ 'devt': 'html5_pc',
+ 'doct': 'json',
+ 'ptc': 'https',
+ 'sptc': 'https',
+ 'cpt': 'vtt',
+ 'ctls': '%7B%22visible%22%3A%7B%22fullscreen%22%3Atrue%2C%22logo%22%3Afalse%2C%22playbackRate%22%3Afalse%2C%22scrap%22%3Afalse%2C%22playCount%22%3Atrue%2C%22commentCount%22%3Atrue%2C%22title%22%3Atrue%2C%22writer%22%3Atrue%2C%22expand%22%3Afalse%2C%22subtitles%22%3Atrue%2C%22thumbnails%22%3Atrue%2C%22quality%22%3Atrue%2C%22setting%22%3Atrue%2C%22script%22%3Afalse%2C%22logoDimmed%22%3Atrue%2C%22badge%22%3Atrue%2C%22seekingTime%22%3Atrue%2C%22muted%22%3Atrue%2C%22muteButton%22%3Afalse%2C%22viewerNotice%22%3Afalse%2C%22linkCount%22%3Afalse%2C%22createTime%22%3Afalse%2C%22thumbnail%22%3Atrue%7D%2C%22clicked%22%3A%7B%22expand%22%3Afalse%2C%22subtitles%22%3Afalse%7D%7D',
+ 'pv': '4.26.9',
+ 'dr': '1920x1080',
+ 'cpl': 'en_US',
+ 'lc': 'en_US',
+ 'adi': '%5B%7B%22type%22%3A%22pre%22%2C%22exposure%22%3Afalse%2C%22replayExposure%22%3Afalse%7D%5D',
+ 'adu': '%2F',
+ 'videoId': vod_id,
+ 'cc': user_country,
+ })
+
+ formats = []
+
+ streams = traverse_obj(vod_data, ('streams', ...))
+ if len(streams) > 1:
+ self.report_warning('Multiple streams found. Only the first stream will be downloaded.')
+ stream = streams[0]
+
+ max_stream = max(
+ stream.get('videos') or [],
+ key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None)
+ if max_stream is not None:
+ params = {arg.get('name'): arg.get('value') for arg in stream.get('keys', []) if arg.get('type') == 'param'}
+ formats = self._extract_formats_from_m3u8(max_stream.get('source'), params, video_id) or []
+
+ # For parts of the project MP4 files were archived
+ max_video = max(
+ traverse_obj(vod_data, ('videos', 'list', ...)),
+ key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None)
+ if max_video is not None:
+ video_url = self._WAYBACK_BASE_URL + max_video.get('source')
+ urlh = self._request_webpage(HEADRequest(video_url), video_id, errnote=False,
+ fatal=False, note='Check video availablity')
+ if urlh:
+ formats.append({'url': video_url})
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ **traverse_obj(player_info, ('postDetail', 'post', {
+ 'title': ('officialVideo', 'title', {str}),
+ 'creator': ('author', 'nickname', {str}),
+ 'channel': ('channel', 'channelName', {str}),
+ 'channel_id': ('channel', 'channelCode', {str}),
+ 'duration': ('officialVideo', 'playTime', {int_or_none}),
+ 'view_count': ('officialVideo', 'playCount', {int_or_none}),
+ 'like_count': ('officialVideo', 'likeCount', {int_or_none}),
+ 'comment_count': ('officialVideo', 'commentCount', {int_or_none}),
+ 'timestamp': ('officialVideo', 'createdAt', {lambda x: int_or_none(x, scale=1000)}),
+ 'release_timestamp': ('officialVideo', 'willStartAt', {lambda x: int_or_none(x, scale=1000)}),
+ })),
+ **traverse_obj(vod_data, ('meta', {
+ 'uploader_id': ('user', 'id', {str}),
+ 'uploader': ('user', 'name', {str}),
+ 'uploader_url': ('user', 'url', {url_or_none}),
+ 'thumbnail': ('cover', 'source', {url_or_none}),
+ }), expected_type=lambda x: x or None),
+ **NaverBaseIE.process_subtitles(vod_data, lambda x: [self._WAYBACK_BASE_URL + x]),
+ }
diff --git a/hypervideo_dl/extractor/ard.py b/hypervideo_dl/extractor/ard.py
index 0a8a874..ca1faa7 100644
--- a/hypervideo_dl/extractor/ard.py
+++ b/hypervideo_dl/extractor/ard.py
@@ -13,6 +13,7 @@ from ..utils import (
try_get,
unified_strdate,
unified_timestamp,
+ update_url,
update_url_query,
url_or_none,
xpath_text,
@@ -46,6 +47,9 @@ class ARDMediathekBaseIE(InfoExtractor):
subtitles['de'] = [{
'ext': 'ttml',
'url': subtitle_url,
+ }, {
+ 'ext': 'vtt',
+ 'url': subtitle_url.replace('/ebutt/', '/webvtt/') + '.vtt',
}]
return {
@@ -286,16 +290,16 @@ class ARDMediathekIE(ARDMediathekBaseIE):
class ARDIE(InfoExtractor):
_VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
_TESTS = [{
- # available till 7.01.2022
- 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html',
- 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1',
+ # available till 7.12.2023
+ 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html',
+ 'md5': 'a438f671e87a7eba04000336a119ccc4',
'info_dict': {
- 'id': 'maischberger-die-woche-video100',
- 'display_id': 'maischberger-die-woche-video100',
+ 'id': 'maischberger-video-424',
+ 'display_id': 'maischberger-video-424',
'ext': 'mp4',
- 'duration': 3687.0,
- 'title': 'maischberger. die woche vom 7. Januar 2021',
- 'upload_date': '20210107',
+ 'duration': 4452.0,
+ 'title': 'maischberger am 07.12.2022',
+ 'upload_date': '20221207',
'thumbnail': r're:^https?://.*\.jpg$',
},
}, {
@@ -405,6 +409,23 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
(?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
_TESTS = [{
+ 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy8xZGY0ZGJmZS00ZWQwLTRmMGItYjhhYy0wOGQ4ZmYxNjVhZDI',
+ 'md5': '3fd5fead7a370a819341129c8d713136',
+ 'info_dict': {
+ 'display_id': 'filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen',
+ 'id': '12172961',
+ 'title': 'Wolfsland - Die traurigen Schwestern',
+ 'description': r're:^Als der Polizeiobermeister Raaben',
+ 'duration': 5241,
+ 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:efa186f7b0054957',
+ 'timestamp': 1670710500,
+ 'upload_date': '20221210',
+ 'ext': 'mp4',
+ 'age_limit': 12,
+ 'episode': 'Wolfsland - Die traurigen Schwestern',
+ 'series': 'Filme im MDR'
+ },
+ }, {
'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
'info_dict': {
@@ -421,7 +442,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'skip': 'Error',
}, {
'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
- 'md5': 'f1837e563323b8a642a8ddeff0131f51',
+ 'md5': '1e73ded21cb79bac065117e80c81dc88',
'info_dict': {
'id': '10049223',
'ext': 'mp4',
@@ -429,13 +450,11 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'timestamp': 1636398000,
'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b',
'upload_date': '20211108',
- },
- }, {
- 'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1',
- 'playlist_count': 6,
- 'info_dict': {
- 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw',
- 'title': 'beforeigners/beforeigners/staffel-1',
+ 'display_id': 'tagesschau-oder-tagesschau-20-00-uhr/das-erste',
+ 'duration': 915,
+ 'episode': 'tagesschau, 20:00 Uhr',
+ 'series': 'tagesschau',
+ 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49',
},
}, {
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
@@ -599,6 +618,9 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
show {
title
}
+ image {
+ src
+ }
synopsis
title
tracking {
@@ -637,6 +659,15 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'description': description,
'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
'series': try_get(player_page, lambda x: x['show']['title']),
+ 'thumbnail': (media_collection.get('_previewImage')
+ or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None))
+ or self.get_thumbnail_from_html(display_id, url)),
})
info.update(self._ARD_extract_episode_info(info['title']))
return info
+
+ def get_thumbnail_from_html(self, display_id, url):
+ webpage = self._download_webpage(url, display_id, fatal=False) or ''
+ return (
+ self._og_search_thumbnail(webpage, default=None)
+ or self._html_search_meta('thumbnailUrl', webpage, default=None))
diff --git a/hypervideo_dl/extractor/arte.py b/hypervideo_dl/extractor/arte.py
index 54e4d2d..e3cc5af 100644
--- a/hypervideo_dl/extractor/arte.py
+++ b/hypervideo_dl/extractor/arte.py
@@ -65,6 +65,21 @@ class ArteTVIE(ArteTVBaseIE):
}, {
'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
'only_matching': True,
+ }, {
+ 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
+ 'info_dict': {
+ 'id': '110203-006-A',
+ 'chapters': 'count:16',
+ 'description': 'md5:cf592f1df52fe52007e3f8eac813c084',
+ 'alt_title': 'Zaz',
+ 'title': 'Baloise Session 2022',
+ 'timestamp': 1668445200,
+ 'duration': 4054,
+ 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/ubQjmVCGyRx3hmBuZEK9QZ/940x530',
+ 'upload_date': '20221114',
+ 'ext': 'mp4',
+ },
+ 'expected_warnings': ['geo restricted']
}]
_GEO_BYPASS = True
@@ -180,10 +195,6 @@ class ArteTVIE(ArteTVBaseIE):
else:
self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
- # TODO: chapters from stream['segments']?
- # The JS also looks for chapters in config['data']['attributes']['chapters'],
- # but I am yet to find a video having those
-
formats.extend(secondary_formats)
self._remove_duplicate_formats(formats)
@@ -205,6 +216,11 @@ class ArteTVIE(ArteTVBaseIE):
{'url': image['url'], 'id': image.get('caption')}
for image in metadata.get('images') or [] if url_or_none(image.get('url'))
],
+ # TODO: chapters may also be in stream['segments']?
+ 'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
+ 'start_time': 'startTime',
+ 'title': 'title',
+ })) or None,
}
diff --git a/hypervideo_dl/extractor/atresplayer.py b/hypervideo_dl/extractor/atresplayer.py
index a20e7f9..3a44e52 100644
--- a/hypervideo_dl/extractor/atresplayer.py
+++ b/hypervideo_dl/extractor/atresplayer.py
@@ -1,5 +1,5 @@
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
@@ -34,8 +34,8 @@ class AtresPlayerIE(InfoExtractor):
_API_BASE = 'https://api.atresplayer.com/'
def _handle_error(self, e, code):
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == code:
- error = self._parse_json(e.cause.read(), None)
+ if isinstance(e.cause, HTTPError) and e.cause.status == code:
+ error = self._parse_json(e.cause.response.read(), None)
if error.get('error') == 'required_registered':
self.raise_login_required()
raise ExtractorError(error['error_description'], expected=True)
diff --git a/hypervideo_dl/extractor/bandcamp.py b/hypervideo_dl/extractor/bandcamp.py
index de81e0d..e89b3a6 100644
--- a/hypervideo_dl/extractor/bandcamp.py
+++ b/hypervideo_dl/extractor/bandcamp.py
@@ -29,11 +29,18 @@ class BandcampIE(InfoExtractor):
'info_dict': {
'id': '1812978515',
'ext': 'mp3',
- 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭",
+ 'title': 'youtube-dl "\'/\\ä↭ - youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭',
'duration': 9.8485,
- 'uploader': 'youtube-dl "\'/\\ä↭',
+ 'uploader': 'youtube-dl "\'/\\ä↭',
'upload_date': '20121129',
'timestamp': 1354224127,
+ 'track': 'youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭',
+ 'album_artist': 'youtube-dl "\'/\\ä↭',
+ 'track_id': '1812978515',
+ 'artist': 'youtube-dl "\'/\\ä↭',
+ 'uploader_url': 'https://youtube-dl.bandcamp.com',
+ 'uploader_id': 'youtube-dl',
+ 'thumbnail': 'https://f4.bcbits.com/img/a3216802731_5.jpg',
},
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
@@ -41,7 +48,8 @@ class BandcampIE(InfoExtractor):
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
'info_dict': {
'id': '2650410135',
- 'ext': 'aiff',
+ 'ext': 'm4a',
+ 'acodec': r're:[fa]lac',
'title': 'Ben Prunty - Lanius (Battle)',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Ben Prunty',
@@ -54,7 +62,10 @@ class BandcampIE(InfoExtractor):
'track_number': 1,
'track_id': '2650410135',
'artist': 'Ben Prunty',
+ 'album_artist': 'Ben Prunty',
'album': 'FTL: Advanced Edition Soundtrack',
+ 'uploader_url': 'https://benprunty.bandcamp.com',
+ 'uploader_id': 'benprunty',
},
}, {
# no free download, mp3 128
@@ -75,7 +86,34 @@ class BandcampIE(InfoExtractor):
'track_number': 5,
'track_id': '2584466013',
'artist': 'Mastodon',
+ 'album_artist': 'Mastodon',
'album': 'Call of the Mastodon',
+ 'uploader_url': 'https://relapsealumni.bandcamp.com',
+ 'uploader_id': 'relapsealumni',
+ },
+ }, {
+ # track from compilation album (artist/album_artist difference)
+ 'url': 'https://diskotopia.bandcamp.com/track/safehouse',
+ 'md5': '19c5337bca1428afa54129f86a2f6a69',
+ 'info_dict': {
+ 'id': '1978174799',
+ 'ext': 'mp3',
+ 'title': 'submerse - submerse - Safehouse',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'submerse',
+ 'timestamp': 1480779297,
+ 'upload_date': '20161203',
+ 'release_timestamp': 1481068800,
+ 'release_date': '20161207',
+ 'duration': 154.066,
+ 'track': 'submerse - Safehouse',
+ 'track_number': 3,
+ 'track_id': '1978174799',
+ 'artist': 'submerse',
+ 'album_artist': 'Diskotopia',
+ 'album': 'DSK F/W 2016-2017 Free Compilation',
+ 'uploader_url': 'https://diskotopia.bandcamp.com',
+ 'uploader_id': 'diskotopia',
},
}]
@@ -121,6 +159,9 @@ class BandcampIE(InfoExtractor):
embed = self._extract_data_attr(webpage, title, 'embed', False)
current = tralbum.get('current') or {}
artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
+ album_artist = self._html_search_regex(
+ r'<h3 class="albumTitle">[\S\s]*?by\s*<span>\s*<a href="[^>]+">\s*([^>]+?)\s*</a>',
+ webpage, 'album artist', fatal=False)
timestamp = unified_timestamp(
current.get('publish_date') or tralbum.get('album_publish_date'))
@@ -205,6 +246,7 @@ class BandcampIE(InfoExtractor):
'track_id': track_id,
'artist': artist,
'album': embed.get('album_title'),
+ 'album_artist': album_artist,
'formats': formats,
}
diff --git a/hypervideo_dl/extractor/bbc.py b/hypervideo_dl/extractor/bbc.py
index 9d28e70..a55cdef 100644
--- a/hypervideo_dl/extractor/bbc.py
+++ b/hypervideo_dl/extractor/bbc.py
@@ -2,11 +2,11 @@ import functools
import itertools
import json
import re
-import urllib.error
import xml.etree.ElementTree
from .common import InfoExtractor
-from ..compat import compat_HTTPError, compat_str, compat_urlparse
+from ..compat import compat_str, compat_urlparse
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
OnDemandPagedList,
@@ -277,7 +277,7 @@ class BBCCoUkIE(InfoExtractor):
post_url, None, 'Logging in', data=urlencode_postdata(login_form),
headers={'Referer': self._LOGIN_URL})
- if self._LOGIN_URL in urlh.geturl():
+ if self._LOGIN_URL in urlh.url:
error = clean_html(get_element_by_class('form-message', response))
if error:
raise ExtractorError(
@@ -388,8 +388,8 @@ class BBCCoUkIE(InfoExtractor):
href, programme_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False)
except ExtractorError as e:
- if not (isinstance(e.exc_info[1], urllib.error.HTTPError)
- and e.exc_info[1].code in (403, 404)):
+ if not (isinstance(e.exc_info[1], HTTPError)
+ and e.exc_info[1].status in (403, 404)):
raise
fmts = []
formats.extend(fmts)
@@ -472,7 +472,7 @@ class BBCCoUkIE(InfoExtractor):
return programme_id, title, description, duration, formats, subtitles
except ExtractorError as ee:
- if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
+ if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404):
raise
# fallback to legacy playlist
@@ -983,7 +983,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
# Some playlist URL may fail with 500, at the same time
# the other one may work fine (e.g.
# http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 500:
continue
raise
if entry:
diff --git a/hypervideo_dl/extractor/beatbump.py b/hypervideo_dl/extractor/beatbump.py
new file mode 100644
index 0000000..0f40ebe
--- /dev/null
+++ b/hypervideo_dl/extractor/beatbump.py
@@ -0,0 +1,101 @@
+from .common import InfoExtractor
+from .youtube import YoutubeIE, YoutubeTabIE
+
+
+class BeatBumpVideoIE(InfoExtractor):
+ _VALID_URL = r'https://beatbump\.ml/listen\?id=(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://beatbump.ml/listen?id=MgNrAu2pzNs',
+ 'md5': '5ff3fff41d3935b9810a9731e485fe66',
+ 'info_dict': {
+ 'id': 'MgNrAu2pzNs',
+ 'ext': 'mp4',
+ 'uploader_url': 'http://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA',
+ 'artist': 'Stephen',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/MgNrAu2pzNs/maxresdefault.webp',
+ 'channel_url': 'https://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA',
+ 'upload_date': '20190312',
+ 'categories': ['Music'],
+ 'playable_in_embed': True,
+ 'duration': 169,
+ 'like_count': int,
+ 'alt_title': 'Voyeur Girl',
+ 'view_count': int,
+ 'track': 'Voyeur Girl',
+ 'uploader': 'Stephen - Topic',
+ 'title': 'Voyeur Girl',
+ 'channel_follower_count': int,
+ 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
+ 'age_limit': 0,
+ 'availability': 'public',
+ 'live_status': 'not_live',
+ 'album': 'it\'s too much love to know my dear',
+ 'channel': 'Stephen',
+ 'comment_count': int,
+ 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
+ 'tags': 'count:11',
+ 'creator': 'Stephen',
+ 'channel_id': 'UC-pWHpBjdGG69N9mM2auIAA',
+ }
+ }]
+
+ def _real_extract(self, url):
+ id_ = self._match_id(url)
+ return self.url_result(f'https://music.youtube.com/watch?v={id_}', YoutubeIE, id_)
+
+
+class BeatBumpPlaylistIE(InfoExtractor):
+ _VALID_URL = r'https://beatbump\.ml/(?:release\?id=|artist/|playlist/)(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://beatbump.ml/release?id=MPREb_gTAcphH99wE',
+ 'playlist_count': 50,
+ 'info_dict': {
+ 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0',
+ 'availability': 'unlisted',
+ 'view_count': int,
+ 'title': 'Album - Royalty Free Music Library V2 (50 Songs)',
+ 'description': '',
+ 'tags': [],
+ 'modified_date': '20221223',
+ }
+ }, {
+ 'url': 'https://beatbump.ml/artist/UC_aEa8K-EOJ3D6gOs7HcyNg',
+ 'playlist_mincount': 1,
+ 'params': {'flatplaylist': True},
+ 'info_dict': {
+ 'id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
+ 'uploader_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg',
+ 'channel_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg',
+ 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
+ 'channel_follower_count': int,
+ 'title': 'NoCopyrightSounds - Videos',
+ 'uploader': 'NoCopyrightSounds',
+ 'description': 'md5:cd4fd53d81d363d05eee6c1b478b491a',
+ 'channel': 'NoCopyrightSounds',
+ 'tags': 'count:12',
+ 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
+ },
+ }, {
+ 'url': 'https://beatbump.ml/playlist/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
+ 'playlist_mincount': 1,
+ 'params': {'flatplaylist': True},
+ 'info_dict': {
+ 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
+ 'uploader_url': 'https://www.youtube.com/@NoCopyrightSounds',
+ 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
+ 'view_count': int,
+ 'channel_url': 'https://www.youtube.com/@NoCopyrightSounds',
+ 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
+ 'title': 'NCS : All Releases 💿',
+ 'uploader': 'NoCopyrightSounds',
+ 'availability': 'public',
+ 'channel': 'NoCopyrightSounds',
+ 'tags': [],
+ 'modified_date': '20221225',
+ 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
+ }
+ }]
+
+ def _real_extract(self, url):
+ id_ = self._match_id(url)
+ return self.url_result(f'https://music.youtube.com/browse/{id_}', YoutubeTabIE, id_)
diff --git a/hypervideo_dl/extractor/bfmtv.py b/hypervideo_dl/extractor/bfmtv.py
index d86d283..a7be0e6 100644
--- a/hypervideo_dl/extractor/bfmtv.py
+++ b/hypervideo_dl/extractor/bfmtv.py
@@ -5,7 +5,7 @@ from ..utils import extract_attributes
class BFMTVBaseIE(InfoExtractor):
- _VALID_URL_BASE = r'https?://(?:www\.)?bfmtv\.com/'
+ _VALID_URL_BASE = r'https?://(?:www\.|rmc\.)?bfmtv\.com/'
_VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P<id>\d{12})\.html'
_VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block"[^>]*>)'
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
@@ -31,6 +31,9 @@ class BFMTVIE(BFMTVBaseIE):
'uploader_id': '876450610001',
'upload_date': '20201002',
'timestamp': 1601629620,
+ 'duration': 44.757,
+ 'tags': ['bfmactu', 'politique'],
+ 'thumbnail': 'https://cf-images.eu-west-1.prod.boltdns.net/v1/static/876450610001/5041f4c1-bc48-4af8-a256-1b8300ad8ef0/cf2f9114-e8e2-4494-82b4-ab794ea4bc7d/1920x1080/match/image.jpg',
},
}]
@@ -81,6 +84,20 @@ class BFMTVArticleIE(BFMTVBaseIE):
}, {
'url': 'https://www.bfmtv.com/sante/covid-19-oui-le-vaccin-de-pfizer-distribue-en-france-a-bien-ete-teste-sur-des-personnes-agees_AN-202101060275.html',
'only_matching': True,
+ }, {
+ 'url': 'https://rmc.bfmtv.com/actualites/societe/transports/ce-n-est-plus-tout-rentable-le-bioethanol-e85-depasse-1eu-le-litre-des-automobilistes-regrettent_AV-202301100268.html',
+ 'info_dict': {
+ 'id': '6318445464112',
+ 'ext': 'mp4',
+ 'title': 'Le plein de bioéthanol fait de plus en plus mal à la pompe',
+ 'description': None,
+ 'uploader_id': '876630703001',
+ 'upload_date': '20230110',
+ 'timestamp': 1673341692,
+ 'duration': 109.269,
+ 'tags': ['rmc', 'show', 'apolline de malherbe', 'info', 'talk', 'matinale', 'radio'],
+ 'thumbnail': 'https://cf-images.eu-west-1.prod.boltdns.net/v1/static/876630703001/5bef74b8-9d5e-4480-a21f-60c2e2480c46/96c88b74-f9db-45e1-8040-e199c5da216c/1920x1080/match/image.jpg'
+ }
}]
def _real_extract(self, url):
diff --git a/hypervideo_dl/extractor/bibeltv.py b/hypervideo_dl/extractor/bibeltv.py
index fd20aad..34464da 100644
--- a/hypervideo_dl/extractor/bibeltv.py
+++ b/hypervideo_dl/extractor/bibeltv.py
@@ -1,27 +1,197 @@
+from functools import partial
+
from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ clean_html,
+ determine_ext,
+ format_field,
+ int_or_none,
+ js_to_json,
+ orderedSet,
+ parse_iso8601,
+ traverse_obj,
+ url_or_none,
+)
+
+
+class BibelTVBaseIE(InfoExtractor):
+ _GEO_COUNTRIES = ['AT', 'CH', 'DE']
+ _GEO_BYPASS = False
+
+ API_URL = 'https://www.bibeltv.de/mediathek/api'
+ AUTH_TOKEN = 'j88bRXY8DsEqJ9xmTdWhrByVi5Hm'
+
+ def _extract_formats_and_subtitles(self, data, crn_id, *, is_live=False):
+ formats = []
+ subtitles = {}
+ for media_url in traverse_obj(data, (..., 'src', {url_or_none})):
+ media_ext = determine_ext(media_url)
+ if media_ext == 'm3u8':
+ m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
+ media_url, crn_id, live=is_live)
+ formats.extend(m3u8_formats)
+ subtitles.update(m3u8_subs)
+ elif media_ext == 'mpd':
+ mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(media_url, crn_id)
+ formats.extend(mpd_formats)
+ subtitles.update(mpd_subs)
+ elif media_ext == 'mp4':
+ formats.append({'url': media_url})
+ else:
+ self.report_warning(f'Unknown format {media_ext!r}')
+
+ return formats, subtitles
+
+ @staticmethod
+ def _extract_base_info(data):
+ return {
+ 'id': data['crn'],
+ **traverse_obj(data, {
+ 'title': 'title',
+ 'description': 'description',
+ 'duration': ('duration', {partial(int_or_none, scale=1000)}),
+ 'timestamp': ('schedulingStart', {parse_iso8601}),
+ 'season_number': 'seasonNumber',
+ 'episode_number': 'episodeNumber',
+ 'view_count': 'viewCount',
+ 'like_count': 'likeCount',
+ }),
+ 'thumbnails': orderedSet(traverse_obj(data, ('images', ..., {
+ 'url': ('url', {url_or_none}),
+ }))),
+ }
+
+ def _extract_url_info(self, data):
+ return {
+ '_type': 'url',
+ 'url': format_field(data, 'slug', 'https://www.bibeltv.de/mediathek/videos/%s'),
+ **self._extract_base_info(data),
+ }
+
+ def _extract_video_info(self, data):
+ crn_id = data['crn']
+ if data.get('drm'):
+ self.report_drm(crn_id)
+
+ json_data = self._download_json(
+ format_field(data, 'id', f'{self.API_URL}/video/%s'), crn_id,
+ headers={'Authorization': self.AUTH_TOKEN}, fatal=False,
+ errnote='No formats available') or {}
+
+ formats, subtitles = self._extract_formats_and_subtitles(
+ traverse_obj(json_data, ('video', 'videoUrls', ...)), crn_id)
+
+ return {
+ '_type': 'video',
+ **self._extract_base_info(data),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class BibelTVVideoIE(BibelTVBaseIE):
+ IE_DESC = 'BibelTV single video'
+ _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?P<id>\d+)[\w-]+'
+ IE_NAME = 'bibeltv:video'
-class BibelTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?:crn/)?(?P<id>\d+)'
_TESTS = [{
- 'url': 'https://www.bibeltv.de/mediathek/videos/329703-sprachkurs-in-malaiisch',
- 'md5': '252f908192d611de038b8504b08bf97f',
+ 'url': 'https://www.bibeltv.de/mediathek/videos/344436-alte-wege',
+ 'md5': 'ec1c07efe54353780512e8a4103b612e',
'info_dict': {
- 'id': 'ref:329703',
+ 'id': '344436',
'ext': 'mp4',
- 'title': 'Sprachkurs in Malaiisch',
- 'description': 'md5:3e9f197d29ee164714e67351cf737dfe',
- 'timestamp': 1608316701,
- 'uploader_id': '5840105145001',
- 'upload_date': '20201218',
- }
+ 'title': 'Alte Wege',
+ 'description': 'md5:2f4eb7294c9797a47b8fd13cccca22e9',
+ 'timestamp': 1677877071,
+ 'duration': 150.0,
+ 'upload_date': '20230303',
+ 'thumbnail': r're:https://bibeltv\.imgix\.net/[\w-]+\.jpg',
+ 'episode': 'Episode 1',
+ 'episode_number': 1,
+ 'view_count': int,
+ 'like_count': int,
+ },
+ 'params': {
+ 'format': '6',
+ },
+ }]
+
+ def _real_extract(self, url):
+ crn_id = self._match_id(url)
+ video_data = traverse_obj(
+ self._search_nextjs_data(self._download_webpage(url, crn_id), crn_id),
+ ('props', 'pageProps', 'videoPageData', 'videos', 0, {dict}))
+ if not video_data:
+ raise ExtractorError('Missing video data.')
+
+ return self._extract_video_info(video_data)
+
+
+class BibelTVSeriesIE(BibelTVBaseIE):
+ IE_DESC = 'BibelTV series playlist'
+ _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/serien/(?P<id>\d+)[\w-]+'
+ IE_NAME = 'bibeltv:series'
+
+ _TESTS = [{
+ 'url': 'https://www.bibeltv.de/mediathek/serien/333485-ein-wunder-fuer-jeden-tag',
+ 'playlist_mincount': 400,
+ 'info_dict': {
+ 'id': '333485',
+ 'title': 'Ein Wunder für jeden Tag',
+ 'description': 'Tägliche Kurzandacht mit Déborah Rosenkranz.',
+ },
+ }]
+
+ def _real_extract(self, url):
+ crn_id = self._match_id(url)
+ webpage = self._download_webpage(url, crn_id)
+ nextjs_data = self._search_nextjs_data(webpage, crn_id)
+ series_data = traverse_obj(nextjs_data, ('props', 'pageProps', 'seriePageData', {dict}))
+ if not series_data:
+ raise ExtractorError('Missing series data.')
+
+ return self.playlist_result(
+ traverse_obj(series_data, ('videos', ..., {dict}, {self._extract_url_info})),
+ crn_id, series_data.get('title'), clean_html(series_data.get('description')))
+
+
+class BibelTVLiveIE(BibelTVBaseIE):
+ IE_DESC = 'BibelTV live program'
+ _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/livestreams/(?P<id>[\w-]+)'
+ IE_NAME = 'bibeltv:live'
+
+ _TESTS = [{
+ 'url': 'https://www.bibeltv.de/livestreams/bibeltv/',
+ 'info_dict': {
+ 'id': 'bibeltv',
+ 'ext': 'mp4',
+ 'title': 're:Bibel TV',
+ 'live_status': 'is_live',
+ 'thumbnail': 'https://streampreview.bibeltv.de/bibeltv.webp',
+ },
+ 'params': {'skip_download': 'm3u8'},
}, {
- 'url': 'https://www.bibeltv.de/mediathek/videos/crn/326374',
+ 'url': 'https://www.bibeltv.de/livestreams/impuls/',
'only_matching': True,
}]
- BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5840105145001/default_default/index.html?videoId=ref:%s'
def _real_extract(self, url):
- crn_id = self._match_id(url)
- return self.url_result(
- self.BRIGHTCOVE_URL_TEMPLATE % crn_id, 'BrightcoveNew')
+ stream_id = self._match_id(url)
+ webpage = self._download_webpage(url, stream_id)
+ stream_data = self._search_json(
+ r'\\"video\\":', webpage, 'bibeltvData', stream_id,
+ transform_source=lambda jstring: js_to_json(jstring.replace('\\"', '"')))
+
+ formats, subtitles = self._extract_formats_and_subtitles(
+ traverse_obj(stream_data, ('src', ...)), stream_id, is_live=True)
+
+ return {
+ 'id': stream_id,
+ 'title': stream_data.get('title'),
+ 'thumbnail': stream_data.get('poster'),
+ 'is_live': True,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/hypervideo_dl/extractor/bilibili.py b/hypervideo_dl/extractor/bilibili.py
index bc04241..cb7ab2a 100644
--- a/hypervideo_dl/extractor/bilibili.py
+++ b/hypervideo_dl/extractor/bilibili.py
@@ -1,11 +1,14 @@
import base64
import functools
+import hashlib
import itertools
import math
-import urllib.error
+import time
import urllib.parse
from .common import InfoExtractor, SearchInfoExtractor
+from ..dependencies import Cryptodome
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
GeoRestrictedError,
@@ -15,14 +18,20 @@ from ..utils import (
float_or_none,
format_field,
int_or_none,
+ join_nonempty,
make_archive_id,
+ merge_dicts,
mimetype2ext,
parse_count,
parse_qs,
qualities,
+ smuggle_url,
srt_subtitles_timecode,
str_or_none,
traverse_obj,
+ try_call,
+ unified_timestamp,
+ unsmuggle_url,
url_or_none,
urlencode_postdata,
)
@@ -77,7 +86,7 @@ class BilibiliBaseIE(InfoExtractor):
f'{line["content"]}\n\n')
return srt_data
- def _get_subtitles(self, video_id, initial_state, cid):
+ def _get_subtitles(self, video_id, aid, cid):
subtitles = {
'danmaku': [{
'ext': 'xml',
@@ -85,7 +94,8 @@ class BilibiliBaseIE(InfoExtractor):
}]
}
- for s in traverse_obj(initial_state, ('videoData', 'subtitle', 'list')) or []:
+ video_info_json = self._download_json(f'https://api.bilibili.com/x/player/v2?aid={aid}&cid={cid}', video_id)
+ for s in traverse_obj(video_info_json, ('data', 'subtitle', 'subtitles', ...)):
subtitles.setdefault(s['lan'], []).append({
'ext': 'srt',
'data': self.json2srt(self._download_json(s['subtitle_url'], video_id))
@@ -126,9 +136,20 @@ class BilibiliBaseIE(InfoExtractor):
for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))):
yield from children
+ def _get_episodes_from_season(self, ss_id, url):
+ season_info = self._download_json(
+ 'https://api.bilibili.com/pgc/web/season/section', ss_id,
+ note='Downloading season info', query={'season_id': ss_id},
+ headers={'Referer': url, **self.geo_verification_headers()})
+
+ for entry in traverse_obj(season_info, (
+ 'result', 'main_section', 'episodes',
+ lambda _, v: url_or_none(v['share_url']) and v['id'])):
+ yield self.url_result(entry['share_url'], BiliBiliBangumiIE, f'ep{entry["id"]}')
+
class BiliBiliIE(BilibiliBaseIE):
- _VALID_URL = r'https?://www\.bilibili\.com/video/[aAbB][vV](?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.bilibili.com/video/BV13x41117TL',
@@ -276,19 +297,60 @@ class BiliBiliIE(BilibiliBaseIE):
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
},
'params': {'skip_download': True},
+ }, {
+ 'note': 'video redirects to festival page',
+ 'url': 'https://www.bilibili.com/video/BV1wP4y1P72h',
+ 'info_dict': {
+ 'id': 'BV1wP4y1P72h',
+ 'ext': 'mp4',
+ 'title': '牛虎年相交之际,一首传统民族打击乐《牛斗虎》祝大家新春快乐,虎年大吉!【bilibili音乐虎闹新春】',
+ 'timestamp': 1643947497,
+ 'upload_date': '20220204',
+ 'description': 'md5:8681a0d4d2c06b4ae27e59c8080a7fe6',
+ 'uploader': '叨叨冯聊音乐',
+ 'duration': 246.719,
+ 'uploader_id': '528182630',
+ 'view_count': int,
+ 'like_count': int,
+ 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
+ },
+ 'params': {'skip_download': True},
+ }, {
+ 'note': 'newer festival video',
+ 'url': 'https://www.bilibili.com/festival/2023honkaiimpact3gala?bvid=BV1ay4y1d77f',
+ 'info_dict': {
+ 'id': 'BV1ay4y1d77f',
+ 'ext': 'mp4',
+ 'title': '【崩坏3新春剧场】为特别的你送上祝福!',
+ 'timestamp': 1674273600,
+ 'upload_date': '20230121',
+ 'description': 'md5:58af66d15c6a0122dc30c8adfd828dd8',
+ 'uploader': '果蝇轰',
+ 'duration': 1111.722,
+ 'uploader_id': '8469526',
+ 'view_count': int,
+ 'like_count': int,
+ 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
+ },
+ 'params': {'skip_download': True},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)
- play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data']
- video_data = initial_state['videoData']
+ is_festival = 'videoData' not in initial_state
+ if is_festival:
+ video_data = initial_state['videoInfo']
+ else:
+ play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data']
+ video_data = initial_state['videoData']
+
video_id, title = video_data['bvid'], video_data.get('title')
# Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself.
- page_list_json = traverse_obj(
+ page_list_json = not is_festival and traverse_obj(
self._download_json(
'https://api.bilibili.com/x/player/pagelist', video_id,
fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'},
@@ -303,106 +365,143 @@ class BiliBiliIE(BilibiliBaseIE):
getter=lambda entry: f'https://www.bilibili.com/video/{video_id}?p={entry["page"]}')
if is_anthology:
- title += f' p{part_id:02d} {traverse_obj(page_list_json, ((part_id or 1) - 1, "part")) or ""}'
+ part_id = part_id or 1
+ title += f' p{part_id:02d} {traverse_obj(page_list_json, (part_id - 1, "part")) or ""}'
aid = video_data.get('aid')
old_video_id = format_field(aid, None, f'%s_part{part_id or 1}')
cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid')
+ festival_info = {}
+ if is_festival:
+ play_info = self._download_json(
+ 'https://api.bilibili.com/x/player/playurl', video_id,
+ query={'bvid': video_id, 'cid': cid, 'fnval': 4048},
+ note='Extracting festival video formats')['data']
+
+ festival_info = traverse_obj(initial_state, {
+ 'uploader': ('videoInfo', 'upName'),
+ 'uploader_id': ('videoInfo', 'upMid', {str_or_none}),
+ 'like_count': ('videoStatus', 'like', {int_or_none}),
+ 'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'),
+ }, get_all=False)
+
return {
+ **traverse_obj(initial_state, {
+ 'uploader': ('upData', 'name'),
+ 'uploader_id': ('upData', 'mid', {str_or_none}),
+ 'like_count': ('videoData', 'stat', 'like', {int_or_none}),
+ 'tags': ('tags', ..., 'tag_name'),
+ 'thumbnail': ('videoData', 'pic', {url_or_none}),
+ }),
+ **festival_info,
+ **traverse_obj(video_data, {
+ 'description': 'desc',
+ 'timestamp': ('pubdate', {int_or_none}),
+ 'view_count': (('viewCount', ('stat', 'view')), {int_or_none}),
+ 'comment_count': ('stat', 'reply', {int_or_none}),
+ }, get_all=False),
'id': f'{video_id}{format_field(part_id, None, "_p%d")}',
'formats': self.extract_formats(play_info),
'_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None,
'title': title,
- 'description': traverse_obj(initial_state, ('videoData', 'desc')),
- 'view_count': traverse_obj(initial_state, ('videoData', 'stat', 'view')),
- 'uploader': traverse_obj(initial_state, ('upData', 'name')),
- 'uploader_id': traverse_obj(initial_state, ('upData', 'mid')),
- 'like_count': traverse_obj(initial_state, ('videoData', 'stat', 'like')),
- 'comment_count': traverse_obj(initial_state, ('videoData', 'stat', 'reply')),
- 'tags': traverse_obj(initial_state, ('tags', ..., 'tag_name')),
- 'thumbnail': traverse_obj(initial_state, ('videoData', 'pic')),
- 'timestamp': traverse_obj(initial_state, ('videoData', 'pubdate')),
'duration': float_or_none(play_info.get('timelength'), scale=1000),
'chapters': self._get_chapters(aid, cid),
- 'subtitles': self.extract_subtitles(video_id, initial_state, cid),
+ 'subtitles': self.extract_subtitles(video_id, aid, cid),
'__post_extractor': self.extract_comments(aid),
'http_headers': {'Referer': url},
}
class BiliBiliBangumiIE(BilibiliBaseIE):
- _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/(?P<id>(?:ss|ep)\d+)'
+ _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/(?P<id>ep\d+)'
_TESTS = [{
- 'url': 'https://www.bilibili.com/bangumi/play/ss897',
+ 'url': 'https://www.bilibili.com/bangumi/play/ep267851',
'info_dict': {
- 'id': 'ss897',
+ 'id': '267851',
'ext': 'mp4',
- 'series': '神的记事本',
- 'season': '神的记事本',
- 'season_id': 897,
+ 'series': '鬼灭之刃',
+ 'series_id': '4358',
+ 'season': '鬼灭之刃',
+ 'season_id': '26801',
'season_number': 1,
- 'episode': '你与旅行包',
- 'episode_number': 2,
- 'title': '神的记事本:第2话 你与旅行包',
- 'duration': 1428.487,
- 'timestamp': 1310809380,
- 'upload_date': '20110716',
- 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
+ 'episode': '残酷',
+ 'episode_id': '267851',
+ 'episode_number': 1,
+ 'title': '1 残酷',
+ 'duration': 1425.256,
+ 'timestamp': 1554566400,
+ 'upload_date': '20190406',
+ 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$'
},
- }, {
- 'url': 'https://www.bilibili.com/bangumi/play/ep508406',
- 'only_matching': True,
+ 'skip': 'According to the copyright owner\'s request, you may only watch the video after you are premium member.'
}]
def _real_extract(self, url):
video_id = self._match_id(url)
+ episode_id = video_id[2:]
webpage = self._download_webpage(url, video_id)
if '您所在的地区无法观看本片' in webpage:
raise GeoRestrictedError('This video is restricted')
- elif ('开通大会员观看' in webpage and '__playinfo__' not in webpage
- or '正在观看预览,大会员免费看全片' in webpage):
+ elif '正在观看预览,大会员免费看全片' in webpage:
self.raise_login_required('This video is for premium members only')
- play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data']
+ headers = {'Referer': url, **self.geo_verification_headers()}
+ play_info = self._download_json(
+ 'https://api.bilibili.com/pgc/player/web/v2/playurl', video_id,
+ 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id},
+ headers=headers)
+ premium_only = play_info.get('code') == -10403
+ play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {}
+
formats = self.extract_formats(play_info)
- if (not formats and '成为大会员抢先看' in webpage
- and play_info.get('durl') and not play_info.get('dash')):
+ if not formats and (premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage):
self.raise_login_required('This video is for premium members only')
- initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)
+ bangumi_info = self._download_json(
+ 'https://api.bilibili.com/pgc/view/web/season', video_id, 'Get episode details',
+ query={'ep_id': episode_id}, headers=headers)['result']
- season_id = traverse_obj(initial_state, ('mediaInfo', 'season_id'))
+ episode_number, episode_info = next((
+ (idx, ep) for idx, ep in enumerate(traverse_obj(
+ bangumi_info, ('episodes', ..., {dict})), 1)
+ if str_or_none(ep.get('id')) == episode_id), (1, {}))
+
+ season_id = bangumi_info.get('season_id')
season_number = season_id and next((
idx + 1 for idx, e in enumerate(
- traverse_obj(initial_state, ('mediaInfo', 'seasons', ...)))
+ traverse_obj(bangumi_info, ('seasons', ...)))
if e.get('season_id') == season_id
), None)
+ aid = episode_info.get('aid')
+
return {
'id': video_id,
'formats': formats,
- 'title': traverse_obj(initial_state, 'h1Title'),
- 'episode': traverse_obj(initial_state, ('epInfo', 'long_title')),
- 'episode_number': int_or_none(traverse_obj(initial_state, ('epInfo', 'title'))),
- 'series': traverse_obj(initial_state, ('mediaInfo', 'series')),
- 'season': traverse_obj(initial_state, ('mediaInfo', 'season_title')),
- 'season_id': season_id,
+ **traverse_obj(bangumi_info, {
+ 'series': ('series', 'series_title', {str}),
+ 'series_id': ('series', 'series_id', {str_or_none}),
+ 'thumbnail': ('square_cover', {url_or_none}),
+ }),
+ 'title': join_nonempty('title', 'long_title', delim=' ', from_dict=episode_info),
+ 'episode': episode_info.get('long_title'),
+ 'episode_id': episode_id,
+ 'episode_number': int_or_none(episode_info.get('title')) or episode_number,
+ 'season_id': str_or_none(season_id),
'season_number': season_number,
- 'thumbnail': traverse_obj(initial_state, ('epInfo', 'cover')),
- 'timestamp': traverse_obj(initial_state, ('epInfo', 'pub_time')),
+ 'timestamp': int_or_none(episode_info.get('pub_time')),
'duration': float_or_none(play_info.get('timelength'), scale=1000),
- 'subtitles': self.extract_subtitles(
- video_id, initial_state, traverse_obj(initial_state, ('epInfo', 'cid'))),
- '__post_extractor': self.extract_comments(traverse_obj(initial_state, ('epInfo', 'aid'))),
- 'http_headers': {'Referer': url, **self.geo_verification_headers()},
+ 'subtitles': self.extract_subtitles(video_id, aid, episode_info.get('cid')),
+ '__post_extractor': self.extract_comments(aid),
+ 'http_headers': headers,
}
-class BiliBiliBangumiMediaIE(InfoExtractor):
+class BiliBiliBangumiMediaIE(BilibiliBaseIE):
_VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.bilibili.com/bangumi/media/md24097891',
@@ -415,16 +514,26 @@ class BiliBiliBangumiMediaIE(InfoExtractor):
def _real_extract(self, url):
media_id = self._match_id(url)
webpage = self._download_webpage(url, media_id)
+ ss_id = self._search_json(
+ r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)['mediaInfo']['season_id']
+
+ return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id)
+
+
+class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
+ _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.bilibili.com/bangumi/play/ss26801',
+ 'info_dict': {
+ 'id': '26801'
+ },
+ 'playlist_mincount': 26
+ }]
- initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)
- episode_list = self._download_json(
- 'https://api.bilibili.com/pgc/web/season/section', media_id,
- query={'season_id': initial_state['mediaInfo']['season_id']},
- note='Downloading season info')['result']['main_section']['episodes']
+ def _real_extract(self, url):
+ ss_id = self._match_id(url)
- return self.playlist_result((
- self.url_result(entry['share_url'], BiliBiliBangumiIE, entry['aid'])
- for entry in episode_list), media_id)
+ return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id)
class BilibiliSpaceBaseIE(InfoExtractor):
@@ -447,21 +556,65 @@ class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE):
'id': '3985676',
},
'playlist_mincount': 178,
+ }, {
+ 'url': 'https://space.bilibili.com/313580179/video',
+ 'info_dict': {
+ 'id': '313580179',
+ },
+ 'playlist_mincount': 92,
}]
+ def _extract_signature(self, playlist_id):
+ session_data = self._download_json('https://api.bilibili.com/x/web-interface/nav', playlist_id, fatal=False)
+
+ key_from_url = lambda x: x[x.rfind('/') + 1:].split('.')[0]
+ img_key = traverse_obj(
+ session_data, ('data', 'wbi_img', 'img_url', {key_from_url})) or '34478ba821254d9d93542680e3b86100'
+ sub_key = traverse_obj(
+ session_data, ('data', 'wbi_img', 'sub_url', {key_from_url})) or '7e16a90d190a4355a78fd00b32a38de6'
+
+ session_key = img_key + sub_key
+
+ signature_values = []
+ for position in (
+ 46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39,
+ 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63,
+ 57, 62, 11, 36, 20, 34, 44, 52
+ ):
+ char_at_position = try_call(lambda: session_key[position])
+ if char_at_position:
+ signature_values.append(char_at_position)
+
+ return ''.join(signature_values)[:32]
+
def _real_extract(self, url):
playlist_id, is_video_url = self._match_valid_url(url).group('id', 'video')
if not is_video_url:
self.to_screen('A channel URL was given. Only the channel\'s videos will be downloaded. '
'To download audios, add a "/audio" to the URL')
+ signature = self._extract_signature(playlist_id)
+
def fetch_page(page_idx):
+ query = {
+ 'keyword': '',
+ 'mid': playlist_id,
+ 'order': 'pubdate',
+ 'order_avoided': 'true',
+ 'platform': 'web',
+ 'pn': page_idx + 1,
+ 'ps': 30,
+ 'tid': 0,
+ 'web_location': 1550101,
+ 'wts': int(time.time()),
+ }
+ query['w_rid'] = hashlib.md5(f'{urllib.parse.urlencode(query)}{signature}'.encode()).hexdigest()
+
try:
- response = self._download_json('https://api.bilibili.com/x/space/arc/search',
- playlist_id, note=f'Downloading page {page_idx}',
- query={'mid': playlist_id, 'pn': page_idx + 1, 'jsonp': 'jsonp'})
+ response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search',
+ playlist_id, note=f'Downloading page {page_idx}', query=query)
except ExtractorError as e:
- if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 412:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 412:
raise ExtractorError(
'Request is blocked by server (412), please add cookies, wait and try later.', expected=True)
raise
@@ -489,9 +642,9 @@ class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE):
class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE):
_VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)/audio'
_TESTS = [{
- 'url': 'https://space.bilibili.com/3985676/audio',
+ 'url': 'https://space.bilibili.com/313580179/audio',
'info_dict': {
- 'id': '3985676',
+ 'id': '313580179',
},
'playlist_mincount': 1,
}]
@@ -880,35 +1033,24 @@ class BiliIntlBaseIE(InfoExtractor):
return formats
- def _extract_video_info(self, video_data, *, ep_id=None, aid=None):
+ def _parse_video_metadata(self, video_data):
return {
- 'id': ep_id or aid,
'title': video_data.get('title_display') or video_data.get('title'),
'thumbnail': video_data.get('cover'),
'episode_number': int_or_none(self._search_regex(
r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)),
- 'formats': self._get_formats(ep_id=ep_id, aid=aid),
- 'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid),
- 'extractor_key': BiliIntlIE.ie_key(),
}
def _perform_login(self, username, password):
- try:
- from Cryptodome.PublicKey import RSA
- from Cryptodome.Cipher import PKCS1_v1_5
- except ImportError:
- try:
- from Crypto.PublicKey import RSA
- from Crypto.Cipher import PKCS1_v1_5
- except ImportError:
- raise ExtractorError('pycryptodomex not found. Please install', expected=True)
+ if not Cryptodome.RSA:
+ raise ExtractorError('pycryptodomex not found. Please install', expected=True)
key_data = self._download_json(
'https://passport.bilibili.tv/x/intl/passport-login/web/key?lang=en-US', None,
note='Downloading login key', errnote='Unable to download login key')['data']
- public_key = RSA.importKey(key_data['key'])
- password_hash = PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8'))
+ public_key = Cryptodome.RSA.importKey(key_data['key'])
+ password_hash = Cryptodome.PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8'))
login_post = self._download_json(
'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({
'username': username,
@@ -935,6 +1077,23 @@ class BiliIntlIE(BiliIntlBaseIE):
'title': 'E2 - The First Night',
'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
'episode_number': 2,
+ 'upload_date': '20201009',
+ 'episode': 'Episode 2',
+ 'timestamp': 1602259500,
+ 'description': 'md5:297b5a17155eb645e14a14b385ab547e',
+ 'chapters': [{
+ 'start_time': 0,
+ 'end_time': 76.242,
+ 'title': '<Untitled Chapter 1>'
+ }, {
+ 'start_time': 76.242,
+ 'end_time': 161.161,
+ 'title': 'Intro'
+ }, {
+ 'start_time': 1325.742,
+ 'end_time': 1403.903,
+ 'title': 'Outro'
+ }],
}
}, {
# Non-Bstation page
@@ -945,6 +1104,23 @@ class BiliIntlIE(BiliIntlBaseIE):
'title': 'E3 - Who?',
'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
'episode_number': 3,
+ 'description': 'md5:e1a775e71a35c43f141484715470ad09',
+ 'episode': 'Episode 3',
+ 'upload_date': '20211219',
+ 'timestamp': 1639928700,
+ 'chapters': [{
+ 'start_time': 0,
+ 'end_time': 88.0,
+ 'title': '<Untitled Chapter 1>'
+ }, {
+ 'start_time': 88.0,
+ 'end_time': 156.0,
+ 'title': 'Intro'
+ }, {
+ 'start_time': 1173.0,
+ 'end_time': 1259.535,
+ 'title': 'Outro'
+ }],
}
}, {
# Subtitle with empty content
@@ -958,6 +1134,78 @@ class BiliIntlIE(BiliIntlBaseIE):
},
'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.'
}, {
+ 'url': 'https://www.bilibili.tv/en/video/2041863208',
+ 'info_dict': {
+ 'id': '2041863208',
+ 'ext': 'mp4',
+ 'timestamp': 1670874843,
+ 'description': 'Scheduled for April 2023.\nStudio: ufotable',
+ 'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$',
+ 'upload_date': '20221212',
+ 'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation',
+ },
+ }, {
+ # episode comment extraction
+ 'url': 'https://www.bilibili.tv/en/play/34580/340317',
+ 'info_dict': {
+ 'id': '340317',
+ 'ext': 'mp4',
+ 'timestamp': 1604057820,
+ 'upload_date': '20201030',
+ 'episode_number': 5,
+ 'title': 'E5 - My Own Steel',
+ 'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2',
+ 'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$',
+ 'episode': 'Episode 5',
+ 'comment_count': int,
+ 'chapters': [{
+ 'start_time': 0,
+ 'end_time': 61.0,
+ 'title': '<Untitled Chapter 1>'
+ }, {
+ 'start_time': 61.0,
+ 'end_time': 134.0,
+ 'title': 'Intro'
+ }, {
+ 'start_time': 1290.0,
+ 'end_time': 1379.0,
+ 'title': 'Outro'
+ }],
+ },
+ 'params': {
+ 'getcomments': True
+ }
+ }, {
+ # user generated content comment extraction
+ 'url': 'https://www.bilibili.tv/en/video/2045730385',
+ 'info_dict': {
+ 'id': '2045730385',
+ 'ext': 'mp4',
+ 'description': 'md5:693b6f3967fb4e7e7764ea817857c33a',
+ 'timestamp': 1667891924,
+ 'upload_date': '20221108',
+ 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation',
+ 'comment_count': int,
+ 'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg',
+ },
+ 'params': {
+ 'getcomments': True
+ }
+ }, {
+ # episode id without intro and outro
+ 'url': 'https://www.bilibili.tv/en/play/1048837/11246489',
+ 'info_dict': {
+ 'id': '11246489',
+ 'ext': 'mp4',
+ 'title': 'E1 - Operation \'Strix\' <Owl>',
+ 'description': 'md5:b4434eb1a9a97ad2bccb779514b89f17',
+ 'timestamp': 1649516400,
+ 'thumbnail': 'https://pic.bstarstatic.com/ogv/62cb1de23ada17fb70fbe7bdd6ff29c29da02a64.png',
+ 'episode': 'Episode 1',
+ 'episode_number': 1,
+ 'upload_date': '20220409',
+ },
+ }, {
'url': 'https://www.biliintl.com/en/play/34613/341736',
'only_matching': True,
}, {
@@ -974,36 +1222,139 @@ class BiliIntlIE(BiliIntlBaseIE):
'only_matching': True,
}]
- def _real_extract(self, url):
- season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')
- video_id = ep_id or aid
+ def _make_url(video_id, series_id=None):
+ if series_id:
+ return f'https://www.bilibili.tv/en/play/{series_id}/{video_id}'
+ return f'https://www.bilibili.tv/en/video/{video_id}'
+
+ def _extract_video_metadata(self, url, video_id, season_id):
+ url, smuggled_data = unsmuggle_url(url, {})
+ if smuggled_data.get('title'):
+ return smuggled_data
+
webpage = self._download_webpage(url, video_id)
# Bstation layout
initial_data = (
self._search_json(r'window\.__INITIAL_(?:DATA|STATE)__\s*=', webpage, 'preload state', video_id, default={})
or self._search_nuxt_data(webpage, video_id, '__initialState', fatal=False, traverse=None))
video_data = traverse_obj(
- initial_data, ('OgvVideo', 'epDetail'), ('UgcVideo', 'videoData'), ('ugc', 'archive'), expected_type=dict)
+ initial_data, ('OgvVideo', 'epDetail'), ('UgcVideo', 'videoData'), ('ugc', 'archive'), expected_type=dict) or {}
if season_id and not video_data:
# Non-Bstation layout, read through episode list
season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id)
- video_data = traverse_obj(season_json,
- ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id),
- expected_type=dict, get_all=False)
- return self._extract_video_info(video_data or {}, ep_id=ep_id, aid=aid)
+ video_data = traverse_obj(season_json, (
+ 'sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == video_id
+ ), expected_type=dict, get_all=False)
+
+ # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found
+ return merge_dicts(
+ self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), {
+ 'title': self._html_search_meta('og:title', webpage),
+ 'description': self._html_search_meta('og:description', webpage)
+ })
+
+ def _get_comments_reply(self, root_id, next_id=0, display_id=None):
+ comment_api_raw_data = self._download_json(
+ 'https://api.bilibili.tv/reply/web/detail', display_id,
+ note=f'Downloading reply comment of {root_id} - {next_id}',
+ query={
+ 'platform': 'web',
+ 'ps': 20, # comment's reply per page (default: 3)
+ 'root': root_id,
+ 'next': next_id,
+ })
+
+ for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
+ yield {
+ 'author': traverse_obj(replies, ('member', 'name')),
+ 'author_id': traverse_obj(replies, ('member', 'mid')),
+ 'author_thumbnail': traverse_obj(replies, ('member', 'face')),
+ 'text': traverse_obj(replies, ('content', 'message')),
+ 'id': replies.get('rpid'),
+ 'like_count': int_or_none(replies.get('like_count')),
+ 'parent': replies.get('parent'),
+ 'timestamp': unified_timestamp(replies.get('ctime_text'))
+ }
+
+ if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
+ yield from self._get_comments_reply(
+ root_id, comment_api_raw_data['data']['cursor']['next'], display_id)
+
+ def _get_comments(self, video_id, ep_id):
+ for i in itertools.count(0):
+ comment_api_raw_data = self._download_json(
+ 'https://api.bilibili.tv/reply/web/root', video_id,
+ note=f'Downloading comment page {i + 1}',
+ query={
+ 'platform': 'web',
+ 'pn': i, # page number
+ 'ps': 20, # comment per page (default: 20)
+ 'oid': video_id,
+ 'type': 3 if ep_id else 1, # 1: user generated content, 3: series content
+ 'sort_type': 1, # 1: best, 2: recent
+ })
+
+ for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
+ yield {
+ 'author': traverse_obj(replies, ('member', 'name')),
+ 'author_id': traverse_obj(replies, ('member', 'mid')),
+ 'author_thumbnail': traverse_obj(replies, ('member', 'face')),
+ 'text': traverse_obj(replies, ('content', 'message')),
+ 'id': replies.get('rpid'),
+ 'like_count': int_or_none(replies.get('like_count')),
+ 'timestamp': unified_timestamp(replies.get('ctime_text')),
+ 'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))),
+ }
+ if replies.get('count'):
+ yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id)
+
+ if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
+ break
+
+ def _real_extract(self, url):
+ season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')
+ video_id = ep_id or aid
+ chapters = None
+
+ if ep_id:
+ intro_ending_json = self._call_api(
+ f'/web/v2/ogv/play/episode?episode_id={ep_id}&platform=web',
+ video_id, fatal=False) or {}
+ if intro_ending_json.get('skip'):
+ # FIXME: start time and end time seems a bit off a few second even it corrext based on ogv.*.js
+ # ref: https://p.bstarstatic.com/fe-static/bstar-web-new/assets/ogv.2b147442.js
+ chapters = [{
+ 'start_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'opening_start_time')), 1000),
+ 'end_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'opening_end_time')), 1000),
+ 'title': 'Intro'
+ }, {
+ 'start_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'ending_start_time')), 1000),
+ 'end_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'ending_end_time')), 1000),
+ 'title': 'Outro'
+ }]
+
+ return {
+ 'id': video_id,
+ **self._extract_video_metadata(url, video_id, season_id),
+ 'formats': self._get_formats(ep_id=ep_id, aid=aid),
+ 'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid),
+ 'chapters': chapters,
+ '__post_extractor': self.extract_comments(video_id, ep_id)
+ }
class BiliIntlSeriesIE(BiliIntlBaseIE):
- _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?play/(?P<id>\d+)/?(?:[?#]|$)'
+ IE_NAME = 'biliIntl:series'
+ _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?(?:play|media)/(?P<id>\d+)/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://www.bilibili.tv/en/play/34613',
'playlist_mincount': 15,
'info_dict': {
'id': '34613',
- 'title': 'Fly Me to the Moon',
- 'description': 'md5:a861ee1c4dc0acfad85f557cc42ac627',
- 'categories': ['Romance', 'Comedy', 'Slice of life'],
+ 'title': 'TONIKAWA: Over the Moon For You',
+ 'description': 'md5:297b5a17155eb645e14a14b385ab547e',
+ 'categories': ['Slice of life', 'Comedy', 'Romance'],
'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$',
'view_count': int,
},
@@ -1011,6 +1362,17 @@ class BiliIntlSeriesIE(BiliIntlBaseIE):
'skip_download': True,
},
}, {
+ 'url': 'https://www.bilibili.tv/en/media/1048837',
+ 'info_dict': {
+ 'id': '1048837',
+ 'title': 'SPY×FAMILY',
+ 'description': 'md5:b4434eb1a9a97ad2bccb779514b89f17',
+ 'categories': ['Adventure', 'Action', 'Comedy'],
+ 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.jpg$',
+ 'view_count': int,
+ },
+ 'playlist_mincount': 25,
+ }, {
'url': 'https://www.biliintl.com/en/play/34613',
'only_matching': True,
}, {
@@ -1020,9 +1382,12 @@ class BiliIntlSeriesIE(BiliIntlBaseIE):
def _entries(self, series_id):
series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id)
- for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]):
- episode_id = str(episode.get('episode_id'))
- yield self._extract_video_info(episode, ep_id=episode_id)
+ for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict):
+ episode_id = str(episode['episode_id'])
+ yield self.url_result(smuggle_url(
+ BiliIntlIE._make_url(episode_id, series_id),
+ self._parse_video_metadata(episode)
+ ), BiliIntlIE, episode_id)
def _real_extract(self, url):
series_id = self._match_id(url)
@@ -1034,7 +1399,7 @@ class BiliIntlSeriesIE(BiliIntlBaseIE):
class BiliLiveIE(InfoExtractor):
- _VALID_URL = r'https?://live.bilibili.com/(?P<id>\d+)'
+ _VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P<id>\d+)'
_TESTS = [{
'url': 'https://live.bilibili.com/196',
@@ -1050,6 +1415,9 @@ class BiliLiveIE(InfoExtractor):
}, {
'url': 'https://live.bilibili.com/196?broadcast_type=0&is_room_feed=1?spm_id_from=333.999.space_home.strengthen_live_card.click',
'only_matching': True
+ }, {
+ 'url': 'https://live.bilibili.com/blanc/196',
+ 'only_matching': True
}]
_FORMATS = {
@@ -1111,6 +1479,7 @@ class BiliLiveIE(InfoExtractor):
'thumbnail': room_data.get('user_cover'),
'timestamp': stream_data.get('live_time'),
'formats': formats,
+ 'is_live': True,
'http_headers': {
'Referer': url,
},
diff --git a/hypervideo_dl/extractor/bitchute.py b/hypervideo_dl/extractor/bitchute.py
index 10e7b0b..0805b8b 100644
--- a/hypervideo_dl/extractor/bitchute.py
+++ b/hypervideo_dl/extractor/bitchute.py
@@ -2,9 +2,9 @@ import functools
import re
from .common import InfoExtractor
+from ..networking import HEADRequest
from ..utils import (
ExtractorError,
- HEADRequest,
OnDemandPagedList,
clean_html,
get_element_by_class,
@@ -77,7 +77,10 @@ class BitChuteIE(InfoExtractor):
def _check_format(self, video_url, video_id):
urls = orderedSet(
re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url)
- for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153'))
+ for host in (r'\g<2>', 'seed122', 'seed125', 'seed126', 'seed128',
+ 'seed132', 'seed150', 'seed151', 'seed152', 'seed153',
+ 'seed167', 'seed171', 'seed177', 'seed305', 'seed307',
+ 'seedp29xb', 'zb10-7gsop1v78'))
for url in urls:
try:
response = self._request_webpage(
diff --git a/hypervideo_dl/extractor/blerp.py b/hypervideo_dl/extractor/blerp.py
new file mode 100644
index 0000000..4631ad2
--- /dev/null
+++ b/hypervideo_dl/extractor/blerp.py
@@ -0,0 +1,167 @@
+import json
+
+from .common import InfoExtractor
+from ..utils import strip_or_none, traverse_obj
+
+
+class BlerpIE(InfoExtractor):
+ IE_NAME = 'blerp'
+ _VALID_URL = r'https?://(?:www\.)?blerp\.com/soundbites/(?P<id>[0-9a-zA-Z]+)'
+ _TESTS = [{
+ 'url': 'https://blerp.com/soundbites/6320fe8745636cb4dd677a5a',
+ 'info_dict': {
+ 'id': '6320fe8745636cb4dd677a5a',
+ 'title': 'Samsung Galaxy S8 Over the Horizon Ringtone 2016',
+ 'uploader': 'luminousaj',
+ 'uploader_id': '5fb81e51aa66ae000c395478',
+ 'ext': 'mp3',
+ 'tags': ['samsung', 'galaxy', 's8', 'over the horizon', '2016', 'ringtone'],
+ }
+ }, {
+ 'url': 'https://blerp.com/soundbites/5bc94ef4796001000498429f',
+ 'info_dict': {
+ 'id': '5bc94ef4796001000498429f',
+ 'title': 'Yee',
+ 'uploader': '179617322678353920',
+ 'uploader_id': '5ba99cf71386730004552c42',
+ 'ext': 'mp3',
+ 'tags': ['YEE', 'YEET', 'wo ha haah catchy tune yee', 'yee']
+ }
+ }]
+
+ _GRAPHQL_OPERATIONNAME = "webBitePageGetBite"
+ _GRAPHQL_QUERY = (
+ '''query webBitePageGetBite($_id: MongoID!) {
+ web {
+ biteById(_id: $_id) {
+ ...bitePageFrag
+ __typename
+ }
+ __typename
+ }
+ }
+
+ fragment bitePageFrag on Bite {
+ _id
+ title
+ userKeywords
+ keywords
+ color
+ visibility
+ isPremium
+ owned
+ price
+ extraReview
+ isAudioExists
+ image {
+ filename
+ original {
+ url
+ __typename
+ }
+ __typename
+ }
+ userReactions {
+ _id
+ reactions
+ createdAt
+ __typename
+ }
+ topReactions
+ totalSaveCount
+ saved
+ blerpLibraryType
+ license
+ licenseMetaData
+ playCount
+ totalShareCount
+ totalFavoriteCount
+ totalAddedToBoardCount
+ userCategory
+ userAudioQuality
+ audioCreationState
+ transcription
+ userTranscription
+ description
+ createdAt
+ updatedAt
+ author
+ listingType
+ ownerObject {
+ _id
+ username
+ profileImage {
+ filename
+ original {
+ url
+ __typename
+ }
+ __typename
+ }
+ __typename
+ }
+ transcription
+ favorited
+ visibility
+ isCurated
+ sourceUrl
+ audienceRating
+ strictAudienceRating
+ ownerId
+ reportObject {
+ reportedContentStatus
+ __typename
+ }
+ giphy {
+ mp4
+ gif
+ __typename
+ }
+ audio {
+ filename
+ original {
+ url
+ __typename
+ }
+ mp3 {
+ url
+ __typename
+ }
+ __typename
+ }
+ __typename
+ }
+
+ ''')
+
+ def _real_extract(self, url):
+ audio_id = self._match_id(url)
+
+ data = {
+ 'operationName': self._GRAPHQL_OPERATIONNAME,
+ 'query': self._GRAPHQL_QUERY,
+ 'variables': {
+ '_id': audio_id
+ }
+ }
+
+ headers = {
+ 'Content-Type': 'application/json'
+ }
+
+ json_result = self._download_json('https://api.blerp.com/graphql',
+ audio_id, data=json.dumps(data).encode('utf-8'), headers=headers)
+
+ bite_json = json_result['data']['web']['biteById']
+
+ info_dict = {
+ 'id': bite_json['_id'],
+ 'url': bite_json['audio']['mp3']['url'],
+ 'title': bite_json['title'],
+ 'uploader': traverse_obj(bite_json, ('ownerObject', 'username'), expected_type=strip_or_none),
+ 'uploader_id': traverse_obj(bite_json, ('ownerObject', '_id'), expected_type=strip_or_none),
+ 'ext': 'mp3',
+ 'tags': list(filter(None, map(strip_or_none, (traverse_obj(bite_json, 'userKeywords', expected_type=list) or []))) or None)
+ }
+
+ return info_dict
diff --git a/hypervideo_dl/extractor/boxcast.py b/hypervideo_dl/extractor/boxcast.py
new file mode 100644
index 0000000..51f9eb7
--- /dev/null
+++ b/hypervideo_dl/extractor/boxcast.py
@@ -0,0 +1,102 @@
+from .common import InfoExtractor
+from ..utils import (
+ js_to_json,
+ traverse_obj,
+ unified_timestamp
+)
+
+
+class BoxCastVideoIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://boxcast\.tv/(?:
+ view-embed/|
+ channel/\w+\?(?:[^#]+&)?b=|
+ video-portal/(?:\w+/){2}
+ )(?P<id>[\w-]+)'''
+ _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>https?://boxcast\.tv/view-embed/[\w-]+)']
+ _TESTS = [{
+ 'url': 'https://boxcast.tv/view-embed/in-the-midst-of-darkness-light-prevails-an-interdisciplinary-symposium-ozmq5eclj50ujl4bmpwx',
+ 'info_dict': {
+ 'id': 'da1eqqgkacngd5djlqld',
+ 'ext': 'mp4',
+ 'thumbnail': r're:https?://uploads\.boxcast\.com/(?:[\w+-]+/){3}.+\.png$',
+ 'title': 'In the Midst of Darkness Light Prevails: An Interdisciplinary Symposium',
+ 'release_timestamp': 1670686812,
+ 'release_date': '20221210',
+ 'uploader_id': 're8w0v8hohhvpqtbskpe',
+ 'uploader': 'Children\'s Health Defense',
+ }
+ }, {
+ 'url': 'https://boxcast.tv/video-portal/vctwevwntun3o0ikq7af/rvyblnn0fxbfjx5nwxhl/otbpltj2kzkveo2qz3ad',
+ 'info_dict': {
+ 'id': 'otbpltj2kzkveo2qz3ad',
+ 'ext': 'mp4',
+ 'uploader_id': 'vctwevwntun3o0ikq7af',
+ 'uploader': 'Legacy Christian Church',
+ 'title': 'The Quest | 1: Beginner\'s Bay | Jamie Schools',
+ 'thumbnail': r're:https?://uploads.boxcast.com/(?:[\w-]+/){3}.+\.jpg'
+ }
+ }, {
+ 'url': 'https://boxcast.tv/channel/z03fqwaeaby5lnaawox2?b=ssihlw5gvfij2by8tkev',
+ 'info_dict': {
+ 'id': 'ssihlw5gvfij2by8tkev',
+ 'ext': 'mp4',
+ 'thumbnail': r're:https?://uploads.boxcast.com/(?:[\w-]+/){3}.+\.jpg$',
+ 'release_date': '20230101',
+ 'uploader_id': 'ds25vaazhlu4ygcvffid',
+ 'release_timestamp': 1672543201,
+ 'uploader': 'Lighthouse Ministries International - Beltsville, Maryland',
+ 'description': 'md5:ac23e3d01b0b0be592e8f7fe0ec3a340',
+ 'title': 'New Year\'s Eve CROSSOVER Service at LHMI | December 31, 2022',
+ }
+ }]
+ _WEBPAGE_TESTS = [{
+ 'url': 'https://childrenshealthdefense.eu/live-stream/',
+ 'info_dict': {
+ 'id': 'da1eqqgkacngd5djlqld',
+ 'ext': 'mp4',
+ 'thumbnail': r're:https?://uploads\.boxcast\.com/(?:[\w+-]+/){3}.+\.png$',
+ 'title': 'In the Midst of Darkness Light Prevails: An Interdisciplinary Symposium',
+ 'release_timestamp': 1670686812,
+ 'release_date': '20221210',
+ 'uploader_id': 're8w0v8hohhvpqtbskpe',
+ 'uploader': 'Children\'s Health Defense',
+ }
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ webpage_json_data = self._search_json(
+ r'var\s*BOXCAST_PRELOAD\s*=', webpage, 'broadcast data', display_id,
+ transform_source=js_to_json, default={})
+
+ # Ref: https://support.boxcast.com/en/articles/4235158-build-a-custom-viewer-experience-with-boxcast-api
+ broadcast_json_data = (
+ traverse_obj(webpage_json_data, ('broadcast', 'data'))
+ or self._download_json(f'https://api.boxcast.com/broadcasts/{display_id}', display_id))
+ view_json_data = (
+ traverse_obj(webpage_json_data, ('view', 'data'))
+ or self._download_json(f'https://api.boxcast.com/broadcasts/{display_id}/view',
+ display_id, fatal=False) or {})
+
+ formats, subtitles = [], {}
+ if view_json_data.get('status') == 'recorded':
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ view_json_data['playlist'], display_id)
+
+ return {
+ 'id': str(broadcast_json_data['id']),
+ 'title': (broadcast_json_data.get('name')
+ or self._html_search_meta(['og:title', 'twitter:title'], webpage)),
+ 'description': (broadcast_json_data.get('description')
+ or self._html_search_meta(['og:description', 'twitter:description'], webpage)
+ or None),
+ 'thumbnail': (broadcast_json_data.get('preview')
+ or self._html_search_meta(['og:image', 'twitter:image'], webpage)),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'release_timestamp': unified_timestamp(broadcast_json_data.get('streamed_at')),
+ 'uploader': broadcast_json_data.get('account_name'),
+ 'uploader_id': broadcast_json_data.get('account_id'),
+ }
diff --git a/hypervideo_dl/extractor/brainpop.py b/hypervideo_dl/extractor/brainpop.py
new file mode 100644
index 0000000..1200437
--- /dev/null
+++ b/hypervideo_dl/extractor/brainpop.py
@@ -0,0 +1,318 @@
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ classproperty,
+ int_or_none,
+ traverse_obj,
+ urljoin
+)
+
+
+class BrainPOPBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'brainpop'
+ _ORIGIN = '' # So that _VALID_URL doesn't crash
+ _LOGIN_ERRORS = {
+ 1502: 'The username and password you entered did not match.', # LOGIN_FAILED
+ 1503: 'Payment method is expired.', # LOGIN_FAILED_ACCOUNT_NOT_ACTIVE
+ 1506: 'Your BrainPOP plan has expired.', # LOGIN_FAILED_ACCOUNT_EXPIRED
+ 1507: 'Terms not accepted.', # LOGIN_FAILED_TERMS_NOT_ACCEPTED
+ 1508: 'Account not activated.', # LOGIN_FAILED_SUBSCRIPTION_NOT_ACTIVE
+ 1512: 'The maximum number of devices permitted are logged in with your account right now.', # LOGIN_FAILED_LOGIN_LIMIT_REACHED
+ 1513: 'You are trying to access your account from outside of its allowed IP range.', # LOGIN_FAILED_INVALID_IP
+ 1514: 'Individual accounts are not included in your plan. Try again with your shared username and password.', # LOGIN_FAILED_MBP_DISABLED
+ 1515: 'Account not activated.', # LOGIN_FAILED_TEACHER_NOT_ACTIVE
+ 1523: 'That username and password won\'t work on this BrainPOP site.', # LOGIN_FAILED_NO_ACCESS
+ 1524: 'You\'ll need to join a class before you can login.', # LOGIN_FAILED_STUDENT_NO_PERIOD
+ 1526: 'Your account is locked. Reset your password, or ask a teacher or administrator for help.', # LOGIN_FAILED_ACCOUNT_LOCKED
+ }
+
+ @classproperty
+ def _VALID_URL(cls):
+ root = re.escape(cls._ORIGIN).replace(r'https:', r'https?:').replace(r'www\.', r'(?:www\.)?')
+ return rf'{root}/(?P<slug>[^/]+/[^/]+/(?P<id>[^/?#&]+))'
+
+ def _assemble_formats(self, slug, format_id, display_id, token='', extra_fields={}):
+ formats = []
+ formats = self._extract_m3u8_formats(
+ f'{urljoin(self._HLS_URL, slug)}.m3u8?{token}',
+ display_id, 'mp4', m3u8_id=f'{format_id}-hls', fatal=False)
+ formats.append({
+ 'format_id': format_id,
+ 'url': f'{urljoin(self._VIDEO_URL, slug)}?{token}',
+ })
+ for f in formats:
+ f.update(extra_fields)
+ return formats
+
+ def _extract_adaptive_formats(self, data, token, display_id, key_format='%s', extra_fields={}):
+ formats = []
+ additional_key_formats = {
+ '%s': {},
+ 'ad_%s': {
+ 'format_note': 'Audio description',
+ 'source_preference': -2
+ }
+ }
+ for additional_key_format, additional_key_fields in additional_key_formats.items():
+ for key_quality, key_index in enumerate(('high', 'low')):
+ full_key_index = additional_key_format % (key_format % key_index)
+ if data.get(full_key_index):
+ formats.extend(self._assemble_formats(data[full_key_index], full_key_index, display_id, token, {
+ 'quality': -1 - key_quality,
+ **additional_key_fields,
+ **extra_fields
+ }))
+ return formats
+
+ def _perform_login(self, username, password):
+ login_res = self._download_json(
+ 'https://api.brainpop.com/api/login', None,
+ data=json.dumps({'username': username, 'password': password}).encode(),
+ headers={
+ 'Content-Type': 'application/json',
+ 'Referer': self._ORIGIN
+ }, note='Logging in', errnote='Unable to log in', expected_status=400)
+ status_code = int_or_none(login_res['status_code'])
+ if status_code != 1505:
+ self.report_warning(
+ f'Unable to login: {self._LOGIN_ERRORS.get(status_code) or login_res.get("message")}'
+ or f'Got status code {status_code}')
+
+
+class BrainPOPIE(BrainPOPBaseIE):
+ _ORIGIN = 'https://www.brainpop.com'
+ _VIDEO_URL = 'https://svideos.brainpop.com'
+ _HLS_URL = 'https://hls.brainpop.com'
+ _CDN_URL = 'https://cdn.brainpop.com'
+ _TESTS = [{
+ 'url': 'https://www.brainpop.com/health/conflictresolution/martinlutherkingjr/movie?ref=null',
+ 'md5': '3ead374233ae74c7f1b0029a01c972f0',
+ 'info_dict': {
+ 'id': '1f3259fa457292b4',
+ 'ext': 'mp4',
+ 'title': 'Martin Luther King, Jr.',
+ 'display_id': 'martinlutherkingjr',
+ 'description': 'md5:f403dbb2bf3ccc7cf4c59d9e43e3c349',
+ },
+ }, {
+ 'url': 'https://www.brainpop.com/science/space/bigbang/',
+ 'md5': '9a1ff0e77444dd9e437354eb669c87ec',
+ 'info_dict': {
+ 'id': 'acae52cd48c99acf',
+ 'ext': 'mp4',
+ 'title': 'Big Bang',
+ 'display_id': 'bigbang',
+ 'description': 'md5:3e53b766b0f116f631b13f4cae185d38',
+ },
+ 'skip': 'Requires login',
+ }]
+
+ def _real_extract(self, url):
+ slug, display_id = self._match_valid_url(url).group('slug', 'id')
+ movie_data = self._download_json(
+ f'https://api.brainpop.com/api/content/published/bp/en/{slug}/movie?full=1', display_id,
+ 'Downloading movie data JSON', 'Unable to download movie data')['data']
+ topic_data = traverse_obj(self._download_json(
+ f'https://api.brainpop.com/api/content/published/bp/en/{slug}?full=1', display_id,
+ 'Downloading topic data JSON', 'Unable to download topic data', fatal=False),
+ ('data', 'topic'), expected_type=dict) or movie_data['topic']
+
+ if not traverse_obj(movie_data, ('access', 'allow')):
+ reason = traverse_obj(movie_data, ('access', 'reason'))
+ if 'logged' in reason:
+ self.raise_login_required(reason, metadata_available=True)
+ else:
+ self.raise_no_formats(reason, video_id=display_id)
+ movie_feature = movie_data['feature']
+ movie_feature_data = movie_feature['data']
+
+ formats, subtitles = [], {}
+ formats.extend(self._extract_adaptive_formats(movie_feature_data, movie_feature_data.get('token', ''), display_id, '%s_v2', {
+ 'language': movie_feature.get('language') or 'en',
+ 'language_preference': 10
+ }))
+ for lang, localized_feature in traverse_obj(movie_feature, 'localization', default={}, expected_type=dict).items():
+ formats.extend(self._extract_adaptive_formats(localized_feature, localized_feature.get('token', ''), display_id, '%s_v2', {
+ 'language': lang,
+ 'language_preference': -10
+ }))
+
+ # TODO: Do localization fields also have subtitles?
+ for name, url in movie_feature_data.items():
+ lang = self._search_regex(
+ r'^subtitles_(?P<lang>\w+)$', name, 'subtitle metadata', default=None)
+ if lang and url:
+ subtitles.setdefault(lang, []).append({
+ 'url': urljoin(self._CDN_URL, url)
+ })
+
+ return {
+ 'id': topic_data['topic_id'],
+ 'display_id': display_id,
+ 'title': topic_data.get('name'),
+ 'description': topic_data.get('synopsis'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class BrainPOPLegacyBaseIE(BrainPOPBaseIE):
+ def _parse_js_topic_data(self, topic_data, display_id, token):
+ movie_data = topic_data['movies']
+ # TODO: Are there non-burned subtitles?
+ formats = self._extract_adaptive_formats(movie_data, token, display_id)
+
+ return {
+ 'id': topic_data['EntryID'],
+ 'display_id': display_id,
+ 'title': topic_data.get('name'),
+ 'alt_title': topic_data.get('title'),
+ 'description': topic_data.get('synopsis'),
+ 'formats': formats,
+ }
+
+ def _real_extract(self, url):
+ slug, display_id = self._match_valid_url(url).group('slug', 'id')
+ webpage = self._download_webpage(url, display_id)
+ topic_data = self._search_json(
+ r'var\s+content\s*=\s*', webpage, 'content data',
+ display_id, end_pattern=';')['category']['unit']['topic']
+ token = self._search_regex(r'ec_token\s*:\s*[\'"]([^\'"]+)', webpage, 'video token')
+ return self._parse_js_topic_data(topic_data, display_id, token)
+
+
+class BrainPOPJrIE(BrainPOPLegacyBaseIE):
+ _ORIGIN = 'https://jr.brainpop.com'
+ _VIDEO_URL = 'https://svideos-jr.brainpop.com'
+ _HLS_URL = 'https://hls-jr.brainpop.com'
+ _CDN_URL = 'https://cdn-jr.brainpop.com'
+ _TESTS = [{
+ 'url': 'https://jr.brainpop.com/health/feelingsandsel/emotions/',
+ 'md5': '04e0561bb21770f305a0ce6cf0d869ab',
+ 'info_dict': {
+ 'id': '347',
+ 'ext': 'mp4',
+ 'title': 'Emotions',
+ 'display_id': 'emotions',
+ },
+ }, {
+ 'url': 'https://jr.brainpop.com/science/habitats/arctichabitats/',
+ 'md5': 'b0ed063bbd1910df00220ee29340f5d6',
+ 'info_dict': {
+ 'id': '29',
+ 'ext': 'mp4',
+ 'title': 'Arctic Habitats',
+ 'display_id': 'arctichabitats',
+ },
+ 'skip': 'Requires login',
+ }]
+
+
+class BrainPOPELLIE(BrainPOPLegacyBaseIE):
+ _ORIGIN = 'https://ell.brainpop.com'
+ _VIDEO_URL = 'https://svideos-esl.brainpop.com'
+ _HLS_URL = 'https://hls-esl.brainpop.com'
+ _CDN_URL = 'https://cdn-esl.brainpop.com'
+ _TESTS = [{
+ 'url': 'https://ell.brainpop.com/level1/unit1/lesson1/',
+ 'md5': 'a2012700cfb774acb7ad2e8834eed0d0',
+ 'info_dict': {
+ 'id': '1',
+ 'ext': 'mp4',
+ 'title': 'Lesson 1',
+ 'display_id': 'lesson1',
+ 'alt_title': 'Personal Pronouns',
+ },
+ }, {
+ 'url': 'https://ell.brainpop.com/level3/unit6/lesson5/',
+ 'md5': 'be19c8292c87b24aacfb5fda2f3f8363',
+ 'info_dict': {
+ 'id': '101',
+ 'ext': 'mp4',
+ 'title': 'Lesson 5',
+ 'display_id': 'lesson5',
+ 'alt_title': 'Review: Unit 6',
+ },
+ 'skip': 'Requires login',
+ }]
+
+
+class BrainPOPEspIE(BrainPOPLegacyBaseIE):
+ IE_DESC = 'BrainPOP Español'
+ _ORIGIN = 'https://esp.brainpop.com'
+ _VIDEO_URL = 'https://svideos.brainpop.com'
+ _HLS_URL = 'https://hls.brainpop.com'
+ _CDN_URL = 'https://cdn.brainpop.com/mx'
+ _TESTS = [{
+ 'url': 'https://esp.brainpop.com/ciencia/la_diversidad_de_la_vida/ecosistemas/',
+ 'md5': 'cb3f062db2b3c5240ddfcfde7108f8c9',
+ 'info_dict': {
+ 'id': '3893',
+ 'ext': 'mp4',
+ 'title': 'Ecosistemas',
+ 'display_id': 'ecosistemas',
+ 'description': 'md5:80fc55b07e241f8c8f2aa8d74deaf3c3',
+ },
+ }, {
+ 'url': 'https://esp.brainpop.com/espanol/la_escritura/emily_dickinson/',
+ 'md5': '98c1b9559e0e33777209c425cda7dac4',
+ 'info_dict': {
+ 'id': '7146',
+ 'ext': 'mp4',
+ 'title': 'Emily Dickinson',
+ 'display_id': 'emily_dickinson',
+ 'description': 'md5:2795ad87b1d239c9711c1e92ab5a978b',
+ },
+ 'skip': 'Requires login',
+ }]
+
+
+class BrainPOPFrIE(BrainPOPLegacyBaseIE):
+ IE_DESC = 'BrainPOP Français'
+ _ORIGIN = 'https://fr.brainpop.com'
+ _VIDEO_URL = 'https://svideos.brainpop.com'
+ _HLS_URL = 'https://hls.brainpop.com'
+ _CDN_URL = 'https://cdn.brainpop.com/fr'
+ _TESTS = [{
+ 'url': 'https://fr.brainpop.com/sciencesdelaterre/energie/sourcesdenergie/',
+ 'md5': '97e7f48af8af93f8a2be11709f239371',
+ 'info_dict': {
+ 'id': '1651',
+ 'ext': 'mp4',
+ 'title': 'Sources d\'énergie',
+ 'display_id': 'sourcesdenergie',
+ 'description': 'md5:7eece350f019a21ef9f64d4088b2d857',
+ },
+ }, {
+ 'url': 'https://fr.brainpop.com/francais/ecrire/plagiat/',
+ 'md5': '0cf2b4f89804d0dd4a360a51310d445a',
+ 'info_dict': {
+ 'id': '5803',
+ 'ext': 'mp4',
+ 'title': 'Plagiat',
+ 'display_id': 'plagiat',
+ 'description': 'md5:4496d87127ace28e8b1eda116e77cd2b',
+ },
+ 'skip': 'Requires login',
+ }]
+
+
+class BrainPOPIlIE(BrainPOPLegacyBaseIE):
+ IE_DESC = 'BrainPOP Hebrew'
+ _ORIGIN = 'https://il.brainpop.com'
+ _VIDEO_URL = 'https://svideos.brainpop.com'
+ _HLS_URL = 'https://hls.brainpop.com'
+ _CDN_URL = 'https://cdn.brainpop.com/he'
+ _TESTS = [{
+ 'url': 'https://il.brainpop.com/category_9/subcategory_150/subjects_3782/',
+ 'md5': '9e4ea9dc60ecd385a6e5ca12ccf31641',
+ 'info_dict': {
+ 'id': '3782',
+ 'ext': 'mp4',
+ 'title': 'md5:e993632fcda0545d9205602ec314ad67',
+ 'display_id': 'subjects_3782',
+ 'description': 'md5:4cc084a8012beb01f037724423a4d4ed',
+ },
+ }]
diff --git a/hypervideo_dl/extractor/bravotv.py b/hypervideo_dl/extractor/bravotv.py
index d489584..419fe8c 100644
--- a/hypervideo_dl/extractor/bravotv.py
+++ b/hypervideo_dl/extractor/bravotv.py
@@ -1,117 +1,189 @@
-import re
-
from .adobepass import AdobePassIE
+from ..networking import HEADRequest
from ..utils import (
- smuggle_url,
- update_url_query,
- int_or_none,
+ extract_attributes,
float_or_none,
- try_get,
- dict_get,
+ get_element_html_by_class,
+ int_or_none,
+ merge_dicts,
+ parse_age_limit,
+ remove_end,
+ str_or_none,
+ traverse_obj,
+ unescapeHTML,
+ unified_timestamp,
+ update_url_query,
+ url_or_none,
)
class BravoTVIE(AdobePassIE):
- _VALID_URL = r'https?://(?:www\.)?(?P<req_id>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?P<site>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is',
- 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9',
'info_dict': {
- 'id': 'epL0pmK1kQlT',
+ 'id': '3923059',
'ext': 'mp4',
'title': 'The Top Chef Season 16 Winner Is...',
'description': 'Find out who takes the title of Top Chef!',
- 'uploader': 'NBCU-BRAV',
'upload_date': '20190314',
'timestamp': 1552591860,
'season_number': 16,
'episode_number': 15,
'series': 'Top Chef',
'episode': 'The Top Chef Season 16 Winner Is...',
- 'duration': 190.0,
- }
+ 'duration': 190.357,
+ 'season': 'Season 16',
+ 'thumbnail': r're:^https://.+\.jpg',
+ },
+ 'params': {'skip_download': 'm3u8'},
}, {
- 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1',
- 'only_matching': True,
+ 'url': 'https://www.bravotv.com/top-chef/season-20/episode-1/london-calling',
+ 'info_dict': {
+ 'id': '9000234570',
+ 'ext': 'mp4',
+ 'title': 'London Calling',
+ 'description': 'md5:5af95a8cbac1856bd10e7562f86bb759',
+ 'upload_date': '20230310',
+ 'timestamp': 1678410000,
+ 'season_number': 20,
+ 'episode_number': 1,
+ 'series': 'Top Chef',
+ 'episode': 'London Calling',
+ 'duration': 3266.03,
+ 'season': 'Season 20',
+ 'chapters': 'count:7',
+ 'thumbnail': r're:^https://.+\.jpg',
+ 'age_limit': 14,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ 'skip': 'This video requires AdobePass MSO credentials',
+ }, {
+ 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-1/closing-night',
+ 'info_dict': {
+ 'id': '3692045',
+ 'ext': 'mp4',
+ 'title': 'Closing Night',
+ 'description': 'md5:3170065c5c2f19548d72a4cbc254af63',
+ 'upload_date': '20180401',
+ 'timestamp': 1522623600,
+ 'season_number': 1,
+ 'episode_number': 1,
+ 'series': 'In Ice Cold Blood',
+ 'episode': 'Closing Night',
+ 'duration': 2629.051,
+ 'season': 'Season 1',
+ 'chapters': 'count:6',
+ 'thumbnail': r're:^https://.+\.jpg',
+ 'age_limit': 14,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ 'skip': 'This video requires AdobePass MSO credentials',
}, {
'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2',
+ 'info_dict': {
+ 'id': '3974019',
+ 'ext': 'mp4',
+ 'title': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)',
+ 'description': 'md5:f9d638dd6946a1c1c0533a9c6100eae5',
+ 'upload_date': '20190617',
+ 'timestamp': 1560790800,
+ 'season_number': 2,
+ 'episode_number': 16,
+ 'series': 'In Ice Cold Blood',
+ 'episode': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)',
+ 'duration': 68.235,
+ 'season': 'Season 2',
+ 'thumbnail': r're:^https://.+\.jpg',
+ 'age_limit': 14,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1',
'only_matching': True,
}]
def _real_extract(self, url):
- site, display_id = self._match_valid_url(url).groups()
+ site, display_id = self._match_valid_url(url).group('site', 'id')
webpage = self._download_webpage(url, display_id)
- settings = self._parse_json(self._search_regex(
- r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'),
- display_id)
- info = {}
+ settings = self._search_json(
+ r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>', webpage, 'settings', display_id)
+ tve = extract_attributes(get_element_html_by_class('tve-video-deck-app', webpage) or '')
query = {
- 'mbr': 'true',
+ 'manifest': 'm3u',
+ 'formats': 'm3u,mpeg4',
}
- account_pid, release_pid = [None] * 2
- tve = settings.get('ls_tve')
+
if tve:
- query['manifest'] = 'm3u'
- mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage)
- if mobj:
- account_pid, tp_path = mobj.groups()
- release_pid = tp_path.strip('/').split('/')[-1]
- else:
- account_pid = 'HNK2IC'
- tp_path = release_pid = tve['release_pid']
- if tve.get('entitlement') == 'auth':
- adobe_pass = settings.get('tve_adobe_auth', {})
- if site == 'bravotv':
- site = 'bravo'
+ account_pid = tve.get('data-mpx-media-account-pid') or 'HNK2IC'
+ account_id = tve['data-mpx-media-account-id']
+ metadata = self._parse_json(
+ tve.get('data-normalized-video', ''), display_id, fatal=False, transform_source=unescapeHTML)
+ video_id = tve.get('data-guid') or metadata['guid']
+ if tve.get('data-entitlement') == 'auth':
+ auth = traverse_obj(settings, ('tve_adobe_auth', {dict})) or {}
+ site = remove_end(site, 'tv')
+ release_pid = tve['data-release-pid']
resource = self._get_mvpd_resource(
- adobe_pass.get('adobePassResourceId') or site,
- tve['title'], release_pid, tve.get('rating'))
- query['auth'] = self._extract_mvpd_auth(
- url, release_pid,
- adobe_pass.get('adobePassRequestorId') or site, resource)
- else:
- shared_playlist = settings['ls_playlist']
- account_pid = shared_playlist['account_pid']
- metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']]
- tp_path = release_pid = metadata.get('release_pid')
- if not release_pid:
- release_pid = metadata['guid']
- tp_path = 'media/guid/2140479951/' + release_pid
- info.update({
- 'title': metadata['title'],
- 'description': metadata.get('description'),
- 'season_number': int_or_none(metadata.get('season_num')),
- 'episode_number': int_or_none(metadata.get('episode_num')),
- })
- query['switch'] = 'progressive'
+ tve.get('data-adobe-pass-resource-id') or auth.get('adobePassResourceId') or site,
+ tve['data-title'], release_pid, tve.get('data-rating'))
+ query.update({
+ 'switch': 'HLSServiceSecure',
+ 'auth': self._extract_mvpd_auth(
+ url, release_pid, auth.get('adobePassRequestorId') or site, resource),
+ })
- tp_url = 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path)
+ else:
+ ls_playlist = traverse_obj(settings, ('ls_playlist', ..., {dict}), get_all=False) or {}
+ account_pid = ls_playlist.get('mpxMediaAccountPid') or 'PHSl-B'
+ account_id = ls_playlist['mpxMediaAccountId']
+ video_id = ls_playlist['defaultGuid']
+ metadata = traverse_obj(
+ ls_playlist, ('videos', lambda _, v: v['guid'] == video_id, {dict}), get_all=False)
+ tp_url = f'https://link.theplatform.com/s/{account_pid}/media/guid/{account_id}/{video_id}'
tp_metadata = self._download_json(
- update_url_query(tp_url, {'format': 'preview'}),
- display_id, fatal=False)
- if tp_metadata:
- info.update({
- 'title': tp_metadata.get('title'),
- 'description': tp_metadata.get('description'),
- 'duration': float_or_none(tp_metadata.get('duration'), 1000),
- 'season_number': int_or_none(
- dict_get(tp_metadata, ('pl1$seasonNumber', 'nbcu$seasonNumber'))),
- 'episode_number': int_or_none(
- dict_get(tp_metadata, ('pl1$episodeNumber', 'nbcu$episodeNumber'))),
- # For some reason the series is sometimes wrapped into a single element array.
- 'series': try_get(
- dict_get(tp_metadata, ('pl1$show', 'nbcu$show')),
- lambda x: x[0] if isinstance(x, list) else x,
- expected_type=str),
- 'episode': dict_get(
- tp_metadata, ('pl1$episodeName', 'nbcu$episodeName', 'title')),
- })
+ update_url_query(tp_url, {'format': 'preview'}), video_id, fatal=False)
+
+ seconds_or_none = lambda x: float_or_none(x, 1000)
+ chapters = traverse_obj(tp_metadata, ('chapters', ..., {
+ 'start_time': ('startTime', {seconds_or_none}),
+ 'end_time': ('endTime', {seconds_or_none}),
+ }))
+ # prune pointless single chapters that span the entire duration from short videos
+ if len(chapters) == 1 and not traverse_obj(chapters, (0, 'end_time')):
+ chapters = None
- info.update({
- '_type': 'url_transparent',
- 'id': release_pid,
- 'url': smuggle_url(update_url_query(tp_url, query), {'force_smil_url': True}),
- 'ie_key': 'ThePlatform',
- })
- return info
+ m3u8_url = self._request_webpage(HEADRequest(
+ update_url_query(f'{tp_url}/stream.m3u8', query)), video_id, 'Checking m3u8 URL').url
+ if 'mpeg_cenc' in m3u8_url:
+ self.report_drm(video_id)
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls')
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'chapters': chapters,
+ **merge_dicts(traverse_obj(tp_metadata, {
+ 'title': 'title',
+ 'description': 'description',
+ 'duration': ('duration', {seconds_or_none}),
+ 'timestamp': ('pubDate', {seconds_or_none}),
+ 'season_number': (('pl1$seasonNumber', 'nbcu$seasonNumber'), {int_or_none}),
+ 'episode_number': (('pl1$episodeNumber', 'nbcu$episodeNumber'), {int_or_none}),
+ 'series': (('pl1$show', 'nbcu$show'), (None, ...), {str}),
+ 'episode': (('title', 'pl1$episodeNumber', 'nbcu$episodeNumber'), {str_or_none}),
+ 'age_limit': ('ratings', ..., 'rating', {parse_age_limit}),
+ }, get_all=False), traverse_obj(metadata, {
+ 'title': 'title',
+ 'description': 'description',
+ 'duration': ('durationInSeconds', {int_or_none}),
+ 'timestamp': ('airDate', {unified_timestamp}),
+ 'thumbnail': ('thumbnailUrl', {url_or_none}),
+ 'season_number': ('seasonNumber', {int_or_none}),
+ 'episode_number': ('episodeNumber', {int_or_none}),
+ 'episode': 'episodeTitle',
+ 'series': 'show',
+ }))
+ }
diff --git a/hypervideo_dl/extractor/brightcove.py b/hypervideo_dl/extractor/brightcove.py
index 2b7ddca..61b1841 100644
--- a/hypervideo_dl/extractor/brightcove.py
+++ b/hypervideo_dl/extractor/brightcove.py
@@ -7,10 +7,10 @@ from .adobepass import AdobePassIE
from .common import InfoExtractor
from ..compat import (
compat_etree_fromstring,
- compat_HTTPError,
compat_parse_qs,
compat_urlparse,
)
+from ..networking.exceptions import HTTPError
from ..utils import (
clean_html,
dict_get,
@@ -575,6 +575,7 @@ class BrightcoveNewBaseIE(AdobePassIE):
self.raise_no_formats(
error.get('message') or error.get('error_subcode') or error['error_code'], expected=True)
+ headers.pop('Authorization', None) # or else http formats will give error 400
for f in formats:
f.setdefault('http_headers', {}).update(headers)
@@ -895,8 +896,9 @@ class BrightcoveNewIE(BrightcoveNewBaseIE):
store_pk(policy_key)
return policy_key
- api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id)
- headers = {}
+ token = smuggled_data.get('token')
+ api_url = f'https://{"edge-auth" if token else "edge"}.api.brightcove.com/playback/v1/accounts/{account_id}/{content_type}s/{video_id}'
+ headers = {'Authorization': f'Bearer {token}'} if token else {}
referrer = smuggled_data.get('referrer') # XXX: notice the spelling/case of the key
if referrer:
headers.update({
@@ -913,8 +915,8 @@ class BrightcoveNewIE(BrightcoveNewBaseIE):
json_data = self._download_json(api_url, video_id, headers=headers)
break
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403):
- json_data = self._parse_json(e.cause.read().decode(), video_id)[0]
+ if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403):
+ json_data = self._parse_json(e.cause.response.read().decode(), video_id)[0]
message = json_data.get('message') or json_data['error_code']
if json_data.get('error_subcode') == 'CLIENT_GEO':
self.raise_geo_restricted(msg=message)
diff --git a/hypervideo_dl/extractor/callin.py b/hypervideo_dl/extractor/callin.py
index e966876..c77179c 100644
--- a/hypervideo_dl/extractor/callin.py
+++ b/hypervideo_dl/extractor/callin.py
@@ -1,9 +1,5 @@
from .common import InfoExtractor
-from ..utils import (
- traverse_obj,
- float_or_none,
- int_or_none
-)
+from ..utils import float_or_none, int_or_none, make_archive_id, traverse_obj
class CallinIE(InfoExtractor):
@@ -35,6 +31,54 @@ class CallinIE(InfoExtractor):
'episode_number': 1,
'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd'
}
+ }, {
+ 'url': 'https://www.callin.com/episode/fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW',
+ 'md5': '14ede27ee2c957b7e4db93140fc0745c',
+ 'info_dict': {
+ 'id': 'c3dab47f237bf953d180d3f243477a84302798be0e0b29bc9ade6d60a69f04f5',
+ 'ext': 'ts',
+ 'title': 'FCC Commissioner Brendan Carr on Elon’s Starlink',
+ 'description': 'Or, why the government doesn’t like SpaceX',
+ 'channel': 'The Pull Request',
+ 'channel_url': 'https://callin.com/show/the-pull-request-ucnDJmEKAa',
+ 'duration': 3182.472,
+ 'series_id': '7e9c23156e4aecfdcaef46bfb2ed7ca268509622ec006c0f0f25d90e34496638',
+ 'uploader_url': 'http://thepullrequest.com',
+ 'upload_date': '20220902',
+ 'episode': 'FCC Commissioner Brendan Carr on Elon’s Starlink',
+ 'display_id': 'fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW',
+ 'series': 'The Pull Request',
+ 'channel_id': '7e9c23156e4aecfdcaef46bfb2ed7ca268509622ec006c0f0f25d90e34496638',
+ 'view_count': int,
+ 'uploader': 'Antonio García Martínez',
+ 'thumbnail': 'https://d1z76fhpoqkd01.cloudfront.net/shows/legacy/1ade9142625344045dc17cf523469ced1d93610762f4c886d06aa190a2f979e8.png',
+ 'episode_id': 'c3dab47f237bf953d180d3f243477a84302798be0e0b29bc9ade6d60a69f04f5',
+ 'timestamp': 1662100688.005,
+ }
+ }, {
+ 'url': 'https://www.callin.com/episode/episode-81-elites-melt-down-over-student-debt-lzxMidUnjA',
+ 'md5': '16f704ddbf82a27e3930533b12062f07',
+ 'info_dict': {
+ 'id': '8d06f869798f93a7814e380bceabea72d501417e620180416ff6bd510596e83c',
+ 'ext': 'ts',
+ 'title': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?',
+ 'description': 'Let’s talk todays episode about the primary election shake up in NYC and the elites melting down over student debt cancelation.',
+ 'channel': 'The DEBRIEF With Briahna Joy Gray',
+ 'channel_url': 'https://callin.com/show/the-debrief-with-briahna-joy-gray-siiFDzGegm',
+ 'duration': 10043.16,
+ 'series_id': '61cea58444465fd26674069703bd8322993bc9e5b4f1a6d0872690554a046ff7',
+ 'uploader_url': 'http://patreon.com/badfaithpodcast',
+ 'upload_date': '20220826',
+ 'episode': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?',
+ 'display_id': 'episode-',
+ 'series': 'The DEBRIEF With Briahna Joy Gray',
+ 'channel_id': '61cea58444465fd26674069703bd8322993bc9e5b4f1a6d0872690554a046ff7',
+ 'view_count': int,
+ 'uploader': 'Briahna Gray',
+ 'thumbnail': 'https://d1z76fhpoqkd01.cloudfront.net/shows/legacy/461ea0d86172cb6aff7d6c80fd49259cf5e64bdf737a4650f8bc24cf392ca218.png',
+ 'episode_id': '8d06f869798f93a7814e380bceabea72d501417e620180416ff6bd510596e83c',
+ 'timestamp': 1661476708.282,
+ }
}]
def try_get_user_name(self, d):
@@ -86,6 +130,7 @@ class CallinIE(InfoExtractor):
return {
'id': id,
+ '_old_archive_ids': [make_archive_id(self, display_id.rsplit('-', 1)[-1])],
'display_id': display_id,
'title': title,
'formats': formats,
diff --git a/hypervideo_dl/extractor/camfm.py b/hypervideo_dl/extractor/camfm.py
new file mode 100644
index 0000000..a9850f4
--- /dev/null
+++ b/hypervideo_dl/extractor/camfm.py
@@ -0,0 +1,85 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ get_element_by_class,
+ get_elements_by_class,
+ join_nonempty,
+ traverse_obj,
+ unified_timestamp,
+ urljoin,
+)
+
+
+class CamFMShowIE(InfoExtractor):
+ _VALID_URL = r'https://(?:www\.)?camfm\.co\.uk/shows/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'playlist_mincount': 5,
+ 'url': 'https://camfm.co.uk/shows/soul-mining/',
+ 'info_dict': {
+ 'id': 'soul-mining',
+ 'thumbnail': 'md5:6a873091f92c936f23bdcce80f75e66a',
+ 'title': 'Soul Mining',
+ 'description': 'Telling the stories of jazz, funk and soul from all corners of the world.',
+ },
+ }]
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+ page = self._download_webpage(url, show_id)
+
+ return {
+ '_type': 'playlist',
+ 'id': show_id,
+ 'entries': [self.url_result(urljoin('https://camfm.co.uk', i), CamFMEpisodeIE)
+ for i in re.findall(r"javascript:popup\('(/player/[^']+)', 'listen'", page)],
+ 'thumbnail': urljoin('https://camfm.co.uk', self._search_regex(
+ r'<img[^>]+class="thumb-expand"[^>]+src="([^"]+)"', page, 'thumbnail', fatal=False)),
+ 'title': self._html_search_regex('<h1>([^<]+)</h1>', page, 'title', fatal=False),
+ 'description': clean_html(get_element_by_class('small-12 medium-8 cell', page))
+ }
+
+
+class CamFMEpisodeIE(InfoExtractor):
+ _VALID_URL = r'https://(?:www\.)?camfm\.co\.uk/player/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'https://camfm.co.uk/player/43336',
+ 'skip': 'Episode will expire - don\'t actually know when, but it will go eventually',
+ 'info_dict': {
+ 'id': '43336',
+ 'title': 'AITAA: Am I the Agony Aunt? - 19:00 Tue 16/05/2023',
+ 'ext': 'mp3',
+ 'upload_date': '20230516',
+ 'description': 'md5:f165144f94927c0f1bfa2ee6e6ab7bbf',
+ 'timestamp': 1684263600,
+ 'series': 'AITAA: Am I the Agony Aunt?',
+ 'thumbnail': 'md5:5980a831360d0744c3764551be3d09c1',
+ 'categories': ['Entertainment'],
+ }
+ }]
+
+ def _real_extract(self, url):
+ episode_id = self._match_id(url)
+ page = self._download_webpage(url, episode_id)
+ audios = self._parse_html5_media_entries('https://audio.camfm.co.uk', page, episode_id)
+
+ caption = get_element_by_class('caption', page)
+ series = clean_html(re.sub(r'<span[^<]+<[^<]+>', '', caption))
+
+ card_section = get_element_by_class('card-section', page)
+ date = self._html_search_regex('>Aired at ([^<]+)<', card_section, 'air date', fatal=False)
+
+ return {
+ 'id': episode_id,
+ 'title': join_nonempty(series, date, delim=' - '),
+ 'formats': traverse_obj(audios, (..., 'formats', ...)),
+ 'timestamp': unified_timestamp(date), # XXX: Does not account for UK's daylight savings
+ 'series': series,
+ 'description': clean_html(re.sub(r'<b>[^<]+</b><br[^>]+/>', '', card_section)),
+ 'thumbnail': urljoin('https://camfm.co.uk', self._search_regex(
+ r'<div[^>]+class="cover-art"[^>]+style="[^"]+url\(\'([^\']+)',
+ page, 'thumbnail', fatal=False)),
+ 'categories': get_elements_by_class('label', caption),
+ 'was_live': True,
+ }
diff --git a/hypervideo_dl/extractor/cammodels.py b/hypervideo_dl/extractor/cammodels.py
index 0509057..135b315 100644
--- a/hypervideo_dl/extractor/cammodels.py
+++ b/hypervideo_dl/extractor/cammodels.py
@@ -1,9 +1,5 @@
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- int_or_none,
- url_or_none,
-)
+from ..utils import int_or_none, url_or_none
class CamModelsIE(InfoExtractor):
@@ -17,32 +13,11 @@ class CamModelsIE(InfoExtractor):
def _real_extract(self, url):
user_id = self._match_id(url)
- webpage = self._download_webpage(
- url, user_id, headers=self.geo_verification_headers())
-
- manifest_root = self._html_search_regex(
- r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None)
-
- if not manifest_root:
- ERRORS = (
- ("I'm offline, but let's stay connected", 'This user is currently offline'),
- ('in a private show', 'This user is in a private show'),
- ('is currently performing LIVE', 'This model is currently performing live'),
- )
- for pattern, message in ERRORS:
- if pattern in webpage:
- error = message
- expected = True
- break
- else:
- error = 'Unable to find manifest URL root'
- expected = False
- raise ExtractorError(error, expected=expected)
-
manifest = self._download_json(
- '%s%s.json' % (manifest_root, user_id), user_id)
+ 'https://manifest-server.naiadsystems.com/live/s:%s.json' % user_id, user_id)
formats = []
+ thumbnails = []
for format_id, format_dict in manifest['formats'].items():
if not isinstance(format_dict, dict):
continue
@@ -82,12 +57,20 @@ class CamModelsIE(InfoExtractor):
'quality': -10,
})
else:
+ if format_id == 'jpeg':
+ thumbnails.append({
+ 'url': f['url'],
+ 'width': f['width'],
+ 'height': f['height'],
+ 'format_id': f['format_id'],
+ })
continue
formats.append(f)
return {
'id': user_id,
'title': user_id,
+ 'thumbnails': thumbnails,
'is_live': True,
'formats': formats,
'age_limit': 18
diff --git a/hypervideo_dl/extractor/canalplus.py b/hypervideo_dl/extractor/canalplus.py
index b7e2f9d..3ff5c3f 100644
--- a/hypervideo_dl/extractor/canalplus.py
+++ b/hypervideo_dl/extractor/canalplus.py
@@ -64,7 +64,7 @@ class CanalplusIE(InfoExtractor):
# response = self._request_webpage(
# HEADRequest(fmt_url), video_id,
# 'Checking if the video is georestricted')
- # if '/blocage' in response.geturl():
+ # if '/blocage' in response.url:
# raise ExtractorError(
# 'The video is not available in your country',
# expected=True)
diff --git a/hypervideo_dl/extractor/cbc.py b/hypervideo_dl/extractor/cbc.py
index a9f6cd2..e66071f 100644
--- a/hypervideo_dl/extractor/cbc.py
+++ b/hypervideo_dl/extractor/cbc.py
@@ -2,20 +2,23 @@ import re
import json
import base64
import time
+import urllib.parse
from .common import InfoExtractor
from ..compat import (
compat_str,
)
from ..utils import (
+ ExtractorError,
int_or_none,
join_nonempty,
js_to_json,
orderedSet,
+ parse_iso8601,
smuggle_url,
strip_or_none,
+ traverse_obj,
try_get,
- ExtractorError,
)
@@ -159,7 +162,7 @@ class CBCPlayerIE(InfoExtractor):
'upload_date': '20160210',
'uploader': 'CBCC-NEW',
},
- 'skip': 'Geo-restricted to Canada',
+ 'skip': 'Geo-restricted to Canada and no longer available',
}, {
# Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/
'url': 'http://www.cbc.ca/player/play/2657631896',
@@ -172,6 +175,9 @@ class CBCPlayerIE(InfoExtractor):
'timestamp': 1425704400,
'upload_date': '20150307',
'uploader': 'CBCC-NEW',
+ 'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg',
+ 'chapters': [],
+ 'duration': 494.811,
},
}, {
'url': 'http://www.cbc.ca/player/play/2164402062',
@@ -184,6 +190,28 @@ class CBCPlayerIE(InfoExtractor):
'timestamp': 1320410746,
'upload_date': '20111104',
'uploader': 'CBCC-NEW',
+ 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg',
+ 'chapters': [],
+ 'duration': 186.867,
+ },
+ }, {
+ # Has subtitles
+ # These broadcasts expire after ~1 month, can find new test URL here:
+ # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast
+ 'url': 'http://www.cbc.ca/player/play/2249992771553',
+ 'md5': '2f2fb675dd4f0f8a5bb7588d1b13bacd',
+ 'info_dict': {
+ 'id': '2249992771553',
+ 'ext': 'mp4',
+ 'title': 'The National | Women’s soccer pay, Florida seawater, Swift quake',
+ 'description': 'md5:adba28011a56cfa47a080ff198dad27a',
+ 'timestamp': 1690596000,
+ 'duration': 2716.333,
+ 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
+ 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/481/326/thumbnail.jpeg',
+ 'uploader': 'CBCC-NEW',
+ 'chapters': 'count:5',
+ 'upload_date': '20230729',
},
}]
@@ -197,12 +225,45 @@ class CBCPlayerIE(InfoExtractor):
'force_smil_url': True
}),
'id': video_id,
+ '_format_sort_fields': ('res', 'proto') # Prioritize direct http formats over HLS
+ }
+
+
+class CBCPlayerPlaylistIE(InfoExtractor):
+ IE_NAME = 'cbc.ca:player:playlist'
+ _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?:player/)(?!play/)(?P<id>[^?#]+)'
+ _TESTS = [{
+ 'url': 'https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast',
+ 'playlist_mincount': 25,
+ 'info_dict': {
+ 'id': 'news/tv shows/the national/latest broadcast',
+ }
+ }, {
+ 'url': 'https://www.cbc.ca/player/news/Canada/North',
+ 'playlist_mincount': 25,
+ 'info_dict': {
+ 'id': 'news/canada/north',
}
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = urllib.parse.unquote(self._match_id(url)).lower()
+ webpage = self._download_webpage(url, playlist_id)
+ json_content = self._search_json(
+ r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', playlist_id)
+
+ def entries():
+ for video_id in traverse_obj(json_content, (
+ 'video', 'clipsByCategory', lambda k, _: k.lower() == playlist_id, 'items', ..., 'id'
+ )):
+ yield self.url_result(f'https://www.cbc.ca/player/play/{video_id}', CBCPlayerIE)
+
+ return self.playlist_result(entries(), playlist_id)
class CBCGemIE(InfoExtractor):
IE_NAME = 'gem.cbc.ca'
- _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P<id>[0-9a-z-]+/s[0-9]+[a-z][0-9]+)'
+ _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>[0-9a-z-]+/s[0-9]+[a-z][0-9]+)'
_TESTS = [{
# This is a normal, public, TV show video
'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01',
@@ -245,6 +306,9 @@ class CBCGemIE(InfoExtractor):
},
'params': {'format': 'bv'},
'skip': 'Geo-restricted to Canada',
+ }, {
+ 'url': 'https://gem.cbc.ca/nadiyas-family-favourites/s01e01',
+ 'only_matching': True,
}]
_GEO_COUNTRIES = ['CA']
@@ -346,7 +410,9 @@ class CBCGemIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id)
+ video_info = self._download_json(
+ f'https://services.radio-canada.ca/ott/cbc-api/v2/assets/{video_id}',
+ video_id, expected_status=426)
email, password = self._get_login_info()
if email and password:
@@ -401,7 +467,7 @@ class CBCGemIE(InfoExtractor):
class CBCGemPlaylistIE(InfoExtractor):
IE_NAME = 'gem.cbc.ca:playlist'
- _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)'
+ _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)'
_TESTS = [{
# TV show playlist, all public videos
'url': 'https://gem.cbc.ca/media/schitts-creek/s06',
@@ -411,6 +477,9 @@ class CBCGemPlaylistIE(InfoExtractor):
'title': 'Season 6',
'description': 'md5:6a92104a56cbeb5818cc47884d4326a2',
},
+ }, {
+ 'url': 'https://gem.cbc.ca/schitts-creek/s06',
+ 'only_matching': True,
}]
_API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/'
@@ -418,7 +487,7 @@ class CBCGemPlaylistIE(InfoExtractor):
match = self._match_valid_url(url)
season_id = match.group('id')
show = match.group('show')
- show_info = self._download_json(self._API_BASE + show, season_id)
+ show_info = self._download_json(self._API_BASE + show, season_id, expected_status=426)
season = int(match.group('season'))
season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None)
@@ -470,49 +539,90 @@ class CBCGemPlaylistIE(InfoExtractor):
class CBCGemLiveIE(InfoExtractor):
IE_NAME = 'gem.cbc.ca:live'
- _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>\d+)'
- _TEST = {
- 'url': 'https://gem.cbc.ca/live/920604739687',
- 'info_dict': {
- 'title': 'Ottawa',
- 'description': 'The live TV channel and local programming from Ottawa',
- 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg',
- 'is_live': True,
- 'id': 'AyqZwxRqh8EH',
- 'ext': 'mp4',
- 'timestamp': 1492106160,
- 'upload_date': '20170413',
- 'uploader': 'CBCC-NEW',
+ _VALID_URL = r'https?://gem\.cbc\.ca/live(?:-event)?/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'https://gem.cbc.ca/live/920604739687',
+ 'info_dict': {
+ 'title': 'Ottawa',
+ 'description': 'The live TV channel and local programming from Ottawa',
+ 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg',
+ 'is_live': True,
+ 'id': 'AyqZwxRqh8EH',
+ 'ext': 'mp4',
+ 'timestamp': 1492106160,
+ 'upload_date': '20170413',
+ 'uploader': 'CBCC-NEW',
+ },
+ 'skip': 'Live might have ended',
},
- 'skip': 'Live might have ended',
- }
-
- # It's unclear where the chars at the end come from, but they appear to be
- # constant. Might need updating in the future.
- # There are two URLs, some livestreams are in one, and some
- # in the other. The JSON schema is the same for both.
- _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT']
+ {
+ 'url': 'https://gem.cbc.ca/live/44',
+ 'info_dict': {
+ 'id': '44',
+ 'ext': 'mp4',
+ 'is_live': True,
+ 'title': r're:^Ottawa [0-9\-: ]+',
+ 'description': 'The live TV channel and local programming from Ottawa',
+ 'live_status': 'is_live',
+ 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*'
+ },
+ 'params': {'skip_download': True},
+ 'skip': 'Live might have ended',
+ },
+ {
+ 'url': 'https://gem.cbc.ca/live-event/10835',
+ 'info_dict': {
+ 'id': '10835',
+ 'ext': 'mp4',
+ 'is_live': True,
+ 'title': r're:^The National \| Biden’s trip wraps up, Paltrow testifies, Bird flu [0-9\-: ]+',
+ 'description': 'March 24, 2023 | President Biden’s Ottawa visit ends with big pledges from both countries. Plus, Gwyneth Paltrow testifies in her ski collision trial.',
+ 'live_status': 'is_live',
+ 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*',
+ 'timestamp': 1679706000,
+ 'upload_date': '20230325',
+ },
+ 'params': {'skip_download': True},
+ 'skip': 'Live might have ended',
+ }
+ ]
def _real_extract(self, url):
video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ video_info = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['data']
- for api_url in self._API_URLS:
- video_info = next((
- stream for stream in self._download_json(api_url, video_id)['entries']
- if stream.get('guid') == video_id), None)
- if video_info:
- break
- else:
+ # Two types of metadata JSON
+ if not video_info.get('formattedIdMedia'):
+ video_info = traverse_obj(
+ video_info, (('freeTv', ('streams', ...)), 'items', lambda _, v: v['key'] == video_id, {dict}),
+ get_all=False, default={})
+
+ video_stream_id = video_info.get('formattedIdMedia')
+ if not video_stream_id:
raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True)
+ stream_data = self._download_json(
+ 'https://services.radio-canada.ca/media/validation/v2/', video_id, query={
+ 'appCode': 'mpx',
+ 'connectionType': 'hd',
+ 'deviceType': 'ipad',
+ 'idMedia': video_stream_id,
+ 'multibitrate': 'true',
+ 'output': 'json',
+ 'tech': 'hls',
+ 'manifestType': 'desktop',
+ })
+
return {
- '_type': 'url_transparent',
- 'ie_key': 'ThePlatform',
- 'url': video_info['content'][0]['url'],
'id': video_id,
- 'title': video_info.get('title'),
- 'description': video_info.get('description'),
- 'tags': try_get(video_info, lambda x: x['keywords'].split(', ')),
- 'thumbnail': video_info.get('cbc$staticImage'),
+ 'formats': self._extract_m3u8_formats(stream_data['url'], video_id, 'mp4', live=True),
'is_live': True,
+ **traverse_obj(video_info, {
+ 'title': 'title',
+ 'description': 'description',
+ 'thumbnail': ('images', 'card', 'url'),
+ 'timestamp': ('airDate', {parse_iso8601}),
+ })
}
diff --git a/hypervideo_dl/extractor/cbs.py b/hypervideo_dl/extractor/cbs.py
index 9aacd50..1c0dbde 100644
--- a/hypervideo_dl/extractor/cbs.py
+++ b/hypervideo_dl/extractor/cbs.py
@@ -1,8 +1,14 @@
+from .brightcove import BrightcoveNewIE
+from .common import InfoExtractor
from .theplatform import ThePlatformFeedIE
+from .youtube import YoutubeIE
from ..utils import (
ExtractorError,
+ extract_attributes,
+ get_element_html_by_id,
int_or_none,
find_xpath_attr,
+ smuggle_url,
xpath_element,
xpath_text,
update_url_query,
@@ -162,3 +168,110 @@ class CBSIE(CBSBaseIE):
'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000),
'thumbnail': url_or_none(xpath_text(video_data, 'previewImageURL')),
})
+
+
+class ParamountPressExpressIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?paramountpressexpress\.com(?:/[\w-]+)+/(?P<yt>yt-)?video/?\?watch=(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://www.paramountpressexpress.com/cbs-entertainment/shows/survivor/video/?watch=pnzew7e2hx',
+ 'md5': '56631dbcadaab980d1fc47cb7b76cba4',
+ 'info_dict': {
+ 'id': '6322981580112',
+ 'ext': 'mp4',
+ 'title': 'I’m Felicia',
+ 'description': 'md5:88fad93f8eede1c9c8f390239e4c6290',
+ 'uploader_id': '6055873637001',
+ 'upload_date': '20230320',
+ 'timestamp': 1679334960,
+ 'duration': 49.557,
+ 'thumbnail': r're:^https://.+\.jpg',
+ 'tags': [],
+ },
+ }, {
+ 'url': 'https://www.paramountpressexpress.com/cbs-entertainment/video/?watch=2s5eh8kppc',
+ 'md5': 'edcb03e3210b88a3e56c05aa863e0e5b',
+ 'info_dict': {
+ 'id': '6323036027112',
+ 'ext': 'mp4',
+ 'title': '‘Y&R’ Set Visit: Jerry O’Connell Quizzes Cast on Pre-Love Scene Rituals and More',
+ 'description': 'md5:b929867a357aac5544b783d834c78383',
+ 'uploader_id': '6055873637001',
+ 'upload_date': '20230321',
+ 'timestamp': 1679430180,
+ 'duration': 132.032,
+ 'thumbnail': r're:^https://.+\.jpg',
+ 'tags': [],
+ },
+ }, {
+ 'url': 'https://www.paramountpressexpress.com/paramount-plus/yt-video/?watch=OX9wJWOcqck',
+ 'info_dict': {
+ 'id': 'OX9wJWOcqck',
+ 'ext': 'mp4',
+ 'title': 'Rugrats | Season 2 Official Trailer | Paramount+',
+ 'description': 'md5:1f7e26f5625a9f0d6564d9ad97a9f7de',
+ 'uploader': 'Paramount Plus',
+ 'uploader_id': '@paramountplus',
+ 'uploader_url': 'http://www.youtube.com/@paramountplus',
+ 'channel': 'Paramount Plus',
+ 'channel_id': 'UCrRttZIypNTA1Mrfwo745Sg',
+ 'channel_url': 'https://www.youtube.com/channel/UCrRttZIypNTA1Mrfwo745Sg',
+ 'upload_date': '20230316',
+ 'duration': 88,
+ 'age_limit': 0,
+ 'availability': 'public',
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'view_count': int,
+ 'like_count': int,
+ 'channel_follower_count': int,
+ 'thumbnail': 'https://i.ytimg.com/vi/OX9wJWOcqck/maxresdefault.jpg',
+ 'categories': ['Entertainment'],
+ 'tags': ['Rugrats'],
+ },
+ }, {
+ 'url': 'https://www.paramountpressexpress.com/showtime/yt-video/?watch=_ljssSoDLkw',
+ 'info_dict': {
+ 'id': '_ljssSoDLkw',
+ 'ext': 'mp4',
+ 'title': 'Lavell Crawford: THEE Lavell Crawford Comedy Special Official Trailer | SHOWTIME',
+ 'description': 'md5:39581bcc3fd810209b642609f448af70',
+ 'uploader': 'SHOWTIME',
+ 'uploader_id': '@Showtime',
+ 'uploader_url': 'http://www.youtube.com/@Showtime',
+ 'channel': 'SHOWTIME',
+ 'channel_id': 'UCtwMWJr2BFPkuJTnSvCESSQ',
+ 'channel_url': 'https://www.youtube.com/channel/UCtwMWJr2BFPkuJTnSvCESSQ',
+ 'upload_date': '20230209',
+ 'duration': 49,
+ 'age_limit': 0,
+ 'availability': 'public',
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ 'channel_follower_count': int,
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/_ljssSoDLkw/maxresdefault.webp',
+ 'categories': ['People & Blogs'],
+ 'tags': 'count:27',
+ },
+ }]
+
+ def _real_extract(self, url):
+ display_id, is_youtube = self._match_valid_url(url).group('id', 'yt')
+ if is_youtube:
+ return self.url_result(display_id, YoutubeIE)
+
+ webpage = self._download_webpage(url, display_id)
+ video_id = self._search_regex(
+ r'\bvideo_id\s*=\s*["\'](\d+)["\']\s*,', webpage, 'Brightcove ID')
+ token = self._search_regex(r'\btoken\s*=\s*["\']([\w.-]+)["\']', webpage, 'token')
+
+ player = extract_attributes(get_element_html_by_id('vcbrightcoveplayer', webpage) or '')
+ account_id = player.get('data-account') or '6055873637001'
+ player_id = player.get('data-player') or 'OtLKgXlO9F'
+ embed = player.get('data-embed') or 'default'
+
+ return self.url_result(smuggle_url(
+ f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/index.html?videoId={video_id}',
+ {'token': token}), BrightcoveNewIE)
diff --git a/hypervideo_dl/extractor/cbsnews.py b/hypervideo_dl/extractor/cbsnews.py
index 16edf3a..5a8ebb8 100644
--- a/hypervideo_dl/extractor/cbsnews.py
+++ b/hypervideo_dl/extractor/cbsnews.py
@@ -1,36 +1,153 @@
+import base64
import re
+import urllib.error
+import urllib.parse
import zlib
+from .anvato import AnvatoIE
from .common import InfoExtractor
-from .cbs import CBSIE
-from ..compat import (
- compat_b64decode,
- compat_urllib_parse_unquote,
-)
+from .paramountplus import ParamountPlusIE
+from ..networking import HEADRequest
from ..utils import (
+ ExtractorError,
+ UserNotLive,
+ determine_ext,
+ float_or_none,
+ format_field,
+ int_or_none,
+ make_archive_id,
+ mimetype2ext,
parse_duration,
+ smuggle_url,
+ traverse_obj,
+ url_or_none,
)
-class CBSNewsEmbedIE(CBSIE): # XXX: Do not subclass from concrete IE
+class CBSNewsBaseIE(InfoExtractor):
+ _LOCALES = {
+ 'atlanta': None,
+ 'baltimore': 'BAL',
+ 'boston': 'BOS',
+ 'chicago': 'CHI',
+ 'colorado': 'DEN',
+ 'detroit': 'DET',
+ 'losangeles': 'LA',
+ 'miami': 'MIA',
+ 'minnesota': 'MIN',
+ 'newyork': 'NY',
+ 'philadelphia': 'PHI',
+ 'pittsburgh': 'PIT',
+ 'sacramento': 'SAC',
+ 'sanfrancisco': 'SF',
+ 'texas': 'DAL',
+ }
+ _LOCALE_RE = '|'.join(map(re.escape, _LOCALES))
+ _ANVACK = '5VD6Eyd6djewbCmNwBFnsJj17YAvGRwl'
+
+ def _get_item(self, webpage, display_id):
+ return traverse_obj(self._search_json(
+ r'CBSNEWS\.defaultPayload\s*=', webpage, 'payload', display_id,
+ default={}), ('items', 0, {dict})) or {}
+
+ def _get_video_url(self, item):
+ return traverse_obj(item, 'video', 'video2', expected_type=url_or_none)
+
+ def _extract_playlist(self, webpage, playlist_id):
+ entries = [self.url_result(embed_url, CBSNewsEmbedIE) for embed_url in re.findall(
+ r'<iframe[^>]+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage)]
+ if entries:
+ return self.playlist_result(
+ entries, playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage),
+ self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage))
+
+ def _extract_video(self, item, video_url, video_id):
+ if mimetype2ext(item.get('format'), default=determine_ext(video_url)) == 'mp4':
+ formats = [{'url': video_url, 'ext': 'mp4'}]
+
+ else:
+ manifest = self._download_webpage(video_url, video_id, note='Downloading m3u8 information')
+
+ anvato_id = self._search_regex(r'anvato-(\d+)', manifest, 'Anvato ID', default=None)
+ # Prefer Anvato if available; cbsnews.com m3u8 formats are re-encoded from Anvato source
+ if anvato_id:
+ return self.url_result(
+ smuggle_url(f'anvato:{self._ANVACK}:{anvato_id}', {'token': 'default'}),
+ AnvatoIE, url_transparent=True, _old_archive_ids=[make_archive_id(self, anvato_id)])
+
+ formats, _ = self._parse_m3u8_formats_and_subtitles(
+ manifest, video_url, 'mp4', m3u8_id='hls', video_id=video_id)
+
+ def get_subtitles(subs_url):
+ return {
+ 'en': [{
+ 'url': subs_url,
+ 'ext': 'dfxp', # TTAF1
+ }],
+ } if url_or_none(subs_url) else None
+
+ episode_meta = traverse_obj(item, {
+ 'season_number': ('season', {int_or_none}),
+ 'episode_number': ('episode', {int_or_none}),
+ }) if item.get('isFullEpisode') else {}
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ **traverse_obj(item, {
+ 'title': (None, ('fulltitle', 'title')),
+ 'description': 'dek',
+ 'timestamp': ('timestamp', {lambda x: float_or_none(x, 1000)}),
+ 'duration': ('duration', {float_or_none}),
+ 'subtitles': ('captions', {get_subtitles}),
+ 'thumbnail': ('images', ('hd', 'sd'), {url_or_none}),
+ 'is_live': ('type', {lambda x: x == 'live'}),
+ }, get_all=False),
+ **episode_meta,
+ }
+
+
+class CBSNewsEmbedIE(CBSNewsBaseIE):
IE_NAME = 'cbsnews:embed'
_VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P<id>.+)'
_TESTS = [{
'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A',
- 'only_matching': True,
+ 'info_dict': {
+ 'id': '6ZP4cXvo9FaX3VLH7MF4CgY30JFpY_GA',
+ 'ext': 'mp4',
+ 'title': 'Cops investigate gorilla incident at Cincinnati Zoo',
+ 'description': 'md5:fee7441ab8aaeb3c693482394738102b',
+ 'duration': 350,
+ 'timestamp': 1464719713,
+ 'upload_date': '20160531',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
- item = self._parse_json(zlib.decompress(compat_b64decode(
- compat_urllib_parse_unquote(self._match_id(url))),
- -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0]
- return self._extract_video_info(item['mpxRefId'], 'cbsnews')
+ item = traverse_obj(self._parse_json(zlib.decompress(base64.b64decode(
+ urllib.parse.unquote(self._match_id(url))),
+ -zlib.MAX_WBITS).decode(), None), ('video', 'items', 0, {dict})) or {}
+ video_id = item['mpxRefId']
+ video_url = self._get_video_url(item)
+ if not video_url:
+ # Old embeds redirect user to ParamountPlus but most links are 404
+ pplus_url = f'https://www.paramountplus.com/shows/video/{video_id}'
+ try:
+ self._request_webpage(HEADRequest(pplus_url), video_id)
+ return self.url_result(pplus_url, ParamountPlusIE)
+ except ExtractorError:
+ self.raise_no_formats('This video is no longer available', True, video_id)
-class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE
+ return self._extract_video(item, video_url, video_id)
+
+
+class CBSNewsIE(CBSNewsBaseIE):
IE_NAME = 'cbsnews'
IE_DESC = 'CBS News'
- _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P<id>[\da-z_-]+)'
+ _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P<id>[\w-]+)'
_TESTS = [
{
@@ -47,10 +164,7 @@ class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE
'timestamp': 1476046464,
'upload_date': '20161009',
},
- 'params': {
- # rtmp download
- 'skip_download': True,
- },
+ 'skip': 'This video is no longer available',
},
{
'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
@@ -61,48 +175,234 @@ class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE
'description': 'md5:4a6983e480542d8b333a947bfc64ddc7',
'upload_date': '20140404',
'timestamp': 1396650660,
- 'uploader': 'CBSI-NEW',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 205,
'subtitles': {
'en': [{
- 'ext': 'ttml',
+ 'ext': 'dfxp',
}],
},
},
'params': {
- # m3u8 download
- 'skip_download': True,
+ 'skip_download': 'm3u8',
},
},
{
# 48 hours
'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/',
'info_dict': {
+ 'id': 'maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved',
'title': 'Cold as Ice',
'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?',
},
'playlist_mincount': 7,
},
+ {
+ 'url': 'https://www.cbsnews.com/video/032823-cbs-evening-news/',
+ 'info_dict': {
+ 'id': '_2wuO7hD9LwtyM_TwSnVwnKp6kxlcXgE',
+ 'ext': 'mp4',
+ 'title': 'CBS Evening News, March 28, 2023',
+ 'description': 'md5:db20615aae54adc1d55a1fd69dc75d13',
+ 'duration': 1189,
+ 'timestamp': 1680042600,
+ 'upload_date': '20230328',
+ 'season': 'Season 2023',
+ 'season_number': 2023,
+ 'episode': 'Episode 83',
+ 'episode_number': 83,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ },
]
def _real_extract(self, url):
display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ playlist = self._extract_playlist(webpage, display_id)
+ if playlist:
+ return playlist
+ item = self._get_item(webpage, display_id)
+ video_id = item.get('mpxRefId') or display_id
+ video_url = self._get_video_url(item)
+ if not video_url:
+ self.raise_no_formats('No video content was found', expected=True, video_id=video_id)
+
+ return self._extract_video(item, video_url, video_id)
+
+
+class CBSLocalBaseIE(CBSNewsBaseIE):
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- entries = []
- for embed_url in re.findall(r'<iframe[^>]+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage):
- entries.append(self.url_result(embed_url, CBSNewsEmbedIE.ie_key()))
- if entries:
- return self.playlist_result(
- entries, playlist_title=self._html_search_meta(['og:title', 'twitter:title'], webpage),
- playlist_description=self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage))
+ item = self._get_item(webpage, display_id)
+ video_id = item.get('mpxRefId') or display_id
+ anvato_id = None
+ video_url = self._get_video_url(item)
+
+ if not video_url:
+ anv_params = self._search_regex(
+ r'<iframe[^>]+\bdata-src="https?://w3\.mp\.lura\.live/player/prod/v3/anvload\.html\?key=([^"]+)"',
+ webpage, 'Anvato URL', default=None)
+
+ if not anv_params:
+ playlist = self._extract_playlist(webpage, display_id)
+ if playlist:
+ return playlist
+ self.raise_no_formats('No video content was found', expected=True, video_id=video_id)
+
+ anv_data = self._parse_json(base64.urlsafe_b64decode(f'{anv_params}===').decode(), video_id)
+ anvato_id = anv_data['v']
+ return self.url_result(
+ smuggle_url(f'anvato:{anv_data.get("anvack") or self._ANVACK}:{anvato_id}', {
+ 'token': anv_data.get('token') or 'default',
+ }), AnvatoIE, url_transparent=True, _old_archive_ids=[make_archive_id(self, anvato_id)])
+
+ return self._extract_video(item, video_url, video_id)
+
- item = self._parse_json(self._html_search_regex(
- r'CBSNEWS\.defaultPayload\s*=\s*({.+})',
- webpage, 'video JSON info'), display_id)['items'][0]
- return self._extract_video_info(item['mpxRefId'], 'cbsnews')
+class CBSLocalIE(CBSLocalBaseIE):
+ _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?:{CBSNewsBaseIE._LOCALE_RE})/(?:live/)?video/(?P<id>[\w-]+)'
+ _TESTS = [{
+ # Anvato video via defaultPayload JSON
+ 'url': 'https://www.cbsnews.com/newyork/video/1st-cannabis-dispensary-opens-in-queens/',
+ 'info_dict': {
+ 'id': '6376747',
+ 'ext': 'mp4',
+ 'title': '1st cannabis dispensary opens in Queens',
+ 'description': 'The dispensary is women-owned and located in Jamaica.',
+ 'uploader': 'CBS',
+ 'duration': 20,
+ 'timestamp': 1680193657,
+ 'upload_date': '20230330',
+ 'categories': ['Stations\\Spoken Word\\WCBSTV', 'Content\\Google', 'Content\\News', 'Content\\News\\Local News'],
+ 'tags': 'count:11',
+ 'thumbnail': 're:^https?://.*',
+ '_old_archive_ids': ['cbslocal 6376747'],
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ # cbsnews.com video via defaultPayload JSON
+ 'url': 'https://www.cbsnews.com/newyork/live/video/20230330171655-the-city-is-sounding-the-alarm-on-dangerous-social-media-challenges/',
+ 'info_dict': {
+ 'id': 'sJqfw7YvgSC6ant2zVmzt3y1jYKoL5J3',
+ 'ext': 'mp4',
+ 'title': 'the city is sounding the alarm on dangerous social media challenges',
+ 'description': 'md5:8eccc9b1b73be5138a52e9c4350d2cd6',
+ 'thumbnail': 'https://images-cbsn.cbsnews.com/prod/2023/03/30/story_22509622_1680196925.jpg',
+ 'duration': 41.0,
+ 'timestamp': 1680196615,
+ 'upload_date': '20230330',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+
+class CBSLocalArticleIE(CBSLocalBaseIE):
+ _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?:{CBSNewsBaseIE._LOCALE_RE})/news/(?P<id>[\w-]+)'
+ _TESTS = [{
+ # Anvato video via iframe embed
+ 'url': 'https://www.cbsnews.com/newyork/news/mta-station-agents-leaving-their-booths-to-provide-more-direct-customer-service/',
+ 'playlist_count': 2,
+ 'info_dict': {
+ 'id': 'mta-station-agents-leaving-their-booths-to-provide-more-direct-customer-service',
+ 'title': 'MTA station agents begin leaving their booths to provide more direct customer service',
+ 'description': 'The more than 2,200 agents will provide face-to-face customer service to passengers.',
+ },
+ }, {
+ 'url': 'https://www.cbsnews.com/losangeles/news/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis/',
+ 'md5': 'f0ee3081e3843f575fccef901199b212',
+ 'info_dict': {
+ 'id': '3401037',
+ 'ext': 'mp4',
+ 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'',
+ 'thumbnail': 're:^https?://.*',
+ 'timestamp': 1463440500,
+ 'upload_date': '20160516',
+ },
+ 'skip': 'Video has been removed',
+ }]
+
+
+class CBSNewsLiveBaseIE(CBSNewsBaseIE):
+ def _get_id(self, url):
+ raise NotImplementedError('This method must be implemented by subclasses')
+
+ def _real_extract(self, url):
+ video_id = self._get_id(url)
+ if not video_id:
+ raise ExtractorError('Livestream is not available', expected=True)
+
+ data = traverse_obj(self._download_json(
+ 'https://feeds-cbsn.cbsnews.com/2.0/rundown/', video_id, query={
+ 'partner': 'cbsnsite',
+ 'edition': video_id,
+ 'type': 'live',
+ }), ('navigation', 'data', 0, {dict}))
+
+ video_url = traverse_obj(data, (('videoUrlDAI', ('videoUrl', 'base')), {url_or_none}), get_all=False)
+ if not video_url:
+ raise UserNotLive(video_id=video_id)
+
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', m3u8_id='hls')
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'is_live': True,
+ **traverse_obj(data, {
+ 'title': 'headline',
+ 'description': 'rundown_slug',
+ 'thumbnail': ('images', 'thumbnail_url_hd', {url_or_none}),
+ }),
+ }
+
+
+class CBSLocalLiveIE(CBSNewsLiveBaseIE):
+ _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?P<id>{CBSNewsBaseIE._LOCALE_RE})/live/?(?:[?#]|$)'
+ _TESTS = [{
+ 'url': 'https://www.cbsnews.com/losangeles/live/',
+ 'info_dict': {
+ 'id': 'CBSN-LA',
+ 'ext': 'mp4',
+ 'title': str,
+ 'description': r're:KCBS/CBSN_LA.CRISPIN.\w+.RUNDOWN \w+ \w+',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'live_status': 'is_live',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _get_id(self, url):
+ return format_field(self._LOCALES, self._match_id(url), 'CBSN-%s')
+
+
+class CBSNewsLiveIE(CBSNewsLiveBaseIE):
+ IE_NAME = 'cbsnews:live'
+ IE_DESC = 'CBS News Livestream'
+ _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/?(?:[?#]|$)'
+ _TESTS = [{
+ 'url': 'https://www.cbsnews.com/live/',
+ 'info_dict': {
+ 'id': 'CBSN-US',
+ 'ext': 'mp4',
+ 'title': str,
+ 'description': r're:\w+ \w+ CRISPIN RUNDOWN',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'live_status': 'is_live',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _get_id(self, url):
+ return 'CBSN-US'
class CBSNewsLiveVideoIE(InfoExtractor):
@@ -111,7 +411,7 @@ class CBSNewsLiveVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[^/?#]+)'
# Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples
- _TEST = {
+ _TESTS = [{
'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/',
'info_dict': {
'id': 'clinton-sanders-prepare-to-face-off-in-nh',
@@ -120,7 +420,7 @@ class CBSNewsLiveVideoIE(InfoExtractor):
'duration': 334,
},
'skip': 'Video gone',
- }
+ }]
def _real_extract(self, url):
display_id = self._match_id(url)
@@ -131,13 +431,13 @@ class CBSNewsLiveVideoIE(InfoExtractor):
'dvr_slug': display_id,
})
- formats = self._extract_akamai_formats(video_info['url'], display_id)
-
return {
'id': display_id,
'display_id': display_id,
- 'title': video_info['headline'],
- 'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'),
- 'duration': parse_duration(video_info.get('segmentDur')),
- 'formats': formats,
+ 'formats': self._extract_akamai_formats(video_info['url'], display_id),
+ **traverse_obj(video_info, {
+ 'title': 'headline',
+ 'thumbnail': ('thumbnail_url_hd', {url_or_none}),
+ 'duration': ('segmentDur', {parse_duration}),
+ }),
}
diff --git a/hypervideo_dl/extractor/cda.py b/hypervideo_dl/extractor/cda.py
index d1212e6..1157114 100644
--- a/hypervideo_dl/extractor/cda.py
+++ b/hypervideo_dl/extractor/cda.py
@@ -4,6 +4,7 @@ import datetime
import hashlib
import hmac
import json
+import random
import re
from .common import InfoExtractor
@@ -27,11 +28,10 @@ class CDAIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
_NETRC_MACHINE = 'cdapl'
- _BASE_URL = 'http://www.cda.pl/'
+ _BASE_URL = 'https://www.cda.pl'
_BASE_API_URL = 'https://api.cda.pl'
_API_HEADERS = {
'Accept': 'application/vnd.cda.public+json',
- 'User-Agent': 'pl.cda 1.0 (version 1.2.88 build 15306; Android 9; Xiaomi Redmi 3S)',
}
# hardcoded in the app
_LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q'
@@ -101,6 +101,38 @@ class CDAIE(InfoExtractor):
}, **kwargs)
def _perform_login(self, username, password):
+ app_version = random.choice((
+ '1.2.88 build 15306',
+ '1.2.174 build 18469',
+ ))
+ android_version = random.randrange(8, 14)
+ phone_model = random.choice((
+ # x-kom.pl top selling Android smartphones, as of 2022-12-26
+ # https://www.x-kom.pl/g-4/c/1590-smartfony-i-telefony.html?f201-system-operacyjny=61322-android
+ 'ASUS ZenFone 8',
+ 'Motorola edge 20 5G',
+ 'Motorola edge 30 neo 5G',
+ 'Motorola moto g22',
+ 'OnePlus Nord 2T 5G',
+ 'Samsung Galaxy A32 SM‑A325F',
+ 'Samsung Galaxy M13',
+ 'Samsung Galaxy S20 FE 5G',
+ 'Xiaomi 11T',
+ 'Xiaomi POCO M4 Pro',
+ 'Xiaomi Redmi 10',
+ 'Xiaomi Redmi 10C',
+ 'Xiaomi Redmi 9C NFC',
+ 'Xiaomi Redmi Note 10 Pro',
+ 'Xiaomi Redmi Note 11 Pro',
+ 'Xiaomi Redmi Note 11',
+ 'Xiaomi Redmi Note 11S 5G',
+ 'Xiaomi Redmi Note 11S',
+ 'realme 10',
+ 'realme 9 Pro+',
+ 'vivo Y33s',
+ ))
+ self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})'
+
cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {}
if cached_bearer.get('valid_until', 0) > datetime.datetime.now().timestamp() + 5:
self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}'
@@ -138,9 +170,6 @@ class CDAIE(InfoExtractor):
meta = self._download_json(
f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video']
- if meta.get('premium') and not meta.get('premium_free'):
- self.report_drm(video_id)
-
uploader = traverse_obj(meta, 'author', 'login')
formats = [{
@@ -151,6 +180,10 @@ class CDAIE(InfoExtractor):
'filesize': quality.get('length'),
} for quality in meta['qualities'] if quality.get('file')]
+ if meta.get('premium') and not meta.get('premium_free') and not formats:
+ raise ExtractorError(
+ 'Video requires CDA Premium - subscription needed', expected=True)
+
return {
'id': video_id,
'title': meta.get('title'),
@@ -167,10 +200,10 @@ class CDAIE(InfoExtractor):
def _web_extract(self, video_id, url):
self._set_cookie('cda.pl', 'cda.player', 'html5')
webpage = self._download_webpage(
- self._BASE_URL + '/video/' + video_id, video_id)
+ f'{self._BASE_URL}/video/{video_id}/vfilm', video_id)
if 'Ten film jest dostępny dla użytkowników premium' in webpage:
- raise ExtractorError('This video is only available for premium users.', expected=True)
+ self.raise_login_required('This video is only available for premium users')
if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
self.raise_geo_restricted()
diff --git a/hypervideo_dl/extractor/ceskatelevize.py b/hypervideo_dl/extractor/ceskatelevize.py
index be2b0bb..8390160 100644
--- a/hypervideo_dl/extractor/ceskatelevize.py
+++ b/hypervideo_dl/extractor/ceskatelevize.py
@@ -1,20 +1,20 @@
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_unquote,
- compat_urllib_parse_urlparse,
-)
+from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse
+from ..networking import Request
from ..utils import (
ExtractorError,
float_or_none,
- sanitized_Request,
str_or_none,
traverse_obj,
urlencode_postdata,
- USER_AGENTS,
)
+USER_AGENTS = {
+ 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
+}
+
class CeskaTelevizeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
@@ -97,7 +97,7 @@ class CeskaTelevizeIE(InfoExtractor):
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage, urlh = self._download_webpage_handle(url, playlist_id)
- parsed_url = compat_urllib_parse_urlparse(urlh.geturl())
+ parsed_url = compat_urllib_parse_urlparse(urlh.url)
site_name = self._og_search_property('site_name', webpage, fatal=False, default='Česká televize')
playlist_title = self._og_search_title(webpage, default=None)
if site_name and playlist_title:
@@ -163,16 +163,16 @@ class CeskaTelevizeIE(InfoExtractor):
entries = []
for user_agent in (None, USER_AGENTS['Safari']):
- req = sanitized_Request(
+ req = Request(
'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/',
data=urlencode_postdata(data))
- req.add_header('Content-type', 'application/x-www-form-urlencoded')
- req.add_header('x-addr', '127.0.0.1')
- req.add_header('X-Requested-With', 'XMLHttpRequest')
+ req.headers['Content-type'] = 'application/x-www-form-urlencoded'
+ req.headers['x-addr'] = '127.0.0.1'
+ req.headers['X-Requested-With'] = 'XMLHttpRequest'
if user_agent:
- req.add_header('User-Agent', user_agent)
- req.add_header('Referer', url)
+ req.headers['User-Agent'] = user_agent
+ req.headers['Referer'] = url
playlistpage = self._download_json(req, playlist_id, fatal=False)
@@ -183,8 +183,8 @@ class CeskaTelevizeIE(InfoExtractor):
if playlist_url == 'error_region':
raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
- req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
- req.add_header('Referer', url)
+ req = Request(compat_urllib_parse_unquote(playlist_url))
+ req.headers['Referer'] = url
playlist = self._download_json(req, playlist_id, fatal=False)
if not playlist:
diff --git a/hypervideo_dl/extractor/chilloutzone.py b/hypervideo_dl/extractor/chilloutzone.py
index 1a2f77c..ac4252f 100644
--- a/hypervideo_dl/extractor/chilloutzone.py
+++ b/hypervideo_dl/extractor/chilloutzone.py
@@ -1,93 +1,123 @@
-import json
+import base64
from .common import InfoExtractor
-from .youtube import YoutubeIE
-from ..compat import compat_b64decode
from ..utils import (
clean_html,
- ExtractorError
+ int_or_none,
+ traverse_obj,
)
class ChilloutzoneIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w-]+)\.html'
_TESTS = [{
- 'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html',
+ 'url': 'https://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html',
'md5': 'a76f3457e813ea0037e5244f509e66d1',
'info_dict': {
'id': 'enemene-meck-alle-katzen-weg',
'ext': 'mp4',
'title': 'Enemene Meck - Alle Katzen weg',
'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?',
+ 'duration': 24,
},
}, {
'note': 'Video hosted at YouTube',
- 'url': 'http://www.chilloutzone.net/video/eine-sekunde-bevor.html',
+ 'url': 'https://www.chilloutzone.net/video/eine-sekunde-bevor.html',
'info_dict': {
'id': '1YVQaAgHyRU',
'ext': 'mp4',
'title': '16 Photos Taken 1 Second Before Disaster',
'description': 'md5:58a8fcf6a459fe0a08f54140f0ad1814',
'uploader': 'BuzzFeedVideo',
- 'uploader_id': 'BuzzFeedVideo',
+ 'uploader_id': '@BuzzFeedVideo',
'upload_date': '20131105',
+ 'availability': 'public',
+ 'thumbnail': 'https://i.ytimg.com/vi/1YVQaAgHyRU/maxresdefault.jpg',
+ 'tags': 'count:41',
+ 'like_count': int,
+ 'playable_in_embed': True,
+ 'channel_url': 'https://www.youtube.com/channel/UCpko_-a4wgz2u_DgDgd9fqA',
+ 'chapters': 'count:6',
+ 'live_status': 'not_live',
+ 'view_count': int,
+ 'categories': ['Entertainment'],
+ 'age_limit': 0,
+ 'channel_id': 'UCpko_-a4wgz2u_DgDgd9fqA',
+ 'duration': 100,
+ 'uploader_url': 'http://www.youtube.com/@BuzzFeedVideo',
+ 'channel_follower_count': int,
+ 'channel': 'BuzzFeedVideo',
},
}, {
- 'note': 'Video hosted at Vimeo',
- 'url': 'http://www.chilloutzone.net/video/icon-blending.html',
- 'md5': '2645c678b8dc4fefcc0e1b60db18dac1',
+ 'url': 'https://www.chilloutzone.net/video/icon-blending.html',
+ 'md5': '2f9d6850ec567b24f0f4fa143b9aa2f9',
'info_dict': {
- 'id': '85523671',
+ 'id': 'LLNkHpSjBfc',
'ext': 'mp4',
- 'title': 'The Sunday Times - Icons',
- 'description': 're:(?s)^Watch the making of - makingoficons.com.{300,}',
- 'uploader': 'Us',
- 'uploader_id': 'usfilms',
- 'upload_date': '20140131'
+ 'title': 'The Sunday Times Making of Icons',
+ 'description': 'md5:b9259fcf63a1669e42001e5db677f02a',
+ 'uploader': 'MadFoxUA',
+ 'uploader_id': '@MadFoxUA',
+ 'upload_date': '20140204',
+ 'channel_id': 'UCSZa9Y6-Vl7c11kWMcbAfCw',
+ 'channel_url': 'https://www.youtube.com/channel/UCSZa9Y6-Vl7c11kWMcbAfCw',
+ 'comment_count': int,
+ 'uploader_url': 'http://www.youtube.com/@MadFoxUA',
+ 'duration': 66,
+ 'live_status': 'not_live',
+ 'channel_follower_count': int,
+ 'playable_in_embed': True,
+ 'view_count': int,
+ 'like_count': int,
+ 'thumbnail': 'https://i.ytimg.com/vi/LLNkHpSjBfc/maxresdefault.jpg',
+ 'categories': ['Comedy'],
+ 'availability': 'public',
+ 'tags': [],
+ 'channel': 'MadFoxUA',
+ 'age_limit': 0,
+ },
+ }, {
+ 'url': 'https://www.chilloutzone.net/video/ordentlich-abgeschuettelt.html',
+ 'info_dict': {
+ 'id': 'ordentlich-abgeschuettelt',
+ 'ext': 'mp4',
+ 'title': 'Ordentlich abgeschüttelt',
+ 'description': 'md5:d41541966b75d3d1e8ea77a94ea0d329',
+ 'duration': 18,
},
}]
def _real_extract(self, url):
- mobj = self._match_valid_url(url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+ b64_data = self._html_search_regex(
+ r'var cozVidData\s*=\s*"([^"]+)"', webpage, 'video data')
+ info = self._parse_json(base64.b64decode(b64_data).decode(), video_id)
- base64_video_info = self._html_search_regex(
- r'var cozVidData = "(.+?)";', webpage, 'video data')
- decoded_video_info = compat_b64decode(base64_video_info).decode('utf-8')
- video_info_dict = json.loads(decoded_video_info)
-
- # get video information from dict
- video_url = video_info_dict['mediaUrl']
- description = clean_html(video_info_dict.get('description'))
- title = video_info_dict['title']
- native_platform = video_info_dict['nativePlatform']
- native_video_id = video_info_dict['nativeVideoId']
- source_priority = video_info_dict['sourcePriority']
-
- # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed)
- if native_platform is None:
- youtube_url = YoutubeIE._extract_url(webpage)
- if youtube_url:
- return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
+ video_url = info.get('mediaUrl')
+ native_platform = info.get('nativePlatform')
- # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or
- # the own CDN
- if source_priority == 'native':
+ if native_platform and info.get('sourcePriority') == 'native':
+ native_video_id = info['nativeVideoId']
if native_platform == 'youtube':
- return self.url_result(native_video_id, ie='Youtube')
- if native_platform == 'vimeo':
- return self.url_result(
- 'http://vimeo.com/' + native_video_id, ie='Vimeo')
+ return self.url_result(native_video_id, 'Youtube')
+ elif native_platform == 'vimeo':
+ return self.url_result(f'https://vimeo.com/{native_video_id}', 'Vimeo')
- if not video_url:
- raise ExtractorError('No video found')
+ elif not video_url:
+ # Possibly a standard youtube embed?
+ # TODO: Investigate if site still does this (there are no tests for it)
+ return self.url_result(url, 'Generic')
return {
'id': video_id,
'url': video_url,
'ext': 'mp4',
- 'title': title,
- 'description': description,
+ **traverse_obj(info, {
+ 'title': 'title',
+ 'description': ('description', {clean_html}),
+ 'duration': ('videoLength', {int_or_none}),
+ 'width': ('videoWidth', {int_or_none}),
+ 'height': ('videoHeight', {int_or_none}),
+ }),
}
diff --git a/hypervideo_dl/extractor/cinetecamilano.py b/hypervideo_dl/extractor/cinetecamilano.py
index 5e770eb..9cffa11 100644
--- a/hypervideo_dl/extractor/cinetecamilano.py
+++ b/hypervideo_dl/extractor/cinetecamilano.py
@@ -1,6 +1,6 @@
import json
-import urllib.error
from .common import InfoExtractor
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
float_or_none,
@@ -40,7 +40,7 @@ class CinetecaMilanoIE(InfoExtractor):
'Authorization': try_get(self._get_cookies('https://www.cinetecamilano.it'), lambda x: f'Bearer {x["cnt-token"].value}') or ''
})
except ExtractorError as e:
- if ((isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 500)
+ if ((isinstance(e.cause, HTTPError) and e.cause.status == 500)
or isinstance(e.cause, json.JSONDecodeError)):
self.raise_login_required(method='cookies')
raise
diff --git a/hypervideo_dl/extractor/ciscowebex.py b/hypervideo_dl/extractor/ciscowebex.py
index 44595d8..85585df 100644
--- a/hypervideo_dl/extractor/ciscowebex.py
+++ b/hypervideo_dl/extractor/ciscowebex.py
@@ -1,5 +1,6 @@
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
int_or_none,
try_get,
unified_timestamp,
@@ -32,17 +33,36 @@ class CiscoWebexIE(InfoExtractor):
if rcid:
webpage = self._download_webpage(url, None, note='Getting video ID')
url = self._search_regex(self._VALID_URL, webpage, 'redirection url', group='url')
- url = self._request_webpage(url, None, note='Resolving final URL').geturl()
+ url = self._request_webpage(url, None, note='Resolving final URL').url
mobj = self._match_valid_url(url)
subdomain = mobj.group('subdomain')
siteurl = mobj.group('siteurl_1') or mobj.group('siteurl_2')
video_id = mobj.group('id')
- stream = self._download_json(
+ password = self.get_param('videopassword')
+
+ headers = {'Accept': 'application/json'}
+ if password:
+ headers['accessPwd'] = password
+
+ stream, urlh = self._download_json_handle(
'https://%s.webex.com/webappng/api/v1/recordings/%s/stream' % (subdomain, video_id),
- video_id, fatal=False, query={'siteurl': siteurl})
- if not stream:
- self.raise_login_required(method='cookies')
+ video_id, headers=headers, query={'siteurl': siteurl}, expected_status=(403, 429))
+
+ if urlh.status == 403:
+ if stream['code'] == 53004:
+ self.raise_login_required()
+ if stream['code'] == 53005:
+ if password:
+ raise ExtractorError('Wrong password', expected=True)
+ raise ExtractorError(
+ 'This video is protected by a password, use the --video-password option', expected=True)
+ raise ExtractorError(f'{self.IE_NAME} said: {stream["code"]} - {stream["message"]}', expected=True)
+
+ if urlh.status == 429:
+ self.raise_login_required(
+ f'{self.IE_NAME} asks you to solve a CAPTCHA. Solve CAPTCHA in browser and',
+ method='cookies')
video_id = stream.get('recordUUID') or video_id
@@ -78,7 +98,7 @@ class CiscoWebexIE(InfoExtractor):
'title': stream['recordName'],
'description': stream.get('description'),
'uploader': stream.get('ownerDisplayName'),
- 'uploader_id': stream.get('ownerUserName') or stream.get('ownerId'), # mail or id
+ 'uploader_id': stream.get('ownerUserName') or stream.get('ownerId'),
'timestamp': unified_timestamp(stream.get('createTime')),
'duration': int_or_none(stream.get('duration'), 1000),
'webpage_url': 'https://%s.webex.com/recordingservice/sites/%s/recording/playback/%s' % (subdomain, siteurl, video_id),
diff --git a/hypervideo_dl/extractor/clipchamp.py b/hypervideo_dl/extractor/clipchamp.py
new file mode 100644
index 0000000..a8bdf7e
--- /dev/null
+++ b/hypervideo_dl/extractor/clipchamp.py
@@ -0,0 +1,61 @@
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ traverse_obj,
+ unified_timestamp,
+ url_or_none,
+)
+
+
+class ClipchampIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?clipchamp\.com/watch/(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://clipchamp.com/watch/gRXZ4ZhdDaU',
+ 'info_dict': {
+ 'id': 'gRXZ4ZhdDaU',
+ 'ext': 'mp4',
+ 'title': 'Untitled video',
+ 'uploader': 'Alexander Schwartz',
+ 'timestamp': 1680805580,
+ 'upload_date': '20230406',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ _STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s'
+ _STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'}
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video']
+
+ storage_location = data.get('storage_location')
+ if storage_location != 'cf_stream':
+ raise ExtractorError(f'Unsupported clip storage location "{storage_location}"')
+
+ path = data['download_url']
+ iframe = self._download_webpage(
+ f'https://iframe.cloudflarestream.com/{path}', video_id, 'Downloading player iframe')
+ subdomain = self._search_regex(
+ r'\bcustomer-domain-prefix=["\']([\w-]+)["\']', iframe,
+ 'subdomain', fatal=False) or 'customer-2ut9yn3y6fta1yxe'
+
+ formats = self._extract_mpd_formats(
+ self._STREAM_URL_TMPL % (subdomain, path, 'mpd'), video_id,
+ query=self._STREAM_URL_QUERY, fatal=False, mpd_id='dash')
+ formats.extend(self._extract_m3u8_formats(
+ self._STREAM_URL_TMPL % (subdomain, path, 'm3u8'), video_id, 'mp4',
+ query=self._STREAM_URL_QUERY, fatal=False, m3u8_id='hls'))
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'uploader': ' '.join(traverse_obj(data, ('creator', ('first_name', 'last_name'), {str}))) or None,
+ **traverse_obj(data, {
+ 'title': ('project', 'project_name', {str}),
+ 'timestamp': ('created_at', {unified_timestamp}),
+ 'thumbnail': ('thumbnail_url', {url_or_none}),
+ }),
+ }
diff --git a/hypervideo_dl/extractor/clyp.py b/hypervideo_dl/extractor/clyp.py
index 0aaf73d..273d002 100644
--- a/hypervideo_dl/extractor/clyp.py
+++ b/hypervideo_dl/extractor/clyp.py
@@ -9,22 +9,22 @@ from ..utils import (
class ClypIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)'
_TESTS = [{
- 'url': 'https://clyp.it/ojz2wfah',
- 'md5': '1d4961036c41247ecfdcc439c0cddcbb',
+ 'url': 'https://clyp.it/iynkjk4b',
+ 'md5': '4bc6371c65210e7b372097fce4d92441',
'info_dict': {
- 'id': 'ojz2wfah',
- 'ext': 'mp3',
- 'title': 'Krisson80 - bits wip wip',
- 'description': '#Krisson80BitsWipWip #chiptune\n#wip',
- 'duration': 263.21,
- 'timestamp': 1443515251,
- 'upload_date': '20150929',
+ 'id': 'iynkjk4b',
+ 'ext': 'ogg',
+ 'title': 'research',
+ 'description': '#Research',
+ 'duration': 51.278,
+ 'timestamp': 1435524981,
+ 'upload_date': '20150628',
},
}, {
'url': 'https://clyp.it/b04p1odi?token=b0078e077e15835845c528a44417719d',
'info_dict': {
'id': 'b04p1odi',
- 'ext': 'mp3',
+ 'ext': 'ogg',
'title': 'GJ! (Reward Edit)',
'description': 'Metal Resistance (THE ONE edition)',
'duration': 177.789,
@@ -34,6 +34,17 @@ class ClypIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ }, {
+ 'url': 'https://clyp.it/v42214lc',
+ 'md5': '4aca4dfc3236fb6d6ddc4ea08314f33f',
+ 'info_dict': {
+ 'id': 'v42214lc',
+ 'ext': 'wav',
+ 'title': 'i dont wanna go (old version)',
+ 'duration': 113.528,
+ 'timestamp': 1607348505,
+ 'upload_date': '20201207',
+ },
}]
def _real_extract(self, url):
@@ -59,8 +70,20 @@ class ClypIE(InfoExtractor):
'url': format_url,
'format_id': format_id,
'vcodec': 'none',
+ 'acodec': ext.lower(),
})
+ page = self._download_webpage(url, video_id=audio_id)
+ wav_url = self._html_search_regex(
+ r'var\s*wavStreamUrl\s*=\s*["\'](?P<url>https?://[^\'"]+)', page, 'url', default=None)
+ if wav_url:
+ formats.append({
+ 'url': wav_url,
+ 'format_id': 'wavStreamUrl',
+ 'vcodec': 'none',
+ 'acodec': 'wav',
+ })
+
title = metadata['Title']
description = metadata.get('Description')
duration = float_or_none(metadata.get('Duration'))
diff --git a/hypervideo_dl/extractor/comedycentral.py b/hypervideo_dl/extractor/comedycentral.py
index 05fc9f2..27d295b 100644
--- a/hypervideo_dl/extractor/comedycentral.py
+++ b/hypervideo_dl/extractor/comedycentral.py
@@ -2,7 +2,7 @@ from .mtv import MTVServicesInfoExtractor
class ComedyCentralIE(MTVServicesInfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist)/(?P<id>[0-9a-z]{6})'
+ _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist|movies)/(?P<id>[0-9a-z]{6})'
_FEED_URL = 'http://comedycentral.com/feeds/mrss/'
_TESTS = [{
@@ -25,6 +25,9 @@ class ComedyCentralIE(MTVServicesInfoExtractor):
}, {
'url': 'https://www.cc.com/collection-playlist/cosnej/stand-up-specials/t6vtjb',
'only_matching': True,
+ }, {
+ 'url': 'https://www.cc.com/movies/tkp406/a-cluesterfuenke-christmas',
+ 'only_matching': True,
}]
diff --git a/hypervideo_dl/extractor/common.py b/hypervideo_dl/extractor/common.py
index 4b56307..5a561a2 100644
--- a/hypervideo_dl/extractor/common.py
+++ b/hypervideo_dl/extractor/common.py
@@ -13,6 +13,7 @@ import netrc
import os
import random
import re
+import subprocess
import sys
import time
import types
@@ -21,9 +22,21 @@ import urllib.request
import xml.etree.ElementTree
from ..compat import functools # isort: split
-from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
+from ..compat import (
+ compat_etree_fromstring,
+ compat_expanduser,
+ compat_os_name,
+ urllib_req_to_req,
+)
from ..cookies import LenientSimpleCookie
from ..downloader.f4m import get_base_url, remove_encrypted_media
+from ..downloader.hls import HlsFD
+from ..networking import HEADRequest, Request
+from ..networking.exceptions import (
+ HTTPError,
+ IncompleteRead,
+ network_exceptions,
+)
from ..utils import (
IDENTITY,
JSON_LD_RE,
@@ -33,6 +46,7 @@ from ..utils import (
GeoRestrictedError,
GeoUtils,
LenientJSONDecoder,
+ Popen,
RegexNotFoundError,
RetryManager,
UnsupportedError,
@@ -55,7 +69,7 @@ from ..utils import (
join_nonempty,
js_to_json,
mimetype2ext,
- network_exceptions,
+ netrc_from_content,
orderedSet,
parse_bitrate,
parse_codecs,
@@ -65,21 +79,20 @@ from ..utils import (
parse_resolution,
sanitize_filename,
sanitize_url,
- sanitized_Request,
smuggle_url,
str_or_none,
str_to_int,
strip_or_none,
traverse_obj,
+ truncate_string,
try_call,
try_get,
unescapeHTML,
unified_strdate,
unified_timestamp,
- update_Request,
- update_url_query,
url_basename,
url_or_none,
+ urlhandle_detect_ext,
urljoin,
variadic,
xpath_element,
@@ -129,6 +142,7 @@ class InfoExtractor:
is parsed from a string (in case of
fragmented media)
for MSS - URL of the ISM manifest.
+ * request_data Data to send in POST request to the URL
* manifest_url
The URL of the manifest file in case of
fragmented media:
@@ -216,7 +230,19 @@ class InfoExtractor:
width : height ratio as float.
* no_resume The server does not support resuming the
(HTTP or RTMP) download. Boolean.
- * has_drm The format has DRM and cannot be downloaded. Boolean
+ * has_drm True if the format has DRM and cannot be downloaded.
+ 'maybe' if the format may have DRM and has to be tested before download.
+ * extra_param_to_segment_url A query string to append to each
+ fragment's URL, or to update each existing query string
+ with. Only applied by the native HLS/DASH downloaders.
+ * hls_aes A dictionary of HLS AES-128 decryption information
+ used by the native HLS downloader to override the
+ values in the media playlist when an '#EXT-X-KEY' tag
+ is present in the playlist:
+ * uri The URI from which the key will be downloaded
+ * key The key (as hex) used to decrypt fragments.
+ If `key` is given, any key URI will be ignored
+ * iv The IV (as hex) used to decrypt fragments
* downloader_options A dictionary of downloader options
(For internal use only)
* http_chunk_size Chunk size for HTTP downloads
@@ -271,6 +297,7 @@ class InfoExtractor:
channel_id: Id of the channel.
channel_url: Full URL to a channel webpage.
channel_follower_count: Number of followers of the channel.
+ channel_is_verified: Whether the channel is verified on the platform.
location: Physical location where the video was filmed.
subtitles: The available subtitles as a dictionary in the format
{tag: subformats}. "tag" is usually a language code, and
@@ -299,6 +326,11 @@ class InfoExtractor:
* "author" - human-readable name of the comment author
* "author_id" - user ID of the comment author
* "author_thumbnail" - The thumbnail of the comment author
+ * "author_url" - The url to the comment author's page
+ * "author_is_verified" - Whether the author is verified
+ on the platform
+ * "author_is_uploader" - Whether the comment is made by
+ the video uploader
* "id" - Comment ID
* "html" - Comment as HTML
* "text" - Plain text of the comment
@@ -310,8 +342,8 @@ class InfoExtractor:
* "dislike_count" - Number of negative ratings of the comment
* "is_favorited" - Whether the comment is marked as
favorite by the video uploader
- * "author_is_uploader" - Whether the comment is made by
- the video uploader
+ * "is_pinned" - Whether the comment is pinned to
+ the top of the comments
age_limit: Age restriction for the video, as an integer (years)
webpage_url: The URL to the video webpage, if given to hypervideo it
should allow to get the same result again. (It will be set
@@ -335,6 +367,10 @@ class InfoExtractor:
* "start_time" - The start time of the chapter in seconds
* "end_time" - The end time of the chapter in seconds
* "title" (optional, string)
+ heatmap: A list of dictionaries, with the following entries:
+ * "start_time" - The start time of the data point in seconds
+ * "end_time" - The end time of the data point in seconds
+ * "value" - The normalized value of the data point (float between 0 and 1)
playable_in_embed: Whether this video is allowed to play in embedded
players on other sites. Can be True (=always allowed),
False (=never allowed), None (=unknown), or a string
@@ -446,8 +482,8 @@ class InfoExtractor:
Subclasses of this should also be added to the list of extractors and
- should define a _VALID_URL regexp and, re-define the _real_extract() and
- (optionally) _real_initialize() methods.
+ should define _VALID_URL as a regexp or a Sequence of regexps, and
+ re-define the _real_extract() and (optionally) _real_initialize() methods.
Subclasses may also override suitable() if necessary, but ensure the function
signature is preserved and that this function imports everything it needs
@@ -510,7 +546,7 @@ class InfoExtractor:
_EMBED_REGEX = []
def _login_hint(self, method=NO_DEFAULT, netrc=None):
- password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
+ password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
return {
None: '',
'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
@@ -537,8 +573,8 @@ class InfoExtractor:
# we have cached the regexp for *this* class, whereas getattr would also
# match the superclass
if '_VALID_URL_RE' not in cls.__dict__:
- cls._VALID_URL_RE = re.compile(cls._VALID_URL)
- return cls._VALID_URL_RE.match(url)
+ cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
+ return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
@classmethod
def suitable(cls, url):
@@ -674,7 +710,8 @@ class InfoExtractor:
for _ in range(2):
try:
self.initialize()
- self.write_debug('Extracting URL: %s' % url)
+ self.to_screen('Extracting URL: %s' % (
+ url if self.get_param('verbose') else truncate_string(url, 100, 20)))
ie_result = self._real_extract(url)
if ie_result is None:
return None
@@ -692,11 +729,11 @@ class InfoExtractor:
except UnsupportedError:
raise
except ExtractorError as e:
- e.video_id = e.video_id or self.get_temp_id(url),
+ e.video_id = e.video_id or self.get_temp_id(url)
e.ie = e.ie or self.IE_NAME,
e.traceback = e.traceback or sys.exc_info()[2]
raise
- except http.client.IncompleteRead as e:
+ except IncompleteRead as e:
raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
except (KeyError, StopIteration) as e:
raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
@@ -755,20 +792,25 @@ class InfoExtractor:
@staticmethod
def __can_accept_status_code(err, expected_status):
- assert isinstance(err, urllib.error.HTTPError)
+ assert isinstance(err, HTTPError)
if expected_status is None:
return False
elif callable(expected_status):
- return expected_status(err.code) is True
+ return expected_status(err.status) is True
else:
- return err.code in variadic(expected_status)
+ return err.status in variadic(expected_status)
def _create_request(self, url_or_request, data=None, headers=None, query=None):
if isinstance(url_or_request, urllib.request.Request):
- return update_Request(url_or_request, data=data, headers=headers, query=query)
- if query:
- url_or_request = update_url_query(url_or_request, query)
- return sanitized_Request(url_or_request, data, headers or {})
+ self._downloader.deprecation_warning(
+ 'Passing a urllib.request.Request to _create_request() is deprecated. '
+ 'Use hypervideo_dl.networking.common.Request instead.')
+ url_or_request = urllib_req_to_req(url_or_request)
+ elif not isinstance(url_or_request, Request):
+ url_or_request = Request(url_or_request)
+
+ url_or_request.update(data=data, headers=headers, query=query)
+ return url_or_request
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
"""
@@ -804,14 +846,9 @@ class InfoExtractor:
try:
return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
except network_exceptions as err:
- if isinstance(err, urllib.error.HTTPError):
+ if isinstance(err, HTTPError):
if self.__can_accept_status_code(err, expected_status):
- # Retain reference to error to prevent file object from
- # being closed before it can be read. Works around the
- # effects of <https://bugs.python.org/issue15002>
- # introduced in Python 3.4.1.
- err.fp._error = err
- return err.fp
+ return err.response
if errnote is False:
return False
@@ -943,11 +980,11 @@ class InfoExtractor:
if prefix is not None:
webpage_bytes = prefix + webpage_bytes
if self.get_param('dump_intermediate_pages', False):
- self.to_screen('Dumping request to ' + urlh.geturl())
+ self.to_screen('Dumping request to ' + urlh.url)
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
if self.get_param('write_pages'):
- filename = self._request_dump_filename(urlh.geturl(), video_id)
+ filename = self._request_dump_filename(urlh.url, video_id)
self.to_screen(f'Saving request to {filename}')
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
@@ -1005,7 +1042,7 @@ class InfoExtractor:
fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query)
- filename = self._request_dump_filename(url_or_request.full_url, video_id)
+ filename = self._request_dump_filename(url_or_request.url, video_id)
self.to_screen(f'Loading request from {filename}')
try:
with open(filename, 'rb') as dumpf:
@@ -1079,7 +1116,7 @@ class InfoExtractor:
while True:
try:
return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
- except http.client.IncompleteRead as e:
+ except IncompleteRead as e:
try_count += 1
if try_count >= tries:
raise e
@@ -1260,51 +1297,53 @@ class InfoExtractor:
Like _search_regex, but strips HTML tags and unescapes entities.
"""
res = self._search_regex(pattern, string, name, default, fatal, flags, group)
- if res:
- return clean_html(res).strip()
- else:
- return res
+ if isinstance(res, tuple):
+ return tuple(map(clean_html, res))
+ return clean_html(res)
def _get_netrc_login_info(self, netrc_machine=None):
- username = None
- password = None
netrc_machine = netrc_machine or self._NETRC_MACHINE
- if self.get_param('usenetrc', False):
- try:
- netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
- if os.path.isdir(netrc_file):
- netrc_file = os.path.join(netrc_file, '.netrc')
- info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
- if info is not None:
- username = info[0]
- password = info[2]
- else:
- raise netrc.NetrcParseError(
- 'No authenticators for %s' % netrc_machine)
- except (OSError, netrc.NetrcParseError) as err:
- self.report_warning(
- 'parsing .netrc: %s' % error_to_compat_str(err))
+ cmd = self.get_param('netrc_cmd')
+ if cmd:
+ cmd = cmd.replace('{}', netrc_machine)
+ self.to_screen(f'Executing command: {cmd}')
+ stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
+ if ret != 0:
+ raise OSError(f'Command returned error code {ret}')
+ info = netrc_from_content(stdout).authenticators(netrc_machine)
+
+ elif self.get_param('usenetrc', False):
+ netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
+ if os.path.isdir(netrc_file):
+ netrc_file = os.path.join(netrc_file, '.netrc')
+ info = netrc.netrc(netrc_file).authenticators(netrc_machine)
- return username, password
+ else:
+ return None, None
+ if not info:
+ raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')
+ return info[0], info[2]
def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
"""
Get the login info as (username, password)
First look for the manually specified credentials using username_option
and password_option as keys in params dictionary. If no such credentials
- available look in the netrc file using the netrc_machine or _NETRC_MACHINE
- value.
+ are available try the netrc_cmd if it is defined or look in the
+ netrc file using the netrc_machine or _NETRC_MACHINE value.
If there's no info available, return (None, None)
"""
- # Attempt to use provided username and password or .netrc data
username = self.get_param(username_option)
if username is not None:
password = self.get_param(password_option)
else:
- username, password = self._get_netrc_login_info(netrc_machine)
-
+ try:
+ username, password = self._get_netrc_login_info(netrc_machine)
+ except (OSError, netrc.NetrcParseError) as err:
+ self.report_warning(f'Failed to parse .netrc: {err}')
+ return None, None
return username, password
def _get_tfa_info(self, note='two-factor verification code'):
@@ -1324,7 +1363,7 @@ class InfoExtractor:
# Helper functions for extracting OpenGraph info
@staticmethod
def _og_regexes(prop):
- content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+ content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
% {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
template = r'<meta[^>]+?%s[^>]+?%s'
@@ -1394,10 +1433,16 @@ class InfoExtractor:
# And then there are the jokers who advertise that they use RTA, but actually don't.
AGE_LIMIT_MARKERS = [
r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
+ r'>[^<]*you acknowledge you are at least (\d+) years old',
+ r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
]
- if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
- return 18
- return 0
+
+ age_limit = 0
+ for marker in AGE_LIMIT_MARKERS:
+ mobj = re.search(marker, html)
+ if mobj:
+ age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
+ return age_limit
def _media_rating_search(self, html):
# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
@@ -1650,11 +1695,8 @@ class InfoExtractor:
if js is None:
return {}
- args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
-
- for key, val in args.items():
- if val in ('undefined', 'void 0'):
- args[key] = 'null'
+ args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
+ f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
return traverse_obj(ret, traverse) or {}
@@ -1757,6 +1799,9 @@ class InfoExtractor:
def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
transform_source=lambda s: fix_xml_ampersands(s).strip(),
fatal=True, m3u8_id=None, data=None, headers={}, query={}):
+ if self.get_param('ignore_no_formats_error'):
+ fatal = False
+
res = self._download_xml_handle(
manifest_url, video_id, 'Downloading f4m manifest',
'Unable to download f4m manifest',
@@ -1768,7 +1813,7 @@ class InfoExtractor:
return []
manifest, urlh = res
- manifest_url = urlh.geturl()
+ manifest_url = urlh.url
return self._parse_f4m_formats(
manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
@@ -1906,6 +1951,17 @@ class InfoExtractor:
errnote=None, fatal=True, live=False, data=None, headers={},
query={}):
+ if self.get_param('ignore_no_formats_error'):
+ fatal = False
+
+ if not m3u8_url:
+ if errnote is not False:
+ errnote = errnote or 'Failed to obtain m3u8 URL'
+ if fatal:
+ raise ExtractorError(errnote, video_id=video_id)
+ self.report_warning(f'{errnote}{bug_reports_message()}')
+ return [], {}
+
res = self._download_webpage_handle(
m3u8_url, video_id,
note='Downloading m3u8 information' if note is None else note,
@@ -1916,7 +1972,7 @@ class InfoExtractor:
return [], {}
m3u8_doc, urlh = res
- m3u8_url = urlh.geturl()
+ m3u8_url = urlh.url
return self._parse_m3u8_formats_and_subtitles(
m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
@@ -1930,11 +1986,7 @@ class InfoExtractor:
errnote=None, fatal=True, data=None, headers={}, query={},
video_id=None):
formats, subtitles = [], {}
-
- has_drm = re.search('|'.join([
- r'#EXT-X-FAXS-CM:', # Adobe Flash Access
- r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
- ]), m3u8_doc)
+ has_drm = HlsFD._has_drm(m3u8_doc)
def format_url(url):
return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
@@ -2032,6 +2084,7 @@ class InfoExtractor:
'protocol': entry_protocol,
'preference': preference,
'quality': quality,
+ 'has_drm': has_drm,
'vcodec': 'none' if media_type == 'AUDIO' else None,
} for idx in _extract_m3u8_playlist_indices(manifest_url))
@@ -2091,6 +2144,7 @@ class InfoExtractor:
'protocol': entry_protocol,
'preference': preference,
'quality': quality,
+ 'has_drm': has_drm,
}
resolution = last_stream_inf.get('RESOLUTION')
if resolution:
@@ -2157,13 +2211,23 @@ class InfoExtractor:
return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
- if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
+ if '#EXT-X-ENDLIST' not in m3u8_vod:
return None
return int(sum(
float(line[len('#EXTINF:'):].split(',')[0])
for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
+ def _extract_mpd_vod_duration(
+ self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
+
+ mpd_doc = self._download_xml(
+ mpd_url, video_id,
+ note='Downloading MPD VOD manifest' if note is None else note,
+ errnote='Failed to download VOD manifest' if errnote is None else errnote,
+ fatal=False, data=data, headers=headers, query=query) or {}
+ return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
+
@staticmethod
def _xpath_ns(path, namespace=None):
if not namespace:
@@ -2177,22 +2241,17 @@ class InfoExtractor:
return '/'.join(out)
def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
+ if self.get_param('ignore_no_formats_error'):
+ fatal = False
+
res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
if res is False:
assert not fatal
return [], {}
-
smil, urlh = res
- smil_url = urlh.geturl()
- namespace = self._parse_smil_namespace(smil)
-
- fmts = self._parse_smil_formats(
- smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
- subs = self._parse_smil_subtitles(
- smil, namespace=namespace)
-
- return fmts, subs
+ return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
+ namespace=self._parse_smil_namespace(smil))
def _extract_smil_formats(self, *args, **kwargs):
fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
@@ -2206,7 +2265,7 @@ class InfoExtractor:
return {}
smil, urlh = res
- smil_url = urlh.geturl()
+ smil_url = urlh.url
return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
@@ -2218,9 +2277,8 @@ class InfoExtractor:
def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
namespace = self._parse_smil_namespace(smil)
- formats = self._parse_smil_formats(
+ formats, subtitles = self._parse_smil_formats_and_subtitles(
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
- subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
video_id = os.path.splitext(url_basename(smil_url))[0]
title = None
@@ -2259,7 +2317,14 @@ class InfoExtractor:
return self._search_regex(
r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
- def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+ def _parse_smil_formats(self, *args, **kwargs):
+ fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self._report_ignoring_subs('SMIL')
+ return fmts
+
+ def _parse_smil_formats_and_subtitles(
+ self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
base = smil_url
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
b = meta.get('base') or meta.get('httpBase')
@@ -2267,7 +2332,7 @@ class InfoExtractor:
base = b
break
- formats = []
+ formats, subtitles = [], {}
rtmp_count = 0
http_count = 0
m3u8_count = 0
@@ -2287,7 +2352,8 @@ class InfoExtractor:
height = int_or_none(medium.get('height'))
proto = medium.get('proto')
ext = medium.get('ext')
- src_ext = determine_ext(src)
+ src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
+ self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
streamer = medium.get('streamer') or base
if proto == 'rtmp' or streamer.startswith('rtmp'):
@@ -2314,8 +2380,9 @@ class InfoExtractor:
src_url = src_url.strip()
if proto == 'm3u8' or src_ext == 'm3u8':
- m3u8_formats = self._extract_m3u8_formats(
+ m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+ self._merge_subtitles(m3u8_subs, target=subtitles)
if len(m3u8_formats) == 1:
m3u8_count += 1
m3u8_formats[0].update({
@@ -2336,11 +2403,15 @@ class InfoExtractor:
f4m_url += urllib.parse.urlencode(f4m_params)
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
elif src_ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- src_url, video_id, mpd_id='dash', fatal=False))
+ mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
+ src_url, video_id, mpd_id='dash', fatal=False)
+ formats.extend(mpd_formats)
+ self._merge_subtitles(mpd_subs, target=subtitles)
elif re.search(r'\.ism/[Mm]anifest', src_url):
- formats.extend(self._extract_ism_formats(
- src_url, video_id, ism_id='mss', fatal=False))
+ ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
+ src_url, video_id, ism_id='mss', fatal=False)
+ formats.extend(ism_formats)
+ self._merge_subtitles(ism_subs, target=subtitles)
elif src_url.startswith('http') and self._is_valid_url(src, video_id):
http_count += 1
formats.append({
@@ -2371,7 +2442,10 @@ class InfoExtractor:
'format_note': 'SMIL storyboards',
})
- return formats
+ smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
+ self._merge_subtitles(smil_subs, target=subtitles)
+
+ return formats, subtitles
def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
urls = []
@@ -2397,7 +2471,7 @@ class InfoExtractor:
return []
xspf, urlh = res
- xspf_url = urlh.geturl()
+ xspf_url = urlh.url
return self._parse_xspf(
xspf, playlist_id, xspf_url=xspf_url,
@@ -2452,6 +2526,10 @@ class InfoExtractor:
def _extract_mpd_formats_and_subtitles(
self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
fatal=True, data=None, headers={}, query={}):
+
+ if self.get_param('ignore_no_formats_error'):
+ fatal = False
+
res = self._download_xml_handle(
mpd_url, video_id,
note='Downloading MPD manifest' if note is None else note,
@@ -2464,7 +2542,7 @@ class InfoExtractor:
return [], {}
# We could have been redirected to a new url when we retrieved our mpd file.
- mpd_url = urlh.geturl()
+ mpd_url = urlh.url
mpd_base_url = base_url(mpd_url)
return self._parse_mpd_formats_and_subtitles(
@@ -2821,6 +2899,9 @@ class InfoExtractor:
return fmts
def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
+ if self.get_param('ignore_no_formats_error'):
+ fatal = False
+
res = self._download_xml_handle(
ism_url, video_id,
note='Downloading ISM manifest' if note is None else note,
@@ -2832,7 +2913,7 @@ class InfoExtractor:
if ism_doc is None:
return [], {}
- return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
+ return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
"""
@@ -2928,6 +3009,8 @@ class InfoExtractor:
'protocol': 'ism',
'fragments': fragments,
'has_drm': ism_doc.find('Protection') is not None,
+ 'language': stream_language,
+ 'audio_channels': int_or_none(track.get('Channels')),
'_download_params': {
'stream_type': stream_type,
'duration': duration,
@@ -3190,7 +3273,7 @@ class InfoExtractor:
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
mobj = re.search(
- r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
+ r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
webpage)
if mobj:
try:
@@ -3211,19 +3294,20 @@ class InfoExtractor:
def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
- # JWPlayer backward compatibility: flattened playlists
- # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
- if 'playlist' not in jwplayer_data:
- jwplayer_data = {'playlist': [jwplayer_data]}
-
entries = []
+ if not isinstance(jwplayer_data, dict):
+ return entries
- # JWPlayer backward compatibility: single playlist item
+ playlist_items = jwplayer_data.get('playlist')
+ # JWPlayer backward compatibility: single playlist item/flattened playlists
# https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
- if not isinstance(jwplayer_data['playlist'], list):
- jwplayer_data['playlist'] = [jwplayer_data['playlist']]
+ # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
+ if not isinstance(playlist_items, list):
+ playlist_items = (playlist_items or jwplayer_data, )
- for video_data in jwplayer_data['playlist']:
+ for video_data in playlist_items:
+ if not isinstance(video_data, dict):
+ continue
# JWPlayer backward compatibility: flattened sources
# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
if 'sources' not in video_data:
@@ -3261,6 +3345,13 @@ class InfoExtractor:
'timestamp': int_or_none(video_data.get('pubdate')),
'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
'subtitles': subtitles,
+ 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
+ 'genre': clean_html(video_data.get('genre')),
+ 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
+ 'season_number': int_or_none(video_data.get('season')),
+ 'episode_number': int_or_none(video_data.get('episode')),
+ 'release_year': int_or_none(video_data.get('releasedate')),
+ 'age_limit': int_or_none(video_data.get('age_restriction')),
}
# https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
@@ -3278,7 +3369,7 @@ class InfoExtractor:
def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
- urls = []
+ urls = set()
formats = []
for source in jwplayer_sources_data:
if not isinstance(source, dict):
@@ -3287,14 +3378,14 @@ class InfoExtractor:
base_url, self._proto_relative_url(source.get('file')))
if not source_url or source_url in urls:
continue
- urls.append(source_url)
+ urls.add(source_url)
source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url)
- if source_type == 'hls' or ext == 'm3u8':
+ if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=m3u8_id, fatal=False))
- elif source_type == 'dash' or ext == 'mpd':
+ elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
formats.extend(self._extract_mpd_formats(
source_url, video_id, mpd_id=mpd_id, fatal=False))
elif ext == 'smil':
@@ -3309,13 +3400,12 @@ class InfoExtractor:
'ext': ext,
})
else:
+ format_id = str_or_none(source.get('label'))
height = int_or_none(source.get('height'))
- if height is None:
+ if height is None and format_id:
# Often no height is provided but there is a label in
# format like "1080p", "720p SD", or 1080.
- height = int_or_none(self._search_regex(
- r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
- 'height', default=None))
+ height = parse_resolution(format_id).get('height')
a_format = {
'url': source_url,
'width': int_or_none(source.get('width')),
@@ -3323,6 +3413,7 @@ class InfoExtractor:
'tbr': int_or_none(source.get('bitrate'), scale=1000),
'filesize': int_or_none(source.get('filesize')),
'ext': ext,
+ 'format_id': format_id
}
if source_url.startswith('rtmp'):
a_format['ext'] = 'flv'
@@ -3375,7 +3466,7 @@ class InfoExtractor:
def _get_cookies(self, url):
""" Return a http.cookies.SimpleCookie with the cookies for the url """
- return LenientSimpleCookie(self._downloader._calc_cookies(url))
+ return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
def _apply_first_set_cookie_header(self, url_handle, cookie):
"""
@@ -3416,13 +3507,17 @@ class InfoExtractor:
continue
t['name'] = cls.ie_key()
yield t
+ if getattr(cls, '__wrapped__', None):
+ yield from cls.__wrapped__.get_testcases(include_onlymatching)
@classmethod
def get_webpage_testcases(cls):
tests = vars(cls).get('_WEBPAGE_TESTS', [])
for t in tests:
t['name'] = cls.ie_key()
- return tests
+ yield t
+ if getattr(cls, '__wrapped__', None):
+ yield from cls.__wrapped__.get_webpage_testcases()
@classproperty(cache=True)
def age_limit(cls):
@@ -3446,8 +3541,8 @@ class InfoExtractor:
@classmethod
def is_single_video(cls, url):
"""Returns whether the URL is of a single video, None if unknown"""
- assert cls.suitable(url), 'The URL must be suitable for the extractor'
- return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
+ if cls.suitable(url):
+ return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
@classmethod
def is_suitable(cls, age_limit):
@@ -3460,7 +3555,7 @@ class InfoExtractor:
desc = ''
if cls._NETRC_MACHINE:
if markdown:
- desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
+ desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
else:
desc += f' [{cls._NETRC_MACHINE}]'
if cls.IE_DESC is False:
@@ -3468,7 +3563,7 @@ class InfoExtractor:
elif cls.IE_DESC:
desc += f' {cls.IE_DESC}'
if cls.SEARCH_KEY:
- desc += f'; "{cls.SEARCH_KEY}:" prefix'
+ desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
if search_examples:
_COUNTS = ('', '5', '10', 'all')
desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
@@ -3582,6 +3677,42 @@ class InfoExtractor:
or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
or default)
+ def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
+ if not duration:
+ return
+ chapter_list = [{
+ 'start_time': start_function(chapter),
+ 'title': title_function(chapter),
+ } for chapter in chapter_list or []]
+ if strict:
+ warn = self.report_warning
+ else:
+ warn = self.write_debug
+ chapter_list.sort(key=lambda c: c['start_time'] or 0)
+
+ chapters = [{'start_time': 0}]
+ for idx, chapter in enumerate(chapter_list):
+ if chapter['start_time'] is None:
+ warn(f'Incomplete chapter {idx}')
+ elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
+ chapters.append(chapter)
+ elif chapter not in chapters:
+ issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
+ else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
+ warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
+ return chapters[1:]
+
+ def _extract_chapters_from_description(self, description, duration):
+ duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
+ sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
+ return self._extract_chapters_helper(
+ re.findall(sep_re % (duration_re, r'.+?'), description or ''),
+ start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
+ duration=duration, strict=False) or self._extract_chapters_helper(
+ re.findall(sep_re % (r'.+?', duration_re), description or ''),
+ start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
+ duration=duration, strict=False)
+
@staticmethod
def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
all_known = all(map(
@@ -3684,10 +3815,12 @@ class InfoExtractor:
if plugin_name:
mro = inspect.getmro(cls)
super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
- cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key
+ cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
+ cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
while getattr(super_class, '__wrapped__', None):
super_class = super_class.__wrapped__
setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
+ _PLUGIN_OVERRIDES[super_class].append(cls)
return super().__init_subclass__(**kwargs)
@@ -3744,3 +3877,6 @@ class UnsupportedURLIE(InfoExtractor):
def _real_extract(self, url):
raise UnsupportedError(url)
+
+
+_PLUGIN_OVERRIDES = collections.defaultdict(list)
diff --git a/hypervideo_dl/extractor/crackle.py b/hypervideo_dl/extractor/crackle.py
index 4610015..1ef90b5 100644
--- a/hypervideo_dl/extractor/crackle.py
+++ b/hypervideo_dl/extractor/crackle.py
@@ -4,7 +4,7 @@ import re
import time
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
determine_ext,
float_or_none,
@@ -113,7 +113,7 @@ class CrackleIE(InfoExtractor):
errnote='Unable to download media JSON')
except ExtractorError as e:
# 401 means geo restriction, trying next country
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
continue
raise
diff --git a/hypervideo_dl/extractor/crtvg.py b/hypervideo_dl/extractor/crtvg.py
new file mode 100644
index 0000000..1aa8d77
--- /dev/null
+++ b/hypervideo_dl/extractor/crtvg.py
@@ -0,0 +1,34 @@
+from .common import InfoExtractor
+from ..utils import remove_end
+
+
+class CrtvgIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?crtvg\.es/tvg/a-carta/[^/#?]+-(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.crtvg.es/tvg/a-carta/os-caimans-do-tea-5839623',
+ 'md5': 'c0958d9ff90e4503a75544358758921d',
+ 'info_dict': {
+ 'id': '5839623',
+ 'title': 'Os caimáns do Tea',
+ 'ext': 'mp4',
+ 'description': 'md5:f71cfba21ae564f0a6f415b31de1f842',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ },
+ 'params': {'skip_download': 'm3u8'}
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ video_url = self._search_regex(r'var\s+url\s*=\s*["\']([^"\']+)', webpage, 'video url')
+ formats = self._extract_m3u8_formats(video_url + '/playlist.m3u8', video_id, fatal=False)
+ formats.extend(self._extract_mpd_formats(video_url + '/manifest.mpd', video_id, fatal=False))
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': remove_end(self._html_search_meta(
+ ['og:title', 'twitter:title'], webpage, 'title', default=None), ' | CRTVG'),
+ 'description': self._html_search_meta('description', webpage, 'description', default=None),
+ 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, 'thumbnail', default=None),
+ }
diff --git a/hypervideo_dl/extractor/crunchyroll.py b/hypervideo_dl/extractor/crunchyroll.py
index d226050..241da11 100644
--- a/hypervideo_dl/extractor/crunchyroll.py
+++ b/hypervideo_dl/extractor/crunchyroll.py
@@ -1,27 +1,53 @@
import base64
-import urllib.parse
from .common import InfoExtractor
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
float_or_none,
format_field,
+ int_or_none,
join_nonempty,
+ parse_age_limit,
+ parse_count,
parse_iso8601,
qualities,
+ remove_start,
+ time_seconds,
traverse_obj,
- try_get,
+ url_or_none,
+ urlencode_postdata,
)
class CrunchyrollBaseIE(InfoExtractor):
- _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login'
+ _BASE_URL = 'https://www.crunchyroll.com'
_API_BASE = 'https://api.crunchyroll.com'
_NETRC_MACHINE = 'crunchyroll'
- params = None
+ _AUTH_HEADERS = None
+ _API_ENDPOINT = None
+ _BASIC_AUTH = None
+ _CLIENT_ID = ('cr_web', 'noaihdevm_6iyg0a8l0q')
+ _LOCALE_LOOKUP = {
+ 'ar': 'ar-SA',
+ 'de': 'de-DE',
+ '': 'en-US',
+ 'es': 'es-419',
+ 'es-es': 'es-ES',
+ 'fr': 'fr-FR',
+ 'it': 'it-IT',
+ 'pt-br': 'pt-BR',
+ 'pt-pt': 'pt-PT',
+ 'ru': 'ru-RU',
+ 'hi': 'hi-IN',
+ }
+
+ @property
+ def is_logged_in(self):
+ return bool(self._get_cookies(self._BASE_URL).get('etp_rt'))
def _perform_login(self, username, password):
- if self._get_cookies(self._LOGIN_URL).get('etp_rt'):
+ if self.is_logged_in:
return
upsell_response = self._download_json(
@@ -31,7 +57,7 @@ class CrunchyrollBaseIE(InfoExtractor):
'device_id': 'whatvalueshouldbeforweb',
'device_type': 'com.crunchyroll.static',
'access_token': 'giKq5eY27ny3cqz',
- 'referer': self._LOGIN_URL
+ 'referer': f'{self._BASE_URL}/welcome/login'
})
if upsell_response['code'] != 'ok':
raise ExtractorError('Could not get session id')
@@ -39,66 +65,164 @@ class CrunchyrollBaseIE(InfoExtractor):
login_response = self._download_json(
f'{self._API_BASE}/login.1.json', None, 'Logging in',
- data=urllib.parse.urlencode({
+ data=urlencode_postdata({
'account': username,
'password': password,
'session_id': session_id
- }).encode('ascii'))
+ }))
if login_response['code'] != 'ok':
raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True)
- if not self._get_cookies(self._LOGIN_URL).get('etp_rt'):
+ if not self.is_logged_in:
raise ExtractorError('Login succeeded but did not set etp_rt cookie')
- def _get_embedded_json(self, webpage, display_id):
- initial_state = self._parse_json(self._search_regex(
- r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id)
- app_config = self._parse_json(self._search_regex(
- r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id)
- return initial_state, app_config
-
- def _get_params(self, lang):
- if not CrunchyrollBaseIE.params:
- if self._get_cookies(f'https://www.crunchyroll.com/{lang}').get('etp_rt'):
- grant_type, key = 'etp_rt_cookie', 'accountAuthClientId'
- else:
- grant_type, key = 'client_id', 'anonClientId'
+ def _update_auth(self):
+ if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_REFRESH > time_seconds():
+ return
- initial_state, app_config = self._get_embedded_json(self._download_webpage(
- f'https://www.crunchyroll.com/{lang}', None, note='Retrieving main page'), None)
- api_domain = app_config['cxApiParams']['apiDomain'].replace('beta.crunchyroll.com', 'www.crunchyroll.com')
+ if not CrunchyrollBaseIE._BASIC_AUTH:
+ cx_api_param = self._CLIENT_ID[self.is_logged_in]
+ self.write_debug(f'Using cxApiParam={cx_api_param}')
+ CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode()
+ grant_type = 'etp_rt_cookie' if self.is_logged_in else 'client_id'
+ try:
auth_response = self._download_json(
- f'{api_domain}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}',
- headers={
- 'Authorization': 'Basic ' + str(base64.b64encode(('%s:' % app_config['cxApiParams'][key]).encode('ascii')), 'ascii')
- }, data=f'grant_type={grant_type}'.encode('ascii'))
- policy_response = self._download_json(
- f'{api_domain}/index/v2', None, note='Retrieving signed policy',
- headers={
- 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']
- })
- cms = policy_response.get('cms_web')
- bucket = cms['bucket']
- params = {
- 'Policy': cms['policy'],
- 'Signature': cms['signature'],
- 'Key-Pair-Id': cms['key_pair_id']
+ f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}',
+ headers={'Authorization': CrunchyrollBaseIE._BASIC_AUTH}, data=f'grant_type={grant_type}'.encode())
+ except ExtractorError as error:
+ if isinstance(error.cause, HTTPError) and error.cause.status == 403:
+ raise ExtractorError(
+ 'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, '
+ 'then pass the fresh cookies (with --cookies-from-browser or --cookies) '
+ 'and your browser\'s User-Agent (with --user-agent)', expected=True)
+ raise
+
+ CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']}
+ CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10)
+
+ def _locale_from_language(self, language):
+ config_locale = self._configuration_arg('metadata', ie_key=CrunchyrollBetaIE, casesense=True)
+ return config_locale[0] if config_locale else self._LOCALE_LOOKUP.get(language)
+
+ def _call_base_api(self, endpoint, internal_id, lang, note=None, query={}):
+ self._update_auth()
+
+ if not endpoint.startswith('/'):
+ endpoint = f'/{endpoint}'
+
+ query = query.copy()
+ locale = self._locale_from_language(lang)
+ if locale:
+ query['locale'] = locale
+
+ return self._download_json(
+ f'{self._BASE_URL}{endpoint}', internal_id, note or f'Calling API: {endpoint}',
+ headers=CrunchyrollBaseIE._AUTH_HEADERS, query=query)
+
+ def _call_api(self, path, internal_id, lang, note='api', query={}):
+ if not path.startswith(f'/content/v2/{self._API_ENDPOINT}/'):
+ path = f'/content/v2/{self._API_ENDPOINT}/{path}'
+
+ try:
+ result = self._call_base_api(
+ path, internal_id, lang, f'Downloading {note} JSON ({self._API_ENDPOINT})', query=query)
+ except ExtractorError as error:
+ if isinstance(error.cause, HTTPError) and error.cause.status == 404:
+ return None
+ raise
+
+ if not result:
+ raise ExtractorError(f'Unexpected response when downloading {note} JSON')
+ return result
+
+ def _extract_formats(self, stream_response, display_id=None):
+ requested_formats = self._configuration_arg('format') or ['adaptive_hls']
+ available_formats = {}
+ for stream_type, streams in traverse_obj(
+ stream_response, (('streams', ('data', 0)), {dict.items}, ...)):
+ if stream_type not in requested_formats:
+ continue
+ for stream in traverse_obj(streams, lambda _, v: v['url']):
+ hardsub_lang = stream.get('hardsub_locale') or ''
+ format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s'))
+ available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url'])
+
+ requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])]
+ if '' in available_formats and 'all' not in requested_hardsubs:
+ full_format_langs = set(requested_hardsubs)
+ self.to_screen(
+ 'To get all formats of a hardsub language, use '
+ '"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". '
+ 'See https://github.com/hypervideo/hypervideo#crunchyrollbeta-crunchyroll for more info',
+ only_once=True)
+ else:
+ full_format_langs = set(map(str.lower, available_formats))
+
+ audio_locale = traverse_obj(stream_response, ((None, 'meta'), 'audio_locale'), get_all=False)
+ hardsub_preference = qualities(requested_hardsubs[::-1])
+ formats = []
+ for stream_type, format_id, hardsub_lang, stream_url in available_formats.values():
+ if stream_type.endswith('hls'):
+ if hardsub_lang.lower() in full_format_langs:
+ adaptive_formats = self._extract_m3u8_formats(
+ stream_url, display_id, 'mp4', m3u8_id=format_id,
+ fatal=False, note=f'Downloading {format_id} HLS manifest')
+ else:
+ adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),)
+ elif stream_type.endswith('dash'):
+ adaptive_formats = self._extract_mpd_formats(
+ stream_url, display_id, mpd_id=format_id,
+ fatal=False, note=f'Downloading {format_id} MPD manifest')
+ else:
+ self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True)
+ continue
+ for f in adaptive_formats:
+ if f.get('acodec') != 'none':
+ f['language'] = audio_locale
+ f['quality'] = hardsub_preference(hardsub_lang.lower())
+ formats.extend(adaptive_formats)
+
+ return formats
+
+ def _extract_subtitles(self, data):
+ subtitles = {}
+
+ for locale, subtitle in traverse_obj(data, ((None, 'meta'), 'subtitles', {dict.items}, ...)):
+ subtitles[locale] = [traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})]
+
+ return subtitles
+
+
+class CrunchyrollCmsBaseIE(CrunchyrollBaseIE):
+ _API_ENDPOINT = 'cms'
+ _CMS_EXPIRY = None
+
+ def _call_cms_api_signed(self, path, internal_id, lang, note='api'):
+ if not CrunchyrollCmsBaseIE._CMS_EXPIRY or CrunchyrollCmsBaseIE._CMS_EXPIRY <= time_seconds():
+ response = self._call_base_api('index/v2', None, lang, 'Retrieving signed policy')['cms_web']
+ CrunchyrollCmsBaseIE._CMS_QUERY = {
+ 'Policy': response['policy'],
+ 'Signature': response['signature'],
+ 'Key-Pair-Id': response['key_pair_id'],
}
- locale = traverse_obj(initial_state, ('localization', 'locale'))
- if locale:
- params['locale'] = locale
- CrunchyrollBaseIE.params = (api_domain, bucket, params)
- return CrunchyrollBaseIE.params
+ CrunchyrollCmsBaseIE._CMS_BUCKET = response['bucket']
+ CrunchyrollCmsBaseIE._CMS_EXPIRY = parse_iso8601(response['expires']) - 10
+
+ if not path.startswith('/cms/v2'):
+ path = f'/cms/v2{CrunchyrollCmsBaseIE._CMS_BUCKET}/{path}'
+
+ return self._call_base_api(
+ path, internal_id, lang, f'Downloading {note} JSON (signed cms)', query=CrunchyrollCmsBaseIE._CMS_QUERY)
-class CrunchyrollBetaIE(CrunchyrollBaseIE):
+class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
IE_NAME = 'crunchyroll'
_VALID_URL = r'''(?x)
- https?://(?:beta|www)\.crunchyroll\.com/
- (?P<lang>(?:\w{2}(?:-\w{2})?/)?)
- watch/(?P<id>\w+)
- (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)'''
+ https?://(?:beta\.|www\.)?crunchyroll\.com/
+ (?:(?P<lang>\w{2}(?:-\w{2})?)/)?
+ watch/(?!concert|musicvideo)(?P<id>\w+)'''
_TESTS = [{
+ # Premium only
'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future',
'info_dict': {
'id': 'GY2P1Q98Y',
@@ -115,10 +239,15 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE):
'season_number': 1,
'episode': 'To the Future',
'episode_number': 73,
- 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$',
+ 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
+ 'chapters': 'count:2',
+ 'age_limit': 14,
+ 'like_count': int,
+ 'dislike_count': int,
},
'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'},
}, {
+ # Premium only
'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR',
'info_dict': {
'id': 'GYE5WKQGR',
@@ -126,7 +255,7 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE):
'duration': 366.459,
'timestamp': 1476788400,
'description': 'md5:74b67283ffddd75f6e224ca7dc031e76',
- 'title': 'SHELTER Episode – Porter Robinson presents Shelter the Animation',
+ 'title': 'SHELTER – Porter Robinson presents Shelter the Animation',
'upload_date': '20161018',
'series': 'SHELTER',
'series_id': 'GYGG09WWY',
@@ -135,121 +264,206 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE):
'season_number': 1,
'episode': 'Porter Robinson presents Shelter the Animation',
'episode_number': 0,
- 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$',
+ 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
+ 'age_limit': 14,
+ 'like_count': int,
+ 'dislike_count': int,
},
'params': {'skip_download': True},
- 'skip': 'Video is Premium only',
}, {
- 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y',
+ 'url': 'https://www.crunchyroll.com/watch/GJWU2VKK3/cherry-blossom-meeting-and-a-coming-blizzard',
+ 'info_dict': {
+ 'id': 'GJWU2VKK3',
+ 'ext': 'mp4',
+ 'duration': 1420.054,
+ 'description': 'md5:2d1c67c0ec6ae514d9c30b0b99a625cd',
+ 'title': 'The Ice Guy and His Cool Female Colleague Episode 1 – Cherry Blossom Meeting and a Coming Blizzard',
+ 'series': 'The Ice Guy and His Cool Female Colleague',
+ 'series_id': 'GW4HM75NP',
+ 'season': 'The Ice Guy and His Cool Female Colleague',
+ 'season_id': 'GY9PC21VE',
+ 'season_number': 1,
+ 'episode': 'Cherry Blossom Meeting and a Coming Blizzard',
+ 'episode_number': 1,
+ 'chapters': 'count:2',
+ 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
+ 'timestamp': 1672839000,
+ 'upload_date': '20230104',
+ 'age_limit': 14,
+ 'like_count': int,
+ 'dislike_count': int,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.crunchyroll.com/watch/GM8F313NQ',
+ 'info_dict': {
+ 'id': 'GM8F313NQ',
+ 'ext': 'mp4',
+ 'title': 'Garakowa -Restore the World-',
+ 'description': 'md5:8d2f8b6b9dd77d87810882e7d2ee5608',
+ 'duration': 3996.104,
+ 'age_limit': 13,
+ 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.crunchyroll.com/watch/G62PEZ2E6',
+ 'info_dict': {
+ 'id': 'G62PEZ2E6',
+ 'description': 'md5:8d2f8b6b9dd77d87810882e7d2ee5608',
+ 'age_limit': 13,
+ 'duration': 65.138,
+ 'title': 'Garakowa -Restore the World-',
+ },
+ 'playlist_mincount': 5,
+ }, {
+ 'url': 'https://www.crunchyroll.com/de/watch/GY2P1Q98Y',
'only_matching': True,
}, {
'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy',
'only_matching': True,
}]
+ # We want to support lazy playlist filtering and movie listings cannot be inside a playlist
+ _RETURN_TYPE = 'video'
def _real_extract(self, url):
- lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id')
- api_domain, bucket, params = self._get_params(lang)
+ lang, internal_id = self._match_valid_url(url).group('lang', 'id')
- episode_response = self._download_json(
- f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id,
- note='Retrieving episode metadata', query=params)
- if episode_response.get('is_premium_only') and not episode_response.get('playback'):
- raise ExtractorError('This video is for premium members only.', expected=True)
+ # We need to use unsigned API call to allow ratings query string
+ response = traverse_obj(self._call_api(
+ f'objects/{internal_id}', internal_id, lang, 'object info', {'ratings': 'true'}), ('data', 0, {dict}))
+ if not response:
+ raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True)
- stream_response = self._download_json(
- f'{api_domain}{episode_response["__links__"]["streams"]["href"]}', display_id,
- note='Retrieving stream info', query=params)
- get_streams = lambda name: (traverse_obj(stream_response, name) or {}).items()
+ object_type = response.get('type')
+ if object_type == 'episode':
+ result = self._transform_episode_response(response)
- requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])]
- hardsub_preference = qualities(requested_hardsubs[::-1])
- requested_formats = self._configuration_arg('format') or ['adaptive_hls']
+ elif object_type == 'movie':
+ result = self._transform_movie_response(response)
- available_formats = {}
- for stream_type, streams in get_streams('streams'):
- if stream_type not in requested_formats:
- continue
- for stream in streams.values():
- if not stream.get('url'):
- continue
- hardsub_lang = stream.get('hardsub_locale') or ''
- format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s'))
- available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url'])
+ elif object_type == 'movie_listing':
+ first_movie_id = traverse_obj(response, ('movie_listing_metadata', 'first_movie_id'))
+ if not self._yes_playlist(internal_id, first_movie_id):
+ return self.url_result(f'{self._BASE_URL}/{lang}watch/{first_movie_id}', CrunchyrollBetaIE, first_movie_id)
+
+ def entries():
+ movies = self._call_api(f'movie_listings/{internal_id}/movies', internal_id, lang, 'movie list')
+ for movie_response in traverse_obj(movies, ('data', ...)):
+ yield self.url_result(
+ f'{self._BASE_URL}/{lang}watch/{movie_response["id"]}',
+ CrunchyrollBetaIE, **self._transform_movie_response(movie_response))
+
+ return self.playlist_result(entries(), **self._transform_movie_response(response))
- if '' in available_formats and 'all' not in requested_hardsubs:
- full_format_langs = set(requested_hardsubs)
- self.to_screen(
- 'To get all formats of a hardsub language, use '
- '"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". '
- 'See https://github.com/hypervideo/hypervideo#crunchyrollbeta for more info',
- only_once=True)
else:
- full_format_langs = set(map(str.lower, available_formats))
+ raise ExtractorError(f'Unknown object type {object_type}')
- formats = []
- for stream_type, format_id, hardsub_lang, stream_url in available_formats.values():
- if stream_type.endswith('hls'):
- if hardsub_lang.lower() in full_format_langs:
- adaptive_formats = self._extract_m3u8_formats(
- stream_url, display_id, 'mp4', m3u8_id=format_id,
- fatal=False, note=f'Downloading {format_id} HLS manifest')
- else:
- adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),)
- elif stream_type.endswith('dash'):
- adaptive_formats = self._extract_mpd_formats(
- stream_url, display_id, mpd_id=format_id,
- fatal=False, note=f'Downloading {format_id} MPD manifest')
- else:
- self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True)
- continue
- for f in adaptive_formats:
- if f.get('acodec') != 'none':
- f['language'] = stream_response.get('audio_locale')
- f['quality'] = hardsub_preference(hardsub_lang.lower())
- formats.extend(adaptive_formats)
+ # There might be multiple audio languages for one object (`<object>_metadata.versions`),
+ # so we need to get the id from `streams_link` instead or we dont know which language to choose
+ streams_link = response.get('streams_link')
+ if not streams_link and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')):
+ message = f'This {object_type} is for premium members only'
+ if self.is_logged_in:
+ raise ExtractorError(message, expected=True)
+ self.raise_login_required(message)
+
+ # We need go from unsigned to signed api to avoid getting soft banned
+ stream_response = self._call_cms_api_signed(remove_start(
+ streams_link, '/content/v2/cms/'), internal_id, lang, 'stream info')
+ result['formats'] = self._extract_formats(stream_response, internal_id)
+ result['subtitles'] = self._extract_subtitles(stream_response)
+
+ # if no intro chapter is available, a 403 without usable data is returned
+ intro_chapter = self._download_json(
+ f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json',
+ internal_id, note='Downloading chapter info', fatal=False, errnote=False)
+ if isinstance(intro_chapter, dict):
+ result['chapters'] = [{
+ 'title': 'Intro',
+ 'start_time': float_or_none(intro_chapter.get('startTime')),
+ 'end_time': float_or_none(intro_chapter.get('endTime')),
+ }]
+
+ def calculate_count(item):
+ return parse_count(''.join((item['displayed'], item.get('unit') or '')))
+
+ result.update(traverse_obj(response, ('rating', {
+ 'like_count': ('up', {calculate_count}),
+ 'dislike_count': ('down', {calculate_count}),
+ })))
+
+ return result
+ @staticmethod
+ def _transform_episode_response(data):
+ metadata = traverse_obj(data, (('episode_metadata', None), {dict}), get_all=False) or {}
return {
- 'id': internal_id,
- 'title': '%s Episode %s – %s' % (
- episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')),
- 'description': try_get(episode_response, lambda x: x['description'].replace(r'\r\n', '\n')),
- 'duration': float_or_none(episode_response.get('duration_ms'), 1000),
- 'timestamp': parse_iso8601(episode_response.get('upload_date')),
- 'series': episode_response.get('series_title'),
- 'series_id': episode_response.get('series_id'),
- 'season': episode_response.get('season_title'),
- 'season_id': episode_response.get('season_id'),
- 'season_number': episode_response.get('season_number'),
- 'episode': episode_response.get('title'),
- 'episode_number': episode_response.get('sequence_number'),
- 'formats': formats,
- 'thumbnails': [{
- 'url': thumb.get('source'),
- 'width': thumb.get('width'),
- 'height': thumb.get('height'),
- } for thumb in traverse_obj(episode_response, ('images', 'thumbnail', ..., ...)) or []],
- 'subtitles': {
- lang: [{
- 'url': subtitle_data.get('url'),
- 'ext': subtitle_data.get('format')
- }] for lang, subtitle_data in get_streams('subtitles')
- },
+ 'id': data['id'],
+ 'title': ' \u2013 '.join((
+ ('%s%s' % (
+ format_field(metadata, 'season_title'),
+ format_field(metadata, 'episode', ' Episode %s'))),
+ format_field(data, 'title'))),
+ **traverse_obj(data, {
+ 'episode': ('title', {str}),
+ 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}),
+ 'thumbnails': ('images', 'thumbnail', ..., ..., {
+ 'url': ('source', {url_or_none}),
+ 'width': ('width', {int_or_none}),
+ 'height': ('height', {int_or_none}),
+ }),
+ }),
+ **traverse_obj(metadata, {
+ 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}),
+ 'timestamp': ('upload_date', {parse_iso8601}),
+ 'series': ('series_title', {str}),
+ 'series_id': ('series_id', {str}),
+ 'season': ('season_title', {str}),
+ 'season_id': ('season_id', {str}),
+ 'season_number': ('season_number', ({int}, {float_or_none})),
+ 'episode_number': ('sequence_number', ({int}, {float_or_none})),
+ 'age_limit': ('maturity_ratings', -1, {parse_age_limit}),
+ 'language': ('audio_locale', {str}),
+ }, get_all=False),
+ }
+
+ @staticmethod
+ def _transform_movie_response(data):
+ metadata = traverse_obj(data, (('movie_metadata', 'movie_listing_metadata', None), {dict}), get_all=False) or {}
+ return {
+ 'id': data['id'],
+ **traverse_obj(data, {
+ 'title': ('title', {str}),
+ 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}),
+ 'thumbnails': ('images', 'thumbnail', ..., ..., {
+ 'url': ('source', {url_or_none}),
+ 'width': ('width', {int_or_none}),
+ 'height': ('height', {int_or_none}),
+ }),
+ }),
+ **traverse_obj(metadata, {
+ 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}),
+ 'age_limit': ('maturity_ratings', -1, {parse_age_limit}),
+ }),
}
-class CrunchyrollBetaShowIE(CrunchyrollBaseIE):
+class CrunchyrollBetaShowIE(CrunchyrollCmsBaseIE):
IE_NAME = 'crunchyroll:playlist'
_VALID_URL = r'''(?x)
- https?://(?:beta|www)\.crunchyroll\.com/
+ https?://(?:beta\.|www\.)?crunchyroll\.com/
(?P<lang>(?:\w{2}(?:-\w{2})?/)?)
- series/(?P<id>\w+)
- (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)'''
+ series/(?P<id>\w+)'''
_TESTS = [{
'url': 'https://www.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA',
'info_dict': {
'id': 'GY19NQ2QR',
'title': 'Girl Friend BETA',
+ 'description': 'md5:99c1b22ee30a74b536a8277ced8eb750',
+ # XXX: `thumbnail` does not get set from `thumbnails` in playlist
+ # 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
+ 'age_limit': 14,
},
'playlist_mincount': 10,
}, {
@@ -258,40 +472,179 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE):
}]
def _real_extract(self, url):
- lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id')
- api_domain, bucket, params = self._get_params(lang)
+ lang, internal_id = self._match_valid_url(url).group('lang', 'id')
+
+ def entries():
+ seasons_response = self._call_cms_api_signed(f'seasons?series_id={internal_id}', internal_id, lang, 'seasons')
+ for season in traverse_obj(seasons_response, ('items', ..., {dict})):
+ episodes_response = self._call_cms_api_signed(
+ f'episodes?season_id={season["id"]}', season["id"], lang, 'episode list')
+ for episode_response in traverse_obj(episodes_response, ('items', ..., {dict})):
+ yield self.url_result(
+ f'{self._BASE_URL}/{lang}watch/{episode_response["id"]}',
+ CrunchyrollBetaIE, **CrunchyrollBetaIE._transform_episode_response(episode_response))
- series_response = self._download_json(
- f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id,
- note='Retrieving series metadata', query=params)
+ return self.playlist_result(
+ entries(), internal_id,
+ **traverse_obj(self._call_api(f'series/{internal_id}', internal_id, lang, 'series'), ('data', 0, {
+ 'title': ('title', {str}),
+ 'description': ('description', {lambda x: x.replace(r'\r\n', '\n')}),
+ 'age_limit': ('maturity_ratings', -1, {parse_age_limit}),
+ 'thumbnails': ('images', ..., ..., ..., {
+ 'url': ('source', {url_or_none}),
+ 'width': ('width', {int_or_none}),
+ 'height': ('height', {int_or_none}),
+ })
+ })))
- seasons_response = self._download_json(
- f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id,
- note='Retrieving season list', query=params)
+
+class CrunchyrollMusicIE(CrunchyrollBaseIE):
+ IE_NAME = 'crunchyroll:music'
+ _VALID_URL = r'''(?x)
+ https?://(?:www\.)?crunchyroll\.com/
+ (?P<lang>(?:\w{2}(?:-\w{2})?/)?)
+ watch/(?P<type>concert|musicvideo)/(?P<id>\w+)'''
+ _TESTS = [{
+ 'url': 'https://www.crunchyroll.com/de/watch/musicvideo/MV5B02C79',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': 'MV5B02C79',
+ 'display_id': 'egaono-hana',
+ 'title': 'Egaono Hana',
+ 'track': 'Egaono Hana',
+ 'artist': 'Goose house',
+ 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
+ 'genre': ['J-Pop'],
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': 'MV88BB7F2C',
+ 'display_id': 'crossing-field',
+ 'title': 'Crossing Field',
+ 'track': 'Crossing Field',
+ 'artist': 'LiSA',
+ 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
+ 'genre': ['Anime'],
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': 'MC2E2AC135',
+ 'display_id': 'live-is-smile-always-364joker-at-yokohama-arena',
+ 'title': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA',
+ 'track': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA',
+ 'artist': 'LiSA',
+ 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
+ 'description': 'md5:747444e7e6300907b7a43f0a0503072e',
+ 'genre': ['J-Pop'],
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.crunchyroll.com/de/watch/musicvideo/MV5B02C79/egaono-hana',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135/live-is-smile-always-364joker-at-yokohama-arena',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C/crossing-field',
+ 'only_matching': True,
+ }]
+ _API_ENDPOINT = 'music'
+
+ def _real_extract(self, url):
+ lang, internal_id, object_type = self._match_valid_url(url).group('lang', 'id', 'type')
+ path, name = {
+ 'concert': ('concerts', 'concert info'),
+ 'musicvideo': ('music_videos', 'music video info'),
+ }[object_type]
+ response = traverse_obj(self._call_api(f'{path}/{internal_id}', internal_id, lang, name), ('data', 0, {dict}))
+ if not response:
+ raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True)
+
+ streams_link = response.get('streams_link')
+ if not streams_link and response.get('isPremiumOnly'):
+ message = f'This {response.get("type") or "media"} is for premium members only'
+ if self.is_logged_in:
+ raise ExtractorError(message, expected=True)
+ self.raise_login_required(message)
+
+ result = self._transform_music_response(response)
+ stream_response = self._call_api(streams_link, internal_id, lang, 'stream info')
+ result['formats'] = self._extract_formats(stream_response, internal_id)
+
+ return result
+
+ @staticmethod
+ def _transform_music_response(data):
+ return {
+ 'id': data['id'],
+ **traverse_obj(data, {
+ 'display_id': 'slug',
+ 'title': 'title',
+ 'track': 'title',
+ 'artist': ('artist', 'name'),
+ 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n') or None}),
+ 'thumbnails': ('images', ..., ..., {
+ 'url': ('source', {url_or_none}),
+ 'width': ('width', {int_or_none}),
+ 'height': ('height', {int_or_none}),
+ }),
+ 'genre': ('genres', ..., 'displayValue'),
+ 'age_limit': ('maturity_ratings', -1, {parse_age_limit}),
+ }),
+ }
+
+
+class CrunchyrollArtistIE(CrunchyrollBaseIE):
+ IE_NAME = 'crunchyroll:artist'
+ _VALID_URL = r'''(?x)
+ https?://(?:www\.)?crunchyroll\.com/
+ (?P<lang>(?:\w{2}(?:-\w{2})?/)?)
+ artist/(?P<id>\w{10})'''
+ _TESTS = [{
+ 'url': 'https://www.crunchyroll.com/artist/MA179CB50D',
+ 'info_dict': {
+ 'id': 'MA179CB50D',
+ 'title': 'LiSA',
+ 'genre': ['J-Pop', 'Anime', 'Rock'],
+ 'description': 'md5:16d87de61a55c3f7d6c454b73285938e',
+ },
+ 'playlist_mincount': 83,
+ }, {
+ 'url': 'https://www.crunchyroll.com/artist/MA179CB50D/lisa',
+ 'only_matching': True,
+ }]
+ _API_ENDPOINT = 'music'
+
+ def _real_extract(self, url):
+ lang, internal_id = self._match_valid_url(url).group('lang', 'id')
+ response = traverse_obj(self._call_api(
+ f'artists/{internal_id}', internal_id, lang, 'artist info'), ('data', 0))
def entries():
- for season in seasons_response['items']:
- episodes_response = self._download_json(
- f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id,
- note=f'Retrieving episode list for {season.get("slug_title")}', query=params)
- for episode in episodes_response['items']:
- episode_id = episode['id']
- episode_display_id = episode['slug_title']
- yield {
- '_type': 'url',
- 'url': f'https://www.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}',
- 'ie_key': CrunchyrollBetaIE.ie_key(),
- 'id': episode_id,
- 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')),
- 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')),
- 'duration': float_or_none(episode.get('duration_ms'), 1000),
- 'series': episode.get('series_title'),
- 'series_id': episode.get('series_id'),
- 'season': episode.get('season_title'),
- 'season_id': episode.get('season_id'),
- 'season_number': episode.get('season_number'),
- 'episode': episode.get('title'),
- 'episode_number': episode.get('sequence_number')
- }
-
- return self.playlist_result(entries(), internal_id, series_response.get('title'))
+ for attribute, path in [('concerts', 'concert'), ('videos', 'musicvideo')]:
+ for internal_id in traverse_obj(response, (attribute, ...)):
+ yield self.url_result(f'{self._BASE_URL}/watch/{path}/{internal_id}', CrunchyrollMusicIE, internal_id)
+
+ return self.playlist_result(entries(), **self._transform_artist_response(response))
+
+ @staticmethod
+ def _transform_artist_response(data):
+ return {
+ 'id': data['id'],
+ **traverse_obj(data, {
+ 'title': 'name',
+ 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}),
+ 'thumbnails': ('images', ..., ..., {
+ 'url': ('source', {url_or_none}),
+ 'width': ('width', {int_or_none}),
+ 'height': ('height', {int_or_none}),
+ }),
+ 'genre': ('genres', ..., 'displayValue'),
+ }),
+ }
diff --git a/hypervideo_dl/extractor/cultureunplugged.py b/hypervideo_dl/extractor/cultureunplugged.py
index 2fb2280..9c8509f 100644
--- a/hypervideo_dl/extractor/cultureunplugged.py
+++ b/hypervideo_dl/extractor/cultureunplugged.py
@@ -1,10 +1,8 @@
import time
from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- HEADRequest,
-)
+from ..networking import HEADRequest
+from ..utils import int_or_none
class CultureUnpluggedIE(InfoExtractor):
diff --git a/hypervideo_dl/extractor/curiositystream.py b/hypervideo_dl/extractor/curiositystream.py
index 26cf24f..941cf4e 100644
--- a/hypervideo_dl/extractor/curiositystream.py
+++ b/hypervideo_dl/extractor/curiositystream.py
@@ -1,4 +1,5 @@
import re
+import urllib.parse
from .common import InfoExtractor
from ..compat import compat_str
@@ -23,7 +24,7 @@ class CuriosityStreamBaseIE(InfoExtractor):
auth_cookie = self._get_cookies('https://curiositystream.com').get('auth_token')
if auth_cookie:
self.write_debug('Obtained auth_token cookie')
- self._auth_token = auth_cookie.value
+ self._auth_token = urllib.parse.unquote(auth_cookie.value)
if self._auth_token:
headers['X-Auth-Token'] = self._auth_token
result = self._download_json(
@@ -54,8 +55,11 @@ class CuriosityStreamIE(CuriosityStreamBaseIE):
'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.',
'channel': 'Curiosity Stream',
'categories': ['Technology', 'Interview'],
- 'average_rating': 96.79,
+ 'average_rating': float,
'series_id': '2',
+ 'thumbnail': r're:https://img.curiositystream.com/.+\.jpg',
+ 'tags': [],
+ 'duration': 158
},
'params': {
# m3u8 download
diff --git a/hypervideo_dl/extractor/dacast.py b/hypervideo_dl/extractor/dacast.py
new file mode 100644
index 0000000..4e81aa4
--- /dev/null
+++ b/hypervideo_dl/extractor/dacast.py
@@ -0,0 +1,158 @@
+import hashlib
+import re
+import time
+
+from .common import InfoExtractor
+from ..networking.exceptions import HTTPError
+from ..utils import (
+ ExtractorError,
+ classproperty,
+ float_or_none,
+ traverse_obj,
+ url_or_none,
+)
+
+
+class DacastBaseIE(InfoExtractor):
+ _URL_TYPE = None
+
+ @classproperty
+ def _VALID_URL(cls):
+ return fr'https?://iframe\.dacast\.com/{cls._URL_TYPE}/(?P<user_id>[\w-]+)/(?P<id>[\w-]+)'
+
+ @classproperty
+ def _EMBED_REGEX(cls):
+ return [rf'<iframe[^>]+\bsrc=["\'](?P<url>{cls._VALID_URL})']
+
+ _API_INFO_URL = 'https://playback.dacast.com/content/info'
+
+ @classmethod
+ def _get_url_from_id(cls, content_id):
+ user_id, media_id = content_id.split(f'-{cls._URL_TYPE}-')
+ return f'https://iframe.dacast.com/{cls._URL_TYPE}/{user_id}/{media_id}'
+
+ @classmethod
+ def _extract_embed_urls(cls, url, webpage):
+ yield from super()._extract_embed_urls(url, webpage)
+ for content_id in re.findall(
+ rf'<script[^>]+\bsrc=["\']https://player\.dacast\.com/js/player\.js\?contentId=([\w-]+-{cls._URL_TYPE}-[\w-]+)["\']', webpage):
+ yield cls._get_url_from_id(content_id)
+
+
+class DacastVODIE(DacastBaseIE):
+ _URL_TYPE = 'vod'
+ _TESTS = [{
+ 'url': 'https://iframe.dacast.com/vod/acae82153ef4d7a7344ae4eaa86af534/1c6143e3-5a06-371d-8695-19b96ea49090',
+ 'info_dict': {
+ 'id': '1c6143e3-5a06-371d-8695-19b96ea49090',
+ 'ext': 'mp4',
+ 'uploader_id': 'acae82153ef4d7a7344ae4eaa86af534',
+ 'title': '2_4||Adnexal mass characterisation: O-RADS US and MRI||N. Bharwani, London/UK',
+ 'thumbnail': 'https://universe-files.dacast.com/26137208-5858-65c1-5e9a-9d6b6bd2b6c2',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+ _WEBPAGE_TESTS = [{
+ 'url': 'https://www.dacast.com/support/knowledgebase/how-can-i-embed-a-video-on-my-website/',
+ 'info_dict': {
+ 'id': 'b6674869-f08a-23c5-1d7b-81f5309e1a90',
+ 'ext': 'mp4',
+ 'title': '4-HowToEmbedVideo.mp4',
+ 'uploader_id': '3b67c4a9-3886-4eb1-d0eb-39b23b14bef3',
+ 'thumbnail': 'https://universe-files.dacast.com/d26ab48f-a52a-8783-c42e-a90290ba06b6.png',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://gist.githubusercontent.com/bashonly/4ad249ef2910346fbdf3809b220f11ee/raw/87349778d4af1a80b1fcc3beb9c88108de5858f5/dacast_embeds.html',
+ 'info_dict': {
+ 'id': 'e7df418e-a83b-7a7f-7b5e-1a667981e8fa',
+ 'ext': 'mp4',
+ 'title': 'Evening Service 2-5-23',
+ 'uploader_id': '943bb1ab3c03695ba85330d92d6d226e',
+ 'thumbnail': 'https://universe-files.dacast.com/337472b3-e92c-2ea4-7eb7-5700da477f67',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ user_id, video_id = self._match_valid_url(url).group('user_id', 'id')
+ query = {'contentId': f'{user_id}-vod-{video_id}', 'provider': 'universe'}
+ info = self._download_json(self._API_INFO_URL, video_id, query=query, fatal=False)
+ access = self._download_json(
+ 'https://playback.dacast.com/content/access', video_id,
+ note='Downloading access JSON', query=query, expected_status=403)
+
+ error = access.get('error')
+ if error in ('Broadcaster has been blocked', 'Content is offline'):
+ raise ExtractorError(error, expected=True)
+ elif error:
+ raise ExtractorError(f'Dacast API says "{error}"')
+
+ hls_url = access['hls']
+ hls_aes = {}
+
+ if 'DRM_EXT' in hls_url:
+ self.report_drm(video_id)
+ elif '/uspaes/' in hls_url:
+ # From https://player.dacast.com/js/player.js
+ ts = int(time.time())
+ signature = hashlib.sha1(
+ f'{10413792000 - ts}{ts}YfaKtquEEpDeusCKbvYszIEZnWmBcSvw').digest().hex()
+ hls_aes['uri'] = f'https://keys.dacast.com/uspaes/{video_id}.key?s={signature}&ts={ts}'
+
+ for retry in self.RetryManager():
+ try:
+ formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4', m3u8_id='hls')
+ except ExtractorError as e:
+ # CDN will randomly respond with 403
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
+ retry.error = e
+ continue
+ raise
+
+ return {
+ 'id': video_id,
+ 'uploader_id': user_id,
+ 'formats': formats,
+ 'hls_aes': hls_aes or None,
+ **traverse_obj(info, ('contentInfo', {
+ 'title': 'title',
+ 'duration': ('duration', {float_or_none}),
+ 'thumbnail': ('thumbnailUrl', {url_or_none}),
+ })),
+ }
+
+
+class DacastPlaylistIE(DacastBaseIE):
+ _URL_TYPE = 'playlist'
+ _TESTS = [{
+ 'url': 'https://iframe.dacast.com/playlist/943bb1ab3c03695ba85330d92d6d226e/b632eb053cac17a9c9a02bcfc827f2d8',
+ 'playlist_mincount': 28,
+ 'info_dict': {
+ 'id': 'b632eb053cac17a9c9a02bcfc827f2d8',
+ 'title': 'Archive Sermons',
+ },
+ }]
+ _WEBPAGE_TESTS = [{
+ 'url': 'https://gist.githubusercontent.com/bashonly/7efb606f49f3c6e07ea0327de5a661d1/raw/05a16eac830245ea301fb0a585023bec71e6093c/dacast_playlist_embed.html',
+ 'playlist_mincount': 28,
+ 'info_dict': {
+ 'id': 'b632eb053cac17a9c9a02bcfc827f2d8',
+ 'title': 'Archive Sermons',
+ },
+ }]
+
+ def _real_extract(self, url):
+ user_id, playlist_id = self._match_valid_url(url).group('user_id', 'id')
+ info = self._download_json(
+ self._API_INFO_URL, playlist_id, note='Downloading playlist JSON', query={
+ 'contentId': f'{user_id}-playlist-{playlist_id}',
+ 'provider': 'universe',
+ })['contentInfo']
+
+ def entries(info):
+ for video in traverse_obj(info, ('features', 'playlist', 'contents', lambda _, v: v['id'])):
+ yield self.url_result(
+ DacastVODIE._get_url_from_id(video['id']), DacastVODIE, video['id'], video.get('title'))
+
+ return self.playlist_result(entries(info), playlist_id, info.get('title'))
diff --git a/hypervideo_dl/extractor/daftsex.py b/hypervideo_dl/extractor/daftsex.py
index 551d5e3..92510c7 100644
--- a/hypervideo_dl/extractor/daftsex.py
+++ b/hypervideo_dl/extractor/daftsex.py
@@ -1,6 +1,7 @@
from .common import InfoExtractor
from ..compat import compat_b64decode
from ..utils import (
+ ExtractorError,
int_or_none,
js_to_json,
parse_count,
@@ -12,21 +13,24 @@ from ..utils import (
class DaftsexIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?daftsex\.com/watch/(?P<id>-?\d+_\d+)'
+ _VALID_URL = r'https?://(?:www\.)?daft\.sex/watch/(?P<id>-?\d+_\d+)'
_TESTS = [{
- 'url': 'https://daftsex.com/watch/-35370899_456246186',
- 'md5': 'd95135e6cea2d905bea20dbe82cda64a',
+ 'url': 'https://daft.sex/watch/-35370899_456246186',
+ 'md5': '64c04ef7b4c7b04b308f3b0c78efe7cd',
'info_dict': {
'id': '-35370899_456246186',
'ext': 'mp4',
'title': 'just relaxing',
- 'description': 'just relaxing - Watch video Watch video in high quality',
+ 'description': 'just relaxing – Watch video Watch video in high quality',
'upload_date': '20201113',
'timestamp': 1605261911,
- 'thumbnail': r're:https://[^/]+/impf/-43BuMDIawmBGr3GLcZ93CYwWf2PBv_tVWoS1A/dnu41DnARU4\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=6af2c26ff4a45e55334189301c867384&type=video_thumb',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'age_limit': 18,
+ 'duration': 15.0,
+ 'view_count': int
},
}, {
- 'url': 'https://daftsex.com/watch/-156601359_456242791',
+ 'url': 'https://daft.sex/watch/-156601359_456242791',
'info_dict': {
'id': '-156601359_456242791',
'ext': 'mp4',
@@ -36,6 +40,7 @@ class DaftsexIE(InfoExtractor):
'timestamp': 1600250735,
'thumbnail': 'https://psv153-1.crazycloud.ru/videos/-156601359/456242791/thumb.jpg?extra=i3D32KaBbBFf9TqDRMAVmQ',
},
+ 'skip': 'deleted / private'
}]
def _real_extract(self, url):
@@ -60,7 +65,7 @@ class DaftsexIE(InfoExtractor):
webpage, 'player color', fatal=False) or ''
embed_page = self._download_webpage(
- 'https://daxab.com/player/%s?color=%s' % (player_hash, player_color),
+ 'https://dxb.to/player/%s?color=%s' % (player_hash, player_color),
video_id, headers={'Referer': url})
video_params = self._parse_json(
self._search_regex(
@@ -94,15 +99,19 @@ class DaftsexIE(InfoExtractor):
'age_limit': 18,
}
- item = self._download_json(
+ items = self._download_json(
f'{server_domain}/method/video.get/{video_id}', video_id,
headers={'Referer': url}, query={
'token': video_params['video']['access_token'],
'videos': video_id,
'ckey': video_params['c_key'],
'credentials': video_params['video']['credentials'],
- })['response']['items'][0]
+ })['response']['items']
+
+ if not items:
+ raise ExtractorError('Video is not available', video_id=video_id, expected=True)
+ item = items[0]
formats = []
for f_id, f_url in item.get('files', {}).items():
if f_id == 'external':
diff --git a/hypervideo_dl/extractor/dailymotion.py b/hypervideo_dl/extractor/dailymotion.py
index 2a44718..21263d4 100644
--- a/hypervideo_dl/extractor/dailymotion.py
+++ b/hypervideo_dl/extractor/dailymotion.py
@@ -3,7 +3,7 @@ import json
import re
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
OnDemandPagedList,
@@ -68,9 +68,9 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
None, 'Downloading Access Token',
data=urlencode_postdata(data))['access_token']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 400:
raise ExtractorError(self._parse_json(
- e.cause.read().decode(), xid)['error_description'], expected=True)
+ e.cause.response.read().decode(), xid)['error_description'], expected=True)
raise
self._set_dailymotion_cookie('access_token' if username else 'client_token', token)
self._HEADERS['Authorization'] = 'Bearer ' + token
diff --git a/hypervideo_dl/extractor/digitalconcerthall.py b/hypervideo_dl/extractor/digitalconcerthall.py
index 3461e36..c11cd79 100644
--- a/hypervideo_dl/extractor/digitalconcerthall.py
+++ b/hypervideo_dl/extractor/digitalconcerthall.py
@@ -11,7 +11,7 @@ from ..utils import (
class DigitalConcertHallIE(InfoExtractor):
IE_DESC = 'DigitalConcertHall extractor'
- _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P<language>[a-z]+)/concert/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P<language>[a-z]+)/(?P<type>film|concert)/(?P<id>[0-9]+)'
_OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token'
_ACCESS_TOKEN = None
_NETRC_MACHINE = 'digitalconcerthall'
@@ -40,6 +40,19 @@ class DigitalConcertHallIE(InfoExtractor):
},
'params': {'skip_download': 'm3u8'},
'playlist_count': 3,
+ }, {
+ 'url': 'https://www.digitalconcerthall.com/en/film/388',
+ 'info_dict': {
+ 'id': '388',
+ 'ext': 'mp4',
+ 'title': 'The Berliner Philharmoniker and Frank Peter Zimmermann',
+ 'description': 'md5:cfe25a7044fa4be13743e5089b5b5eb2',
+ 'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$',
+ 'upload_date': '20220714',
+ 'timestamp': 1657785600,
+ 'album_artist': 'Frank Peter Zimmermann / Benedikt von Bernstorff / Jakob von Bernstorff',
+ },
+ 'params': {'skip_download': 'm3u8'},
}]
def _perform_login(self, username, password):
@@ -75,7 +88,7 @@ class DigitalConcertHallIE(InfoExtractor):
if not self._ACCESS_TOKEN:
self.raise_login_required(method='password')
- def _entries(self, items, language, **kwargs):
+ def _entries(self, items, language, type_, **kwargs):
for item in items:
video_id = item['id']
stream_info = self._download_json(
@@ -103,11 +116,11 @@ class DigitalConcertHallIE(InfoExtractor):
'start_time': chapter.get('time'),
'end_time': try_get(chapter, lambda x: x['time'] + x['duration']),
'title': chapter.get('text'),
- } for chapter in item['cuepoints']] if item.get('cuepoints') else None,
+ } for chapter in item['cuepoints']] if item.get('cuepoints') and type_ == 'concert' else None,
}
def _real_extract(self, url):
- language, video_id = self._match_valid_url(url).group('language', 'id')
+ language, type_, video_id = self._match_valid_url(url).group('language', 'type', 'id')
if not language:
language = 'en'
@@ -120,18 +133,18 @@ class DigitalConcertHallIE(InfoExtractor):
}]
vid_info = self._download_json(
- f'https://api.digitalconcerthall.com/v2/concert/{video_id}', video_id, headers={
+ f'https://api.digitalconcerthall.com/v2/{type_}/{video_id}', video_id, headers={
'Accept': 'application/json',
'Accept-Language': language
})
album_artist = ' / '.join(traverse_obj(vid_info, ('_links', 'artist', ..., 'name')) or '')
+ videos = [vid_info] if type_ == 'film' else traverse_obj(vid_info, ('_embedded', ..., ...))
return {
'_type': 'playlist',
'id': video_id,
'title': vid_info.get('title'),
- 'entries': self._entries(traverse_obj(vid_info, ('_embedded', ..., ...)), language,
- thumbnails=thumbnails, album_artist=album_artist),
+ 'entries': self._entries(videos, language, thumbnails=thumbnails, album_artist=album_artist, type_=type_),
'thumbnails': thumbnails,
'album_artist': album_artist,
}
diff --git a/hypervideo_dl/extractor/discogs.py b/hypervideo_dl/extractor/discogs.py
new file mode 100644
index 0000000..048c622
--- /dev/null
+++ b/hypervideo_dl/extractor/discogs.py
@@ -0,0 +1,35 @@
+from .common import InfoExtractor
+from .youtube import YoutubeIE
+from ..utils import traverse_obj
+
+
+class DiscogsReleasePlaylistIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?discogs\.com/(?P<type>release|master)/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.discogs.com/release/1-The-Persuader-Stockholm',
+ 'info_dict': {
+ 'id': 'release1',
+ 'title': 'Stockholm',
+ },
+ 'playlist_mincount': 7,
+ }, {
+ 'url': 'https://www.discogs.com/master/113-Vince-Watson-Moments-In-Time',
+ 'info_dict': {
+ 'id': 'master113',
+ 'title': 'Moments In Time',
+ },
+ 'playlist_mincount': 53,
+ }]
+
+ def _real_extract(self, url):
+ playlist_id, playlist_type = self._match_valid_url(url).group('id', 'type')
+
+ display_id = f'{playlist_type}{playlist_id}'
+ response = self._download_json(
+ f'https://api.discogs.com/{playlist_type}s/{playlist_id}', display_id)
+
+ entries = [
+ self.url_result(video['uri'], YoutubeIE, video_title=video.get('title'))
+ for video in traverse_obj(response, ('videos', lambda _, v: YoutubeIE.suitable(v['uri'])))]
+
+ return self.playlist_result(entries, display_id, response.get('title'))
diff --git a/hypervideo_dl/extractor/discovery.py b/hypervideo_dl/extractor/discovery.py
index fd3fc8f..75b4643 100644
--- a/hypervideo_dl/extractor/discovery.py
+++ b/hypervideo_dl/extractor/discovery.py
@@ -3,8 +3,8 @@ import string
from .discoverygo import DiscoveryGoBaseIE
from ..compat import compat_urllib_parse_unquote
+from ..networking.exceptions import HTTPError
from ..utils import ExtractorError
-from ..compat import compat_HTTPError
class DiscoveryIE(DiscoveryGoBaseIE):
@@ -78,7 +78,7 @@ class DiscoveryIE(DiscoveryGoBaseIE):
'Downloading token JSON metadata', query={
'authRel': 'authorization',
'client_id': '3020a40c2356a645b4b4',
- 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]),
+ 'nonce': ''.join(random.choices(string.ascii_letters, k=32)),
'redirectUri': 'https://www.discovery.com/',
})['access_token']
@@ -100,9 +100,9 @@ class DiscoveryIE(DiscoveryGoBaseIE):
self._API_BASE_URL + 'streaming/video/' + video_id,
display_id, 'Downloading streaming JSON metadata', headers=headers)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403):
+ if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403):
e_description = self._parse_json(
- e.cause.read().decode(), display_id)['description']
+ e.cause.response.read().decode(), display_id)['description']
if 'resource not available for country' in e_description:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
if 'Authorized Networks' in e_description:
diff --git a/hypervideo_dl/extractor/dlf.py b/hypervideo_dl/extractor/dlf.py
new file mode 100644
index 0000000..88a4149
--- /dev/null
+++ b/hypervideo_dl/extractor/dlf.py
@@ -0,0 +1,192 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ extract_attributes,
+ int_or_none,
+ traverse_obj,
+ url_or_none,
+)
+
+
+class DLFBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/'
+ _BUTTON_REGEX = r'(<button[^>]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)'
+
+ def _parse_button_attrs(self, button, audio_id=None):
+ attrs = extract_attributes(button)
+ audio_id = audio_id or attrs['data-audio-diraid']
+
+ url = traverse_obj(
+ attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference',
+ 'data-audio-src', expected_type=url_or_none)
+ ext = determine_ext(url)
+
+ return {
+ 'id': audio_id,
+ 'extractor_key': DLFIE.ie_key(),
+ 'extractor': DLFIE.IE_NAME,
+ **traverse_obj(attrs, {
+ 'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), {str}),
+ 'duration': (('data-audioduration', 'data-audio-duration'), {int_or_none}),
+ 'thumbnail': ('data-audioimage', {url_or_none}),
+ 'uploader': 'data-audio-producer',
+ 'series': 'data-audio-series',
+ 'channel': 'data-audio-origin-site-name',
+ 'webpage_url': ('data-audio-download-tracking-path', {url_or_none}),
+ }, get_all=False),
+ 'formats': (self._extract_m3u8_formats(url, audio_id, fatal=False)
+ if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}])
+ }
+
+
+class DLFIE(DLFBaseIE):
+ IE_NAME = 'dlf'
+ _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P<id>[\da-f]{8})-100\.html'
+ _TESTS = [
+ # Audio as an HLS stream
+ {
+ 'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html',
+ 'info_dict': {
+ 'id': '03a3eb19',
+ 'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien',
+ 'ext': 'm4a',
+ 'duration': 3298,
+ 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
+ 'uploader': 'Deutschlandfunk',
+ 'series': 'On Stage',
+ 'channel': 'deutschlandfunk'
+ },
+ 'params': {
+ 'skip_download': 'm3u8'
+ },
+ 'skip': 'This webpage no longer exists'
+ }, {
+ 'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html',
+ 'info_dict': {
+ 'id': 'd9cc1856',
+ 'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner',
+ 'ext': 'mp3',
+ 'duration': 291,
+ 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
+ 'uploader': 'Deutschlandfunk',
+ 'series': 'Kommentare und Themen der Woche',
+ 'channel': 'deutschlandfunk'
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ audio_id = self._match_id(url)
+ webpage = self._download_webpage(url, audio_id)
+
+ return self._parse_button_attrs(
+ self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id)
+
+
+class DLFCorpusIE(DLFBaseIE):
+ IE_NAME = 'dlf:corpus'
+ IE_DESC = 'DLF Multi-feed Archives'
+ _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P<id>(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html'
+ _TESTS = [
+ # Recorded news broadcast with referrals to related broadcasts
+ {
+ 'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html',
+ 'info_dict': {
+ 'id': 'fechten-russland-belarus-ukraine-protest-100',
+ 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
+ 'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad'
+ },
+ 'playlist_mincount': 5,
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '1fc5d64a',
+ 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
+ 'ext': 'mp3',
+ 'duration': 252,
+ 'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313',
+ 'uploader': 'Deutschlandfunk',
+ 'series': 'Sport',
+ 'channel': 'deutschlandfunk'
+ }
+ }, {
+ 'info_dict': {
+ 'id': '2ada145f',
+ 'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten',
+ 'ext': 'mp3',
+ 'duration': 336,
+ 'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005',
+ 'uploader': 'Deutschlandfunk',
+ 'series': 'Deutschlandfunk Nova',
+ 'channel': 'deutschlandfunk-nova'
+ }
+ }, {
+ 'info_dict': {
+ 'id': '5e55e8c9',
+ 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
+ 'ext': 'mp3',
+ 'duration': 187,
+ 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
+ 'uploader': 'Deutschlandfunk',
+ 'series': 'Sport am Samstag',
+ 'channel': 'deutschlandfunk'
+ }
+ }, {
+ 'info_dict': {
+ 'id': '47e1a096',
+ 'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"',
+ 'ext': 'mp3',
+ 'duration': 602,
+ 'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770',
+ 'uploader': 'Deutschlandfunk',
+ 'series': 'Sport am Samstag',
+ 'channel': 'deutschlandfunk'
+ }
+ }, {
+ 'info_dict': {
+ 'id': '5e55e8c9',
+ 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
+ 'ext': 'mp3',
+ 'duration': 187,
+ 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
+ 'uploader': 'Deutschlandfunk',
+ 'series': 'Sport am Samstag',
+ 'channel': 'deutschlandfunk'
+ }
+ }]
+ },
+ # Podcast feed with tag buttons, playlist count fluctuates
+ {
+ 'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html',
+ 'info_dict': {
+ 'id': 'kommentare-und-themen-der-woche-100',
+ 'title': 'Meinung - Kommentare und Themen der Woche',
+ 'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5',
+ },
+ 'playlist_mincount': 10,
+ },
+ # Podcast feed with no description
+ {
+ 'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html',
+ 'info_dict': {
+ 'id': 'podcast-tolle-idee-100',
+ 'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?',
+ },
+ 'playlist_mincount': 11,
+ },
+ ]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ webpage = self._download_webpage(url, playlist_id)
+
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'description': self._html_search_meta(
+ ['description', 'og:description', 'twitter:description'], webpage, default=None),
+ 'title': self._html_search_meta(
+ ['og:title', 'twitter:title'], webpage, default=None),
+ 'entries': map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)),
+ }
diff --git a/hypervideo_dl/extractor/douyutv.py b/hypervideo_dl/extractor/douyutv.py
index 477f468..fa40844 100644
--- a/hypervideo_dl/extractor/douyutv.py
+++ b/hypervideo_dl/extractor/douyutv.py
@@ -1,6 +1,7 @@
import time
import hashlib
import re
+import urllib
from .common import InfoExtractor
from ..utils import (
@@ -13,7 +14,7 @@ from ..utils import (
class DouyuTVIE(InfoExtractor):
IE_DESC = '斗鱼'
- _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?:[^/]+/)*(?P<id>[A-Za-z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'http://www.douyutv.com/iseven',
'info_dict': {
@@ -22,7 +23,7 @@ class DouyuTVIE(InfoExtractor):
'ext': 'flv',
'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
- 'thumbnail': r're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.png',
'uploader': '7师傅',
'is_live': True,
},
@@ -37,7 +38,7 @@ class DouyuTVIE(InfoExtractor):
'ext': 'flv',
'title': 're:^小漠从零单排记!——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': 'md5:746a2f7a253966a06755a912f0acc0d2',
- 'thumbnail': r're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.png',
'uploader': 'douyu小漠',
'is_live': True,
},
@@ -53,7 +54,7 @@ class DouyuTVIE(InfoExtractor):
'ext': 'flv',
'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
- 'thumbnail': r're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.png',
'uploader': '7师傅',
'is_live': True,
},
@@ -61,6 +62,21 @@ class DouyuTVIE(InfoExtractor):
'skip_download': True,
},
}, {
+ 'url': 'https://www.douyu.com/topic/ydxc?rid=6560603',
+ 'info_dict': {
+ 'id': '6560603',
+ 'display_id': '6560603',
+ 'ext': 'flv',
+ 'title': 're:^阿余:新年快乐恭喜发财! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 're:.*直播时间.*',
+ 'thumbnail': r're:^https?://.*\.png',
+ 'uploader': '阿涛皎月Carry',
+ 'live_status': 'is_live',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
'url': 'http://www.douyu.com/xiaocang',
'only_matching': True,
}, {
@@ -79,28 +95,24 @@ class DouyuTVIE(InfoExtractor):
room_id = self._html_search_regex(
r'"room_id\\?"\s*:\s*(\d+),', page, 'room id')
- # Grab metadata from mobile API
+ # Grab metadata from API
+ params = {
+ 'aid': 'wp',
+ 'client_sys': 'wp',
+ 'time': int(time.time()),
+ }
+ params['auth'] = hashlib.md5(
+ f'room/{video_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest()
room = self._download_json(
- 'http://m.douyu.com/html5/live?roomId=%s' % room_id, video_id,
- note='Downloading room info')['data']
+ f'http://www.douyutv.com/api/v1/room/{room_id}', video_id,
+ note='Downloading room info', query=params)['data']
# 1 = live, 2 = offline
if room.get('show_status') == '2':
raise ExtractorError('Live stream is offline', expected=True)
- # Grab the URL from PC client API
- # The m3u8 url from mobile API requires re-authentication every 5 minutes
- tt = int(time.time())
- signContent = 'lapi/live/thirdPart/getPlay/%s?aid=pcclient&rate=0&time=%d9TUk5fjjUjg9qIMH3sdnh' % (room_id, tt)
- sign = hashlib.md5(signContent.encode('ascii')).hexdigest()
- video_url = self._download_json(
- 'http://coapi.douyucdn.cn/lapi/live/thirdPart/getPlay/' + room_id,
- video_id, note='Downloading video URL info',
- query={'rate': 0}, headers={
- 'auth': sign,
- 'time': str(tt),
- 'aid': 'pcclient'
- })['data']['live_url']
+ video_url = urljoin('https://hls3-akm.douyucdn.cn/', self._search_regex(r'(live/.*)', room['hls_url'], 'URL'))
+ formats, subs = self._extract_m3u8_formats_and_subtitles(video_url, room_id)
title = unescapeHTML(room['room_name'])
description = room.get('show_details')
@@ -110,12 +122,13 @@ class DouyuTVIE(InfoExtractor):
return {
'id': room_id,
'display_id': video_id,
- 'url': video_url,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'is_live': True,
+ 'subtitles': subs,
+ 'formats': formats,
}
diff --git a/hypervideo_dl/extractor/dplay.py b/hypervideo_dl/extractor/dplay.py
index 8eb4d8f..363b4be 100644
--- a/hypervideo_dl/extractor/dplay.py
+++ b/hypervideo_dl/extractor/dplay.py
@@ -2,7 +2,7 @@ import json
import uuid
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
determine_ext,
ExtractorError,
@@ -39,7 +39,7 @@ class DPlayBaseIE(InfoExtractor):
return f'Bearer {token}'
def _process_errors(self, e, geo_countries):
- info = self._parse_json(e.cause.read().decode('utf-8'), None)
+ info = self._parse_json(e.cause.response.read().decode('utf-8'), None)
error = info['errors'][0]
error_code = error.get('code')
if error_code == 'access.denied.geoblocked':
@@ -65,6 +65,7 @@ class DPlayBaseIE(InfoExtractor):
return streaming_list
def _get_disco_api_info(self, url, display_id, disco_host, realm, country, domain=''):
+ country = self.get_param('geo_bypass_country') or country
geo_countries = [country.upper()]
self._initialize_geo_bypass({
'countries': geo_countries,
@@ -86,7 +87,7 @@ class DPlayBaseIE(InfoExtractor):
'include': 'images,primaryChannel,show,tags'
})
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 400:
self._process_errors(e, geo_countries)
raise
video_id = video['data']['id']
@@ -98,7 +99,7 @@ class DPlayBaseIE(InfoExtractor):
streaming = self._download_video_playback_info(
disco_base, video_id, headers)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
self._process_errors(e, geo_countries)
raise
for format_dict in streaming:
@@ -745,7 +746,7 @@ class MotorTrendIE(DiscoveryPlusBaseIE):
class MotorTrendOnDemandIE(DiscoveryPlusBaseIE):
- _VALID_URL = r'https?://(?:www\.)?motortrendondemand\.com/detail' + DPlayBaseIE._PATH_REGEX
+ _VALID_URL = r'https?://(?:www\.)?motortrend(?:ondemand\.com|\.com/plus)/detail' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.motortrendondemand.com/detail/wheelstanding-dump-truck-stubby-bobs-comeback/37699/784',
'info_dict': {
@@ -766,6 +767,25 @@ class MotorTrendOnDemandIE(DiscoveryPlusBaseIE):
'upload_date': '20140101',
'tags': [],
},
+ }, {
+ 'url': 'https://www.motortrend.com/plus/detail/roadworthy-rescues-teaser-trailer/4922860/',
+ 'info_dict': {
+ 'id': '4922860',
+ 'ext': 'mp4',
+ 'title': 'Roadworthy Rescues | Teaser Trailer',
+ 'description': 'Derek Bieri helps Freiburger and Finnegan with their \'68 big-block Dart.',
+ 'display_id': 'roadworthy-rescues-teaser-trailer/4922860',
+ 'creator': 'Originals',
+ 'series': 'Roadworthy Rescues',
+ 'thumbnail': r're:^https?://.+\.jpe?g$',
+ 'upload_date': '20220907',
+ 'timestamp': 1662523200,
+ 'duration': 1066.356,
+ 'tags': [],
+ },
+ }, {
+ 'url': 'https://www.motortrend.com/plus/detail/ugly-duckling/2450033/12439',
+ 'only_matching': True,
}]
_PRODUCT = 'MTOD'
@@ -1001,3 +1021,39 @@ class DiscoveryPlusIndiaShowIE(DiscoveryPlusShowBaseIE):
_SHOW_STR = 'show'
_INDEX = 4
_VIDEO_IE = DiscoveryPlusIndiaIE
+
+
+class GlobalCyclingNetworkPlusIE(DiscoveryPlusBaseIE):
+ _VALID_URL = r'https?://plus\.globalcyclingnetwork\.com/watch/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://plus.globalcyclingnetwork.com/watch/1397691',
+ 'info_dict': {
+ 'id': '1397691',
+ 'ext': 'mp4',
+ 'title': 'The Athertons: Mountain Biking\'s Fastest Family',
+ 'description': 'md5:75a81937fcd8b989eec6083a709cd837',
+ 'thumbnail': 'https://us1-prod-images.disco-api.com/2021/03/04/eb9e3026-4849-3001-8281-9356466f0557.png',
+ 'series': 'gcn',
+ 'creator': 'Gcn',
+ 'upload_date': '20210309',
+ 'timestamp': 1615248000,
+ 'duration': 2531.0,
+ 'tags': [],
+ },
+ 'skip': 'Subscription required',
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ _PRODUCT = 'web'
+ _DISCO_API_PARAMS = {
+ 'disco_host': 'disco-api-prod.globalcyclingnetwork.com',
+ 'realm': 'gcn',
+ 'country': 'us',
+ }
+
+ def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
+ headers.update({
+ 'x-disco-params': f'realm={realm}',
+ 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:27.3.2',
+ 'Authorization': self._get_auth(disco_base, display_id, realm),
+ })
diff --git a/hypervideo_dl/extractor/dropbox.py b/hypervideo_dl/extractor/dropbox.py
index 214b309..bc2efce 100644
--- a/hypervideo_dl/extractor/dropbox.py
+++ b/hypervideo_dl/extractor/dropbox.py
@@ -1,3 +1,4 @@
+import base64
import os.path
import re
@@ -5,14 +6,13 @@ from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
from ..utils import (
ExtractorError,
- traverse_obj,
- try_get,
+ update_url_query,
url_basename,
)
class DropboxIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/sh?/(?P<id>[a-zA-Z0-9]{15})/.*'
+ _VALID_URL = r'https?://(?:www\.)?dropbox\.com/(?:(?:e/)?scl/fi|sh?)/(?P<id>\w+)'
_TESTS = [
{
'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4?dl=0',
@@ -22,7 +22,16 @@ class DropboxIE(InfoExtractor):
'title': 'youtube-dl test video \'ä"BaW_jenozKc'
}
}, {
- 'url': 'https://www.dropbox.com/sh/662glsejgzoj9sr/AAByil3FGH9KFNZ13e08eSa1a/Pregame%20Ceremony%20Program%20PA%2020140518.m4v',
+ 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.dropbox.com/sh/2mgpiuq7kv8nqdf/AABy-fW4dkydT4GmWi2mdOUDa?dl=0&preview=Drone+Shot.mp4',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.dropbox.com/scl/fi/r2kd2skcy5ylbbta5y1pz/DJI_0003.MP4?dl=0&rlkey=wcdgqangn7t3lnmmv6li9mu9h',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.dropbox.com/e/scl/fi/r2kd2skcy5ylbbta5y1pz/DJI_0003.MP4?dl=0&rlkey=wcdgqangn7t3lnmmv6li9mu9h',
'only_matching': True,
},
]
@@ -53,16 +62,25 @@ class DropboxIE(InfoExtractor):
else:
raise ExtractorError('Password protected video, use --video-password <password>', expected=True)
- info_json = self._search_json(r'InitReact\.mountComponent\(.*?,', webpage, 'mountComponent', video_id,
- contains_pattern=r'{.+?"preview".+?}', end_pattern=r'\)')['props']
- transcode_url = traverse_obj(info_json, ((None, 'preview'), 'file', 'preview', 'content', 'transcode_url'), get_all=False)
- formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id)
+ formats, subtitles, has_anonymous_download = [], {}, False
+ for encoded in reversed(re.findall(r'registerStreamedPrefetch\s*\(\s*"[\w/+=]+"\s*,\s*"([\w/+=]+)"', webpage)):
+ decoded = base64.b64decode(encoded).decode('utf-8', 'ignore')
+ transcode_url = self._search_regex(
+ r'\n.(https://[^\x03\x08\x12\n]+\.m3u8)', decoded, 'transcode url', default=None)
+ if not transcode_url:
+ continue
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id, 'mp4')
+ has_anonymous_download = self._search_regex(r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False)
+ break
# downloads enabled we can get the original file
- if 'anonymous' in (try_get(info_json, lambda x: x['sharePermission']['canDownloadRoles']) or []):
- video_url = re.sub(r'[?&]dl=0', '', url)
- video_url += ('?' if '?' not in video_url else '&') + 'dl=1'
- formats.append({'url': video_url, 'format_id': 'original', 'format_note': 'Original', 'quality': 1})
+ if has_anonymous_download:
+ formats.append({
+ 'url': update_url_query(url, {'dl': '1'}),
+ 'format_id': 'original',
+ 'format_note': 'Original',
+ 'quality': 1
+ })
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/dropout.py b/hypervideo_dl/extractor/dropout.py
index e280b1c..80ae6c1 100644
--- a/hypervideo_dl/extractor/dropout.py
+++ b/hypervideo_dl/extractor/dropout.py
@@ -1,13 +1,17 @@
+import functools
+
from .common import InfoExtractor
from .vimeo import VHXEmbedIE
from ..utils import (
ExtractorError,
+ OnDemandPagedList,
clean_html,
+ extract_attributes,
get_element_by_class,
get_element_by_id,
- get_elements_by_class,
+ get_elements_html_by_class,
int_or_none,
- join_nonempty,
+ traverse_obj,
unified_strdate,
urlencode_postdata,
)
@@ -162,12 +166,13 @@ class DropoutIE(InfoExtractor):
class DropoutSeasonIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?P<id>[^\/$&?#]+)(?:/?$|/season:[0-9]+/?$)'
+ _PAGE_SIZE = 24
+ _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?P<id>[^\/$&?#]+)(?:/?$|/season:(?P<season>[0-9]+)/?$)'
_TESTS = [
{
'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1',
'note': 'Multi-season series with the season in the url',
- 'playlist_count': 17,
+ 'playlist_count': 24,
'info_dict': {
'id': 'dimension-20-fantasy-high-season-1',
'title': 'Dimension 20 Fantasy High - Season 1'
@@ -176,7 +181,7 @@ class DropoutSeasonIE(InfoExtractor):
{
'url': 'https://www.dropout.tv/dimension-20-fantasy-high',
'note': 'Multi-season series with the season not in the url',
- 'playlist_count': 17,
+ 'playlist_count': 24,
'info_dict': {
'id': 'dimension-20-fantasy-high-season-1',
'title': 'Dimension 20 Fantasy High - Season 1'
@@ -190,29 +195,30 @@ class DropoutSeasonIE(InfoExtractor):
'id': 'dimension-20-shriek-week-season-1',
'title': 'Dimension 20 Shriek Week - Season 1'
}
+ },
+ {
+ 'url': 'https://www.dropout.tv/breaking-news-no-laugh-newsroom/season:3',
+ 'note': 'Multi-season series with season in the url that requires pagination',
+ 'playlist_count': 25,
+ 'info_dict': {
+ 'id': 'breaking-news-no-laugh-newsroom-season-3',
+ 'title': 'Breaking News No Laugh Newsroom - Season 3'
+ }
}
]
+ def _fetch_page(self, url, season_id, page):
+ page += 1
+ webpage = self._download_webpage(
+ f'{url}?page={page}', season_id, note=f'Downloading page {page}', expected_status={400})
+ yield from [self.url_result(item_url, DropoutIE) for item_url in traverse_obj(
+ get_elements_html_by_class('browse-item-link', webpage), (..., {extract_attributes}, 'href'))]
+
def _real_extract(self, url):
season_id = self._match_id(url)
+ season_num = self._match_valid_url(url).group('season') or 1
season_title = season_id.replace('-', ' ').title()
- webpage = self._download_webpage(url, season_id)
-
- entries = [
- self.url_result(
- url=self._search_regex(r'<a href=["\'](.+?)["\'] class=["\']browse-item-link["\']',
- item, 'item_url'),
- ie=DropoutIE.ie_key()
- ) for item in get_elements_by_class('js-collection-item', webpage)
- ]
-
- seasons = (get_element_by_class('select-dropdown-wrapper', webpage) or '').strip().replace('\n', '')
- current_season = self._search_regex(r'<option[^>]+selected>([^<]+)</option>',
- seasons, 'current_season', default='').strip()
- return {
- '_type': 'playlist',
- 'id': join_nonempty(season_id, current_season.lower().replace(' ', '-')),
- 'title': join_nonempty(season_title, current_season, delim=' - '),
- 'entries': entries
- }
+ return self.playlist_result(
+ OnDemandPagedList(functools.partial(self._fetch_page, url, season_id), self._PAGE_SIZE),
+ f'{season_id}-season-{season_num}', f'{season_title} - Season {season_num}')
diff --git a/hypervideo_dl/extractor/drtv.py b/hypervideo_dl/extractor/drtv.py
index 128f439..6c381aa 100644
--- a/hypervideo_dl/extractor/drtv.py
+++ b/hypervideo_dl/extractor/drtv.py
@@ -2,28 +2,29 @@ import binascii
import hashlib
import re
-
from .common import InfoExtractor
from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
from ..compat import compat_urllib_parse_unquote
from ..utils import (
ExtractorError,
- int_or_none,
float_or_none,
+ int_or_none,
mimetype2ext,
str_or_none,
- try_get,
+ traverse_obj,
unified_timestamp,
update_url_query,
url_or_none,
)
+SERIES_API = 'https://production-cdn.dr-massive.com/api/page?device=web_browser&item_detail_expand=all&lang=da&max_list_prefetch=3&path=%s'
+
class DRTVIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
- (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?:radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*|
+ (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?P<radio>radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*|
(?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/
)
(?P<id>[\da-z_-]+)
@@ -78,7 +79,7 @@ class DRTVIE(InfoExtractor):
'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a',
'timestamp': 1546628400,
'upload_date': '20190104',
- 'duration': 3504.618,
+ 'duration': 3504.619,
'formats': 'mincount:20',
'release_year': 2017,
'season_id': 'urn:dr:mu:bundle:5afc03ad6187a4065ca5fd35',
@@ -99,14 +100,16 @@ class DRTVIE(InfoExtractor):
'ext': 'mp4',
'title': 'Bonderøven 2019 (1:8)',
'description': 'md5:b6dcfe9b6f0bea6703e9a0092739a5bd',
- 'timestamp': 1603188600,
- 'upload_date': '20201020',
+ 'timestamp': 1654856100,
+ 'upload_date': '20220610',
'duration': 2576.6,
'season': 'Bonderøven 2019',
'season_id': 'urn:dr:mu:bundle:5c201667a11fa01ca4528ce5',
'release_year': 2019,
'season_number': 2019,
- 'series': 'Frank & Kastaniegaarden'
+ 'series': 'Frank & Kastaniegaarden',
+ 'episode_number': 1,
+ 'episode': 'Episode 1',
},
'params': {
'skip_download': True,
@@ -138,16 +141,32 @@ class DRTVIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': 'this video has been removed',
+ }, {
+ 'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/regionale-nyheder-2023-03-14-10-30-9',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': '14802310112',
+ 'timestamp': 1678786200,
+ 'duration': 120.043,
+ 'season_id': 'urn:dr:mu:bundle:63a4f7c87140143504b6710f',
+ 'series': 'P4 København regionale nyheder',
+ 'upload_date': '20230314',
+ 'release_year': 0,
+ 'description': 'Hør seneste regionale nyheder fra P4 København.',
+ 'season': 'Regionale nyheder',
+ 'title': 'Regionale nyheder',
+ },
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ raw_video_id, is_radio_url = self._match_valid_url(url).group('id', 'radio')
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url, raw_video_id)
if '>Programmet er ikke længere tilgængeligt' in webpage:
raise ExtractorError(
- 'Video %s is not available' % video_id, expected=True)
+ 'Video %s is not available' % raw_video_id, expected=True)
video_id = self._search_regex(
(r'data-(?:material-identifier|episode-slug)="([^"]+)"',
@@ -168,20 +187,27 @@ class DRTVIE(InfoExtractor):
programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id)
else:
programcard_url = _PROGRAMCARD_BASE
- page = self._parse_json(
- self._search_regex(
- r'data\s*=\s*({.+?})\s*(?:;|</script)', webpage,
- 'data'), '1')['cache']['page']
- page = page[list(page.keys())[0]]
- item = try_get(
- page, (lambda x: x['item'], lambda x: x['entries'][0]['item']),
- dict)
- video_id = item['customId'].split(':')[-1]
+ if is_radio_url:
+ video_id = self._search_nextjs_data(
+ webpage, raw_video_id)['props']['pageProps']['episode']['productionNumber']
+ else:
+ json_data = self._search_json(
+ r'window\.__data\s*=', webpage, 'data', raw_video_id)
+ video_id = traverse_obj(json_data, (
+ 'cache', 'page', ..., (None, ('entries', 0)), 'item', 'customId',
+ {lambda x: x.split(':')[-1]}), get_all=False)
+ if not video_id:
+ raise ExtractorError('Unable to extract video id')
query['productionnumber'] = video_id
data = self._download_json(
programcard_url, video_id, 'Downloading video JSON', query=query)
+ supplementary_data = {}
+ if re.search(r'_\d+$', raw_video_id):
+ supplementary_data = self._download_json(
+ SERIES_API % f'/episode/{raw_video_id}', raw_video_id, fatal=False) or {}
+
title = str_or_none(data.get('Title')) or re.sub(
r'\s*\|\s*(?:TV\s*\|\s*DR|DRTV)$', '',
self._og_search_title(webpage))
@@ -262,10 +288,11 @@ class DRTVIE(InfoExtractor):
f['vcodec'] = 'none'
formats.extend(f4m_formats)
elif target == 'HLS':
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
uri, video_id, 'mp4', entry_protocol='m3u8_native',
- quality=preference, m3u8_id=format_id,
- fatal=False))
+ quality=preference, m3u8_id=format_id, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
else:
bitrate = link.get('Bitrate')
if bitrate:
@@ -313,8 +340,8 @@ class DRTVIE(InfoExtractor):
'season': str_or_none(data.get('SeasonTitle')),
'season_number': int_or_none(data.get('SeasonNumber')),
'season_id': str_or_none(data.get('SeasonUrn')),
- 'episode': str_or_none(data.get('EpisodeTitle')),
- 'episode_number': int_or_none(data.get('EpisodeNumber')),
+ 'episode': traverse_obj(supplementary_data, ('entries', 0, 'item', 'contextualTitle')) or str_or_none(data.get('EpisodeTitle')),
+ 'episode_number': traverse_obj(supplementary_data, ('entries', 0, 'item', 'episodeNumber')) or int_or_none(data.get('EpisodeNumber')),
'release_year': int_or_none(data.get('ProductionYear')),
}
@@ -372,3 +399,92 @@ class DRTVLiveIE(InfoExtractor):
'formats': formats,
'is_live': True,
}
+
+
+class DRTVSeasonIE(InfoExtractor):
+ IE_NAME = 'drtv:season'
+ _VALID_URL = r'https?://(?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/saeson/(?P<display_id>[\w-]+)_(?P<id>\d+)'
+ _GEO_COUNTRIES = ['DK']
+ _TESTS = [{
+ 'url': 'https://www.dr.dk/drtv/saeson/frank-and-kastaniegaarden_9008',
+ 'info_dict': {
+ 'id': '9008',
+ 'display_id': 'frank-and-kastaniegaarden',
+ 'title': 'Frank & Kastaniegaarden',
+ 'series': 'Frank & Kastaniegaarden',
+ },
+ 'playlist_mincount': 8
+ }, {
+ 'url': 'https://www.dr.dk/drtv/saeson/frank-and-kastaniegaarden_8761',
+ 'info_dict': {
+ 'id': '8761',
+ 'display_id': 'frank-and-kastaniegaarden',
+ 'title': 'Frank & Kastaniegaarden',
+ 'series': 'Frank & Kastaniegaarden',
+ },
+ 'playlist_mincount': 19
+ }]
+
+ def _real_extract(self, url):
+ display_id, season_id = self._match_valid_url(url).group('display_id', 'id')
+ data = self._download_json(SERIES_API % f'/saeson/{display_id}_{season_id}', display_id)
+
+ entries = [{
+ '_type': 'url',
+ 'url': f'https://www.dr.dk/drtv{episode["path"]}',
+ 'ie_key': DRTVIE.ie_key(),
+ 'title': episode.get('title'),
+ 'episode': episode.get('episodeName'),
+ 'description': episode.get('shortDescription'),
+ 'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
+ 'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber')),
+ 'episode_number': episode.get('episodeNumber'),
+ } for episode in traverse_obj(data, ('entries', 0, 'item', 'episodes', 'items'))]
+
+ return {
+ '_type': 'playlist',
+ 'id': season_id,
+ 'display_id': display_id,
+ 'title': traverse_obj(data, ('entries', 0, 'item', 'title')),
+ 'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
+ 'entries': entries,
+ 'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber'))
+ }
+
+
+class DRTVSeriesIE(InfoExtractor):
+ IE_NAME = 'drtv:series'
+ _VALID_URL = r'https?://(?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/serie/(?P<display_id>[\w-]+)_(?P<id>\d+)'
+ _GEO_COUNTRIES = ['DK']
+ _TESTS = [{
+ 'url': 'https://www.dr.dk/drtv/serie/frank-and-kastaniegaarden_6954',
+ 'info_dict': {
+ 'id': '6954',
+ 'display_id': 'frank-and-kastaniegaarden',
+ 'title': 'Frank & Kastaniegaarden',
+ 'series': 'Frank & Kastaniegaarden',
+ },
+ 'playlist_mincount': 15
+ }]
+
+ def _real_extract(self, url):
+ display_id, series_id = self._match_valid_url(url).group('display_id', 'id')
+ data = self._download_json(SERIES_API % f'/serie/{display_id}_{series_id}', display_id)
+
+ entries = [{
+ '_type': 'url',
+ 'url': f'https://www.dr.dk/drtv{season.get("path")}',
+ 'ie_key': DRTVSeasonIE.ie_key(),
+ 'title': season.get('title'),
+ 'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
+ 'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber'))
+ } for season in traverse_obj(data, ('entries', 0, 'item', 'show', 'seasons', 'items'))]
+
+ return {
+ '_type': 'playlist',
+ 'id': series_id,
+ 'display_id': display_id,
+ 'title': traverse_obj(data, ('entries', 0, 'item', 'title')),
+ 'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
+ 'entries': entries
+ }
diff --git a/hypervideo_dl/extractor/dumpert.py b/hypervideo_dl/extractor/dumpert.py
index 010c2d0..0cf8426 100644
--- a/hypervideo_dl/extractor/dumpert.py
+++ b/hypervideo_dl/extractor/dumpert.py
@@ -1,12 +1,17 @@
from .common import InfoExtractor
from ..utils import (
+ determine_ext,
int_or_none,
qualities,
)
class DumpertIE(InfoExtractor):
- _VALID_URL = r'(?P<protocol>https?)://(?:(?:www|legacy)\.)?dumpert\.nl/(?:mediabase|embed|item)/(?P<id>[0-9]+[/_][0-9a-zA-Z]+)'
+ _VALID_URL = r'''(?x)
+ (?P<protocol>https?)://(?:(?:www|legacy)\.)?dumpert\.nl(?:
+ /(?:mediabase|embed|item)/|
+ (?:/toppers|/latest|/?)\?selectedId=
+ )(?P<id>[0-9]+[/_][0-9a-zA-Z]+)'''
_TESTS = [{
'url': 'https://www.dumpert.nl/item/6646981_951bc60f',
'md5': '1b9318d7d5054e7dcb9dc7654f21d643',
@@ -16,6 +21,9 @@ class DumpertIE(InfoExtractor):
'title': 'Ik heb nieuws voor je',
'description': 'Niet schrikken hoor',
'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 9,
+ 'view_count': int,
+ 'like_count': int,
}
}, {
'url': 'https://www.dumpert.nl/embed/6675421_dc440fe7',
@@ -26,6 +34,28 @@ class DumpertIE(InfoExtractor):
}, {
'url': 'http://legacy.dumpert.nl/embed/6675421/dc440fe7',
'only_matching': True,
+ }, {
+ 'url': 'https://www.dumpert.nl/item/100031688_b317a185',
+ 'info_dict': {
+ 'id': '100031688/b317a185',
+ 'ext': 'mp4',
+ 'title': 'Epic schijnbeweging',
+ 'description': '<p>Die zag je niet eh</p>',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
+ 'duration': 12,
+ 'view_count': int,
+ 'like_count': int,
+ },
+ 'params': {'skip_download': 'm3u8'}
+ }, {
+ 'url': 'https://www.dumpert.nl/toppers?selectedId=100031688_b317a185',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.dumpert.nl/latest?selectedId=100031688_b317a185',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.dumpert.nl/?selectedId=100031688_b317a185',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -36,18 +66,23 @@ class DumpertIE(InfoExtractor):
title = item['title']
media = next(m for m in item['media'] if m.get('mediatype') == 'VIDEO')
- quality = qualities(['flv', 'mobile', 'tablet', '720p'])
+ quality = qualities(['flv', 'mobile', 'tablet', '720p', '1080p'])
formats = []
for variant in media.get('variants', []):
uri = variant.get('uri')
if not uri:
continue
version = variant.get('version')
- formats.append({
- 'url': uri,
- 'format_id': version,
- 'quality': quality(version),
- })
+ preference = quality(version)
+ if determine_ext(uri) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ uri, video_id, 'mp4', m3u8_id=version, quality=preference))
+ else:
+ formats.append({
+ 'url': uri,
+ 'format_id': version,
+ 'quality': preference,
+ })
thumbnails = []
stills = item.get('stills') or {}
diff --git a/hypervideo_dl/extractor/eagleplatform.py b/hypervideo_dl/extractor/eagleplatform.py
index 9ebd24d..739d179 100644
--- a/hypervideo_dl/extractor/eagleplatform.py
+++ b/hypervideo_dl/extractor/eagleplatform.py
@@ -2,7 +2,7 @@ import functools
import re
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
@@ -111,8 +111,8 @@ class EaglePlatformIE(InfoExtractor):
response = super(EaglePlatformIE, self)._download_json(
url_or_request, video_id, *args, **kwargs)
except ExtractorError as ee:
- if isinstance(ee.cause, compat_HTTPError):
- response = self._parse_json(ee.cause.read().decode('utf-8'), video_id)
+ if isinstance(ee.cause, HTTPError):
+ response = self._parse_json(ee.cause.response.read().decode('utf-8'), video_id)
self._handle_error(response)
raise
return response
diff --git a/hypervideo_dl/extractor/ebay.py b/hypervideo_dl/extractor/ebay.py
new file mode 100644
index 0000000..d0eb9fc
--- /dev/null
+++ b/hypervideo_dl/extractor/ebay.py
@@ -0,0 +1,36 @@
+from .common import InfoExtractor
+from ..utils import remove_end
+
+
+class EbayIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?ebay\.com/itm/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.ebay.com/itm/194509326719',
+ 'info_dict': {
+ 'id': '194509326719',
+ 'ext': 'mp4',
+ 'title': 'WiFi internal antenna adhesive for wifi 2.4GHz wifi 5 wifi 6 wifi 6E full bands',
+ },
+ 'params': {'skip_download': 'm3u8'}
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ video_json = self._search_json(r'"video":', webpage, 'video json', video_id)
+
+ formats = []
+ for key, url in video_json['playlistMap'].items():
+ if key == 'HLS':
+ formats.extend(self._extract_m3u8_formats(url, video_id, fatal=False))
+ elif key == 'DASH':
+ formats.extend(self._extract_mpd_formats(url, video_id, fatal=False))
+ else:
+ self.report_warning(f'Unsupported format {key}', video_id)
+
+ return {
+ 'id': video_id,
+ 'title': remove_end(self._html_extract_title(webpage), ' | eBay'),
+ 'formats': formats
+ }
diff --git a/hypervideo_dl/extractor/eitb.py b/hypervideo_dl/extractor/eitb.py
index bd027da..66afbb6 100644
--- a/hypervideo_dl/extractor/eitb.py
+++ b/hypervideo_dl/extractor/eitb.py
@@ -1,10 +1,6 @@
from .common import InfoExtractor
-from ..utils import (
- float_or_none,
- int_or_none,
- parse_iso8601,
- sanitized_Request,
-)
+from ..networking import Request
+from ..utils import float_or_none, int_or_none, parse_iso8601
class EitbIE(InfoExtractor):
@@ -54,7 +50,7 @@ class EitbIE(InfoExtractor):
hls_url = media.get('HLS_SURL')
if hls_url:
- request = sanitized_Request(
+ request = Request(
'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/',
headers={'Referer': url})
token_data = self._download_json(
diff --git a/hypervideo_dl/extractor/elevensports.py b/hypervideo_dl/extractor/elevensports.py
new file mode 100644
index 0000000..99c52b3
--- /dev/null
+++ b/hypervideo_dl/extractor/elevensports.py
@@ -0,0 +1,59 @@
+from .common import InfoExtractor
+from ..utils import (
+ parse_iso8601,
+ traverse_obj,
+ url_or_none,
+)
+
+
+class ElevenSportsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?elevensports\.com/view/event/(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://elevensports.com/view/event/clf46yr3kenn80jgrqsjmwefk',
+ 'md5': 'c0958d9ff90e4503a75544358758921d',
+ 'info_dict': {
+ 'id': 'clf46yr3kenn80jgrqsjmwefk',
+ 'title': 'Cleveland SC vs Lionsbridge FC',
+ 'ext': 'mp4',
+ 'description': 'md5:03b5238d6549f4ea1fddadf69b5e0b58',
+ 'upload_date': '20230323',
+ 'timestamp': 1679612400,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ },
+ 'params': {'skip_download': 'm3u8'}
+ }, {
+ 'url': 'https://elevensports.com/view/event/clhpyd53b06160jez74qhgkmf',
+ 'md5': 'c0958d9ff90e4503a75544358758921d',
+ 'info_dict': {
+ 'id': 'clhpyd53b06160jez74qhgkmf',
+ 'title': 'AJNLF vs ARRAF',
+ 'ext': 'mp4',
+ 'description': 'md5:c8c5e75c78f37c6d15cd6c475e43a8c1',
+ 'upload_date': '20230521',
+ 'timestamp': 1684684800,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ },
+ 'params': {'skip_download': 'm3u8'}
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ event_id = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['event']['mclsEventId']
+ event_data = self._download_json(
+ f'https://mcls-api.mycujoo.tv/bff/events/v1beta1/{event_id}', video_id,
+ headers={'Authorization': 'Bearer FBVKACGN37JQC5SFA0OVK8KKSIOP153G'})
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ event_data['streams'][0]['full_url'], video_id, 'mp4', m3u8_id='hls')
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ **traverse_obj(event_data, {
+ 'title': ('title', {str}),
+ 'description': ('description', {str}),
+ 'timestamp': ('start_time', {parse_iso8601}),
+ 'thumbnail': ('thumbnail_url', {url_or_none}),
+ }),
+ }
diff --git a/hypervideo_dl/extractor/embedly.py b/hypervideo_dl/extractor/embedly.py
index 483d018..458aaa0 100644
--- a/hypervideo_dl/extractor/embedly.py
+++ b/hypervideo_dl/extractor/embedly.py
@@ -1,24 +1,109 @@
import re
import urllib.parse
+
from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
+from .youtube import YoutubeTabIE
+from ..utils import parse_qs, smuggle_url, traverse_obj
class EmbedlyIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?url=(?P<id>[^#&]+)'
+ _VALID_URL = r'https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?(?:src|url)=(?:[^#&]+)'
_TESTS = [{
'url': 'https://cdn.embedly.com/widgets/media.html?src=http%3A%2F%2Fwww.youtube.com%2Fembed%2Fvideoseries%3Flist%3DUUGLim4T2loE5rwCMdpCIPVg&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DSU4fj_aEMVw%26list%3DUUGLim4T2loE5rwCMdpCIPVg&image=http%3A%2F%2Fi.ytimg.com%2Fvi%2FSU4fj_aEMVw%2Fhqdefault.jpg&key=8ee8a2e6a8cc47aab1a5ee67f9a178e0&type=text%2Fhtml&schema=youtube&autoplay=1',
+ 'info_dict': {
+ 'id': 'UUGLim4T2loE5rwCMdpCIPVg',
+ 'modified_date': '20221225',
+ 'view_count': int,
+ 'uploader_url': 'https://www.youtube.com/@TraciHinesMusic',
+ 'channel_id': 'UCGLim4T2loE5rwCMdpCIPVg',
+ 'uploader': 'TraciJHines',
+ 'channel_url': 'https://www.youtube.com/@TraciHinesMusic',
+ 'channel': 'TraciJHines',
+ 'availability': 'public',
+ 'uploader_id': 'UCGLim4T2loE5rwCMdpCIPVg',
+ 'description': '',
+ 'tags': [],
+ 'title': 'Uploads from TraciJHines',
+ },
+ 'playlist_mincount': 10,
+ }, {
+ 'url': 'https://cdn.embedly.com/widgets/media.html?src=http%3A%2F%2Fwww.youtube.com%2Fembed%2Fvideoseries%3Flist%3DUUGLim4T2loE5rwCMdpCIPVg&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DSU4fj_aEMVw%26list%3DUUGLim4T2loE5rwCMdpCIPVg&image=http%3A%2F%2Fi.ytimg.com%2Fvi%2FSU4fj_aEMVw%2Fhqdefault.jpg&key=8ee8a2e6a8cc47aab1a5ee67f9a178e0&type=text%2Fhtml&schema=youtube&autoplay=1',
+ 'params': {'noplaylist': True},
+ 'info_dict': {
+ 'id': 'SU4fj_aEMVw',
+ 'ext': 'mp4',
+ 'title': 'I\'m on Patreon!',
+ 'age_limit': 0,
+ 'categories': ['Entertainment'],
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/SU4fj_aEMVw/maxresdefault.webp',
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'channel': 'TraciJHines',
+ 'uploader_id': 'TraciJHines',
+ 'channel_url': 'https://www.youtube.com/channel/UCGLim4T2loE5rwCMdpCIPVg',
+ 'uploader_url': 'http://www.youtube.com/user/TraciJHines',
+ 'upload_date': '20150211',
+ 'duration': 282,
+ 'availability': 'public',
+ 'channel_follower_count': int,
+ 'tags': 'count:39',
+ 'view_count': int,
+ 'comment_count': int,
+ 'channel_id': 'UCGLim4T2loE5rwCMdpCIPVg',
+ 'like_count': int,
+ 'uploader': 'TraciJHines',
+ 'description': 'md5:8af6425f50bd46fbf29f3db0fc3a8364',
+ 'chapters': list,
+
+ },
+ }, {
+ 'url': 'https://cdn.embedly.com/widgets/media.html?src=https://player.vimeo.com/video/1234567?h=abcdefgh',
'only_matching': True,
}]
+ _WEBPAGE_TESTS = [{
+ 'url': 'http://www.permacultureetc.com/2022/12/comment-greffer-facilement-les-arbres-fruitiers.html',
+ 'info_dict': {
+ 'id': 'pfUK_ADTvgY',
+ 'ext': 'mp4',
+ 'title': 'Comment greffer facilement les arbres fruitiers ? (mois par mois)',
+ 'description': 'md5:d3a876995e522f138aabb48e040bfb4c',
+ 'view_count': int,
+ 'upload_date': '20221210',
+ 'comment_count': int,
+ 'live_status': 'not_live',
+ 'channel_id': 'UCsM4_jihNFYe4CtSkXvDR-Q',
+ 'channel_follower_count': int,
+ 'tags': ['permaculture', 'jardinage', 'dekarz', 'autonomie', 'greffe', 'fruitiers', 'arbres', 'jardin forêt', 'forêt comestible', 'damien'],
+ 'playable_in_embed': True,
+ 'uploader': 'permaculture agroécologie etc...',
+ 'channel': 'permaculture agroécologie etc...',
+ 'thumbnail': 'https://i.ytimg.com/vi/pfUK_ADTvgY/sddefault.jpg',
+ 'duration': 1526,
+ 'channel_url': 'https://www.youtube.com/channel/UCsM4_jihNFYe4CtSkXvDR-Q',
+ 'age_limit': 0,
+ 'uploader_id': 'permacultureetc',
+ 'like_count': int,
+ 'uploader_url': 'http://www.youtube.com/user/permacultureetc',
+ 'categories': ['Education'],
+ 'availability': 'public',
+ },
+ }]
+
@classmethod
- def _extract_embed_urls(cls, url, webpage):
- # Bypass suitable check
+ def _extract_from_webpage(cls, url, webpage):
+ # Bypass "ie=cls" and suitable check
for mobj in re.finditer(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage):
- yield mobj.group('url')
+ yield cls.url_result(mobj.group('url'))
for mobj in re.finditer(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage):
- yield urllib.parse.unquote(mobj.group('url'))
+ yield cls.url_result(urllib.parse.unquote(mobj.group('url')))
def _real_extract(self, url):
- return self.url_result(compat_urllib_parse_unquote(self._match_id(url)))
+ qs = parse_qs(url)
+ src = urllib.parse.unquote(traverse_obj(qs, ('url', 0)) or '')
+ if src and YoutubeTabIE.suitable(src):
+ return self.url_result(src, YoutubeTabIE)
+ return self.url_result(smuggle_url(
+ urllib.parse.unquote(traverse_obj(qs, ('src', 0), ('url', 0))),
+ {'http_headers': {'Referer': url}}))
diff --git a/hypervideo_dl/extractor/eporner.py b/hypervideo_dl/extractor/eporner.py
index a233797..aee2dee 100644
--- a/hypervideo_dl/extractor/eporner.py
+++ b/hypervideo_dl/extractor/eporner.py
@@ -52,7 +52,7 @@ class EpornerIE(InfoExtractor):
webpage, urlh = self._download_webpage_handle(url, display_id)
- video_id = self._match_id(urlh.geturl())
+ video_id = self._match_id(urlh.url)
hash = self._search_regex(
r'hash\s*[:=]\s*["\']([\da-f]{32})', webpage, 'hash')
diff --git a/hypervideo_dl/extractor/espn.py b/hypervideo_dl/extractor/espn.py
index f4b0134..7ed824c 100644
--- a/hypervideo_dl/extractor/espn.py
+++ b/hypervideo_dl/extractor/espn.py
@@ -240,7 +240,7 @@ class FiveThirtyEightIE(InfoExtractor):
class ESPNCricInfoIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?espncricinfo\.com/video/[^#$&?/]+-(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?espncricinfo\.com/(?:cricket-)?videos?/[^#$&?/]+-(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.espncricinfo.com/video/finch-chasing-comes-with-risks-despite-world-cup-trend-1289135',
'info_dict': {
@@ -252,6 +252,17 @@ class ESPNCricInfoIE(InfoExtractor):
'duration': 96,
},
'params': {'skip_download': True}
+ }, {
+ 'url': 'https://www.espncricinfo.com/cricket-videos/daryl-mitchell-mitchell-santner-is-one-of-the-best-white-ball-spinners-india-vs-new-zealand-1356225',
+ 'info_dict': {
+ 'id': '1356225',
+ 'ext': 'mp4',
+ 'description': '"Santner has done it for a long time for New Zealand - we\'re lucky to have him"',
+ 'upload_date': '20230128',
+ 'title': 'Mitchell: \'Santner is one of the best white-ball spinners at the moment\'',
+ 'duration': 87,
+ },
+ 'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
diff --git a/hypervideo_dl/extractor/ettutv.py b/hypervideo_dl/extractor/ettutv.py
new file mode 100644
index 0000000..133b525
--- /dev/null
+++ b/hypervideo_dl/extractor/ettutv.py
@@ -0,0 +1,60 @@
+from .common import InfoExtractor
+from ..utils import bool_or_none, traverse_obj, unified_timestamp, url_or_none
+
+
+class EttuTvIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?ettu\.tv/[^?#]+/playerpage/(?P<id>[0-9]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.ettu.tv/en-int/playerpage/1573849',
+ 'md5': '5874b7639a2aa866d1f6c3a4037c7c09',
+ 'info_dict': {
+ 'id': '1573849',
+ 'title': 'Ni Xia Lian - Shao Jieni',
+ 'description': 'ITTF Europe Top 16 Cup',
+ 'timestamp': 1677348600,
+ 'upload_date': '20230225',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'ext': 'mp4',
+ },
+ }, {
+ 'url': 'https://www.ettu.tv/en-int/playerpage/1573753',
+ 'md5': '1fc094bf96cf2d5ec0f434d3a6dec9aa',
+ 'info_dict': {
+ 'id': '1573753',
+ 'title': 'Qiu Dang - Jorgic Darko',
+ 'description': 'ITTF Europe Top 16 Cup',
+ 'timestamp': 1677423600,
+ 'upload_date': '20230226',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'ext': 'mp4',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ player_settings = self._download_json(
+ f'https://www.ettu.tv/api/v3/contents/{video_id}/player-settings', video_id, query={
+ 'language': 'en',
+ 'showTitle': 'true',
+ 'device': 'desktop',
+ })
+
+ stream_response = self._download_json(player_settings['streamAccess'], video_id, data=b'')
+
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ stream_response['data']['stream'], video_id, 'mp4')
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ **traverse_obj(player_settings, {
+ 'title': 'title',
+ 'description': ('metaInformation', 'competition'),
+ 'thumbnail': ('image', {url_or_none}),
+ 'timestamp': ('date', {unified_timestamp}),
+ 'is_live': ('isLivestream', {bool_or_none}),
+ })
+ }
diff --git a/hypervideo_dl/extractor/europa.py b/hypervideo_dl/extractor/europa.py
index c2b4937..f3da95f 100644
--- a/hypervideo_dl/extractor/europa.py
+++ b/hypervideo_dl/extractor/europa.py
@@ -3,8 +3,10 @@ from ..utils import (
int_or_none,
orderedSet,
parse_duration,
+ parse_iso8601,
parse_qs,
qualities,
+ traverse_obj,
unified_strdate,
xpath_text
)
@@ -87,3 +89,85 @@ class EuropaIE(InfoExtractor):
'view_count': view_count,
'formats': formats
}
+
+
+class EuroParlWebstreamIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://multimedia\.europarl\.europa\.eu/[^/#?]+/
+ (?:(?!video)[^/#?]+/[\w-]+_)(?P<id>[\w-]+)
+ '''
+ _TESTS = [{
+ 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY',
+ 'info_dict': {
+ 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d',
+ 'ext': 'mp4',
+ 'title': 'Plenary session',
+ 'release_timestamp': 1663139069,
+ 'release_date': '20220914',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ # live webstream
+ 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': '510eda7f-ba72-161b-7ee7-0e836cd2e715',
+ 'release_timestamp': 1668502800,
+ 'title': 'Euroscola 2022-11-15 19:21',
+ 'release_date': '20221115',
+ 'live_status': 'is_live',
+ },
+ 'skip': 'not live anymore'
+ }, {
+ 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-culture-and-education_20230301-1130-COMMITTEE-CULT',
+ 'info_dict': {
+ 'id': '7355662c-8eac-445e-4bb9-08db14b0ddd7',
+ 'ext': 'mp4',
+ 'release_date': '20230301',
+ 'title': 'Committee on Culture and Education',
+ 'release_timestamp': 1677666641,
+ }
+ }, {
+ # live stream
+ 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-environment-public-health-and-food-safety_20230524-0900-COMMITTEE-ENVI',
+ 'info_dict': {
+ 'id': 'e4255f56-10aa-4b3c-6530-08db56d5b0d9',
+ 'ext': 'mp4',
+ 'release_date': '20230524',
+ 'title': r're:Committee on Environment, Public Health and Food Safety \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}',
+ 'release_timestamp': 1684911541,
+ 'live_status': 'is_live',
+ },
+ 'skip': 'Not live anymore'
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ webpage_nextjs = self._search_nextjs_data(webpage, display_id)['props']['pageProps']
+
+ json_info = self._download_json(
+ 'https://acs-api.europarl.connectedviews.eu/api/FullMeeting', display_id,
+ query={
+ 'api-version': 1.0,
+ 'tenantId': 'bae646ca-1fc8-4363-80ba-2c04f06b4968',
+ 'externalReference': display_id
+ })
+
+ formats, subtitles = [], {}
+ for hls_url in traverse_obj(json_info, ((('meetingVideo'), ('meetingVideos', ...)), 'hlsUrl')):
+ fmt, subs = self._extract_m3u8_formats_and_subtitles(hls_url, display_id)
+ formats.extend(fmt)
+ self._merge_subtitles(subs, target=subtitles)
+
+ return {
+ 'id': json_info['id'],
+ 'title': traverse_obj(webpage_nextjs, (('mediaItem', 'title'), ('title', )), get_all=False),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'release_timestamp': parse_iso8601(json_info.get('startDateTime')),
+ 'is_live': traverse_obj(webpage_nextjs, ('mediaItem', 'mediaSubType')) == 'Live'
+ }
diff --git a/hypervideo_dl/extractor/eurosport.py b/hypervideo_dl/extractor/eurosport.py
index 654e112..6c426bb 100644
--- a/hypervideo_dl/extractor/eurosport.py
+++ b/hypervideo_dl/extractor/eurosport.py
@@ -3,7 +3,7 @@ from ..utils import traverse_obj
class EurosportIE(InfoExtractor):
- _VALID_URL = r'https?://www\.eurosport\.com/\w+/[\w-]+/\d+/[\w-]+_(?P<id>vid\d+)'
+ _VALID_URL = r'https?://www\.eurosport\.com/\w+/(?:[\w-]+/[\d-]+/)?[\w-]+_(?P<id>vid\d+)'
_TESTS = [{
'url': 'https://www.eurosport.com/tennis/roland-garros/2022/highlights-rafael-nadal-brushes-aside-caper-ruud-to-win-record-extending-14th-french-open-title_vid1694147/video.shtml',
'info_dict': {
@@ -44,6 +44,32 @@ class EurosportIE(InfoExtractor):
'description': 'md5:32bbe3a773ac132c57fb1e8cca4b7c71',
'upload_date': '20220727',
}
+ }, {
+ 'url': 'https://www.eurosport.com/football/champions-league/2022-2023/pep-guardiola-emotionally-destroyed-after-manchester-city-win-over-bayern-munich-in-champions-league_vid1896254/video.shtml',
+ 'info_dict': {
+ 'id': '3096477',
+ 'ext': 'mp4',
+ 'title': 'md5:82edc17370124c7a19b3cf518517583b',
+ 'duration': 84.0,
+ 'description': 'md5:b3f44ef7f5b5b95b24a273b163083feb',
+ 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2023/04/12/3682873-74947393-2560-1440.jpg',
+ 'timestamp': 1681292028,
+ 'upload_date': '20230412',
+ 'display_id': 'vid1896254',
+ }
+ }, {
+ 'url': 'https://www.eurosport.com/football/last-year-s-semi-final-pain-was-still-there-pep-guardiola-after-man-city-reach-cl-final_vid1914115/video.shtml',
+ 'info_dict': {
+ 'id': '3149108',
+ 'ext': 'mp4',
+ 'title': '\'Last year\'s semi-final pain was still there\' - Pep Guardiola after Man City reach CL final',
+ 'description': 'md5:89ef142fe0170a66abab77fac2955d8e',
+ 'display_id': 'vid1914115',
+ 'timestamp': 1684403618,
+ 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2023/05/18/3707254-75435008-2560-1440.jpg',
+ 'duration': 105.0,
+ 'upload_date': '20230518',
+ }
}]
_TOKEN = None
diff --git a/hypervideo_dl/extractor/extractors.py b/hypervideo_dl/extractor/extractors.py
index 610e02f..baa69d2 100644
--- a/hypervideo_dl/extractor/extractors.py
+++ b/hypervideo_dl/extractor/extractors.py
@@ -1,10 +1,10 @@
import contextlib
import os
-from ..utils import load_plugins
+from ..plugins import load_plugins
# NB: Must be before other imports so that plugins can be correctly injected
-_PLUGIN_CLASSES = load_plugins('extractor', 'IE', {})
+_PLUGIN_CLASSES = load_plugins('extractor', 'IE')
_LAZY_LOADER = False
if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
@@ -24,3 +24,5 @@ if not _LAZY_LOADER:
globals().update(_PLUGIN_CLASSES)
_ALL_CLASSES[:0] = _PLUGIN_CLASSES.values()
+
+from .common import _PLUGIN_OVERRIDES # noqa: F401
diff --git a/hypervideo_dl/extractor/facebook.py b/hypervideo_dl/extractor/facebook.py
index a58d9c8..021c3cf 100644
--- a/hypervideo_dl/extractor/facebook.py
+++ b/hypervideo_dl/extractor/facebook.py
@@ -8,6 +8,8 @@ from ..compat import (
compat_str,
compat_urllib_parse_unquote,
)
+from ..networking import Request
+from ..networking.exceptions import network_exceptions
from ..utils import (
ExtractorError,
clean_html,
@@ -19,11 +21,10 @@ from ..utils import (
int_or_none,
js_to_json,
merge_dicts,
- network_exceptions,
parse_count,
parse_qs,
qualities,
- sanitized_Request,
+ str_or_none,
traverse_obj,
try_get,
url_or_none,
@@ -90,16 +91,16 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '274175099429670',
'ext': 'mp4',
- 'title': 'Asif Nawab Butt',
- 'description': 'Asif Nawab Butt',
+ 'title': 'Asif',
+ 'description': '',
'uploader': 'Asif Nawab Butt',
'upload_date': '20140506',
'timestamp': 1399398998,
'thumbnail': r're:^https?://.*',
+ 'uploader_id': 'pfbid04scW44U4P9iTyLZAGy8y8W3pR3i2VugvHCimiRudUAVbN3MPp9eXBaYFcgVworZwl',
+ 'duration': 131.03,
+ 'concurrent_view_count': int,
},
- 'expected_warnings': [
- 'title'
- ]
}, {
'note': 'Video with DASH manifest',
'url': 'https://www.facebook.com/video.php?v=957955867617029',
@@ -151,7 +152,7 @@ class FacebookIE(InfoExtractor):
# have 1080P, but only up to 720p in swf params
# data.video.story.attachments[].media
'url': 'https://www.facebook.com/cnn/videos/10155529876156509/',
- 'md5': '3f3798adb2b73423263e59376f1f5eb7',
+ 'md5': 'ca63897a90c9452efee5f8c40d080e25',
'info_dict': {
'id': '10155529876156509',
'ext': 'mp4',
@@ -162,6 +163,9 @@ class FacebookIE(InfoExtractor):
'uploader': 'CNN',
'thumbnail': r're:^https?://.*',
'view_count': int,
+ 'uploader_id': '100059479812265',
+ 'concurrent_view_count': int,
+ 'duration': 44.478,
},
}, {
# bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
@@ -170,12 +174,16 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '1417995061575415',
'ext': 'mp4',
- 'title': 'Ukrainian Scientists Worldwide | Довгоочікуване відео',
+ 'title': 'Довгоочікуване відео | By Yaroslav - Facebook',
'description': 'Довгоочікуване відео',
- 'timestamp': 1486648771,
+ 'timestamp': 1486648217,
'upload_date': '20170209',
'uploader': 'Yaroslav Korpan',
- 'uploader_id': '100000948048708',
+ 'uploader_id': 'pfbid029y8j22EwH3ikeqgH3SEP9G3CAi9kmWKgXJJG9s5geV7mo3J2bvURqHCdgucRgAyhl',
+ 'concurrent_view_count': int,
+ 'thumbnail': r're:^https?://.*',
+ 'view_count': int,
+ 'duration': 11736.446,
},
'params': {
'skip_download': True,
@@ -192,9 +200,7 @@ class FacebookIE(InfoExtractor):
'uploader': 'La Guía Del Varón',
'thumbnail': r're:^https?://.*',
},
- 'params': {
- 'skip_download': True,
- },
+ 'skip': 'Requires logging in',
}, {
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/',
@@ -208,9 +214,7 @@ class FacebookIE(InfoExtractor):
'uploader': 'Elisabeth Ahtn',
'uploader_id': '100013949973717',
},
- 'params': {
- 'skip_download': True,
- },
+ 'skip': 'Requires logging in',
}, {
'url': 'https://www.facebook.com/video.php?v=10204634152394104',
'only_matching': True,
@@ -252,7 +256,11 @@ class FacebookIE(InfoExtractor):
'timestamp': 1527084179,
'upload_date': '20180523',
'uploader': 'ESL One Dota 2',
- 'uploader_id': '234218833769558',
+ 'uploader_id': '100066514874195',
+ 'duration': 4524.212,
+ 'view_count': int,
+ 'thumbnail': r're:^https?://.*',
+ 'concurrent_view_count': int,
},
'params': {
'skip_download': True,
@@ -262,8 +270,17 @@ class FacebookIE(InfoExtractor):
'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/',
'info_dict': {
'id': '106560053808006',
+ 'ext': 'mp4',
+ 'title': 'Josef',
+ 'thumbnail': r're:^https?://.*',
+ 'concurrent_view_count': int,
+ 'uploader_id': 'pfbid02gXHbDwxumkaKJQaTGUf3znYfYzTuidGEWawiramNx4YamSj2afwYSRkpcjtHtMRJl',
+ 'timestamp': 1549275572,
+ 'duration': 3.413,
+ 'uploader': 'Josef Novak',
+ 'description': '',
+ 'upload_date': '20190204',
},
- 'playlist_count': 2,
}, {
# data.video.story.attachments[].media
'url': 'https://www.facebook.com/watch/?v=647537299265662',
@@ -276,6 +293,7 @@ class FacebookIE(InfoExtractor):
'id': '10157667649866271',
},
'playlist_count': 3,
+ 'skip': 'Requires logging in',
}, {
# data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media
'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330',
@@ -319,7 +337,7 @@ class FacebookIE(InfoExtractor):
}
def _perform_login(self, username, password):
- login_page_req = sanitized_Request(self._LOGIN_URL)
+ login_page_req = Request(self._LOGIN_URL)
self._set_cookie('facebook.com', 'locale', 'en_US')
login_page = self._download_webpage(login_page_req, None,
note='Downloading login page',
@@ -340,8 +358,8 @@ class FacebookIE(InfoExtractor):
'timezone': '-60',
'trynum': '1',
}
- request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form))
- request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ request = Request(self._LOGIN_URL, urlencode_postdata(login_form))
+ request.headers['Content-Type'] = 'application/x-www-form-urlencoded'
try:
login_results = self._download_webpage(request, None,
note='Logging in', errnote='unable to fetch login page')
@@ -367,8 +385,8 @@ class FacebookIE(InfoExtractor):
'h': h,
'name_action_selected': 'dont_save',
}
- check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
- check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ check_req = Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
+ check_req.headers['Content-Type'] = 'application/x-www-form-urlencoded'
check_response = self._download_webpage(check_req, None,
note='Confirming login')
if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
@@ -390,7 +408,10 @@ class FacebookIE(InfoExtractor):
k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
title = get_first(media, ('title', 'text'))
description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text'))
- uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {}
+ uploader_data = (
+ get_first(media, ('owner', {dict}))
+ or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name']))
+ or get_first(post, ('node', 'actors', ..., {dict})) or {})
page_title = title or self._html_search_regex((
r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
@@ -415,16 +436,17 @@ class FacebookIE(InfoExtractor):
# in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/
if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail):
thumbnail = None
- view_count = parse_count(self._search_regex(
- r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
- default=None))
info_dict = {
'description': description,
'uploader': uploader,
'uploader_id': uploader_data.get('id'),
'timestamp': timestamp,
'thumbnail': thumbnail,
- 'view_count': view_count,
+ 'view_count': parse_count(self._search_regex(
+ (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)',),
+ webpage, 'view count', default=None)),
+ 'concurrent_view_count': get_first(post, (
+ ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})),
}
info_json_ld = self._search_json_ld(webpage, video_id, default={})
@@ -459,7 +481,8 @@ class FacebookIE(InfoExtractor):
dash_manifest = video.get('dash_manifest')
if dash_manifest:
formats.extend(self._parse_mpd_formats(
- compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest))))
+ compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
+ mpd_url=video.get('dash_manifest_url')))
def process_formats(info):
# Downloads with browser's User-Agent are rate limited. Working around
@@ -493,6 +516,13 @@ class FacebookIE(InfoExtractor):
entries = []
def parse_graphql_video(video):
+ v_id = video.get('videoId') or video.get('id') or video_id
+ reel_info = traverse_obj(
+ video, ('creation_story', 'short_form_video_context', 'playback_video', {dict}))
+ if reel_info:
+ video = video['creation_story']
+ video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
+ video.update(reel_info)
formats = []
q = qualities(['sd', 'hd'])
for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
@@ -509,15 +539,15 @@ class FacebookIE(InfoExtractor):
'url': playable_url,
})
extract_dash_manifest(video, formats)
- v_id = video.get('videoId') or video.get('id') or video_id
info = {
'id': v_id,
'formats': formats,
'thumbnail': traverse_obj(
video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')),
- 'uploader_id': try_get(video, lambda x: x['owner']['id']),
- 'timestamp': int_or_none(video.get('publish_time')),
- 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000),
+ 'uploader_id': traverse_obj(video, ('owner', 'id', {str_or_none})),
+ 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none),
+ 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000)
+ or float_or_none(video.get('length_in_second'))),
}
process_formats(info)
description = try_get(video, lambda x: x['savable_description']['text'])
@@ -778,18 +808,18 @@ class FacebookReelIE(InfoExtractor):
_TESTS = [{
'url': 'https://www.facebook.com/reel/1195289147628387',
- 'md5': 'c4ff9a7182ff9ff7d6f7a83603bae831',
+ 'md5': 'f13dd37f2633595982db5ed8765474d3',
'info_dict': {
'id': '1195289147628387',
'ext': 'mp4',
- 'title': 'md5:9f5b142921b2dc57004fa13f76005f87',
- 'description': 'md5:24ea7ef062215d295bdde64e778f5474',
- 'uploader': 'Beast Camp Training',
- 'uploader_id': '1738535909799870',
- 'duration': 9.536,
- 'thumbnail': r're:^https?://.*',
+ 'title': 'md5:b05800b5b1ad56c0ca78bd3807b6a61e',
+ 'description': 'md5:22f03309b216ac84720183961441d8db',
+ 'uploader': 'md5:723e6cb3091241160f20b3c5dc282af1',
+ 'uploader_id': '100040874179269',
+ 'duration': 9.579,
+ 'timestamp': 1637502609,
'upload_date': '20211121',
- 'timestamp': 1637502604,
+ 'thumbnail': r're:^https?://.*',
}
}]
diff --git a/hypervideo_dl/extractor/fc2.py b/hypervideo_dl/extractor/fc2.py
index dd5e088..ba19b6c 100644
--- a/hypervideo_dl/extractor/fc2.py
+++ b/hypervideo_dl/extractor/fc2.py
@@ -3,11 +3,11 @@ import re
from .common import InfoExtractor
from ..compat import compat_parse_qs
from ..dependencies import websockets
+from ..networking import Request
from ..utils import (
ExtractorError,
WebSocketsWrapper,
js_to_json,
- sanitized_Request,
traverse_obj,
update_url_query,
urlencode_postdata,
@@ -57,7 +57,7 @@ class FC2IE(InfoExtractor):
}
login_data = urlencode_postdata(login_form_strs)
- request = sanitized_Request(
+ request = Request(
'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data)
login_results = self._download_webpage(request, None, note='Logging in', errnote='Unable to log in')
@@ -66,7 +66,7 @@ class FC2IE(InfoExtractor):
return False
# this is also needed
- login_redir = sanitized_Request('http://id.fc2.com/?mode=redirect&login=done')
+ login_redir = Request('http://id.fc2.com/?mode=redirect&login=done')
self._download_webpage(
login_redir, None, note='Login redirect', errnote='Login redirect failed')
diff --git a/hypervideo_dl/extractor/fifa.py b/hypervideo_dl/extractor/fifa.py
index dc00edc..8b4db3a 100644
--- a/hypervideo_dl/extractor/fifa.py
+++ b/hypervideo_dl/extractor/fifa.py
@@ -17,8 +17,10 @@ class FifaIE(InfoExtractor):
'description': 'md5:f4520d0ee80529c8ba4134a7d692ff8b',
'ext': 'mp4',
'categories': ['FIFA Tournaments'],
- 'thumbnail': 'https://digitalhub.fifa.com/transform/fa6f0b3e-a2e9-4cf7-9f32-53c57bcb7360/2006_Final_ITA_FRA',
+ 'thumbnail': 'https://digitalhub.fifa.com/transform/135e2656-3a51-407b-8810-6c34bec5b59b/FMR_2006_Italy_France_Final_Hero',
'duration': 8165,
+ 'release_timestamp': 1152403200,
+ 'release_date': '20060709',
},
'params': {'skip_download': 'm3u8'},
}, {
@@ -54,7 +56,7 @@ class FifaIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
preconnect_link = self._search_regex(
- r'<link[^>]+rel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link')
+ r'<link\b[^>]+\brel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link')
video_details = self._download_json(
f'{preconnect_link}/sections/videoDetails/{video_id}', video_id, 'Downloading Video Details', fatal=False)
@@ -62,22 +64,9 @@ class FifaIE(InfoExtractor):
preplay_parameters = self._download_json(
f'{preconnect_link}/videoPlayerData/{video_id}', video_id, 'Downloading Preplay Parameters')['preplayParameters']
- cid = preplay_parameters['contentId']
content_data = self._download_json(
- f'https://content.uplynk.com/preplay/{cid}/multiple.json', video_id, 'Downloading Content Data', query={
- 'v': preplay_parameters['preplayAPIVersion'],
- 'tc': preplay_parameters['tokenCheckAlgorithmVersion'],
- 'rn': preplay_parameters['randomNumber'],
- 'exp': preplay_parameters['tokenExpirationDate'],
- 'ct': preplay_parameters['contentType'],
- 'cid': cid,
- 'mbtracks': preplay_parameters['tracksAssetNumber'],
- 'ad': preplay_parameters['adConfiguration'],
- 'ad.preroll': int(preplay_parameters['adPreroll']),
- 'ad.cmsid': preplay_parameters['adCMSSourceId'],
- 'ad.vid': preplay_parameters['adSourceVideoID'],
- 'sig': preplay_parameters['signature'],
- })
+ 'https://content.uplynk.com/preplay/{contentId}/multiple.json?{queryStr}&sig={signature}'.format(**preplay_parameters),
+ video_id, 'Downloading Content Data')
formats, subtitles = self._extract_m3u8_formats_and_subtitles(content_data['playURL'], video_id)
diff --git a/hypervideo_dl/extractor/filmon.py b/hypervideo_dl/extractor/filmon.py
index 9a93cb9..0cd18f4 100644
--- a/hypervideo_dl/extractor/filmon.py
+++ b/hypervideo_dl/extractor/filmon.py
@@ -1,8 +1,6 @@
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_HTTPError,
-)
+from ..compat import compat_str
+from ..networking.exceptions import HTTPError
from ..utils import (
qualities,
strip_or_none,
@@ -40,8 +38,8 @@ class FilmOnIE(InfoExtractor):
'https://www.filmon.com/api/vod/movie?id=%s' % video_id,
video_id)['response']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError):
- errmsg = self._parse_json(e.cause.read().decode(), video_id)['reason']
+ if isinstance(e.cause, HTTPError):
+ errmsg = self._parse_json(e.cause.response.read().decode(), video_id)['reason']
raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True)
raise
@@ -124,8 +122,8 @@ class FilmOnChannelIE(InfoExtractor):
channel_data = self._download_json(
'http://www.filmon.com/api-v2/channel/' + channel_id, channel_id)['data']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError):
- errmsg = self._parse_json(e.cause.read().decode(), channel_id)['message']
+ if isinstance(e.cause, HTTPError):
+ errmsg = self._parse_json(e.cause.response.read().decode(), channel_id)['message']
raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True)
raise
diff --git a/hypervideo_dl/extractor/fox.py b/hypervideo_dl/extractor/fox.py
index 15c0c48..e00e977 100644
--- a/hypervideo_dl/extractor/fox.py
+++ b/hypervideo_dl/extractor/fox.py
@@ -3,10 +3,10 @@ import uuid
from .common import InfoExtractor
from ..compat import (
- compat_HTTPError,
compat_str,
compat_urllib_parse_unquote,
)
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
@@ -20,7 +20,7 @@ from ..utils import (
class FOXIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)'
+ _VALID_URL = r'https?://(?:www\.)?fox(?:sports)?\.com/(?:watch|replay)/(?P<id>[\da-fA-F]+)'
_TESTS = [{
# clip
'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/',
@@ -50,6 +50,10 @@ class FOXIE(InfoExtractor):
# sports event, geo-restricted
'url': 'https://www.fox.com/watch/b057484dade738d1f373b3e46216fa2c/',
'only_matching': True,
+ }, {
+ # fox sports replay, geo-restricted
+ 'url': 'https://www.foxsports.com/replay/561f3e071347a24e5e877abc56b22e89',
+ 'only_matching': True,
}]
_GEO_BYPASS = False
_HOME_PAGE_URL = 'https://www.fox.com/'
@@ -68,9 +72,9 @@ class FOXIE(InfoExtractor):
'https://api3.fox.com/v2.0/' + path,
video_id, data=data, headers=headers)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
entitlement_issues = self._parse_json(
- e.cause.read().decode(), video_id)['entitlementIssues']
+ e.cause.response.read().decode(), video_id)['entitlementIssues']
for e in entitlement_issues:
if e.get('errorCode') == 1005:
raise ExtractorError(
@@ -123,8 +127,8 @@ class FOXIE(InfoExtractor):
try:
m3u8_url = self._download_json(release_url, video_id)['playURL']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- error = self._parse_json(e.cause.read().decode(), video_id)
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
+ error = self._parse_json(e.cause.response.read().decode(), video_id)
if error.get('exception') == 'GeoLocationBlocked':
self.raise_geo_restricted(countries=['US'])
raise ExtractorError(error['description'], expected=True)
diff --git a/hypervideo_dl/extractor/foxnews.py b/hypervideo_dl/extractor/foxnews.py
index 52172aa..6aa6361 100644
--- a/hypervideo_dl/extractor/foxnews.py
+++ b/hypervideo_dl/extractor/foxnews.py
@@ -7,9 +7,38 @@ from .common import InfoExtractor
class FoxNewsIE(AMPIE):
IE_NAME = 'foxnews'
IE_DESC = 'Fox News and Fox Business Video'
- _VALID_URL = r'https?://(?P<host>video\.(?:insider\.)?fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)'
+ _VALID_URL = r'https?://video\.(?:insider\.)?fox(?:news|business)\.com/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)'
_TESTS = [
{
+ 'url': 'https://video.foxnews.com/v/6320653836112',
+ 'info_dict': {
+ 'id': '6320653836112',
+ 'ext': 'mp4',
+ 'title': 'Tucker Carlson joins \'Gutfeld!\' to discuss his new documentary',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 404,
+ 'upload_date': '20230217',
+ 'description': 'md5:858a8a36f59e9ca897d758855bcdfa02',
+ 'timestamp': 1676611344.0,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ },
+ {
+ # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words
+ 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true',
+ 'info_dict': {
+ 'id': '5099377331001',
+ 'ext': 'mp4',
+ 'title': '82416_censoring',
+ 'description': '82416_censoring',
+ 'upload_date': '20160826',
+ 'timestamp': 1472169708.0,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 521,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ },
+ {
'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips',
'md5': '32aaded6ba3ef0d1c04e238d01031e5e',
'info_dict': {
@@ -22,6 +51,7 @@ class FoxNewsIE(AMPIE):
'upload_date': '20110503',
'thumbnail': r're:^https?://.*\.jpg$',
},
+ 'skip': '404 page',
},
{
'url': 'http://video.foxnews.com/v/3922535568001/rep-luis-gutierrez-on-if-obamas-immigration-plan-is-legal/#sp=show-clips',
@@ -36,10 +66,7 @@ class FoxNewsIE(AMPIE):
'upload_date': '20141204',
'thumbnail': r're:^https?://.*\.jpg$',
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
+ 'skip': 'm3u8 HTTP error 400 in web browser',
},
{
'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com',
@@ -49,11 +76,6 @@ class FoxNewsIE(AMPIE):
'url': 'http://video.foxbusiness.com/v/4442309889001',
'only_matching': True,
},
- {
- # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words
- 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true',
- 'only_matching': True,
- },
]
@classmethod
@@ -67,10 +89,10 @@ class FoxNewsIE(AMPIE):
yield f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}'
def _real_extract(self, url):
- host, video_id = self._match_valid_url(url).groups()
+ video_id = self._match_id(url)
info = self._extract_feed_info(
- 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id))
+ f'https://api.foxnews.com/v3/video-player/{video_id}?callback=uid_{video_id}')
info['id'] = video_id
return info
@@ -78,6 +100,19 @@ class FoxNewsIE(AMPIE):
class FoxNewsVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?foxnews\.com/video/(?P<id>\d+)'
_TESTS = [{
+ 'url': 'https://www.foxnews.com/video/6328632286112',
+ 'info_dict': {
+ 'id': '6328632286112',
+ 'ext': 'mp4',
+ 'title': 'Review: 2023 Toyota Prius Prime',
+ 'duration': 155,
+ 'thumbnail': r're:^https://.+\.jpg$',
+ 'timestamp': 1685720177.0,
+ 'upload_date': '20230602',
+ 'description': 'md5:b69aafb125b41c1402e9744f53d6edc4',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
'url': 'https://www.foxnews.com/video/6313058664112',
'info_dict': {
'id': '6313058664112',
@@ -89,8 +124,7 @@ class FoxNewsVideoIE(InfoExtractor):
'title': 'Gutfeld! - Thursday, September 29',
'timestamp': 1664527538,
},
- 'expected_warnings': ['Ignoring subtitle tracks'],
- 'params': {'skip_download': 'm3u8'},
+ 'skip': '404 page',
}]
def _real_extract(self, url):
@@ -104,19 +138,22 @@ class FoxNewsArticleIE(InfoExtractor):
_TESTS = [{
# data-video-id
- 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html',
- 'md5': '83d44e1aff1433e7a29a7b537d1700b5',
+ 'url': 'https://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html',
+ 'md5': 'd2dd6ce809cedeefa96460e964821437',
'info_dict': {
'id': '5116295019001',
'ext': 'mp4',
'title': 'Trump and Clinton asked to defend positions on Iraq War',
- 'description': 'Veterans react on \'The Kelly File\'',
+ 'description': 'Veterans and Fox News host Dana Perino react on \'The Kelly File\' to NBC\'s presidential forum',
'timestamp': 1473301045,
'upload_date': '20160908',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 426,
},
+ 'params': {'skip_download': 'm3u8'},
}, {
# iframe embed
- 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true',
+ 'url': 'https://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true',
'info_dict': {
'id': '5748266721001',
'ext': 'flv',
@@ -127,9 +164,7 @@ class FoxNewsArticleIE(InfoExtractor):
'timestamp': 1520594670,
'upload_date': '20180309',
},
- 'params': {
- 'skip_download': True,
- },
+ 'skip': '404 page',
}, {
'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words',
'only_matching': True,
diff --git a/hypervideo_dl/extractor/foxsports.py b/hypervideo_dl/extractor/foxsports.py
index f9d7fe5..8e89ccf 100644
--- a/hypervideo_dl/extractor/foxsports.py
+++ b/hypervideo_dl/extractor/foxsports.py
@@ -1,31 +1,52 @@
from .common import InfoExtractor
+from .uplynk import UplynkPreplayIE
+from ..networking import HEADRequest
+from ..utils import float_or_none, make_archive_id, smuggle_url
class FoxSportsIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*video/(?P<id>\d+)'
-
- _TEST = {
- 'url': 'http://www.foxsports.com/tennessee/video/432609859715',
- 'md5': 'b49050e955bebe32c301972e4012ac17',
+ _VALID_URL = r'https?://(?:www\.)?foxsports\.com/watch/(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://www.foxsports.com/watch/play-612168c6700004b',
'info_dict': {
- 'id': '432609859715',
+ 'id': 'b72f5bd8658140baa5791bb676433733',
'ext': 'mp4',
- 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers',
- 'description': 'Courtney Lee talks about Memphis being focused.',
- # TODO: fix timestamp
- 'upload_date': '19700101', # '20150423',
- # 'timestamp': 1429761109,
- 'uploader': 'NEWA-FNG-FOXSPORTS',
+ 'display_id': 'play-612168c6700004b',
+ 'title': 'md5:e0c4ecac3a1f25295b4fae22fb5c126a',
+ 'description': 'md5:371bc43609708ae2b9e1a939229762af',
+ 'uploader_id': '06b4a36349624051a9ba52ac3a91d268',
+ 'upload_date': '20221205',
+ 'timestamp': 1670262586,
+ 'duration': 31.7317,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'extra_param_to_segment_url': str,
},
'params': {
- # m3u8 download
- 'skip_download': True,
+ 'skip_download': 'm3u8',
},
- 'add_ie': ['ThePlatform'],
- }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ json_ld = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={})
+ data = self._download_json(
+ f'https://api3.fox.com/v2.0/vodplayer/sportsclip/{video_id}',
+ video_id, note='Downloading API JSON', headers={
+ 'x-api-key': 'cf289e299efdfa39fb6316f259d1de93',
+ })
+ preplay_url = self._request_webpage(
+ HEADRequest(data['url']), video_id, 'Fetching preplay URL').url
- return self.url_result(
- 'https://feed.theplatform.com/f/BKQ29B/foxsports-all?byId=' + video_id, 'ThePlatformFeed')
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': UplynkPreplayIE.ie_key(),
+ 'url': smuggle_url(preplay_url, {'Origin': 'https://www.foxsports.com'}),
+ 'display_id': video_id,
+ 'title': data.get('name') or json_ld.get('title'),
+ 'description': data.get('description') or json_ld.get('description'),
+ 'duration': float_or_none(data.get('durationInSeconds')),
+ 'timestamp': json_ld.get('timestamp'),
+ 'thumbnails': json_ld.get('thumbnails'),
+ '_old_archive_ids': [make_archive_id(self, video_id)],
+ }
diff --git a/hypervideo_dl/extractor/freesound.py b/hypervideo_dl/extractor/freesound.py
index 8b5f227..fcde044 100644
--- a/hypervideo_dl/extractor/freesound.py
+++ b/hypervideo_dl/extractor/freesound.py
@@ -52,6 +52,7 @@ class FreesoundIE(InfoExtractor):
tags_str = get_element_by_class('tags', webpage)
tags = re.findall(r'<a[^>]+>([^<]+)', tags_str) if tags_str else None
+ audio_url = re.sub(r'^https?://freesound\.org(https?://)', r'\1', audio_url)
audio_urls = [audio_url]
LQ_FORMAT = '-lq.mp3'
diff --git a/hypervideo_dl/extractor/fujitv.py b/hypervideo_dl/extractor/fujitv.py
index 668bb27..77e826e 100644
--- a/hypervideo_dl/extractor/fujitv.py
+++ b/hypervideo_dl/extractor/fujitv.py
@@ -1,5 +1,5 @@
-from ..utils import HEADRequest
from .common import InfoExtractor
+from ..networking import HEADRequest
class FujiTVFODPlus7IE(InfoExtractor):
diff --git a/hypervideo_dl/extractor/funimation.py b/hypervideo_dl/extractor/funimation.py
index 18363c1..41de85c 100644
--- a/hypervideo_dl/extractor/funimation.py
+++ b/hypervideo_dl/extractor/funimation.py
@@ -3,7 +3,7 @@ import re
import string
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
determine_ext,
@@ -46,8 +46,8 @@ class FunimationBaseIE(InfoExtractor):
}))
FunimationBaseIE._TOKEN = data['token']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- error = self._parse_json(e.cause.read().decode(), None)['error']
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+ error = self._parse_json(e.cause.response.read().decode(), None)['error']
raise ExtractorError(error, expected=True)
raise
@@ -210,7 +210,7 @@ class FunimationIE(FunimationBaseIE):
page = self._download_json(
'https://www.funimation.com/api/showexperience/%s/' % experience_id,
display_id, headers=headers, expected_status=403, query={
- 'pinst_id': ''.join([random.choice(string.digits + string.ascii_letters) for _ in range(8)]),
+ 'pinst_id': ''.join(random.choices(string.digits + string.ascii_letters, k=8)),
}, note=f'Downloading {format_name} JSON')
sources = page.get('items') or []
if not sources:
diff --git a/hypervideo_dl/extractor/funker530.py b/hypervideo_dl/extractor/funker530.py
new file mode 100644
index 0000000..ba5ab7d
--- /dev/null
+++ b/hypervideo_dl/extractor/funker530.py
@@ -0,0 +1,79 @@
+from .common import InfoExtractor
+from .rumble import RumbleEmbedIE
+from .youtube import YoutubeIE
+from ..utils import ExtractorError, clean_html, get_element_by_class, strip_or_none
+
+
+class Funker530IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?funker530\.com/video/(?P<id>[^/?#]+)'
+ _TESTS = [{
+ 'url': 'https://funker530.com/video/azov-patrol-caught-in-open-under-automatic-grenade-launcher-fire/',
+ 'md5': '085f50fea27523a388bbc22e123e09c8',
+ 'info_dict': {
+ 'id': 'v2qbmu4',
+ 'ext': 'mp4',
+ 'title': 'Azov Patrol Caught In Open Under Automatic Grenade Launcher Fire',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Funker530',
+ 'channel': 'Funker530',
+ 'channel_url': 'https://rumble.com/c/c-1199543',
+ 'width': 1280,
+ 'height': 720,
+ 'fps': 25,
+ 'duration': 27,
+ 'upload_date': '20230608',
+ 'timestamp': 1686241321,
+ 'live_status': 'not_live',
+ 'description': 'md5:bea2e1f458095414e04b5ac189c2f980',
+ }
+ }, {
+ 'url': 'https://funker530.com/video/my-friends-joined-the-russians-civdiv/',
+ 'md5': 'a42c2933391210662e93e867d7124b70',
+ 'info_dict': {
+ 'id': 'k-pk4bOvoac',
+ 'ext': 'mp4',
+ 'view_count': int,
+ 'channel': 'Civ Div',
+ 'comment_count': int,
+ 'channel_follower_count': int,
+ 'thumbnail': 'https://i.ytimg.com/vi/k-pk4bOvoac/maxresdefault.jpg',
+ 'uploader_id': '@CivDiv',
+ 'duration': 357,
+ 'channel_url': 'https://www.youtube.com/channel/UCgsCiwJ88up-YyMHo7hL5-A',
+ 'tags': [],
+ 'uploader_url': 'https://www.youtube.com/@CivDiv',
+ 'channel_id': 'UCgsCiwJ88up-YyMHo7hL5-A',
+ 'like_count': int,
+ 'description': 'md5:aef75ec3f59c07a0e39400f609b24429',
+ 'live_status': 'not_live',
+ 'age_limit': 0,
+ 'uploader': 'Civ Div',
+ 'categories': ['People & Blogs'],
+ 'title': 'My “Friends” joined the Russians.',
+ 'availability': 'public',
+ 'upload_date': '20230608',
+ 'playable_in_embed': True,
+ 'heatmap': 'count:100',
+ }
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ rumble_url = list(RumbleEmbedIE._extract_embed_urls(url, webpage))
+ if rumble_url:
+ info = {'url': rumble_url[0], 'ie_key': RumbleEmbedIE.ie_key()}
+ else:
+ youtube_url = list(YoutubeIE._extract_embed_urls(url, webpage))
+ if youtube_url:
+ info = {'url': youtube_url[0], 'ie_key': YoutubeIE.ie_key()}
+ if not info:
+ raise ExtractorError('No videos found on webpage', expected=True)
+
+ return {
+ **info,
+ '_type': 'url_transparent',
+ 'description': strip_or_none(self._search_regex(
+ r'(?s)(.+)About the Author', clean_html(get_element_by_class('video-desc-paragraph', webpage)),
+ 'description', default=None))
+ }
diff --git a/hypervideo_dl/extractor/gamejolt.py b/hypervideo_dl/extractor/gamejolt.py
index 440b832..8ec046b 100644
--- a/hypervideo_dl/extractor/gamejolt.py
+++ b/hypervideo_dl/extractor/gamejolt.py
@@ -48,7 +48,7 @@ class GameJoltBaseIE(InfoExtractor):
post_hash_id, note='Downloading comments list page %d' % page)
if not comments_data.get('comments'):
break
- for comment in traverse_obj(comments_data, (('comments', 'childComments'), ...), expected_type=dict, default=[]):
+ for comment in traverse_obj(comments_data, (('comments', 'childComments'), ...), expected_type=dict):
yield {
'id': comment['id'],
'text': self._parse_content_as_text(
diff --git a/hypervideo_dl/extractor/gdcvault.py b/hypervideo_dl/extractor/gdcvault.py
index 2878bbd..4265feb 100644
--- a/hypervideo_dl/extractor/gdcvault.py
+++ b/hypervideo_dl/extractor/gdcvault.py
@@ -2,13 +2,8 @@ import re
from .common import InfoExtractor
from .kaltura import KalturaIE
-from ..utils import (
- HEADRequest,
- remove_start,
- sanitized_Request,
- smuggle_url,
- urlencode_postdata,
-)
+from ..networking import HEADRequest, Request
+from ..utils import remove_start, smuggle_url, urlencode_postdata
class GDCVaultIE(InfoExtractor):
@@ -138,8 +133,8 @@ class GDCVaultIE(InfoExtractor):
'password': password,
}
- request = sanitized_Request(login_url, urlencode_postdata(login_form))
- request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ request = Request(login_url, urlencode_postdata(login_form))
+ request.headers['Content-Type'] = 'application/x-www-form-urlencoded'
self._download_webpage(request, display_id, 'Logging in')
start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page')
self._download_webpage(logout_url, display_id, 'Logging out')
@@ -163,7 +158,7 @@ class GDCVaultIE(InfoExtractor):
video_url = 'http://www.gdcvault.com' + direct_url
# resolve the url so that we can detect the correct extension
video_url = self._request_webpage(
- HEADRequest(video_url), video_id).geturl()
+ HEADRequest(video_url), video_id).url
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/generic.py b/hypervideo_dl/extractor/generic.py
index f28a77e..77b6fb3 100644
--- a/hypervideo_dl/extractor/generic.py
+++ b/hypervideo_dl/extractor/generic.py
@@ -14,7 +14,9 @@ from ..utils import (
ExtractorError,
UnsupportedError,
determine_ext,
+ determine_protocol,
dict_get,
+ extract_basic_auth,
format_field,
int_or_none,
is_html,
@@ -31,7 +33,9 @@ from ..utils import (
unescapeHTML,
unified_timestamp,
unsmuggle_url,
+ update_url_query,
url_or_none,
+ urljoin,
variadic,
xpath_attr,
xpath_text,
@@ -864,21 +868,7 @@ class GenericIE(InfoExtractor):
},
},
{
- # JWPlayer config passed as variable
- 'url': 'http://www.txxx.com/videos/3326530/ariele/',
- 'info_dict': {
- 'id': '3326530_hq',
- 'ext': 'mp4',
- 'title': 'ARIELE | Tube Cup',
- 'uploader': 'www.txxx.com',
- 'age_limit': 18,
- },
- 'params': {
- 'skip_download': True,
- }
- },
- {
- # Video.js embed, multiple formats
+ # Youtube embed, formerly: Video.js embed, multiple formats
'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html',
'info_dict': {
'id': 'yygqldloqIk',
@@ -905,6 +895,7 @@ class GenericIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': '404 Not Found',
},
# rtl.nl embed
{
@@ -1548,19 +1539,6 @@ class GenericIE(InfoExtractor):
'add_ie': ['WashingtonPost'],
},
{
- # Mediaset embed
- 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml',
- 'info_dict': {
- 'id': '720642',
- 'ext': 'mp4',
- 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"',
- },
- 'params': {
- 'skip_download': True,
- },
- 'add_ie': ['Mediaset'],
- },
- {
# JOJ.sk embeds
'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok',
'info_dict': {
@@ -1864,11 +1842,6 @@ class GenericIE(InfoExtractor):
'title': 'I AM BIO Podcast | BIO',
},
'playlist_mincount': 52,
- },
- {
- # Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed)
- 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html',
- 'only_matching': True,
}, {
# WimTv embed player
'url': 'http://www.msmotor.tv/wearefmi-pt-2-2021/',
@@ -1885,11 +1858,13 @@ class GenericIE(InfoExtractor):
'display_id': 'kelis-4th-of-july',
'ext': 'mp4',
'title': 'Kelis - 4th Of July',
- 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
+ 'description': 'Kelis - 4th Of July',
+ 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
},
'params': {
'skip_download': True,
},
+ 'expected_warnings': ['Untested major version'],
}, {
# KVS Player
'url': 'https://www.kvs-demo.com/embed/105/',
@@ -1898,35 +1873,12 @@ class GenericIE(InfoExtractor):
'display_id': 'kelis-4th-of-july',
'ext': 'mp4',
'title': 'Kelis - 4th Of July / Embed Player',
- 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
+ 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
},
'params': {
'skip_download': True,
},
}, {
- # KVS Player
- 'url': 'https://thisvid.com/videos/french-boy-pantsed/',
- 'md5': '3397979512c682f6b85b3b04989df224',
- 'info_dict': {
- 'id': '2400174',
- 'display_id': 'french-boy-pantsed',
- 'ext': 'mp4',
- 'title': 'French Boy Pantsed - ThisVid.com',
- 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
- }
- }, {
- # KVS Player
- 'url': 'https://thisvid.com/embed/2400174/',
- 'md5': '3397979512c682f6b85b3b04989df224',
- 'info_dict': {
- 'id': '2400174',
- 'display_id': 'french-boy-pantsed',
- 'ext': 'mp4',
- 'title': 'French Boy Pantsed - ThisVid.com',
- 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
- }
- }, {
- # KVS Player
'url': 'https://youix.com/video/leningrad-zoj/',
'md5': '94f96ba95706dc3880812b27b7d8a2b8',
'info_dict': {
@@ -1934,8 +1886,8 @@ class GenericIE(InfoExtractor):
'display_id': 'leningrad-zoj',
'ext': 'mp4',
'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
- 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg',
- }
+ 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
+ },
}, {
# KVS Player
'url': 'https://youix.com/embed/18485',
@@ -1945,19 +1897,20 @@ class GenericIE(InfoExtractor):
'display_id': 'leningrad-zoj',
'ext': 'mp4',
'title': 'Ленинград - ЗОЖ',
- 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg',
- }
+ 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
+ },
}, {
# KVS Player
'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/',
'md5': '94166bdb26b4cb1fb9214319a629fc51',
'info_dict': {
'id': '21217',
- 'display_id': '40-nochey-40-nights-2016',
+ 'display_id': '40-nochey-2016',
'ext': 'mp4',
'title': '40 ночей (2016) - BogMedia.org',
+ 'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg',
- }
+ },
},
{
# KVS Player (for sites that serve kt_player.js via non-https urls)
@@ -1967,9 +1920,9 @@ class GenericIE(InfoExtractor):
'id': '389508',
'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source',
'ext': 'mp4',
- 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
- 'thumbnail': 'http://www.camhub.world/contents/videos_screenshots/389000/389508/preview.mp4.jpg',
- }
+ 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
+ 'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg',
+ },
},
{
# Reddit-hosted video that will redirect and be processed by RedditIE
@@ -2172,7 +2125,79 @@ class GenericIE(InfoExtractor):
'age_limit': 0,
'direct': True,
}
- }
+ },
+ {
+ 'note': 'server returns data in brotli compression by default if `accept-encoding: *` is specified.',
+ 'url': 'https://www.extra.cz/cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
+ 'info_dict': {
+ 'id': 'cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
+ 'ext': 'mp4',
+ 'title': 'čauky lidi 70 finall',
+ 'description': 'čauky lidi 70 finall',
+ 'thumbnail': 'h',
+ 'upload_date': '20220606',
+ 'timestamp': 1654513791,
+ 'duration': 318.0,
+ 'direct': True,
+ 'age_limit': 0,
+ },
+ },
+ {
+ 'note': 'JW Player embed with unicode-escape sequences in URL',
+ 'url': 'https://www.medici.tv/en/concerts/lahav-shani-mozart-mahler-israel-philharmonic-abu-dhabi-classics',
+ 'info_dict': {
+ 'id': 'm',
+ 'ext': 'mp4',
+ 'title': 'Lahav Shani conducts the Israel Philharmonic\'s first-ever concert in Abu Dhabi',
+ 'description': 'Mahler\'s ',
+ 'uploader': 'www.medici.tv',
+ 'age_limit': 0,
+ 'thumbnail': r're:^https?://.+\.jpg',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
+ 'md5': 'e2f0a4c329f7986280b7328e24036d60',
+ 'info_dict': {
+ 'id': '284002',
+ 'display_id': 'just-out-of-the-shower-joi',
+ 'ext': 'mp4',
+ 'title': 'Just Out Of The Shower JOI - Shooshtime',
+ 'thumbnail': 'https://i.shoosh.co/contents/videos_screenshots/284000/284002/preview.mp4.jpg',
+ 'height': 720,
+ 'age_limit': 18,
+ },
+ },
+ {
+ 'note': 'Live HLS direct link',
+ 'url': 'https://d18j67ugtrocuq.cloudfront.net/out/v1/2767aec339144787926bd0322f72c6e9/index.m3u8',
+ 'info_dict': {
+ 'id': 'index',
+ 'title': r're:index',
+ 'ext': 'mp4',
+ 'live_status': 'is_live',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ },
+ {
+ 'note': 'Video.js VOD HLS',
+ 'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html',
+ 'info_dict': {
+ 'id': 'videojs_hls_test',
+ 'title': 'video',
+ 'ext': 'mp4',
+ 'age_limit': 0,
+ 'duration': 1800,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ },
]
def report_following_redirect(self, new_url):
@@ -2189,12 +2214,41 @@ class GenericIE(InfoExtractor):
self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
- def _fragment_query(self, url):
- if self._configuration_arg('fragment_query'):
- query_string = urllib.parse.urlparse(url).query
- if query_string:
- return {'extra_param_to_segment_url': query_string}
- return {}
+ def _extra_manifest_info(self, info, manifest_url):
+ fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0]
+ if fragment_query is not None:
+ info['extra_param_to_segment_url'] = (
+ urllib.parse.urlparse(fragment_query).query or fragment_query
+ or urllib.parse.urlparse(manifest_url).query or None)
+
+ hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None
+ info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), {
+ 'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}),
+ }) or None
+
+ variant_query = self._configuration_arg('variant_query', [None], casesense=True)[0]
+ if variant_query is not None:
+ query = urllib.parse.parse_qs(
+ urllib.parse.urlparse(variant_query).query or variant_query
+ or urllib.parse.urlparse(manifest_url).query)
+ for fmt in self._downloader._get_formats(info):
+ fmt['url'] = update_url_query(fmt['url'], query)
+
+ # Attempt to detect live HLS or set VOD duration
+ m3u8_format = next((f for f in self._downloader._get_formats(info)
+ if determine_protocol(f) == 'm3u8_native'), None)
+ if m3u8_format:
+ is_live = self._configuration_arg('is_live', [None])[0]
+ if is_live is not None:
+ info['live_status'] = 'not_live' if is_live == 'false' else 'is_live'
+ return
+ headers = m3u8_format.get('http_headers') or info.get('http_headers')
+ duration = self._extract_m3u8_vod_duration(
+ m3u8_format['url'], info.get('id'), note='Checking m3u8 live status',
+ errnote='Failed to download m3u8 media playlist', headers=headers)
+ if not duration:
+ info['live_status'] = 'is_live'
+ info['duration'] = info.get('duration') or duration
def _extract_rss(self, url, video_id, doc):
NS_MAP = {
@@ -2238,43 +2292,87 @@ class GenericIE(InfoExtractor):
'entries': entries,
}
- def _kvs_getrealurl(self, video_url, license_code):
+ @classmethod
+ def _kvs_get_real_url(cls, video_url, license_code):
if not video_url.startswith('function/0/'):
return video_url # not obfuscated
- url_path, _, url_query = video_url.partition('?')
- urlparts = url_path.split('/')[2:]
- license = self._kvs_getlicensetoken(license_code)
- newmagic = urlparts[5][:32]
+ parsed = urllib.parse.urlparse(video_url[len('function/0/'):])
+ license = cls._kvs_get_license_token(license_code)
+ urlparts = parsed.path.split('/')
- for o in range(len(newmagic) - 1, -1, -1):
- new = ''
- l = (o + sum(int(n) for n in license[o:])) % 32
+ HASH_LENGTH = 32
+ hash = urlparts[3][:HASH_LENGTH]
+ indices = list(range(HASH_LENGTH))
- for i in range(0, len(newmagic)):
- if i == o:
- new += newmagic[l]
- elif i == l:
- new += newmagic[o]
- else:
- new += newmagic[i]
- newmagic = new
+ # Swap indices of hash according to the destination calculated from the license token
+ accum = 0
+ for src in reversed(range(HASH_LENGTH)):
+ accum += license[src]
+ dest = (src + accum) % HASH_LENGTH
+ indices[src], indices[dest] = indices[dest], indices[src]
+
+ urlparts[3] = ''.join(hash[index] for index in indices) + urlparts[3][HASH_LENGTH:]
+ return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts)))
- urlparts[5] = newmagic + urlparts[5][32:]
- return '/'.join(urlparts) + '?' + url_query
+ @staticmethod
+ def _kvs_get_license_token(license):
+ license = license.replace('$', '')
+ license_values = [int(char) for char in license]
- def _kvs_getlicensetoken(self, license):
- modlicense = license.replace('$', '').replace('0', '1')
- center = int(len(modlicense) / 2)
+ modlicense = license.replace('0', '1')
+ center = len(modlicense) // 2
fronthalf = int(modlicense[:center + 1])
backhalf = int(modlicense[center:])
+ modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1]
+
+ return [
+ (license_values[index + offset] + current) % 10
+ for index, current in enumerate(map(int, modlicense))
+ for offset in range(4)
+ ]
+
+ def _extract_kvs(self, url, webpage, video_id):
+ flashvars = self._search_json(
+ r'(?s:<script\b[^>]*>.*?var\s+flashvars\s*=)',
+ webpage, 'flashvars', video_id, transform_source=js_to_json)
+
+ # extract the part after the last / as the display_id from the
+ # canonical URL.
+ display_id = self._search_regex(
+ r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
+ r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
+ webpage, 'display_id', fatal=False)
+ title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
+
+ thumbnail = flashvars['preview_url']
+ if thumbnail.startswith('//'):
+ protocol, _, _ = url.partition('/')
+ thumbnail = protocol + thumbnail
+
+ url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys()))
+ formats = []
+ for key in url_keys:
+ if '/get_file/' not in flashvars[key]:
+ continue
+ format_id = flashvars.get(f'{key}_text', key)
+ formats.append({
+ 'url': urljoin(url, self._kvs_get_real_url(flashvars[key], flashvars['license_code'])),
+ 'format_id': format_id,
+ 'ext': 'mp4',
+ **(parse_resolution(format_id) or parse_resolution(flashvars[key])),
+ 'http_headers': {'Referer': url},
+ })
+ if not formats[-1].get('height'):
+ formats[-1]['quality'] = 1
- modlicense = str(4 * abs(fronthalf - backhalf))
- retval = ''
- for o in range(0, center + 1):
- for i in range(1, 5):
- retval += str((int(license[o + i]) + int(modlicense[o])) % 10)
- return retval
+ return {
+ 'id': flashvars['video_id'],
+ 'display_id': display_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
def _real_extract(self, url):
if url.startswith('//'):
@@ -2330,13 +2428,12 @@ class GenericIE(InfoExtractor):
# It may probably better to solve this by checking Content-Type for application/octet-stream
# after a HEAD request, but not sure if we can rely on this.
full_response = self._request_webpage(url, video_id, headers={
- 'Accept-Encoding': '*',
+ 'Accept-Encoding': 'identity',
**smuggled_data.get('http_headers', {})
})
- new_url = full_response.geturl()
- if new_url == urllib.parse.urlparse(url)._replace(scheme='https').geturl():
- url = new_url
- elif url != new_url:
+ new_url = full_response.url
+ url = urllib.parse.urlparse(url)._replace(scheme=urllib.parse.urlparse(new_url).scheme).geturl()
+ if new_url != extract_basic_auth(url)[0]:
self.report_following_redirect(new_url)
if force_videoid:
new_url = smuggle_url(new_url, {'force_videoid': force_videoid})
@@ -2355,14 +2452,13 @@ class GenericIE(InfoExtractor):
self.report_detected('direct video link')
headers = smuggled_data.get('http_headers', {})
format_id = str(m.group('format_id'))
+ ext = determine_ext(url)
subtitles = {}
- if format_id.endswith('mpegurl'):
+ if format_id.endswith('mpegurl') or ext == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers)
- info_dict.update(self._fragment_query(url))
- elif format_id.endswith('mpd') or format_id.endswith('dash+xml'):
+ elif format_id.endswith('mpd') or format_id.endswith('dash+xml') or ext == 'mpd':
formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers)
- info_dict.update(self._fragment_query(url))
- elif format_id == 'f4m':
+ elif format_id == 'f4m' or ext == 'f4m':
formats = self._extract_f4m_formats(url, video_id, headers=headers)
else:
formats = [{
@@ -2374,8 +2470,9 @@ class GenericIE(InfoExtractor):
info_dict.update({
'formats': formats,
'subtitles': subtitles,
- 'http_headers': headers,
+ 'http_headers': headers or None,
})
+ self._extra_manifest_info(info_dict, url)
return info_dict
if not self.get_param('test', False) and not is_intentional:
@@ -2388,7 +2485,7 @@ class GenericIE(InfoExtractor):
if first_bytes.startswith(b'#EXTM3U'):
self.report_detected('M3U playlist')
info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
- info_dict.update(self._fragment_query(url))
+ self._extra_manifest_info(info_dict, url)
return info_dict
# Maybe it's a direct link to a video?
@@ -2432,14 +2529,14 @@ class GenericIE(InfoExtractor):
return self.playlist_result(
self._parse_xspf(
doc, video_id, xspf_url=url,
- xspf_base_url=full_response.geturl()),
+ xspf_base_url=full_response.url),
video_id)
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles(
doc,
- mpd_base_url=full_response.geturl().rpartition('/')[0],
+ mpd_base_url=full_response.url.rpartition('/')[0],
mpd_url=url)
- info_dict.update(self._fragment_query(url))
+ self._extra_manifest_info(info_dict, url)
self.report_detected('DASH manifest')
return info_dict
elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
@@ -2465,7 +2562,7 @@ class GenericIE(InfoExtractor):
self._downloader.write_debug('Looking for embeds')
embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict))
if len(embeds) == 1:
- return {**info_dict, **embeds[0]}
+ return merge_dicts(embeds[0], info_dict)
elif embeds:
return self.playlist_result(embeds, **info_dict)
raise UnsupportedError(url)
@@ -2475,7 +2572,7 @@ class GenericIE(InfoExtractor):
info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation
video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url)
url, smuggled_data = unsmuggle_url(url, {})
- actual_url = urlh.geturl() if urlh else url
+ actual_url = urlh.url if urlh else url
# Sometimes embedded video player is hidden behind percent encoding
# (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
@@ -2528,8 +2625,7 @@ class GenericIE(InfoExtractor):
varname = mobj.group(1)
sources = variadic(self._parse_json(
mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [])
- formats = []
- subtitles = {}
+ formats, subtitles, src = [], {}, None
for source in sources:
src = source.get('src')
if not src or not isinstance(src, str):
@@ -2552,8 +2648,6 @@ class GenericIE(InfoExtractor):
m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
- for fmt in formats:
- fmt.update(self._fragment_query(src))
if not formats:
formats.append({
@@ -2569,11 +2663,11 @@ class GenericIE(InfoExtractor):
for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage):
sub = self._parse_json(
sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {}
- src = str_or_none(sub.get('src'))
- if not src:
+ sub_src = str_or_none(sub.get('src'))
+ if not sub_src:
continue
subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
- 'url': urllib.parse.urljoin(url, src),
+ 'url': urllib.parse.urljoin(url, sub_src),
'name': sub.get('label'),
'http_headers': {
'Referer': actual_url,
@@ -2581,7 +2675,21 @@ class GenericIE(InfoExtractor):
})
if formats or subtitles:
self.report_detected('video.js embed')
- return [{'formats': formats, 'subtitles': subtitles}]
+ info_dict = {'formats': formats, 'subtitles': subtitles}
+ if formats:
+ self._extra_manifest_info(info_dict, src)
+ return [info_dict]
+
+ # Look for generic KVS player (before json-ld bc of some urls that break otherwise)
+ found = self._search_regex((
+ r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:(?!\1)[^?#])+/kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>',
+ r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:(?!\2)[^?#])+/kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,',
+ ), webpage, 'KVS player', group='ver', default=False)
+ if found:
+ self.report_detected('KVS Player')
+ if found.split('.')[0] not in ('4', '5', '6'):
+ self.report_warning(f'Untested major version ({found}) in player engine - download may fail.')
+ return [self._extract_kvs(url, webpage, video_id)]
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(webpage, video_id, default={})
@@ -2626,52 +2734,6 @@ class GenericIE(InfoExtractor):
if found:
self.report_detected('JW Player embed')
if not found:
- # Look for generic KVS player
- found = re.search(r'<script [^>]*?src="https?://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage)
- if found:
- self.report_detected('KWS Player')
- if found.group('maj_ver') not in ['4', '5']:
- self.report_warning('Untested major version (%s) in player engine--Download may fail.' % found.group('ver'))
- flashvars = re.search(r'(?ms)<script.*?>.*?var\s+flashvars\s*=\s*(\{.*?\});.*?</script>', webpage)
- flashvars = self._parse_json(flashvars.group(1), video_id, transform_source=js_to_json)
-
- # extract the part after the last / as the display_id from the
- # canonical URL.
- display_id = self._search_regex(
- r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
- r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
- webpage, 'display_id', fatal=False
- )
- title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
-
- thumbnail = flashvars['preview_url']
- if thumbnail.startswith('//'):
- protocol, _, _ = url.partition('/')
- thumbnail = protocol + thumbnail
-
- url_keys = list(filter(re.compile(r'video_url|video_alt_url\d*').fullmatch, flashvars.keys()))
- formats = []
- for key in url_keys:
- if '/get_file/' not in flashvars[key]:
- continue
- format_id = flashvars.get(f'{key}_text', key)
- formats.append({
- 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']),
- 'format_id': format_id,
- 'ext': 'mp4',
- **(parse_resolution(format_id) or parse_resolution(flashvars[key]))
- })
- if not formats[-1].get('height'):
- formats[-1]['quality'] = 1
-
- return [{
- 'id': flashvars['video_id'],
- 'display_id': display_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }]
- if not found:
# Broaden the search a little bit
found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
if found:
@@ -2751,6 +2813,7 @@ class GenericIE(InfoExtractor):
entries = []
for video_url in orderedSet(found):
+ video_url = video_url.encode().decode('unicode-escape')
video_url = unescapeHTML(video_url)
video_url = video_url.replace('\\/', '/')
video_url = urllib.parse.urljoin(url, video_url)
@@ -2790,10 +2853,10 @@ class GenericIE(InfoExtractor):
return [self._extract_xspf_playlist(video_url, video_id)]
elif ext == 'm3u8':
entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers)
- entry_info_dict.update(self._fragment_query(video_url))
+ self._extra_manifest_info(entry_info_dict, video_url)
elif ext == 'mpd':
entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers)
- entry_info_dict.update(self._fragment_query(video_url))
+ self._extra_manifest_info(entry_info_dict, video_url)
elif ext == 'f4m':
entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers)
elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url:
diff --git a/hypervideo_dl/extractor/genius.py b/hypervideo_dl/extractor/genius.py
index 62f5a28..57c25e7 100644
--- a/hypervideo_dl/extractor/genius.py
+++ b/hypervideo_dl/extractor/genius.py
@@ -10,7 +10,7 @@ from ..utils import (
class GeniusIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?genius\.com/videos/(?P<id>[^?/#]+)'
+ _VALID_URL = r'https?://(?:www\.)?genius\.com/(?:videos|(?P<article>a))/(?P<id>[^?/#]+)'
_TESTS = [{
'url': 'https://genius.com/videos/Vince-staples-breaks-down-the-meaning-of-when-sparks-fly',
'md5': '64c2ad98cfafcfda23bfa0ad0c512f4c',
@@ -41,19 +41,37 @@ class GeniusIE(InfoExtractor):
'timestamp': 1631209167,
'thumbnail': r're:^https?://.*\.jpg$',
},
+ }, {
+ 'url': 'https://genius.com/a/cordae-anderson-paak-break-down-the-meaning-of-two-tens',
+ 'md5': 'f98a4e03b16b0a2821bd6e52fb3cc9d7',
+ 'info_dict': {
+ 'id': '6321509903112',
+ 'ext': 'mp4',
+ 'title': 'Cordae & Anderson .Paak Breaks Down The Meaning Of “Two Tens”',
+ 'description': 'md5:1255f0e1161d07342ce56a8464ac339d',
+ 'tags': ['song id: 5457554'],
+ 'uploader_id': '4863540648001',
+ 'duration': 361.813,
+ 'upload_date': '20230301',
+ 'timestamp': 1677703908,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
}]
def _real_extract(self, url):
- display_id = self._match_id(url)
+ display_id, is_article = self._match_valid_url(url).group('id', 'article')
webpage = self._download_webpage(url, display_id)
metadata = self._search_json(
- r'<meta content="', webpage, 'metadata', display_id, transform_source=unescapeHTML)
- video_id = traverse_obj(
- metadata, ('video', 'provider_id'),
- ('dfp_kv', lambda _, x: x['name'] == 'brightcove_video_id', 'values', 0), get_all=False)
+ r'<meta content="', webpage, 'metadata', display_id,
+ end_pattern=r'"\s+itemprop="page_data"', transform_source=unescapeHTML)
+ video_id = traverse_obj(metadata, (
+ (('article', 'media', ...), ('video', None)),
+ ('provider_id', ('dfp_kv', lambda _, v: v['name'] == 'brightcove_video_id', 'values', ...))),
+ get_all=False)
if not video_id:
- raise ExtractorError('Brightcove video id not found in webpage')
+ # Not all article pages have videos, expect the error
+ raise ExtractorError('Brightcove video ID not found in webpage', expected=bool(is_article))
config = self._search_json(r'var\s*APP_CONFIG\s*=', webpage, 'config', video_id, default={})
account_id = config.get('brightcove_account_id', '4863540648001')
@@ -68,7 +86,7 @@ class GeniusIE(InfoExtractor):
class GeniusLyricsIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?genius\.com/(?P<id>[^?/#]+)-lyrics[?/#]?'
+ _VALID_URL = r'https?://(?:www\.)?genius\.com/(?P<id>[^?/#]+)-lyrics(?:[?/#]|$)'
_TESTS = [{
'url': 'https://genius.com/Lil-baby-heyy-lyrics',
'playlist_mincount': 2,
diff --git a/hypervideo_dl/extractor/globalplayer.py b/hypervideo_dl/extractor/globalplayer.py
new file mode 100644
index 0000000..e0c0d58
--- /dev/null
+++ b/hypervideo_dl/extractor/globalplayer.py
@@ -0,0 +1,254 @@
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ join_nonempty,
+ parse_duration,
+ str_or_none,
+ traverse_obj,
+ unified_strdate,
+ unified_timestamp,
+ urlhandle_detect_ext,
+)
+
+
+class GlobalPlayerBaseIE(InfoExtractor):
+ def _get_page_props(self, url, video_id):
+ webpage = self._download_webpage(url, video_id)
+ return self._search_nextjs_data(webpage, video_id)['props']['pageProps']
+
+ def _request_ext(self, url, video_id):
+ return urlhandle_detect_ext(self._request_webpage( # Server rejects HEAD requests
+ url, video_id, note='Determining source extension'))
+
+ def _extract_audio(self, episode, series):
+ return {
+ 'vcodec': 'none',
+ **traverse_obj(series, {
+ 'series': 'title',
+ 'series_id': 'id',
+ 'thumbnail': 'imageUrl',
+ 'uploader': 'itunesAuthor', # podcasts only
+ }),
+ **traverse_obj(episode, {
+ 'id': 'id',
+ 'description': ('description', {clean_html}),
+ 'duration': ('duration', {parse_duration}),
+ 'thumbnail': 'imageUrl',
+ 'url': 'streamUrl',
+ 'timestamp': (('pubDate', 'startDate'), {unified_timestamp}),
+ 'title': 'title',
+ }, get_all=False)
+ }
+
+
+class GlobalPlayerLiveIE(GlobalPlayerBaseIE):
+ _VALID_URL = r'https?://www\.globalplayer\.com/live/(?P<id>\w+)/\w+'
+ _TESTS = [{
+ 'url': 'https://www.globalplayer.com/live/smoothchill/uk/',
+ 'info_dict': {
+ 'id': '2mx1E',
+ 'ext': 'aac',
+ 'display_id': 'smoothchill-uk',
+ 'title': 're:^Smooth Chill.+$',
+ 'thumbnail': 'https://herald.musicradio.com/media/f296ade8-50c9-4f60-911f-924e96873620.png',
+ 'description': 'Music To Chill To',
+ 'live_status': 'is_live',
+ },
+ }, {
+ # national station
+ 'url': 'https://www.globalplayer.com/live/heart/uk/',
+ 'info_dict': {
+ 'id': '2mwx4',
+ 'ext': 'aac',
+ 'description': 'turn up the feel good!',
+ 'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
+ 'live_status': 'is_live',
+ 'title': 're:^Heart UK.+$',
+ 'display_id': 'heart-uk',
+ },
+ }, {
+ # regional variation
+ 'url': 'https://www.globalplayer.com/live/heart/london/',
+ 'info_dict': {
+ 'id': 'AMqg',
+ 'ext': 'aac',
+ 'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
+ 'title': 're:^Heart London.+$',
+ 'live_status': 'is_live',
+ 'display_id': 'heart-london',
+ 'description': 'turn up the feel good!',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ station = self._get_page_props(url, video_id)['station']
+ stream_url = station['streamUrl']
+
+ return {
+ 'id': station['id'],
+ 'display_id': join_nonempty('brandSlug', 'slug', from_dict=station) or station.get('legacyStationPrefix'),
+ 'url': stream_url,
+ 'ext': self._request_ext(stream_url, video_id),
+ 'vcodec': 'none',
+ 'is_live': True,
+ **traverse_obj(station, {
+ 'title': (('name', 'brandName'), {str_or_none}),
+ 'description': 'tagline',
+ 'thumbnail': 'brandLogo',
+ }, get_all=False),
+ }
+
+
+class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE):
+ _VALID_URL = r'https?://www\.globalplayer\.com/playlists/(?P<id>\w+)'
+ _TESTS = [{
+ # "live playlist"
+ 'url': 'https://www.globalplayer.com/playlists/8bLk/',
+ 'info_dict': {
+ 'id': '8bLk',
+ 'ext': 'aac',
+ 'live_status': 'is_live',
+ 'description': 'md5:e10f5e10b01a7f2c14ba815509fbb38d',
+ 'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=',
+ 'title': 're:^Classic FM Hall of Fame.+$'
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ station = self._get_page_props(url, video_id)['playlistData']
+ stream_url = station['streamUrl']
+
+ return {
+ 'id': video_id,
+ 'url': stream_url,
+ 'ext': self._request_ext(stream_url, video_id),
+ 'vcodec': 'none',
+ 'is_live': True,
+ **traverse_obj(station, {
+ 'title': 'title',
+ 'description': 'description',
+ 'thumbnail': 'image',
+ }),
+ }
+
+
+class GlobalPlayerAudioIE(GlobalPlayerBaseIE):
+ _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)/|catchup/\w+/\w+/)(?P<id>\w+)/?(?:$|[?#])'
+ _TESTS = [{
+ # podcast
+ 'url': 'https://www.globalplayer.com/podcasts/42KuaM/',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': '42KuaM',
+ 'title': 'Filthy Ritual',
+ 'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
+ 'categories': ['Society & Culture', 'True Crime'],
+ 'uploader': 'Global',
+ 'description': 'md5:da5b918eac9ae319454a10a563afacf9',
+ },
+ }, {
+ # radio catchup
+ 'url': 'https://www.globalplayer.com/catchup/lbc/uk/46vyD7z/',
+ 'playlist_mincount': 3,
+ 'info_dict': {
+ 'id': '46vyD7z',
+ 'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
+ 'title': 'Nick Ferrari',
+ 'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
+ props = self._get_page_props(url, video_id)
+ series = props['podcastInfo'] if podcast else props['catchupInfo']
+
+ return {
+ '_type': 'playlist',
+ 'id': video_id,
+ 'entries': [self._extract_audio(ep, series) for ep in traverse_obj(
+ series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))],
+ 'categories': traverse_obj(series, ('categories', ..., 'name')) or None,
+ **traverse_obj(series, {
+ 'description': 'description',
+ 'thumbnail': 'imageUrl',
+ 'title': 'title',
+ 'uploader': 'itunesAuthor', # podcasts only
+ }),
+ }
+
+
+class GlobalPlayerAudioEpisodeIE(GlobalPlayerBaseIE):
+ _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)|catchup/\w+/\w+)/episodes/(?P<id>\w+)/?(?:$|[?#])'
+ _TESTS = [{
+ # podcast
+ 'url': 'https://www.globalplayer.com/podcasts/episodes/7DrfNnE/',
+ 'info_dict': {
+ 'id': '7DrfNnE',
+ 'ext': 'mp3',
+ 'title': 'Filthy Ritual - Trailer',
+ 'description': 'md5:1f1562fd0f01b4773b590984f94223e0',
+ 'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
+ 'duration': 225.0,
+ 'timestamp': 1681254900,
+ 'series': 'Filthy Ritual',
+ 'series_id': '42KuaM',
+ 'upload_date': '20230411',
+ 'uploader': 'Global',
+ },
+ }, {
+ # radio catchup
+ 'url': 'https://www.globalplayer.com/catchup/lbc/uk/episodes/2zGq26Vcv1fCWhddC4JAwETXWe/',
+ 'info_dict': {
+ 'id': '2zGq26Vcv1fCWhddC4JAwETXWe',
+ 'ext': 'm4a',
+ 'timestamp': 1682056800,
+ 'series': 'Nick Ferrari',
+ 'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
+ 'upload_date': '20230421',
+ 'series_id': '46vyD7z',
+ 'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
+ 'title': 'Nick Ferrari',
+ 'duration': 10800.0,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
+ props = self._get_page_props(url, video_id)
+ episode = props['podcastEpisode'] if podcast else props['catchupEpisode']
+
+ return self._extract_audio(
+ episode, traverse_obj(episode, 'podcast', 'show', expected_type=dict) or {})
+
+
+class GlobalPlayerVideoIE(GlobalPlayerBaseIE):
+ _VALID_URL = r'https?://www\.globalplayer\.com/videos/(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://www.globalplayer.com/videos/2JsSZ7Gm2uP/',
+ 'info_dict': {
+ 'id': '2JsSZ7Gm2uP',
+ 'ext': 'mp4',
+ 'description': 'md5:6a9f063c67c42f218e42eee7d0298bfd',
+ 'thumbnail': 'md5:d4498af48e15aae4839ce77b97d39550',
+ 'upload_date': '20230420',
+ 'title': 'Treble Malakai Bayoh sings a sublime Handel aria at Classic FM Live',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ meta = self._get_page_props(url, video_id)['videoData']
+
+ return {
+ 'id': video_id,
+ **traverse_obj(meta, {
+ 'url': 'url',
+ 'thumbnail': ('image', 'url'),
+ 'title': 'title',
+ 'upload_date': ('publish_date', {unified_strdate}),
+ 'description': 'description',
+ }),
+ }
diff --git a/hypervideo_dl/extractor/globo.py b/hypervideo_dl/extractor/globo.py
index a7be2cb..df98f09 100644
--- a/hypervideo_dl/extractor/globo.py
+++ b/hypervideo_dl/extractor/globo.py
@@ -8,8 +8,8 @@ from .common import InfoExtractor
from ..compat import (
compat_str,
)
+from ..networking import HEADRequest
from ..utils import (
- HEADRequest,
ExtractorError,
float_or_none,
orderedSet,
diff --git a/hypervideo_dl/extractor/gmanetwork.py b/hypervideo_dl/extractor/gmanetwork.py
new file mode 100644
index 0000000..62fff4e
--- /dev/null
+++ b/hypervideo_dl/extractor/gmanetwork.py
@@ -0,0 +1,83 @@
+from .common import InfoExtractor
+from .dailymotion import DailymotionIE
+from .youtube import YoutubeIE
+
+
+class GMANetworkVideoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www)\.gmanetwork\.com/(?:\w+/){3}(?P<id>\d+)/(?P<display_id>[\w-]+)/video'
+ _TESTS = [{
+ 'url': 'https://www.gmanetwork.com/fullepisodes/home/running_man_philippines/168677/running-man-philippines-catch-the-thief-full-chapter-2/video?section=home',
+ 'info_dict': {
+ 'id': '28BqW0AXPe0',
+ 'ext': 'mp4',
+ 'upload_date': '20220919',
+ 'uploader_url': 'http://www.youtube.com/channel/UChsoPNR5x-wdSO2GrOSIWqQ',
+ 'like_count': int,
+ 'view_count': int,
+ 'uploader': 'YoüLOL',
+ 'channel_id': 'UChsoPNR5x-wdSO2GrOSIWqQ',
+ 'duration': 5313,
+ 'comment_count': int,
+ 'tags': 'count:22',
+ 'uploader_id': 'UChsoPNR5x-wdSO2GrOSIWqQ',
+ 'title': 'Running Man Philippines: Catch the Thief (FULL CHAPTER 2)',
+ 'channel_url': 'https://www.youtube.com/channel/UChsoPNR5x-wdSO2GrOSIWqQ',
+ 'thumbnail': 'https://i.ytimg.com/vi/28BqW0AXPe0/maxresdefault.jpg',
+ 'release_timestamp': 1663594212,
+ 'age_limit': 0,
+ 'channel_follower_count': int,
+ 'categories': ['Entertainment'],
+ 'description': 'md5:811bdcea74f9c48051824e494756e926',
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'channel': 'YoüLOL',
+ 'availability': 'public',
+ 'release_date': '20220919',
+ }
+ }, {
+ 'url': 'https://www.gmanetwork.com/fullepisodes/home/more_than_words/87059/more-than-words-full-episode-80/video?section=home',
+ 'info_dict': {
+ 'id': 'yiDOExw2aSA',
+ 'ext': 'mp4',
+ 'live_status': 'not_live',
+ 'channel': 'GMANetwork',
+ 'like_count': int,
+ 'channel_follower_count': int,
+ 'description': 'md5:6d00cd658394fa1a5071200d3ed4be05',
+ 'duration': 1419,
+ 'age_limit': 0,
+ 'comment_count': int,
+ 'upload_date': '20181003',
+ 'thumbnail': 'https://i.ytimg.com/vi_webp/yiDOExw2aSA/maxresdefault.webp',
+ 'availability': 'public',
+ 'playable_in_embed': True,
+ 'channel_id': 'UCKL5hAuzgFQsyrsQKgU0Qng',
+ 'title': 'More Than Words: Full Episode 80 (Finale)',
+ 'uploader_id': 'GMANETWORK',
+ 'categories': ['Entertainment'],
+ 'uploader': 'GMANetwork',
+ 'channel_url': 'https://www.youtube.com/channel/UCKL5hAuzgFQsyrsQKgU0Qng',
+ 'tags': 'count:29',
+ 'view_count': int,
+ 'uploader_url': 'http://www.youtube.com/user/GMANETWORK',
+ }
+ }]
+
+ def _real_extract(self, url):
+ content_id, display_id = self._match_valid_url(url).group('id', 'display_id')
+ webpage = self._download_webpage(url, display_id)
+ # webpage route
+ youtube_id = self._search_regex(
+ r'var\s*YOUTUBE_VIDEO\s*=\s*[\'"]+(?P<yt_id>[\w-]+)', webpage, 'youtube_id', fatal=False)
+ if youtube_id:
+ return self.url_result(youtube_id, YoutubeIE, youtube_id)
+
+ # api call route
+ # more info at https://aphrodite.gmanetwork.com/fullepisodes/assets/fullepisodes/js/dist/fullepisodes_video.js?v=1.1.11
+ network_url = self._search_regex(
+ r'NETWORK_URL\s*=\s*[\'"](?P<url>[^\'"]+)', webpage, 'network_url')
+ json_data = self._download_json(f'{network_url}api/data/content/video/{content_id}', display_id)
+ if json_data.get('video_file'):
+ return self.url_result(json_data['video_file'], YoutubeIE, json_data['video_file'])
+ else:
+ return self.url_result(json_data['dailymotion_file'], DailymotionIE, json_data['dailymotion_file'])
diff --git a/hypervideo_dl/extractor/googledrive.py b/hypervideo_dl/extractor/googledrive.py
index e027ea7..2fdec20 100644
--- a/hypervideo_dl/extractor/googledrive.py
+++ b/hypervideo_dl/extractor/googledrive.py
@@ -3,9 +3,11 @@ import re
from .common import InfoExtractor
from ..compat import compat_parse_qs
from ..utils import (
- determine_ext,
ExtractorError,
+ determine_ext,
+ extract_attributes,
get_element_by_class,
+ get_element_html_by_id,
int_or_none,
lowercase_escape,
try_get,
@@ -34,6 +36,7 @@ class GoogleDriveIE(InfoExtractor):
'ext': 'mp4',
'title': 'Big Buck Bunny.mp4',
'duration': 45,
+ 'thumbnail': 'https://drive.google.com/thumbnail?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ',
}
}, {
# video can't be watched anonymously due to view count limit reached,
@@ -163,15 +166,13 @@ class GoogleDriveIE(InfoExtractor):
video_id = self._match_id(url)
video_info = compat_parse_qs(self._download_webpage(
'https://drive.google.com/get_video_info',
- video_id, query={'docid': video_id}))
+ video_id, 'Downloading video webpage', query={'docid': video_id}))
def get_value(key):
return try_get(video_info, lambda x: x[key][0])
reason = get_value('reason')
title = get_value('title')
- if not title and reason:
- raise ExtractorError(reason, expected=True)
formats = []
fmt_stream_map = (get_value('fmt_stream_map') or '').split(',')
@@ -209,20 +210,25 @@ class GoogleDriveIE(InfoExtractor):
'export': 'download',
})
- def request_source_file(source_url, kind):
+ def request_source_file(source_url, kind, data=None):
return self._request_webpage(
source_url, video_id, note='Requesting %s file' % kind,
- errnote='Unable to request %s file' % kind, fatal=False)
+ errnote='Unable to request %s file' % kind, fatal=False, data=data)
urlh = request_source_file(source_url, 'source')
if urlh:
def add_source_format(urlh):
+ nonlocal title
+ if not title:
+ title = self._search_regex(
+ r'\bfilename="([^"]+)"', urlh.headers.get('Content-Disposition'),
+ 'title', default=None)
formats.append({
# Use redirect URLs as download URLs in order to calculate
# correct cookies in _calc_cookies.
# Using original URLs may result in redirect loop due to
# google.com's cookies mistakenly used for googleusercontent.com
# redirect URLs (see #23919).
- 'url': urlh.geturl(),
+ 'url': urlh.url,
'ext': determine_ext(title, 'mp4').lower(),
'format_id': 'source',
'quality': 1,
@@ -234,14 +240,10 @@ class GoogleDriveIE(InfoExtractor):
urlh, url, video_id, note='Downloading confirmation page',
errnote='Unable to confirm download', fatal=False)
if confirmation_webpage:
- confirm = self._search_regex(
- r'confirm=([^&"\']+)', confirmation_webpage,
- 'confirmation code', default=None)
- if confirm:
- confirmed_source_url = update_url_query(source_url, {
- 'confirm': confirm,
- })
- urlh = request_source_file(confirmed_source_url, 'confirmed source')
+ confirmed_source_url = extract_attributes(
+ get_element_html_by_id('download-form', confirmation_webpage) or '').get('action')
+ if confirmed_source_url:
+ urlh = request_source_file(confirmed_source_url, 'confirmed source', data=b'')
if urlh and urlh.headers.get('Content-Disposition'):
add_source_format(urlh)
else:
@@ -251,7 +253,10 @@ class GoogleDriveIE(InfoExtractor):
or 'unable to extract confirmation code')
if not formats and reason:
- self.raise_no_formats(reason, expected=True)
+ if title:
+ self.raise_no_formats(reason, expected=True)
+ else:
+ raise ExtractorError(reason, expected=True)
hl = get_value('hl')
subtitles_id = None
diff --git a/hypervideo_dl/extractor/goplay.py b/hypervideo_dl/extractor/goplay.py
index 2882b49..960d7d7 100644
--- a/hypervideo_dl/extractor/goplay.py
+++ b/hypervideo_dl/extractor/goplay.py
@@ -76,11 +76,11 @@ class GoPlayIE(InfoExtractor):
}
api = self._download_json(
- f'https://api.viervijfzes.be/content/{video_id}',
- video_id, headers={'Authorization': self._id_token})
+ f'https://api.goplay.be/web/v1/videos/long-form/{video_id}',
+ video_id, headers={'Authorization': 'Bearer %s' % self._id_token})
formats, subs = self._extract_m3u8_formats_and_subtitles(
- api['video']['S'], video_id, ext='mp4', m3u8_id='HLS')
+ api['manifestUrls']['hls'], video_id, ext='mp4', m3u8_id='HLS')
info_dict.update({
'id': video_id,
diff --git a/hypervideo_dl/extractor/gronkh.py b/hypervideo_dl/extractor/gronkh.py
index b9370e3..1ae0a68 100644
--- a/hypervideo_dl/extractor/gronkh.py
+++ b/hypervideo_dl/extractor/gronkh.py
@@ -3,6 +3,7 @@ import functools
from .common import InfoExtractor
from ..utils import (
OnDemandPagedList,
+ float_or_none,
traverse_obj,
unified_strdate,
)
@@ -19,7 +20,9 @@ class GronkhIE(InfoExtractor):
'title': 'H.O.R.D.E. - DAS ZWEiTE ZEiTALTER 🎲 Session 1',
'view_count': int,
'thumbnail': 'https://01.cdn.vod.farm/preview/9e2555d3a23bf4e5c5b7c6b3b70a9d84.jpg',
- 'upload_date': '20221111'
+ 'upload_date': '20221111',
+ 'chapters': 'count:3',
+ 'duration': 31463,
},
'params': {'skip_download': True}
}, {
@@ -30,7 +33,8 @@ class GronkhIE(InfoExtractor):
'title': 'GTV0536, 2021-10-01 - MARTHA IS DEAD #FREiAB1830 !FF7 !horde !archiv',
'view_count': int,
'thumbnail': 'https://01.cdn.vod.farm/preview/6436746cce14e25f751260a692872b9b.jpg',
- 'upload_date': '20211001'
+ 'upload_date': '20211001',
+ 'duration': 32058,
},
'params': {'skip_download': True}
}, {
@@ -56,6 +60,12 @@ class GronkhIE(InfoExtractor):
'upload_date': unified_strdate(data_json.get('created_at')),
'formats': formats,
'subtitles': subtitles,
+ 'duration': float_or_none(data_json.get('source_length')),
+ 'chapters': traverse_obj(data_json, (
+ 'chapters', lambda _, v: float_or_none(v['offset']) is not None, {
+ 'title': 'title',
+ 'start_time': ('offset', {float_or_none}),
+ })) or None,
}
diff --git a/hypervideo_dl/extractor/hidive.py b/hypervideo_dl/extractor/hidive.py
index 3a53f2c..df6868d 100644
--- a/hypervideo_dl/extractor/hidive.py
+++ b/hypervideo_dl/extractor/hidive.py
@@ -1,5 +1,3 @@
-import re
-
from .common import InfoExtractor
from ..utils import (
ExtractorError,
@@ -39,15 +37,28 @@ class HiDiveIE(InfoExtractor):
form = self._search_regex(
r'(?s)<form[^>]+action="/account/login"[^>]*>(.+?)</form>',
webpage, 'login form', default=None)
- if not form: # logged in
+ if not form:
return
data = self._hidden_inputs(form)
data.update({
'Email': username,
'Password': password,
})
- self._download_webpage(
+ login_webpage = self._download_webpage(
self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data))
+ # If the user has multiple profiles on their account, select one. For now pick the first profile.
+ profile_id = self._search_regex(
+ r'<button [^>]+?data-profile-id="(\w+)"', login_webpage, 'profile id', default=None)
+ if profile_id is None:
+ return # If only one profile, Hidive auto-selects it
+ self._request_webpage(
+ 'https://www.hidive.com/ajax/chooseprofile', None,
+ data=urlencode_postdata({
+ 'profileId': profile_id,
+ 'hash': self._search_regex(
+ r'\<button [^>]+?data-hash="(\w+)"', login_webpage, 'profile id hash'),
+ 'returnUrl': '/dashboard'
+ }))
def _call_api(self, video_id, title, key, data={}, **kwargs):
data = {
@@ -60,26 +71,6 @@ class HiDiveIE(InfoExtractor):
'https://www.hidive.com/play/settings', video_id,
data=urlencode_postdata(data), **kwargs) or {}
- def _extract_subtitles_from_rendition(self, rendition, subtitles, parsed_urls):
- for cc_file in rendition.get('ccFiles', []):
- cc_url = url_or_none(try_get(cc_file, lambda x: x[2]))
- # name is used since we cant distinguish subs with same language code
- cc_lang = try_get(cc_file, (lambda x: x[1].replace(' ', '-').lower(), lambda x: x[0]), str)
- if cc_url not in parsed_urls and cc_lang:
- parsed_urls.add(cc_url)
- subtitles.setdefault(cc_lang, []).append({'url': cc_url})
-
- def _get_subtitles(self, url, video_id, title, key, parsed_urls):
- webpage = self._download_webpage(url, video_id, fatal=False) or ''
- subtitles = {}
- for caption in set(re.findall(r'data-captions=\"([^\"]+)\"', webpage)):
- renditions = self._call_api(
- video_id, title, key, {'Captions': caption}, fatal=False,
- note=f'Downloading {caption} subtitle information').get('renditions') or {}
- for rendition_id, rendition in renditions.items():
- self._extract_subtitles_from_rendition(rendition, subtitles, parsed_urls)
- return subtitles
-
def _real_extract(self, url):
video_id, title, key = self._match_valid_url(url).group('id', 'title', 'key')
settings = self._call_api(video_id, title, key)
@@ -104,10 +95,20 @@ class HiDiveIE(InfoExtractor):
f['format_note'] = f'{version}, {extra}'
formats.extend(frmt)
+ subtitles = {}
+ for rendition_id, rendition in settings['renditions'].items():
+ audio, version, extra = rendition_id.split('_')
+ for cc_file in rendition.get('ccFiles') or []:
+ cc_url = url_or_none(try_get(cc_file, lambda x: x[2]))
+ cc_lang = try_get(cc_file, (lambda x: x[1].replace(' ', '-').lower(), lambda x: x[0]), str)
+ if cc_url not in parsed_urls and cc_lang:
+ parsed_urls.add(cc_url)
+ subtitles.setdefault(cc_lang, []).append({'url': cc_url})
+
return {
'id': video_id,
'title': video_id,
- 'subtitles': self.extract_subtitles(url, video_id, title, key, parsed_urls),
+ 'subtitles': subtitles,
'formats': formats,
'series': title,
'season_number': int_or_none(
diff --git a/hypervideo_dl/extractor/hketv.py b/hypervideo_dl/extractor/hketv.py
index 1087956..e026996 100644
--- a/hypervideo_dl/extractor/hketv.py
+++ b/hypervideo_dl/extractor/hketv.py
@@ -126,7 +126,7 @@ class HKETVIE(InfoExtractor):
# If we ever wanted to provide the final resolved URL that
# does not require cookies, albeit with a shorter lifespan:
# urlh = self._downloader.urlopen(file_url)
- # resolved_url = urlh.geturl()
+ # resolved_url = urlh.url
label = fmt.get('label')
h = self._FORMAT_HEIGHTS.get(label)
w = h * width // height if h and width and height else None
diff --git a/hypervideo_dl/extractor/hollywoodreporter.py b/hypervideo_dl/extractor/hollywoodreporter.py
new file mode 100644
index 0000000..1f7eb89
--- /dev/null
+++ b/hypervideo_dl/extractor/hollywoodreporter.py
@@ -0,0 +1,72 @@
+import functools
+import re
+
+from .common import InfoExtractor
+from .jwplatform import JWPlatformIE
+from ..utils import (
+ ExtractorError,
+ OnDemandPagedList,
+ extract_attributes,
+ get_element_by_class,
+ get_element_html_by_class,
+)
+
+
+class HollywoodReporterIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?hollywoodreporter\.com/video/(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://www.hollywoodreporter.com/video/chris-pine-michelle-rodriguez-dungeons-dragons-cast-directors-on-what-it-took-to-make-film-sxsw-2023/',
+ 'info_dict': {
+ 'id': 'zH4jZaR5',
+ 'ext': 'mp4',
+ 'title': 'md5:a9a1c073770a32f178955997712c4bd9',
+ 'description': 'The cast and directors of \'Dungeons & Dragons: Honor Among Thieves\' talk about their new film.',
+ 'thumbnail': 'https://cdn.jwplayer.com/v2/media/zH4jZaR5/poster.jpg?width=720',
+ 'upload_date': '20230312',
+ 'timestamp': 1678586423,
+ 'duration': 242.0,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ data = extract_attributes(get_element_html_by_class('vlanding-video-card__link', webpage) or '')
+ video_id = data['data-video-showcase-trigger']
+ showcase_type = data['data-video-showcase-type']
+
+ if showcase_type == 'jwplayer':
+ return self.url_result(f'jwplatform:{video_id}', JWPlatformIE)
+ elif showcase_type == 'youtube':
+ return self.url_result(video_id, 'Youtube')
+ else:
+ raise ExtractorError(f'Unsupported showcase type "{showcase_type}"')
+
+
+class HollywoodReporterPlaylistIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?hollywoodreporter\.com/vcategory/(?P<slug>[\w-]+)-(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.hollywoodreporter.com/vcategory/heat-vision-breakdown-57822/',
+ 'playlist_mincount': 109,
+ 'info_dict': {
+ 'id': '57822',
+ 'title': 'heat-vision-breakdown',
+ }
+ }]
+
+ def _fetch_page(self, slug, pl_id, page):
+ page += 1
+ webpage = self._download_webpage(
+ f'https://www.hollywoodreporter.com/vcategory/{slug}-{pl_id}/page/{page}/',
+ pl_id, note=f'Downloading playlist page {page}')
+ section = get_element_by_class('video-playlist-river', webpage) or ''
+
+ for url in re.findall(r'<a[^>]+href="([^"]+)"[^>]+class="c-title__link', section):
+ yield self.url_result(url, HollywoodReporterIE)
+
+ def _real_extract(self, url):
+ slug, pl_id = self._match_valid_url(url).group('slug', 'id')
+ return self.playlist_result(
+ OnDemandPagedList(functools.partial(self._fetch_page, slug, pl_id), 15), pl_id, slug)
diff --git a/hypervideo_dl/extractor/hotnewhiphop.py b/hypervideo_dl/extractor/hotnewhiphop.py
index f8570cb..3007fbb 100644
--- a/hypervideo_dl/extractor/hotnewhiphop.py
+++ b/hypervideo_dl/extractor/hotnewhiphop.py
@@ -1,11 +1,7 @@
from .common import InfoExtractor
from ..compat import compat_b64decode
-from ..utils import (
- ExtractorError,
- HEADRequest,
- sanitized_Request,
- urlencode_postdata,
-)
+from ..networking import HEADRequest, Request
+from ..utils import ExtractorError, urlencode_postdata
class HotNewHipHopIE(InfoExtractor):
@@ -36,9 +32,9 @@ class HotNewHipHopIE(InfoExtractor):
('mediaType', 's'),
('mediaId', video_id),
])
- r = sanitized_Request(
+ r = Request(
'http://www.hotnewhiphop.com/ajax/media/getActions/', data=reqdata)
- r.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ r.headers['Content-Type'] = 'application/x-www-form-urlencoded'
mkd = self._download_json(
r, video_id, note='Requesting media key',
errnote='Could not download media key')
@@ -50,7 +46,7 @@ class HotNewHipHopIE(InfoExtractor):
req = self._request_webpage(
redirect_req, video_id,
note='Resolving final URL', errnote='Could not resolve final URL')
- video_url = req.geturl()
+ video_url = req.url
if video_url.endswith('.html'):
raise ExtractorError('Redirect failed')
diff --git a/hypervideo_dl/extractor/hotstar.py b/hypervideo_dl/extractor/hotstar.py
index 61eec7b..02183ad 100644
--- a/hypervideo_dl/extractor/hotstar.py
+++ b/hypervideo_dl/extractor/hotstar.py
@@ -6,7 +6,8 @@ import time
import uuid
from .common import InfoExtractor
-from ..compat import compat_HTTPError, compat_str
+from ..compat import compat_str
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
determine_ext,
@@ -83,7 +84,7 @@ class HotStarIE(HotStarBaseIE):
_VALID_URL = r'''(?x)
https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/)
(?:
- (?P<type>movies|sports|episode|(?P<tv>tv))/
+ (?P<type>movies|sports|clips|episode|(?P<tv>tv|shows))/
(?(tv)(?:[^/?#]+/){2}|[^?#]*)
)?
[^/?#]+/
@@ -123,6 +124,70 @@ class HotStarIE(HotStarBaseIE):
'episode_number': 8,
}
}, {
+ 'url': 'https://www.hotstar.com/in/shows/anupama/1260022017/anupama-anuj-share-a-moment/1000282843',
+ 'info_dict': {
+ 'id': '1000282843',
+ 'ext': 'mp4',
+ 'title': 'Anupama, Anuj Share a Moment',
+ 'season': 'Chapter 1',
+ 'description': 'md5:8d74ed2248423b8b06d5c8add4d7a0c0',
+ 'timestamp': 1678149000,
+ 'channel': 'StarPlus',
+ 'series': 'Anupama',
+ 'season_number': 1,
+ 'season_id': 7399,
+ 'upload_date': '20230307',
+ 'episode': 'Anupama, Anuj Share a Moment',
+ 'episode_number': 853,
+ 'duration': 1272,
+ 'channel_id': 3,
+ },
+ 'skip': 'HTTP Error 504: Gateway Time-out', # XXX: Investigate 504 errors on some episodes
+ }, {
+ 'url': 'https://www.hotstar.com/in/shows/kana-kaanum-kaalangal/1260097087/back-to-school/1260097320',
+ 'info_dict': {
+ 'id': '1260097320',
+ 'ext': 'mp4',
+ 'title': 'Back To School',
+ 'season': 'Chapter 1',
+ 'description': 'md5:b0d6a4c8a650681491e7405496fc7e13',
+ 'timestamp': 1650564000,
+ 'channel': 'Hotstar Specials',
+ 'series': 'Kana Kaanum Kaalangal',
+ 'season_number': 1,
+ 'season_id': 9441,
+ 'upload_date': '20220421',
+ 'episode': 'Back To School',
+ 'episode_number': 1,
+ 'duration': 1810,
+ 'channel_id': 54,
+ },
+ }, {
+ 'url': 'https://www.hotstar.com/in/clips/e3-sairat-kahani-pyaar-ki/1000262286',
+ 'info_dict': {
+ 'id': '1000262286',
+ 'ext': 'mp4',
+ 'title': 'E3 - SaiRat, Kahani Pyaar Ki',
+ 'description': 'md5:e3b4b3203bc0c5396fe7d0e4948a6385',
+ 'episode': 'E3 - SaiRat, Kahani Pyaar Ki',
+ 'upload_date': '20210606',
+ 'timestamp': 1622943900,
+ 'duration': 5395,
+ },
+ }, {
+ 'url': 'https://www.hotstar.com/in/movies/premam/1000091195',
+ 'info_dict': {
+ 'id': '1000091195',
+ 'ext': 'mp4',
+ 'title': 'Premam',
+ 'release_year': 2015,
+ 'description': 'md5:d833c654e4187b5e34757eafb5b72d7f',
+ 'timestamp': 1462149000,
+ 'upload_date': '20160502',
+ 'episode': 'Premam',
+ 'duration': 8994,
+ },
+ }, {
'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157',
'only_matching': True,
}, {
@@ -139,6 +204,8 @@ class HotStarIE(HotStarBaseIE):
'sports': 'match',
'episode': 'episode',
'tv': 'episode',
+ 'shows': 'episode',
+ 'clips': 'content',
None: 'content',
}
@@ -148,6 +215,12 @@ class HotStarIE(HotStarBaseIE):
'dr': 'dynamic_range',
}
+ _TAG_FIELDS = {
+ 'language': 'language',
+ 'acodec': 'audio_codec',
+ 'vcodec': 'video_codec',
+ }
+
@classmethod
def _video_url(cls, video_id, video_type=None, *, slug='ignore_me', root=None):
assert None in (video_type, root)
@@ -160,8 +233,10 @@ class HotStarIE(HotStarBaseIE):
video_type = self._TYPE.get(video_type, video_type)
cookies = self._get_cookies(url) # Cookies before any request
- video_data = self._call_api_v1(f'{video_type}/detail', video_id,
- query={'tas': 10000, 'contentId': video_id})['body']['results']['item']
+ video_data = traverse_obj(
+ self._call_api_v1(
+ f'{video_type}/detail', video_id, fatal=False, query={'tas': 10000, 'contentId': video_id}),
+ ('body', 'results', 'item', {dict})) or {}
if not self.get_param('allow_unplayable_formats') and video_data.get('drmProtected'):
self.report_drm(video_id)
@@ -182,24 +257,22 @@ class HotStarIE(HotStarBaseIE):
for key, prefix in self._IGNORE_MAP.items()
for ignore in self._configuration_arg(key)):
continue
+ tag_dict = dict((t.split(':', 1) + [None])[:2] for t in tags.split(';'))
format_url = url_or_none(playback_set.get('playbackUrl'))
if not format_url:
continue
format_url = re.sub(r'(?<=//staragvod)(\d)', r'web\1', format_url)
- dr = re.search(r'dynamic_range:(?P<dr>[a-z]+)', playback_set.get('tagsCombination')).group('dr')
ext = determine_ext(format_url)
current_formats, current_subs = [], {}
try:
if 'package:hls' in tags or ext == 'm3u8':
current_formats, current_subs = self._extract_m3u8_formats_and_subtitles(
- format_url, video_id, 'mp4',
- entry_protocol='m3u8_native',
- m3u8_id=f'{dr}-hls', headers=headers)
+ format_url, video_id, ext='mp4', headers=headers)
elif 'package:dash' in tags or ext == 'mpd':
current_formats, current_subs = self._extract_mpd_formats_and_subtitles(
- format_url, video_id, mpd_id=f'{dr}-dash', headers=headers)
+ format_url, video_id, headers=headers)
elif ext == 'f4m':
pass # XXX: produce broken files
else:
@@ -209,24 +282,36 @@ class HotStarIE(HotStarBaseIE):
'height': int_or_none(playback_set.get('height')),
}]
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
geo_restricted = True
continue
- if tags and 'encryption:plain' not in tags:
+ if tag_dict.get('encryption') not in ('plain', None):
for f in current_formats:
f['has_drm'] = True
- if tags and 'language' in tags:
- lang = re.search(r'language:(?P<lang>[a-z]+)', tags).group('lang')
- for f in current_formats:
- if not f.get('langauge'):
- f['language'] = lang
+ for f in current_formats:
+ for k, v in self._TAG_FIELDS.items():
+ if not f.get(k):
+ f[k] = tag_dict.get(v)
+ if f.get('vcodec') != 'none' and not f.get('dynamic_range'):
+ f['dynamic_range'] = tag_dict.get('dynamic_range')
+ if f.get('acodec') != 'none' and not f.get('audio_channels'):
+ f['audio_channels'] = {
+ 'stereo': 2,
+ 'dolby51': 6,
+ }.get(tag_dict.get('audio_channel'))
+ f['format_note'] = join_nonempty(
+ tag_dict.get('ladder'),
+ tag_dict.get('audio_channel') if f.get('acodec') != 'none' else None,
+ f.get('format_note'),
+ delim=', ')
formats.extend(current_formats)
subs = self._merge_subtitles(subs, current_subs)
if not formats and geo_restricted:
self.raise_geo_restricted(countries=['IN'], metadata_available=True)
+ self._remove_duplicate_formats(formats)
for f in formats:
f.setdefault('http_headers', {}).update(headers)
@@ -235,7 +320,8 @@ class HotStarIE(HotStarBaseIE):
'title': video_data.get('title'),
'description': video_data.get('description'),
'duration': int_or_none(video_data.get('duration')),
- 'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')),
+ 'timestamp': int_or_none(traverse_obj(video_data, 'broadcastDate', 'startDate')),
+ 'release_year': int_or_none(video_data.get('year')),
'formats': formats,
'subtitles': subs,
'channel': video_data.get('channelName'),
@@ -288,7 +374,7 @@ class HotStarPrefixIE(InfoExtractor):
class HotStarPlaylistIE(HotStarBaseIE):
IE_NAME = 'hotstar:playlist'
- _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/tv(?:/[^/]+){2}/list/[^/]+/t-(?P<id>\w+)'
+ _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)(?:/[^/]+){2}/list/[^/]+/t-(?P<id>\w+)'
_TESTS = [{
'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26',
'info_dict': {
@@ -296,6 +382,9 @@ class HotStarPlaylistIE(HotStarBaseIE):
},
'playlist_mincount': 20,
}, {
+ 'url': 'https://www.hotstar.com/shows/savdhaan-india/s-26/list/popular-clips/t-3_2_26',
+ 'only_matching': True,
+ }, {
'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480',
'only_matching': True,
}, {
@@ -311,7 +400,7 @@ class HotStarPlaylistIE(HotStarBaseIE):
class HotStarSeasonIE(HotStarBaseIE):
IE_NAME = 'hotstar:season'
- _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/\w+)/seasons/[^/]+/ss-(?P<id>\w+)'
+ _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)/[^/]+/\w+)/seasons/[^/]+/ss-(?P<id>\w+)'
_TESTS = [{
'url': 'https://www.hotstar.com/tv/radhakrishn/1260000646/seasons/season-2/ss-8028',
'info_dict': {
@@ -330,6 +419,9 @@ class HotStarSeasonIE(HotStarBaseIE):
'id': '8208',
},
'playlist_mincount': 19,
+ }, {
+ 'url': 'https://www.hotstar.com/in/shows/bigg-boss/14714/seasons/season-4/ss-8208/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -340,7 +432,7 @@ class HotStarSeasonIE(HotStarBaseIE):
class HotStarSeriesIE(HotStarBaseIE):
IE_NAME = 'hotstar:series'
- _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))/?(?:[#?]|$)'
+ _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)/[^/]+/(?P<id>\d+))/?(?:[#?]|$)'
_TESTS = [{
'url': 'https://www.hotstar.com/in/tv/radhakrishn/1260000646',
'info_dict': {
@@ -359,6 +451,12 @@ class HotStarSeriesIE(HotStarBaseIE):
'id': '435',
},
'playlist_mincount': 267,
+ }, {
+ 'url': 'https://www.hotstar.com/in/shows/anupama/1260022017/',
+ 'info_dict': {
+ 'id': '1260022017',
+ },
+ 'playlist_mincount': 940,
}]
def _real_extract(self, url):
diff --git a/hypervideo_dl/extractor/hrefli.py b/hypervideo_dl/extractor/hrefli.py
new file mode 100644
index 0000000..77db2ea
--- /dev/null
+++ b/hypervideo_dl/extractor/hrefli.py
@@ -0,0 +1,15 @@
+from .common import InfoExtractor
+
+
+class HrefLiRedirectIE(InfoExtractor):
+ IE_NAME = 'href.li'
+ IE_DESC = False # Do not list
+ _VALID_URL = r'https?://href\.li/\?(?P<url>.+)'
+
+ _TESTS = [{
+ 'url': 'https://href.li/?https://www.reddit.com/r/cats/comments/12bluel/my_cat_helps_me_with_water/?utm_source=share&utm_medium=android_app&utm_name=androidcss&utm_term=1&utm_content=share_button',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ return self.url_result(self._match_valid_url(url).group('url'))
diff --git a/hypervideo_dl/extractor/hrti.py b/hypervideo_dl/extractor/hrti.py
index cfec80d..57b76e4 100644
--- a/hypervideo_dl/extractor/hrti.py
+++ b/hypervideo_dl/extractor/hrti.py
@@ -1,13 +1,13 @@
import json
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking import Request
+from ..networking.exceptions import HTTPError
from ..utils import (
clean_html,
ExtractorError,
int_or_none,
parse_age_limit,
- sanitized_Request,
try_get,
)
@@ -42,7 +42,7 @@ class HRTiBaseIE(InfoExtractor):
'application_version': self._APP_VERSION
}
- req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8'))
+ req = Request(self._API_URL, data=json.dumps(app_data).encode('utf-8'))
req.get_method = lambda: 'PUT'
resources = self._download_json(
@@ -73,8 +73,8 @@ class HRTiBaseIE(InfoExtractor):
self._login_url, None, note='Logging in', errnote='Unable to log in',
data=json.dumps(auth_data).encode('utf-8'))
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406:
- auth_info = self._parse_json(e.cause.read().encode('utf-8'), None)
+ if isinstance(e.cause, HTTPError) and e.cause.status == 406:
+ auth_info = self._parse_json(e.cause.response.read().encode('utf-8'), None)
else:
raise
diff --git a/hypervideo_dl/extractor/hungama.py b/hypervideo_dl/extractor/hungama.py
index 2e99396..cdec368 100644
--- a/hypervideo_dl/extractor/hungama.py
+++ b/hypervideo_dl/extractor/hungama.py
@@ -1,19 +1,32 @@
-import re
-
from .common import InfoExtractor
from ..utils import (
int_or_none,
+ remove_end,
+ traverse_obj,
try_get,
+ unified_timestamp,
+ url_or_none,
urlencode_postdata,
)
-class HungamaIE(InfoExtractor):
+class HungamaBaseIE(InfoExtractor):
+ def _call_api(self, path, content_id, fatal=False):
+ return traverse_obj(self._download_json(
+ f'https://cpage.api.hungama.com/v2/page/content/{content_id}/{path}/detail',
+ content_id, fatal=fatal, query={
+ 'device': 'web',
+ 'platform': 'a',
+ 'storeId': '1',
+ }), ('data', {dict})) or {}
+
+
+class HungamaIE(HungamaBaseIE):
_VALID_URL = r'''(?x)
https?://
- (?:www\.)?hungama\.com/
+ (?:www\.|un\.)?hungama\.com/
(?:
- (?:video|movie)/[^/]+/|
+ (?:video|movie|short-film)/[^/]+/|
tv-show/(?:[^/]+/){2}\d+/episode/[^/]+/
)
(?P<id>\d+)
@@ -25,13 +38,28 @@ class HungamaIE(InfoExtractor):
'id': '39349649',
'ext': 'mp4',
'title': 'Krishna Chants',
- 'description': 'Watch Krishna Chants video now. You can also watch other latest videos only at Hungama',
+ 'description': ' ',
'upload_date': '20180829',
'duration': 264,
'timestamp': 1535500800,
'view_count': int,
- 'thumbnail': 'https://images.hungama.com/c/1/0dc/2ca/39349649/39349649_700x394.jpg',
- }
+ 'thumbnail': 'https://images1.hungama.com/tr:n-a_169_m/c/1/0dc/2ca/39349649/39349649_350x197.jpg?v=8',
+ 'tags': 'count:6',
+ },
+ }, {
+ 'url': 'https://un.hungama.com/short-film/adira/102524179/',
+ 'md5': '2278463f5dc9db9054d0c02602d44666',
+ 'info_dict': {
+ 'id': '102524179',
+ 'ext': 'mp4',
+ 'title': 'Adira',
+ 'description': 'md5:df20cd4d41eabb33634f06de1025a4b4',
+ 'upload_date': '20230417',
+ 'timestamp': 1681689600,
+ 'view_count': int,
+ 'thumbnail': 'https://images1.hungama.com/tr:n-a_23_m/c/1/197/ac9/102524179/102524179_350x525.jpg?v=1',
+ 'tags': 'count:7',
+ },
}, {
'url': 'https://www.hungama.com/movie/kahaani-2/44129919/',
'only_matching': True,
@@ -51,14 +79,19 @@ class HungamaIE(InfoExtractor):
'c': 'common',
'm': 'get_video_mdn_url',
})
-
formats = self._extract_m3u8_formats(video_json['stream_url'], video_id, ext='mp4', m3u8_id='hls')
-
- json_ld = self._search_json_ld(
- self._download_webpage(url, video_id, fatal=False) or '', video_id, fatal=False)
+ metadata = self._call_api('movie', video_id)
return {
- **json_ld,
+ **traverse_obj(metadata, ('head', 'data', {
+ 'title': ('title', {str}),
+ 'description': ('misc', 'description', {str}),
+ 'duration': ('duration', {int}), # duration in JSON is incorrect if string
+ 'timestamp': ('releasedate', {unified_timestamp}),
+ 'view_count': ('misc', 'playcount', {int_or_none}),
+ 'thumbnail': ('image', {url_or_none}),
+ 'tags': ('misc', 'keywords', ..., {str}),
+ })),
'id': video_id,
'formats': formats,
'subtitles': {
@@ -71,10 +104,10 @@ class HungamaIE(InfoExtractor):
class HungamaSongIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?hungama\.com/song/[^/]+/(?P<id>\d+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.|un\.)?hungama\.com/song/[^/]+/(?P<id>\d+)'
+ _TESTS = [{
'url': 'https://www.hungama.com/song/kitni-haseen-zindagi/2931166/',
- 'md5': 'd4a6a05a394ad0453a9bea3ca00e6024',
+ 'md5': '964f46828e8b250aa35e5fdcfdcac367',
'info_dict': {
'id': '2931166',
'ext': 'mp3',
@@ -83,8 +116,22 @@ class HungamaSongIE(InfoExtractor):
'artist': 'Lucky Ali',
'album': None,
'release_year': 2000,
- }
- }
+ 'thumbnail': 'https://stat2.hungama.ind.in/assets/images/default_images/da-200x200.png',
+ },
+ }, {
+ 'url': 'https://un.hungama.com/song/tum-kya-mile-from-rocky-aur-rani-kii-prem-kahaani/103553672',
+ 'md5': '964f46828e8b250aa35e5fdcfdcac367',
+ 'info_dict': {
+ 'id': '103553672',
+ 'ext': 'mp3',
+ 'title': 'md5:5ebeb1e10771b634ce5f700ce68ae5f4',
+ 'track': 'Tum Kya Mile (From "Rocky Aur Rani Kii Prem Kahaani")',
+ 'artist': 'Pritam Chakraborty, Arijit Singh, Shreya Ghoshal, Amitabh Bhattacharya',
+ 'album': 'Tum Kya Mile (From "Rocky Aur Rani Kii Prem Kahaani")',
+ 'release_year': 2023,
+ 'thumbnail': 'https://images.hungama.com/c/1/7c2/c7b/103553671/103553671_200x200.jpg',
+ },
+ }]
def _real_extract(self, url):
audio_id = self._match_id(url)
@@ -122,8 +169,8 @@ class HungamaSongIE(InfoExtractor):
}
-class HungamaAlbumPlaylistIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?hungama\.com/(?:playlists|album)/[^/]+/(?P<id>\d+)'
+class HungamaAlbumPlaylistIE(HungamaBaseIE):
+ _VALID_URL = r'https?://(?:www\.|un\.)?hungama\.com/(?P<path>playlists|album)/[^/]+/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.hungama.com/album/bhuj-the-pride-of-india/69481490/',
'playlist_mincount': 7,
@@ -132,16 +179,24 @@ class HungamaAlbumPlaylistIE(InfoExtractor):
},
}, {
'url': 'https://www.hungama.com/playlists/hindi-jan-to-june-2021/123063/',
- 'playlist_mincount': 50,
+ 'playlist_mincount': 33,
'info_dict': {
'id': '123063',
},
+ }, {
+ 'url': 'https://un.hungama.com/album/what-jhumka-%3F-from-rocky-aur-rani-kii-prem-kahaani/103891805/',
+ 'playlist_mincount': 1,
+ 'info_dict': {
+ 'id': '103891805',
+ },
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- ptrn = r'<meta[^>]+?property=[\"\']?music:song:url[\"\']?[^>]+?content=[\"\']?([^\"\']+)'
- items = re.findall(ptrn, webpage)
- entries = [self.url_result(item, ie=HungamaSongIE.ie_key()) for item in items]
- return self.playlist_result(entries, video_id)
+ playlist_id, path = self._match_valid_url(url).group('id', 'path')
+ data = self._call_api(remove_end(path, 's'), playlist_id, fatal=True)
+
+ def entries():
+ for song_url in traverse_obj(data, ('body', 'rows', ..., 'data', 'misc', 'share', {url_or_none})):
+ yield self.url_result(song_url, HungamaSongIE)
+
+ return self.playlist_result(entries(), playlist_id)
diff --git a/hypervideo_dl/extractor/huya.py b/hypervideo_dl/extractor/huya.py
index b6e9eec..c4965f9 100644
--- a/hypervideo_dl/extractor/huya.py
+++ b/hypervideo_dl/extractor/huya.py
@@ -1,5 +1,6 @@
import hashlib
import random
+import re
from ..compat import compat_urlparse, compat_b64decode
@@ -37,7 +38,7 @@ class HuyaLiveIE(InfoExtractor):
}]
_RESOLUTION = {
- '蓝光4M': {
+ '蓝光': {
'width': 1920,
'height': 1080,
},
@@ -76,11 +77,15 @@ class HuyaLiveIE(InfoExtractor):
if re_secret:
fm, ss = self.encrypt(params, stream_info, stream_name)
for si in stream_data.get('vMultiStreamInfo'):
+ display_name, bitrate = re.fullmatch(
+ r'(.+?)(?:(\d+)M)?', si.get('sDisplayName')).groups()
rate = si.get('iBitRate')
if rate:
params['ratio'] = rate
else:
params.pop('ratio', None)
+ if bitrate:
+ rate = int(bitrate) * 1000
if re_secret:
params['wsSecret'] = hashlib.md5(
'_'.join([fm, params['u'], stream_name, ss, params['wsTime']]))
@@ -90,7 +95,7 @@ class HuyaLiveIE(InfoExtractor):
'tbr': rate,
'url': update_url_query(f'{stream_url}/{stream_name}.{stream_info.get("sFlvUrlSuffix")}',
query=params),
- **self._RESOLUTION.get(si.get('sDisplayName'), {}),
+ **self._RESOLUTION.get(display_name, {}),
})
return {
diff --git a/hypervideo_dl/extractor/hypergryph.py b/hypervideo_dl/extractor/hypergryph.py
new file mode 100644
index 0000000..9ca6cae
--- /dev/null
+++ b/hypervideo_dl/extractor/hypergryph.py
@@ -0,0 +1,32 @@
+from .common import InfoExtractor
+from ..utils import js_to_json, traverse_obj
+
+
+class MonsterSirenHypergryphMusicIE(InfoExtractor):
+ _VALID_URL = r'https?://monster-siren\.hypergryph\.com/music/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://monster-siren.hypergryph.com/music/514562',
+ 'info_dict': {
+ 'id': '514562',
+ 'ext': 'wav',
+ 'artist': ['塞壬唱片-MSR'],
+ 'album': 'Flame Shadow',
+ 'title': 'Flame Shadow',
+ }
+ }]
+
+ def _real_extract(self, url):
+ audio_id = self._match_id(url)
+ webpage = self._download_webpage(url, audio_id)
+ json_data = self._search_json(
+ r'window\.g_initialProps\s*=', webpage, 'data', audio_id, transform_source=js_to_json)
+
+ return {
+ 'id': audio_id,
+ 'title': traverse_obj(json_data, ('player', 'songDetail', 'name')),
+ 'url': traverse_obj(json_data, ('player', 'songDetail', 'sourceUrl')),
+ 'ext': 'wav',
+ 'vcodec': 'none',
+ 'artist': traverse_obj(json_data, ('player', 'songDetail', 'artists')),
+ 'album': traverse_obj(json_data, ('musicPlay', 'albumDetail', 'name'))
+ }
diff --git a/hypervideo_dl/extractor/idolplus.py b/hypervideo_dl/extractor/idolplus.py
new file mode 100644
index 0000000..3c905b0
--- /dev/null
+++ b/hypervideo_dl/extractor/idolplus.py
@@ -0,0 +1,115 @@
+from .common import InfoExtractor
+from ..utils import traverse_obj, try_call, url_or_none
+
+
+class IdolPlusIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?idolplus\.com/z[us]/(?:concert/|contents/?\?(?:[^#]+&)?albumId=)(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://idolplus.com/zs/contents?albumId=M012077298PPV00',
+ 'md5': '2ace3f4661c943a2f7e79f0b88cea1e7',
+ 'info_dict': {
+ 'id': 'M012077298PPV00',
+ 'ext': 'mp4',
+ 'title': '[MultiCam] Aegyo on Top of Aegyo (IZ*ONE EATING TRIP)',
+ 'release_date': '20200707',
+ 'formats': 'count:65',
+ },
+ 'params': {'format': '532-KIM_MINJU'},
+ }, {
+ 'url': 'https://idolplus.com/zs/contents?albumId=M01232H058PPV00&catId=E9TX5',
+ 'info_dict': {
+ 'id': 'M01232H058PPV00',
+ 'ext': 'mp4',
+ 'title': 'YENA (CIRCLE CHART MUSIC AWARDS 2022 RED CARPET)',
+ 'release_date': '20230218',
+ 'formats': 'count:5',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ # live stream
+ 'url': 'https://idolplus.com/zu/contents?albumId=M012323174PPV00',
+ 'info_dict': {
+ 'id': 'M012323174PPV00',
+ 'ext': 'mp4',
+ 'title': 'Hanteo Music Awards 2022 DAY2',
+ 'release_date': '20230211',
+ 'formats': 'count:5',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://idolplus.com/zs/concert/M012323039PPV00',
+ 'info_dict': {
+ 'id': 'M012323039PPV00',
+ 'ext': 'mp4',
+ 'title': 'CIRCLE CHART MUSIC AWARDS 2022',
+ 'release_date': '20230218',
+ 'formats': 'count:5',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ data_list = traverse_obj(self._download_json(
+ 'https://idolplus.com/api/zs/viewdata/ruleset/build', video_id,
+ headers={'App_type': 'web', 'Country_Code': 'KR'}, query={
+ 'rulesetId': 'contents',
+ 'albumId': video_id,
+ 'distribute': 'PRD',
+ 'loggedIn': 'false',
+ 'region': 'zs',
+ 'countryGroup': '00010',
+ 'lang': 'en',
+ 'saId': '999999999998',
+ }), ('data', 'viewData', ...))
+
+ player_data = {}
+ while data_list:
+ player_data = data_list.pop()
+ if traverse_obj(player_data, 'type') == 'player':
+ break
+ elif traverse_obj(player_data, ('dataList', ...)):
+ data_list += player_data['dataList']
+
+ formats = self._extract_m3u8_formats(traverse_obj(player_data, (
+ 'vodPlayerList', 'vodProfile', 0, 'vodServer', 0, 'video_url', {url_or_none})), video_id)
+
+ subtitles = {}
+ for caption in traverse_obj(player_data, ('vodPlayerList', 'caption')) or []:
+ subtitles.setdefault(caption.get('lang') or 'und', []).append({
+ 'url': caption.get('smi_url'),
+ 'ext': 'vtt',
+ })
+
+ # Add member multicams as alternative formats
+ if (traverse_obj(player_data, ('detail', 'has_cuesheet')) == 'Y'
+ and traverse_obj(player_data, ('detail', 'is_omni_member')) == 'Y'):
+ cuesheet = traverse_obj(self._download_json(
+ 'https://idolplus.com/gapi/contents/v1.0/content/cuesheet', video_id,
+ 'Downloading JSON metadata for member multicams',
+ headers={'App_type': 'web', 'Country_Code': 'KR'}, query={
+ 'ALBUM_ID': video_id,
+ 'COUNTRY_GRP': '00010',
+ 'LANG': 'en',
+ 'SA_ID': '999999999998',
+ 'COUNTRY_CODE': 'KR',
+ }), ('data', 'cuesheet_item', 0))
+
+ for member in traverse_obj(cuesheet, ('members', ...)):
+ index = try_call(lambda: int(member['omni_view_index']) - 1)
+ member_video_url = traverse_obj(cuesheet, ('omni_view', index, 'cdn_url', 0, 'url', {url_or_none}))
+ if not member_video_url:
+ continue
+ member_formats = self._extract_m3u8_formats(
+ member_video_url, video_id, note=f'Downloading m3u8 for multicam {member["name"]}')
+ for mf in member_formats:
+ mf['format_id'] = f'{mf["format_id"]}-{member["name"].replace(" ", "_")}'
+ formats.extend(member_formats)
+
+ return {
+ 'id': video_id,
+ 'title': traverse_obj(player_data, ('detail', 'albumName')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'release_date': traverse_obj(player_data, ('detail', 'broadcastDate')),
+ }
diff --git a/hypervideo_dl/extractor/ign.py b/hypervideo_dl/extractor/ign.py
index d4797d3..64875f8 100644
--- a/hypervideo_dl/extractor/ign.py
+++ b/hypervideo_dl/extractor/ign.py
@@ -1,17 +1,21 @@
import re
+import urllib.parse
from .common import InfoExtractor
-from ..compat import (
- compat_parse_qs,
- compat_urllib_parse_urlparse,
-)
+from ..compat import compat_parse_qs
+from ..networking.exceptions import HTTPError
from ..utils import (
- HEADRequest,
+ ExtractorError,
determine_ext,
+ error_to_compat_str,
+ extract_attributes,
int_or_none,
+ merge_dicts,
parse_iso8601,
strip_or_none,
- try_get,
+ traverse_obj,
+ url_or_none,
+ urljoin,
)
@@ -20,69 +24,37 @@ class IGNBaseIE(InfoExtractor):
return self._download_json(
'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug)
+ def _checked_call_api(self, slug):
+ try:
+ return self._call_api(slug)
+ except ExtractorError as e:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 404:
+ e.cause.args = e.cause.args or [
+ e.cause.response.url, e.cause.status, e.cause.reason]
+ raise ExtractorError(
+ 'Content not found: expired?', cause=e.cause,
+ expected=True)
+ raise
-class IGNIE(IGNBaseIE):
- """
- Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com.
- Some videos of it.ign.com are also supported
- """
-
- _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[^/?&#]+)'
- IE_NAME = 'ign.com'
- _PAGE_TYPE = 'video'
-
- _TESTS = [{
- 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
- 'md5': 'd2e1586d9987d40fad7867bf96a018ea',
- 'info_dict': {
- 'id': '8f862beef863986b2785559b9e1aa599',
- 'ext': 'mp4',
- 'title': 'The Last of Us Review',
- 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c',
- 'timestamp': 1370440800,
- 'upload_date': '20130605',
- 'tags': 'count:9',
- }
- }, {
- 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data',
- 'md5': 'f1581a6fe8c5121be5b807684aeac3f6',
- 'info_dict': {
- 'id': 'ee10d774b508c9b8ec07e763b9125b91',
- 'ext': 'mp4',
- 'title': 'What\'s New Now: Is GoGo Snooping on Your Data?',
- 'description': 'md5:817a20299de610bd56f13175386da6fa',
- 'timestamp': 1420571160,
- 'upload_date': '20150106',
- 'tags': 'count:4',
- }
- }, {
- 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- video = self._call_api(display_id)
+ def _extract_video_info(self, video, fatal=True):
video_id = video['videoId']
- metadata = video['metadata']
- title = metadata.get('longTitle') or metadata.get('title') or metadata['name']
formats = []
- refs = video.get('refs') or {}
+ refs = traverse_obj(video, 'refs', expected_type=dict) or {}
- m3u8_url = refs.get('m3uUrl')
+ m3u8_url = url_or_none(refs.get('m3uUrl'))
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
- f4m_url = refs.get('f4mUrl')
+ f4m_url = url_or_none(refs.get('f4mUrl'))
if f4m_url:
formats.extend(self._extract_f4m_formats(
f4m_url, video_id, f4m_id='hds', fatal=False))
for asset in (video.get('assets') or []):
- asset_url = asset.get('url')
+ asset_url = url_or_none(asset.get('url'))
if not asset_url:
continue
formats.append({
@@ -93,7 +65,8 @@ class IGNIE(IGNBaseIE):
'width': int_or_none(asset.get('width')),
})
- mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl'])
+ mezzanine_url = traverse_obj(
+ video, ('system', 'mezzanineUrl'), expected_type=url_or_none)
if mezzanine_url:
formats.append({
'ext': determine_ext(mezzanine_url, 'mp4'),
@@ -102,21 +75,16 @@ class IGNIE(IGNBaseIE):
'url': mezzanine_url,
})
- thumbnails = []
- for thumbnail in (video.get('thumbnails') or []):
- thumbnail_url = thumbnail.get('url')
- if not thumbnail_url:
- continue
- thumbnails.append({
- 'url': thumbnail_url,
- })
+ thumbnails = traverse_obj(
+ video, ('thumbnails', ..., {'url': 'url'}), expected_type=url_or_none)
+ tags = traverse_obj(
+ video, ('tags', ..., 'displayName'),
+ expected_type=lambda x: x.strip() or None)
- tags = []
- for tag in (video.get('tags') or []):
- display_name = tag.get('displayName')
- if not display_name:
- continue
- tags.append(display_name)
+ metadata = traverse_obj(video, 'metadata', expected_type=dict) or {}
+ title = traverse_obj(
+ metadata, 'longTitle', 'title', 'name',
+ expected_type=lambda x: x.strip() or None)
return {
'id': video_id,
@@ -124,14 +92,96 @@ class IGNIE(IGNBaseIE):
'description': strip_or_none(metadata.get('description')),
'timestamp': parse_iso8601(metadata.get('publishDate')),
'duration': int_or_none(metadata.get('duration')),
- 'display_id': display_id,
'thumbnails': thumbnails,
'formats': formats,
'tags': tags,
}
-class IGNVideoIE(InfoExtractor):
+class IGNIE(IGNBaseIE):
+ """
+ Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com.
+ Some videos of it.ign.com are also supported
+ """
+ _VIDEO_PATH_RE = r'/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>.+?)'
+ _PLAYLIST_PATH_RE = r'(?:/?\?(?P<filt>[^&#]+))?'
+ _VALID_URL = (
+ r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos(?:%s)'
+ % '|'.join((_VIDEO_PATH_RE + r'(?:[/?&#]|$)', _PLAYLIST_PATH_RE)))
+ IE_NAME = 'ign.com'
+ _PAGE_TYPE = 'video'
+
+ _TESTS = [{
+ 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
+ 'md5': 'd2e1586d9987d40fad7867bf96a018ea',
+ 'info_dict': {
+ 'id': '8f862beef863986b2785559b9e1aa599',
+ 'ext': 'mp4',
+ 'title': 'The Last of Us Review',
+ 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c',
+ 'timestamp': 1370440800,
+ 'upload_date': '20130605',
+ 'tags': 'count:9',
+ 'display_id': 'the-last-of-us-review',
+ 'thumbnail': 'https://assets1.ignimgs.com/vid/thumbnails/user/2014/03/26/lastofusreviewmimig2.jpg',
+ 'duration': 440,
+ },
+ 'params': {
+ 'nocheckcertificate': True,
+ },
+ }, {
+ 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data',
+ 'md5': 'f1581a6fe8c5121be5b807684aeac3f6',
+ 'info_dict': {
+ 'id': 'ee10d774b508c9b8ec07e763b9125b91',
+ 'ext': 'mp4',
+ 'title': 'What\'s New Now: Is GoGo Snooping on Your Data?',
+ 'description': 'md5:817a20299de610bd56f13175386da6fa',
+ 'timestamp': 1420571160,
+ 'upload_date': '20150106',
+ 'tags': 'count:4',
+ },
+ 'skip': '404 Not Found',
+ }, {
+ 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def _extract_embed_urls(cls, url, webpage):
+ grids = re.findall(
+ r'''(?s)<section\b[^>]+\bclass\s*=\s*['"](?:[\w-]+\s+)*?content-feed-grid(?!\B|-)[^>]+>(.+?)</section[^>]*>''',
+ webpage)
+ return filter(None,
+ (urljoin(url, m.group('path')) for m in re.finditer(
+ r'''<a\b[^>]+\bhref\s*=\s*('|")(?P<path>/videos%s)\1'''
+ % cls._VIDEO_PATH_RE, grids[0] if grids else '')))
+
+ def _real_extract(self, url):
+ display_id, filt = self._match_valid_url(url).group('id', 'filt')
+ if display_id:
+ return self._extract_video(url, display_id)
+ return self._extract_playlist(url, filt or 'all')
+
+ def _extract_playlist(self, url, display_id):
+ webpage = self._download_webpage(url, display_id)
+
+ return self.playlist_result(
+ (self.url_result(u, self.ie_key())
+ for u in self._extract_embed_urls(url, webpage)),
+ playlist_id=display_id)
+
+ def _extract_video(self, url, display_id):
+ video = self._checked_call_api(display_id)
+
+ info = self._extract_video_info(video)
+
+ return merge_dicts({
+ 'display_id': display_id,
+ }, info)
+
+
+class IGNVideoIE(IGNBaseIE):
_VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P<id>\d+)/(?:video|trailer)/'
_TESTS = [{
'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s',
@@ -143,7 +193,16 @@ class IGNVideoIE(InfoExtractor):
'description': 'Taking out assassination targets in Hitman has never been more stylish.',
'timestamp': 1444665600,
'upload_date': '20151012',
- }
+ 'display_id': '112203',
+ 'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg',
+ 'duration': 298,
+ 'tags': 'count:13',
+ 'display_id': '112203',
+ 'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg',
+ 'duration': 298,
+ 'tags': 'count:13',
+ },
+ 'expected_warnings': ['HTTP Error 400: Bad Request'],
}, {
'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds',
'only_matching': True,
@@ -163,22 +222,38 @@ class IGNVideoIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- req = HEADRequest(url.rsplit('/', 1)[0] + '/embed')
- url = self._request_webpage(req, video_id).geturl()
+ parsed_url = urllib.parse.urlparse(url)
+ embed_url = urllib.parse.urlunparse(
+ parsed_url._replace(path=parsed_url.path.rsplit('/', 1)[0] + '/embed'))
+
+ webpage, urlh = self._download_webpage_handle(embed_url, video_id)
+ new_url = urlh.url
ign_url = compat_parse_qs(
- compat_urllib_parse_urlparse(url).query).get('url', [None])[0]
+ urllib.parse.urlparse(new_url).query).get('url', [None])[-1]
if ign_url:
return self.url_result(ign_url, IGNIE.ie_key())
- return self.url_result(url)
+ video = self._search_regex(r'(<div\b[^>]+\bdata-video-id\s*=\s*[^>]+>)', webpage, 'video element', fatal=False)
+ if not video:
+ if new_url == url:
+ raise ExtractorError('Redirect loop: ' + url)
+ return self.url_result(new_url)
+ video = extract_attributes(video)
+ video_data = video.get('data-settings') or '{}'
+ video_data = self._parse_json(video_data, video_id)['video']
+ info = self._extract_video_info(video_data)
+
+ return merge_dicts({
+ 'display_id': video_id,
+ }, info)
class IGNArticleIE(IGNBaseIE):
- _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P<id>[^/?&#]+)'
+ _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?(?:[\w-]+/)*?feature/\d+)/(?P<id>[^/?&#]+)'
_PAGE_TYPE = 'article'
_TESTS = [{
'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
'info_dict': {
- 'id': '524497489e4e8ff5848ece34',
+ 'id': '72113',
'title': '100 Little Things in GTA 5 That Will Blow Your Mind',
},
'playlist': [
@@ -186,34 +261,43 @@ class IGNArticleIE(IGNBaseIE):
'info_dict': {
'id': '5ebbd138523268b93c9141af17bec937',
'ext': 'mp4',
- 'title': 'GTA 5 Video Review',
+ 'title': 'Grand Theft Auto V Video Review',
'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
'timestamp': 1379339880,
'upload_date': '20130916',
+ 'tags': 'count:12',
+ 'thumbnail': 'https://assets1.ignimgs.com/thumbs/userUploaded/2021/8/16/gta-v-heistsjpg-e94705-1629138553533.jpeg',
+ 'display_id': 'grand-theft-auto-v-video-review',
+ 'duration': 501,
},
},
{
'info_dict': {
'id': '638672ee848ae4ff108df2a296418ee2',
'ext': 'mp4',
- 'title': '26 Twisted Moments from GTA 5 in Slow Motion',
+ 'title': 'GTA 5 In Slow Motion',
'description': 'The twisted beauty of GTA 5 in stunning slow motion.',
'timestamp': 1386878820,
'upload_date': '20131212',
+ 'duration': 202,
+ 'tags': 'count:25',
+ 'display_id': 'gta-5-in-slow-motion',
+ 'thumbnail': 'https://assets1.ignimgs.com/vid/thumbnails/user/2013/11/03/GTA-SLO-MO-1.jpg',
},
},
],
'params': {
- 'playlist_items': '2-3',
'skip_download': True,
},
+ 'expected_warnings': ['Backend fetch failed'],
}, {
'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch',
'info_dict': {
'id': '53ee806780a81ec46e0790f8',
'title': 'Rewind Theater - Wild Trailer Gamescom 2014',
},
- 'playlist_count': 2,
+ 'playlist_count': 1,
+ 'expected_warnings': ['Backend fetch failed'],
}, {
# videoId pattern
'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned',
@@ -236,18 +320,84 @@ class IGNArticleIE(IGNBaseIE):
'only_matching': True,
}]
+ def _checked_call_api(self, slug):
+ try:
+ return self._call_api(slug)
+ except ExtractorError as e:
+ if isinstance(e.cause, HTTPError):
+ e.cause.args = e.cause.args or [
+ e.cause.response.url, e.cause.status, e.cause.reason]
+ if e.cause.status == 404:
+ raise ExtractorError(
+ 'Content not found: expired?', cause=e.cause,
+ expected=True)
+ elif e.cause.status == 503:
+ self.report_warning(error_to_compat_str(e.cause))
+ return
+ raise
+
def _real_extract(self, url):
display_id = self._match_id(url)
- article = self._call_api(display_id)
+ article = self._checked_call_api(display_id)
+
+ if article:
+ # obsolete ?
+ def entries():
+ media_url = traverse_obj(
+ article, ('mediaRelations', 0, 'media', 'metadata', 'url'),
+ expected_type=url_or_none)
+ if media_url:
+ yield self.url_result(media_url, IGNIE.ie_key())
+ for content in (article.get('content') or []):
+ for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content):
+ if url_or_none(video_url):
+ yield self.url_result(video_url)
+
+ return self.playlist_result(
+ entries(), article.get('articleId'),
+ traverse_obj(
+ article, ('metadata', 'headline'),
+ expected_type=lambda x: x.strip() or None))
+
+ webpage = self._download_webpage(url, display_id)
+
+ playlist_id = self._html_search_meta('dable:item_id', webpage, default=None)
+ if playlist_id:
+
+ def entries():
+ for m in re.finditer(
+ r'''(?s)<object\b[^>]+\bclass\s*=\s*("|')ign-videoplayer\1[^>]*>(?P<params>.+?)</object''',
+ webpage):
+ flashvars = self._search_regex(
+ r'''(<param\b[^>]+\bname\s*=\s*("|')flashvars\2[^>]*>)''',
+ m.group('params'), 'flashvars', default='')
+ flashvars = compat_parse_qs(extract_attributes(flashvars).get('value') or '')
+ v_url = url_or_none((flashvars.get('url') or [None])[-1])
+ if v_url:
+ yield self.url_result(v_url)
+ else:
+ playlist_id = self._search_regex(
+ r'''\bdata-post-id\s*=\s*("|')(?P<id>[\da-f]+)\1''',
+ webpage, 'id', group='id', default=None)
+
+ nextjs_data = self._search_nextjs_data(webpage, display_id)
- def entries():
- media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url'])
- if media_url:
- yield self.url_result(media_url, IGNIE.ie_key())
- for content in (article.get('content') or []):
- for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content):
- yield self.url_result(video_url)
+ def entries():
+ for player in traverse_obj(
+ nextjs_data,
+ ('props', 'apolloState', 'ROOT_QUERY', lambda k, _: k.startswith('videoPlayerProps('), '__ref')):
+ # skip promo links (which may not always be served, eg GH CI servers)
+ if traverse_obj(nextjs_data,
+ ('props', 'apolloState', player.replace('PlayerProps', 'ModernContent')),
+ expected_type=dict):
+ continue
+ video = traverse_obj(nextjs_data, ('props', 'apolloState', player), expected_type=dict) or {}
+ info = self._extract_video_info(video, fatal=False)
+ if info:
+ yield merge_dicts({
+ 'display_id': display_id,
+ }, info)
return self.playlist_result(
- entries(), article.get('articleId'),
- strip_or_none(try_get(article, lambda x: x['metadata']['headline'])))
+ entries(), playlist_id or display_id,
+ re.sub(r'\s+-\s+IGN\s*$', '', self._og_search_title(webpage, default='')) or None)
diff --git a/hypervideo_dl/extractor/imggaming.py b/hypervideo_dl/extractor/imggaming.py
index 8e220fd..a40aa21 100644
--- a/hypervideo_dl/extractor/imggaming.py
+++ b/hypervideo_dl/extractor/imggaming.py
@@ -1,7 +1,7 @@
import json
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
@@ -52,9 +52,9 @@ class ImgGamingBaseIE(InfoExtractor):
return self._call_api(
stream_path, media_id)['playerUrlCallback']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
raise ExtractorError(
- self._parse_json(e.cause.read().decode(), media_id)['messages'][0],
+ self._parse_json(e.cause.response.read().decode(), media_id)['messages'][0],
expected=True)
raise
diff --git a/hypervideo_dl/extractor/instagram.py b/hypervideo_dl/extractor/instagram.py
index 0233513..bfc4b7b 100644
--- a/hypervideo_dl/extractor/instagram.py
+++ b/hypervideo_dl/extractor/instagram.py
@@ -3,9 +3,9 @@ import itertools
import json
import re
import time
-import urllib.error
from .common import InfoExtractor
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
decode_base_n,
@@ -442,7 +442,7 @@ class InstagramIE(InstagramBaseIE):
shared_data = self._search_json(
r'window\._sharedData\s*=', webpage, 'shared data', video_id, fatal=False) or {}
- if shared_data and self._LOGIN_URL not in urlh.geturl():
+ if shared_data and self._LOGIN_URL not in urlh.url:
media.update(traverse_obj(
shared_data, ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'),
('entry_data', 'PostPage', 0, 'media'), expected_type=dict) or {})
@@ -589,7 +589,7 @@ class InstagramPlaylistBaseIE(InstagramBaseIE):
except ExtractorError as e:
# if it's an error caused by a bad query, and there are
# more GIS templates to try, ignore it and keep trying
- if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
if gis_tmpl != gis_tmpls[-1]:
continue
raise
diff --git a/hypervideo_dl/extractor/iprima.py b/hypervideo_dl/extractor/iprima.py
index 1818205..6dec151 100644
--- a/hypervideo_dl/extractor/iprima.py
+++ b/hypervideo_dl/extractor/iprima.py
@@ -7,7 +7,8 @@ from ..utils import (
js_to_json,
urlencode_postdata,
ExtractorError,
- parse_qs
+ parse_qs,
+ traverse_obj
)
@@ -15,8 +16,7 @@ class IPrimaIE(InfoExtractor):
_VALID_URL = r'https?://(?!cnn)(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_GEO_BYPASS = False
_NETRC_MACHINE = 'iprima'
- _LOGIN_URL = 'https://auth.iprima.cz/oauth2/login'
- _TOKEN_URL = 'https://auth.iprima.cz/oauth2/token'
+ _AUTH_ROOT = 'https://auth.iprima.cz'
access_token = None
_TESTS = [{
@@ -67,7 +67,7 @@ class IPrimaIE(InfoExtractor):
return
login_page = self._download_webpage(
- self._LOGIN_URL, None, note='Downloading login page',
+ f'{self._AUTH_ROOT}/oauth2/login', None, note='Downloading login page',
errnote='Downloading login page failed')
login_form = self._hidden_inputs(login_page)
@@ -76,11 +76,20 @@ class IPrimaIE(InfoExtractor):
'_email': username,
'_password': password})
- _, login_handle = self._download_webpage_handle(
- self._LOGIN_URL, None, data=urlencode_postdata(login_form),
+ profile_select_html, login_handle = self._download_webpage_handle(
+ f'{self._AUTH_ROOT}/oauth2/login', None, data=urlencode_postdata(login_form),
note='Logging in')
- code = parse_qs(login_handle.geturl()).get('code')[0]
+ # a profile may need to be selected first, even when there is only a single one
+ if '/profile-select' in login_handle.url:
+ profile_id = self._search_regex(
+ r'data-identifier\s*=\s*["\']?(\w+)', profile_select_html, 'profile id')
+
+ login_handle = self._request_webpage(
+ f'{self._AUTH_ROOT}/user/profile-select-perform/{profile_id}', None,
+ query={'continueUrl': '/user/login?redirect_uri=/user/'}, note='Selecting profile')
+
+ code = traverse_obj(login_handle.url, ({parse_qs}, 'code', 0))
if not code:
raise ExtractorError('Login failed', expected=True)
@@ -89,10 +98,10 @@ class IPrimaIE(InfoExtractor):
'client_id': 'prima_sso',
'grant_type': 'authorization_code',
'code': code,
- 'redirect_uri': 'https://auth.iprima.cz/sso/auth-check'}
+ 'redirect_uri': f'{self._AUTH_ROOT}/sso/auth-check'}
token_data = self._download_json(
- self._TOKEN_URL, None,
+ f'{self._AUTH_ROOT}/oauth2/token', None,
note='Downloading token', errnote='Downloading token failed',
data=urlencode_postdata(token_request_data))
@@ -115,14 +124,22 @@ class IPrimaIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- title = self._html_search_meta(
+ title = self._html_extract_title(webpage) or self._html_search_meta(
['og:title', 'twitter:title'],
webpage, 'title', default=None)
video_id = self._search_regex((
r'productId\s*=\s*([\'"])(?P<id>p\d+)\1',
- r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1'),
- webpage, 'real id', group='id')
+ r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1',
+ ), webpage, 'real id', group='id', default=None)
+
+ if not video_id:
+ nuxt_data = self._search_nuxt_data(webpage, video_id, traverse='data')
+ video_id = traverse_obj(
+ nuxt_data, (..., 'content', 'additionals', 'videoPlayId', {str}), get_all=False)
+
+ if not video_id:
+ self.raise_no_formats('Unable to extract video ID from webpage')
metadata = self._download_json(
f'https://api.play-backend.iprima.cz/api/v1//products/id-{video_id}/play',
diff --git a/hypervideo_dl/extractor/iqiyi.py b/hypervideo_dl/extractor/iqiyi.py
index c41f6db..94bcad4 100644
--- a/hypervideo_dl/extractor/iqiyi.py
+++ b/hypervideo_dl/extractor/iqiyi.py
@@ -270,12 +270,14 @@ class IqIE(InfoExtractor):
'1': 'zh_CN',
'2': 'zh_TW',
'3': 'en',
- '4': 'kor',
+ '4': 'ko',
+ '5': 'ja',
'18': 'th',
'21': 'my',
'23': 'vi',
'24': 'id',
'26': 'es',
+ '27': 'pt',
'28': 'ar',
}
@@ -355,13 +357,16 @@ class IqIE(InfoExtractor):
if player_js_cache:
return player_js_cache
webpack_js_url = self._proto_relative_url(self._search_regex(
- r'<script src="((?:https?)?//stc.iqiyipic.com/_next/static/chunks/webpack-\w+\.js)"', webpage, 'webpack URL'))
+ r'<script src="((?:https?:)?//stc\.iqiyipic\.com/_next/static/chunks/webpack-\w+\.js)"', webpage, 'webpack URL'))
webpack_js = self._download_webpage(webpack_js_url, video_id, note='Downloading webpack JS', errnote='Unable to download webpack JS')
- webpack_map1, webpack_map2 = [self._parse_json(js_map, video_id, transform_source=js_to_json) for js_map in self._search_regex(
- r'\(({[^}]*})\[\w+\][^\)]*\)\s*\+\s*["\']\.["\']\s*\+\s*({[^}]*})\[\w+\]\+["\']\.js', webpack_js, 'JS locations', group=(1, 2))]
- for module_index in reversed(list(webpack_map2.keys())):
+ webpack_map = self._search_json(
+ r'["\']\s*\+\s*', webpack_js, 'JS locations', video_id,
+ contains_pattern=r'{\s*(?:\d+\s*:\s*["\'][\da-f]+["\']\s*,?\s*)+}',
+ end_pattern=r'\[\w+\]\+["\']\.js', transform_source=js_to_json)
+
+ for module_index in reversed(webpack_map):
module_js = self._download_webpage(
- f'https://stc.iqiyipic.com/_next/static/chunks/{webpack_map1.get(module_index, module_index)}.{webpack_map2[module_index]}.js',
+ f'https://stc.iqiyipic.com/_next/static/chunks/{module_index}.{webpack_map[module_index]}.js',
video_id, note=f'Downloading #{module_index} module JS', errnote='Unable to download module JS', fatal=False) or ''
if 'vms request' in module_js:
self.cache.store('iq', 'player_js', module_js)
@@ -373,11 +378,11 @@ class IqIE(InfoExtractor):
self._extract_vms_player_js(webpage, video_id), 'signature function')
def _update_bid_tags(self, webpage, video_id):
- extracted_bid_tags = self._parse_json(
- self._search_regex(
- r'arguments\[1\][^,]*,\s*function\s*\([^\)]*\)\s*{\s*"use strict";?\s*var \w=({.+}})\s*,\s*\w\s*=\s*{\s*getNewVd',
- self._extract_vms_player_js(webpage, video_id), 'video tags', default=''),
- video_id, transform_source=js_to_json, fatal=False)
+ extracted_bid_tags = self._search_json(
+ r'function\s*\([^)]*\)\s*\{\s*"use strict";?\s*var \w\s*=\s*',
+ self._extract_vms_player_js(webpage, video_id), 'video tags', video_id,
+ contains_pattern=r'{\s*\d+\s*:\s*\{\s*nbid\s*:.+}\s*}',
+ end_pattern=r'\s*,\s*\w\s*=\s*\{\s*getNewVd', fatal=False, transform_source=js_to_json)
if not extracted_bid_tags:
return
self._BID_TAGS = {
@@ -412,7 +417,7 @@ class IqIE(InfoExtractor):
'langCode': self._get_cookie('lang', 'en_us'),
'deviceId': self._get_cookie('QC005', '')
}, fatal=False)
- ut_list = traverse_obj(vip_data, ('data', 'all_vip', ..., 'vipType'), expected_type=str_or_none, default=[])
+ ut_list = traverse_obj(vip_data, ('data', 'all_vip', ..., 'vipType'), expected_type=str_or_none)
else:
ut_list = ['0']
@@ -444,7 +449,7 @@ class IqIE(InfoExtractor):
self.report_warning('This preview video is limited%s' % format_field(preview_time, None, ' to %s seconds'))
# TODO: Extract audio-only formats
- for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none, default=[])):
+ for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none)):
dash_path = dash_paths.get(bid)
if not dash_path:
self.report_warning(f'Unknown format id: {bid}. It is currently not being extracted')
@@ -455,7 +460,7 @@ class IqIE(InfoExtractor):
fatal=False), 'data', expected_type=dict)
video_format = traverse_obj(format_data, ('program', 'video', lambda _, v: str(v['bid']) == bid),
- expected_type=dict, default=[], get_all=False) or {}
+ expected_type=dict, get_all=False) or {}
extracted_formats = []
if video_format.get('m3u8Url'):
extracted_formats.extend(self._extract_m3u8_formats(
@@ -496,7 +501,7 @@ class IqIE(InfoExtractor):
})
formats.extend(extracted_formats)
- for sub_format in traverse_obj(initial_format_data, ('program', 'stl', ...), expected_type=dict, default=[]):
+ for sub_format in traverse_obj(initial_format_data, ('program', 'stl', ...), expected_type=dict):
lang = self._LID_TAGS.get(str_or_none(sub_format.get('lid')), sub_format.get('_name'))
subtitles.setdefault(lang, []).extend([{
'ext': format_ext,
diff --git a/hypervideo_dl/extractor/ivi.py b/hypervideo_dl/extractor/ivi.py
index 27a222a..e7ba5f3 100644
--- a/hypervideo_dl/extractor/ivi.py
+++ b/hypervideo_dl/extractor/ivi.py
@@ -2,11 +2,8 @@ import json
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- int_or_none,
- qualities,
-)
+from ..dependencies import Cryptodome
+from ..utils import ExtractorError, int_or_none, qualities
class IviIE(InfoExtractor):
@@ -94,18 +91,8 @@ class IviIE(InfoExtractor):
for site in (353, 183):
content_data = (data % site).encode()
if site == 353:
- try:
- from Cryptodome.Cipher import Blowfish
- from Cryptodome.Hash import CMAC
- pycryptodome_found = True
- except ImportError:
- try:
- from Crypto.Cipher import Blowfish
- from Crypto.Hash import CMAC
- pycryptodome_found = True
- except ImportError:
- pycryptodome_found = False
- continue
+ if not Cryptodome.CMAC:
+ continue
timestamp = (self._download_json(
self._LIGHT_URL, video_id,
@@ -118,7 +105,8 @@ class IviIE(InfoExtractor):
query = {
'ts': timestamp,
- 'sign': CMAC.new(self._LIGHT_KEY, timestamp.encode() + content_data, Blowfish).hexdigest(),
+ 'sign': Cryptodome.CMAC.new(self._LIGHT_KEY, timestamp.encode() + content_data,
+ Cryptodome.Blowfish).hexdigest(),
}
else:
query = {}
diff --git a/hypervideo_dl/extractor/iwara.py b/hypervideo_dl/extractor/iwara.py
index ec3e59c..91b87e0 100644
--- a/hypervideo_dl/extractor/iwara.py
+++ b/hypervideo_dl/extractor/iwara.py
@@ -1,239 +1,298 @@
-import itertools
-import re
+import functools
import urllib.parse
+import urllib.error
+import hashlib
+import json
+import time
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
+ OnDemandPagedList,
int_or_none,
+ jwt_decode_hs256,
mimetype2ext,
- remove_end,
- strip_or_none,
- unified_strdate,
- url_or_none,
- urljoin,
+ qualities,
+ traverse_obj,
+ try_call,
+ unified_timestamp,
)
class IwaraBaseIE(InfoExtractor):
- _BASE_REGEX = r'(?P<base_url>https?://(?:www\.|ecchi\.)?iwara\.tv)'
+ _NETRC_MACHINE = 'iwara'
+ _USERTOKEN = None
+ _MEDIATOKEN = None
+
+ def _is_token_expired(self, token, token_type):
+ # User token TTL == ~3 weeks, Media token TTL == ~1 hour
+ if (try_call(lambda: jwt_decode_hs256(token)['exp']) or 0) <= int(time.time() - 120):
+ self.to_screen(f'{token_type} token has expired')
+ return True
+
+ def _get_user_token(self):
+ username, password = self._get_login_info()
+ if not username or not password:
+ return
- def _extract_playlist(self, base_url, webpage):
- for path in re.findall(r'class="title">\s*<a[^<]+href="([^"]+)', webpage):
- yield self.url_result(urljoin(base_url, path))
+ user_token = IwaraBaseIE._USERTOKEN or self.cache.load(self._NETRC_MACHINE, username)
+ if not user_token or self._is_token_expired(user_token, 'User'):
+ response = self._download_json(
+ 'https://api.iwara.tv/user/login', None, note='Logging in',
+ headers={'Content-Type': 'application/json'}, data=json.dumps({
+ 'email': username,
+ 'password': password
+ }).encode(), expected_status=lambda x: True)
+ user_token = traverse_obj(response, ('token', {str}))
+ if not user_token:
+ error = traverse_obj(response, ('message', {str}))
+ if 'invalidLogin' in error:
+ raise ExtractorError('Invalid login credentials', expected=True)
+ else:
+ raise ExtractorError(f'Iwara API said: {error or "nothing"}')
+
+ self.cache.store(self._NETRC_MACHINE, username, user_token)
+
+ IwaraBaseIE._USERTOKEN = user_token
+
+ def _get_media_token(self):
+ self._get_user_token()
+ if not IwaraBaseIE._USERTOKEN:
+ return # user has not passed credentials
+
+ if not IwaraBaseIE._MEDIATOKEN or self._is_token_expired(IwaraBaseIE._MEDIATOKEN, 'Media'):
+ IwaraBaseIE._MEDIATOKEN = self._download_json(
+ 'https://api.iwara.tv/user/token', None, note='Fetching media token',
+ data=b'', headers={
+ 'Authorization': f'Bearer {IwaraBaseIE._USERTOKEN}',
+ 'Content-Type': 'application/json'
+ })['accessToken']
+
+ return {'Authorization': f'Bearer {IwaraBaseIE._MEDIATOKEN}'}
+
+ def _perform_login(self, username, password):
+ self._get_media_token()
class IwaraIE(IwaraBaseIE):
- _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/videos/(?P<id>[a-zA-Z0-9]+)'
+ IE_NAME = 'iwara'
+ _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos?/(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
- 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD',
- # md5 is unstable
+ 'url': 'https://www.iwara.tv/video/k2ayoueezfkx6gvq',
'info_dict': {
- 'id': 'amVwUl1EHpAD9RD',
+ 'id': 'k2ayoueezfkx6gvq',
'ext': 'mp4',
- 'title': '【MMD R-18】ガールフレンド carry_me_off',
'age_limit': 18,
- 'thumbnail': 'https://i.iwara.tv/sites/default/files/videos/thumbnails/7951/thumbnail-7951_0001.png',
- 'uploader': 'Reimu丨Action',
- 'upload_date': '20150828',
- 'description': 'md5:1d4905ce48c66c9299c617f08e106e0f',
+ 'title': 'Defeat of Irybelda - アイリベルダの敗北',
+ 'description': 'md5:70278abebe706647a8b4cb04cf23e0d3',
+ 'uploader': 'Inwerwm',
+ 'uploader_id': 'inwerwm',
+ 'tags': 'count:1',
+ 'like_count': 6133,
+ 'view_count': 1050343,
+ 'comment_count': 1,
+ 'timestamp': 1677843869,
+ 'modified_timestamp': 1679056362,
},
+ 'skip': 'this video cannot be played because of migration',
}, {
- 'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO',
- 'md5': '7e5f1f359cd51a027ba4a7b7710a50f0',
+ 'url': 'https://iwara.tv/video/1ywe1sbkqwumpdxz5/',
+ 'md5': '7645f966f069b8ec9210efd9130c9aad',
'info_dict': {
- 'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc',
+ 'id': '1ywe1sbkqwumpdxz5',
'ext': 'mp4',
- 'title': '[3D Hentai] Kyonyu × Genkai × Emaki Shinobi Girls.mp4',
'age_limit': 18,
+ 'title': 'Aponia アポニア SEX Party Tonight 手の脱衣 巨乳 ',
+ 'description': 'md5:3f60016fff22060eef1ef26d430b1f67',
+ 'uploader': 'Lyu ya',
+ 'uploader_id': 'user792540',
+ 'tags': [
+ 'uncategorized'
+ ],
+ 'like_count': int,
+ 'view_count': int,
+ 'comment_count': int,
+ 'timestamp': 1678732213,
+ 'modified_timestamp': int,
+ 'thumbnail': 'https://files.iwara.tv/image/thumbnail/581d12b5-46f4-4f15-beb2-cfe2cde5d13d/thumbnail-00.jpg',
+ 'modified_date': '20230614',
+ 'upload_date': '20230313',
},
- 'add_ie': ['GoogleDrive'],
}, {
- 'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq',
- # md5 is unstable
+ 'url': 'https://iwara.tv/video/blggmfno8ghl725bg',
'info_dict': {
- 'id': '6liAP9s2Ojc',
+ 'id': 'blggmfno8ghl725bg',
'ext': 'mp4',
'age_limit': 18,
- 'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)',
- 'description': 'md5:590c12c0df1443d833fbebe05da8c47a',
- 'upload_date': '20160910',
- 'uploader': 'aMMDsork',
- 'uploader_id': 'UCVOFyOSCyFkXTYYHITtqB7A',
+ 'title': 'お外でおしっこしちゃう猫耳ロリメイド',
+ 'description': 'md5:0342ba9bf6db09edbbb28729657c3611',
+ 'uploader': 'Fe_Kurosabi',
+ 'uploader_id': 'fekurosabi',
+ 'tags': [
+ 'pee'
+ ],
+ 'like_count': int,
+ 'view_count': int,
+ 'comment_count': int,
+ 'timestamp': 1598880567,
+ 'modified_timestamp': int,
+ 'upload_date': '20200831',
+ 'modified_date': '20230605',
+ 'thumbnail': 'https://files.iwara.tv/image/thumbnail/7693e881-d302-42a4-a780-f16d66b5dadd/thumbnail-00.jpg',
+ # 'availability': 'needs_auth',
},
- 'add_ie': ['Youtube'],
}]
+ def _extract_formats(self, video_id, fileurl):
+ up = urllib.parse.urlparse(fileurl)
+ q = urllib.parse.parse_qs(up.query)
+ paths = up.path.rstrip('/').split('/')
+ # https://github.com/hypervideo/hypervideo/issues/6549#issuecomment-1473771047
+ x_version = hashlib.sha1('_'.join((paths[-1], q['expires'][0], '5nFp9kmbNnHdAFhaqMvt')).encode()).hexdigest()
+
+ preference = qualities(['preview', '360', '540', 'Source'])
+
+ files = self._download_json(fileurl, video_id, headers={'X-Version': x_version})
+ for fmt in files:
+ yield traverse_obj(fmt, {
+ 'format_id': 'name',
+ 'url': ('src', ('view', 'download'), {self._proto_relative_url}),
+ 'ext': ('type', {mimetype2ext}),
+ 'quality': ('name', {preference}),
+ 'height': ('name', {int_or_none}),
+ }, get_all=False)
+
def _real_extract(self, url):
video_id = self._match_id(url)
-
- webpage, urlh = self._download_webpage_handle(url, video_id)
-
- hostname = urllib.parse.urlparse(urlh.geturl()).hostname
- # ecchi is 'sexy' in Japanese
- age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0
-
- video_data = self._download_json('http://www.iwara.tv/api/video/%s' % video_id, video_id)
-
- if not video_data:
- iframe_url = self._html_search_regex(
- r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1',
- webpage, 'iframe URL', group='url')
- return {
- '_type': 'url_transparent',
- 'url': iframe_url,
- 'age_limit': age_limit,
- }
-
- title = remove_end(self._html_extract_title(webpage), ' | Iwara')
-
- thumbnail = self._html_search_regex(
- r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None)
-
- uploader = self._html_search_regex(
- r'class="username">([^<]+)', webpage, 'uploader', fatal=False)
-
- upload_date = unified_strdate(self._html_search_regex(
- r'作成日:([^\s]+)', webpage, 'upload_date', fatal=False))
-
- description = strip_or_none(self._search_regex(
- r'<p>(.+?(?=</div))', webpage, 'description', fatal=False,
- flags=re.DOTALL))
-
- formats = []
- for a_format in video_data:
- format_uri = url_or_none(a_format.get('uri'))
- if not format_uri:
- continue
- format_id = a_format.get('resolution')
- height = int_or_none(self._search_regex(
- r'(\d+)p', format_id, 'height', default=None))
- formats.append({
- 'url': self._proto_relative_url(format_uri, 'https:'),
- 'format_id': format_id,
- 'ext': mimetype2ext(a_format.get('mime')) or 'mp4',
- 'height': height,
- 'width': int_or_none(height / 9.0 * 16.0 if height else None),
- 'quality': 1 if format_id == 'Source' else 0,
- })
+ username, _ = self._get_login_info()
+ video_data = self._download_json(
+ f'https://api.iwara.tv/video/{video_id}', video_id,
+ expected_status=lambda x: True, headers=self._get_media_token())
+ errmsg = video_data.get('message')
+ # at this point we can actually get uploaded user info, but do we need it?
+ if errmsg == 'errors.privateVideo':
+ self.raise_login_required('Private video. Login if you have permissions to watch', method='password')
+ elif errmsg == 'errors.notFound' and not username:
+ self.raise_login_required('Video may need login to view', method='password')
+ elif errmsg: # None if success
+ raise ExtractorError(f'Iwara says: {errmsg}')
+
+ if not video_data.get('fileUrl'):
+ if video_data.get('embedUrl'):
+ return self.url_result(video_data.get('embedUrl'))
+ raise ExtractorError('This video is unplayable', expected=True)
return {
'id': video_id,
- 'title': title,
- 'age_limit': age_limit,
- 'formats': formats,
- 'thumbnail': self._proto_relative_url(thumbnail, 'https:'),
- 'uploader': uploader,
- 'upload_date': upload_date,
- 'description': description,
+ 'age_limit': 18 if video_data.get('rating') == 'ecchi' else 0, # ecchi is 'sexy' in Japanese
+ **traverse_obj(video_data, {
+ 'title': 'title',
+ 'description': 'body',
+ 'uploader': ('user', 'name'),
+ 'uploader_id': ('user', 'username'),
+ 'tags': ('tags', ..., 'id'),
+ 'like_count': 'numLikes',
+ 'view_count': 'numViews',
+ 'comment_count': 'numComments',
+ 'timestamp': ('createdAt', {unified_timestamp}),
+ 'modified_timestamp': ('updatedAt', {unified_timestamp}),
+ 'thumbnail': ('file', 'id', {str}, {
+ lambda x: f'https://files.iwara.tv/image/thumbnail/{x}/thumbnail-00.jpg'}),
+ }),
+ 'formats': list(self._extract_formats(video_id, video_data.get('fileUrl'))),
}
-class IwaraPlaylistIE(IwaraBaseIE):
- _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/playlist/(?P<id>[^/?#&]+)'
- IE_NAME = 'iwara:playlist'
+class IwaraUserIE(IwaraBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?iwara\.tv/profile/(?P<id>[^/?#&]+)'
+ IE_NAME = 'iwara:user'
+ _PER_PAGE = 32
_TESTS = [{
- 'url': 'https://ecchi.iwara.tv/playlist/best-enf',
+ 'url': 'https://iwara.tv/profile/user792540/videos',
+ 'info_dict': {
+ 'id': 'user792540',
+ 'title': 'Lyu ya',
+ },
+ 'playlist_mincount': 70,
+ }, {
+ 'url': 'https://iwara.tv/profile/theblackbirdcalls/videos',
'info_dict': {
- 'title': 'Best enf',
- 'uploader': 'Jared98112',
- 'id': 'best-enf',
+ 'id': 'theblackbirdcalls',
+ 'title': 'TheBlackbirdCalls',
},
- 'playlist_mincount': 1097,
+ 'playlist_mincount': 723,
+ }, {
+ 'url': 'https://iwara.tv/profile/user792540',
+ 'only_matching': True,
}, {
- # urlencoded
- 'url': 'https://ecchi.iwara.tv/playlist/%E3%83%97%E3%83%AC%E3%82%A4%E3%83%AA%E3%82%B9%E3%83%88-2',
+ 'url': 'https://iwara.tv/profile/theblackbirdcalls',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.iwara.tv/profile/lumymmd',
'info_dict': {
- 'id': 'プレイリスト-2',
- 'title': 'プレイリスト',
- 'uploader': 'mainyu',
+ 'id': 'lumymmd',
+ 'title': 'Lumy MMD',
},
- 'playlist_mincount': 91,
+ 'playlist_mincount': 1,
}]
+ def _entries(self, playlist_id, user_id, page):
+ videos = self._download_json(
+ 'https://api.iwara.tv/videos', playlist_id,
+ note=f'Downloading page {page}',
+ query={
+ 'page': page,
+ 'sort': 'date',
+ 'user': user_id,
+ 'limit': self._PER_PAGE,
+ }, headers=self._get_media_token())
+ for x in traverse_obj(videos, ('results', ..., 'id')):
+ yield self.url_result(f'https://iwara.tv/video/{x}')
+
def _real_extract(self, url):
- playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url')
- playlist_id = urllib.parse.unquote(playlist_id)
- webpage = self._download_webpage(url, playlist_id)
+ playlist_id = self._match_id(url)
+ user_info = self._download_json(
+ f'https://api.iwara.tv/profile/{playlist_id}', playlist_id,
+ note='Requesting user info')
+ user_id = traverse_obj(user_info, ('user', 'id'))
- return {
- '_type': 'playlist',
- 'id': playlist_id,
- 'title': self._html_search_regex(r'class="title"[^>]*>([^<]+)', webpage, 'title', fatal=False),
- 'uploader': self._html_search_regex(r'<h2>([^<]+)', webpage, 'uploader', fatal=False),
- 'entries': self._extract_playlist(base_url, webpage),
- }
+ return self.playlist_result(
+ OnDemandPagedList(
+ functools.partial(self._entries, playlist_id, user_id),
+ self._PER_PAGE),
+ playlist_id, traverse_obj(user_info, ('user', 'name')))
-class IwaraUserIE(IwaraBaseIE):
- _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/users/(?P<id>[^/?#&]+)'
- IE_NAME = 'iwara:user'
+class IwaraPlaylistIE(IwaraBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?iwara\.tv/playlist/(?P<id>[0-9a-f-]+)'
+ IE_NAME = 'iwara:playlist'
+ _PER_PAGE = 32
_TESTS = [{
- 'note': 'number of all videos page is just 1 page. less than 40 videos',
- 'url': 'https://ecchi.iwara.tv/users/infinityyukarip',
+ 'url': 'https://iwara.tv/playlist/458e5486-36a4-4ac0-b233-7e9eef01025f',
'info_dict': {
- 'title': 'Uploaded videos from Infinity_YukariP',
- 'id': 'infinityyukarip',
- 'uploader': 'Infinity_YukariP',
- 'uploader_id': 'infinityyukarip',
+ 'id': '458e5486-36a4-4ac0-b233-7e9eef01025f',
},
- 'playlist_mincount': 39,
- }, {
- 'note': 'no even all videos page. probably less than 10 videos',
- 'url': 'https://ecchi.iwara.tv/users/mmd-quintet',
- 'info_dict': {
- 'title': 'Uploaded videos from mmd quintet',
- 'id': 'mmd-quintet',
- 'uploader': 'mmd quintet',
- 'uploader_id': 'mmd-quintet',
- },
- 'playlist_mincount': 6,
- }, {
- 'note': 'has paging. more than 40 videos',
- 'url': 'https://ecchi.iwara.tv/users/theblackbirdcalls',
- 'info_dict': {
- 'title': 'Uploaded videos from TheBlackbirdCalls',
- 'id': 'theblackbirdcalls',
- 'uploader': 'TheBlackbirdCalls',
- 'uploader_id': 'theblackbirdcalls',
- },
- 'playlist_mincount': 420,
- }, {
- 'note': 'foreign chars in URL. there must be foreign characters in URL',
- 'url': 'https://ecchi.iwara.tv/users/ぶた丼',
- 'info_dict': {
- 'title': 'Uploaded videos from ぶた丼',
- 'id': 'ぶた丼',
- 'uploader': 'ぶた丼',
- 'uploader_id': 'ぶた丼',
- },
- 'playlist_mincount': 170,
+ 'playlist_mincount': 3,
}]
- def _entries(self, playlist_id, base_url):
- webpage = self._download_webpage(
- f'{base_url}/users/{playlist_id}', playlist_id)
- videos_url = self._search_regex(r'<a href="(/users/[^/]+/videos)(?:\?[^"]+)?">', webpage, 'all videos url', default=None)
- if not videos_url:
- yield from self._extract_playlist(base_url, webpage)
- return
-
- videos_url = urljoin(base_url, videos_url)
-
- for n in itertools.count(1):
- page = self._download_webpage(
- videos_url, playlist_id, note=f'Downloading playlist page {n}',
- query={'page': str(n - 1)} if n > 1 else {})
- yield from self._extract_playlist(
- base_url, page)
-
- if f'page={n}' not in page:
- break
+ def _entries(self, playlist_id, first_page, page):
+ videos = self._download_json(
+ 'https://api.iwara.tv/videos', playlist_id, f'Downloading page {page}',
+ query={'page': page, 'limit': self._PER_PAGE},
+ headers=self._get_media_token()) if page else first_page
+ for x in traverse_obj(videos, ('results', ..., 'id')):
+ yield self.url_result(f'https://iwara.tv/video/{x}')
def _real_extract(self, url):
- playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url')
- playlist_id = urllib.parse.unquote(playlist_id)
+ playlist_id = self._match_id(url)
+ page_0 = self._download_json(
+ f'https://api.iwara.tv/playlist/{playlist_id}?page=0&limit={self._PER_PAGE}', playlist_id,
+ note='Requesting playlist info', headers=self._get_media_token())
return self.playlist_result(
- self._entries(playlist_id, base_url), playlist_id)
+ OnDemandPagedList(
+ functools.partial(self._entries, playlist_id, page_0),
+ self._PER_PAGE),
+ playlist_id, traverse_obj(page_0, ('title', 'name')))
diff --git a/hypervideo_dl/extractor/joj.py b/hypervideo_dl/extractor/joj.py
index 9b62284..ea46042 100644
--- a/hypervideo_dl/extractor/joj.py
+++ b/hypervideo_dl/extractor/joj.py
@@ -23,10 +23,20 @@ class JojIE(InfoExtractor):
'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932',
'ext': 'mp4',
'title': 'NOVÉ BÝVANIE',
- 'thumbnail': r're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*?$',
'duration': 3118,
}
}, {
+ 'url': 'https://media.joj.sk/embed/CSM0Na0l0p1',
+ 'info_dict': {
+ 'id': 'CSM0Na0l0p1',
+ 'ext': 'mp4',
+ 'height': 576,
+ 'title': 'Extrémne rodiny 2 - POKRAČOVANIE (2012/04/09 21:30:00)',
+ 'duration': 3937,
+ 'thumbnail': r're:^https?://.*?$',
+ }
+ }, {
'url': 'https://media.joj.sk/embed/9i1cxv',
'only_matching': True,
}, {
@@ -43,10 +53,10 @@ class JojIE(InfoExtractor):
webpage = self._download_webpage(
'https://media.joj.sk/embed/%s' % video_id, video_id)
- title = self._search_regex(
- (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
- r'<title>(?P<title>[^<]+)'), webpage, 'title',
- default=None, group='title') or self._og_search_title(webpage)
+ title = (self._search_json(r'videoTitle\s*:', webpage, 'title', video_id,
+ contains_pattern=r'["\'].+["\']', default=None)
+ or self._html_extract_title(webpage, default=None)
+ or self._og_search_title(webpage))
bitrates = self._parse_json(
self._search_regex(
@@ -58,11 +68,13 @@ class JojIE(InfoExtractor):
for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []:
if isinstance(format_url, compat_str):
height = self._search_regex(
- r'(\d+)[pP]\.', format_url, 'height', default=None)
+ r'(\d+)[pP]|(pal)\.', format_url, 'height', default=None)
+ if height == 'pal':
+ height = 576
formats.append({
'url': format_url,
'format_id': format_field(height, None, '%sp'),
- 'height': int(height),
+ 'height': int_or_none(height),
})
if not formats:
playlist = self._download_xml(
diff --git a/hypervideo_dl/extractor/jstream.py b/hypervideo_dl/extractor/jstream.py
new file mode 100644
index 0000000..3e2e627
--- /dev/null
+++ b/hypervideo_dl/extractor/jstream.py
@@ -0,0 +1,73 @@
+import base64
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ float_or_none,
+ js_to_json,
+ remove_start,
+)
+
+
+class JStreamIE(InfoExtractor):
+ # group "id" only exists for compliance, not directly used in requests
+ # also all components are mandatory
+ _VALID_URL = r'jstream:(?P<host>www\d+):(?P<id>(?P<publisher>[a-z0-9]+):(?P<mid>\d+))'
+
+ _TESTS = [{
+ 'url': 'jstream:www50:eqd638pvwx:752',
+ 'info_dict': {
+ 'id': 'eqd638pvwx:752',
+ 'ext': 'mp4',
+ 'title': '阪神淡路大震災 激震の記録2020年版 解説動画',
+ 'duration': 672,
+ 'thumbnail': r're:https?://eqd638pvwx\.eq\.webcdn\.stream\.ne\.jp/.+\.jpg',
+ },
+ }]
+
+ def _parse_jsonp(self, callback, string, video_id):
+ return self._search_json(rf'\s*{re.escape(callback)}\s*\(', string, callback, video_id)
+
+ def _find_formats(self, video_id, movie_list_hls, host, publisher, subtitles):
+ for value in movie_list_hls:
+ text = value.get('text') or ''
+ if not text.startswith('auto'):
+ continue
+ m3u8_id = remove_start(remove_start(text, 'auto'), '_') or None
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/{value.get("url")}', video_id, 'mp4', m3u8_id=m3u8_id)
+ self._merge_subtitles(subs, target=subtitles)
+ yield from fmts
+
+ def _real_extract(self, url):
+ host, publisher, mid, video_id = self._match_valid_url(url).group('host', 'publisher', 'mid', 'id')
+ video_info_jsonp = self._download_webpage(
+ f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/eq_meta/v1/{mid}.jsonp',
+ video_id, 'Requesting video info')
+ video_info = self._parse_jsonp('metaDataResult', video_info_jsonp, video_id)['movie']
+ subtitles = {}
+ formats = list(self._find_formats(video_id, video_info.get('movie_list_hls'), host, publisher, subtitles))
+ self._remove_duplicate_formats(formats)
+ return {
+ 'id': video_id,
+ 'title': video_info.get('title'),
+ 'duration': float_or_none(video_info.get('duration')),
+ 'thumbnail': video_info.get('thumbnail_url'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ @classmethod
+ def _extract_embed_urls(cls, url, webpage):
+ # check for eligiblity of webpage
+ # https://support.eq.stream.co.jp/hc/ja/articles/115008388147-%E3%83%97%E3%83%AC%E3%82%A4%E3%83%A4%E3%83%BCAPI%E3%81%AE%E3%82%B5%E3%83%B3%E3%83%97%E3%83%AB%E3%82%B3%E3%83%BC%E3%83%89
+ script_tag = re.search(r'<script\s*[^>]+?src="https://ssl-cache\.stream\.ne\.jp/(?P<host>www\d+)/(?P<publisher>[a-z0-9]+)/[^"]+?/if\.js"', webpage)
+ if not script_tag:
+ return
+ host, publisher = script_tag.groups()
+ for m in re.finditer(r'(?s)PlayerFactoryIF\.create\(\s*({[^\}]+?})\s*\)\s*;', webpage):
+ # TODO: using json.loads here as InfoExtractor._parse_json is not classmethod
+ info = json.loads(js_to_json(m.group(1)))
+ mid = base64.b64decode(info.get('m')).decode()
+ yield f'jstream:{host}:{publisher}:{mid}'
diff --git a/hypervideo_dl/extractor/jwplatform.py b/hypervideo_dl/extractor/jwplatform.py
index c949689..bc47aa6 100644
--- a/hypervideo_dl/extractor/jwplatform.py
+++ b/hypervideo_dl/extractor/jwplatform.py
@@ -8,14 +8,16 @@ class JWPlatformIE(InfoExtractor):
_VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|manifest)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
_TESTS = [{
'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
- 'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
+ 'md5': '3aa16e4f6860e6e78b7df5829519aed3',
'info_dict': {
'id': 'nPripu9l',
- 'ext': 'mov',
+ 'ext': 'mp4',
'title': 'Big Buck Bunny Trailer',
'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.',
'upload_date': '20081127',
'timestamp': 1227796140,
+ 'duration': 32.0,
+ 'thumbnail': 'https://cdn.jwplayer.com/v2/media/nPripu9l/poster.jpg?width=720',
}
}, {
'url': 'https://cdn.jwplayer.com/players/nPripu9l-ALJ3XQCI.js',
@@ -37,18 +39,31 @@ class JWPlatformIE(InfoExtractor):
},
}, {
# Player url not surrounded by quotes
- 'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/darling-berlin',
+ 'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/school-trip',
'info_dict': {
- 'id': 'R10NQdhY',
- 'title': 'Playgirl',
+ 'id': 'jUxh5uin',
+ 'title': 'Klassenfahrt',
'ext': 'mp4',
- 'upload_date': '20220624',
- 'thumbnail': 'https://cdn.jwplayer.com/v2/media/R10NQdhY/poster.jpg?width=720',
- 'timestamp': 1656064800,
- 'description': 'BRD 1966, Will Tremper',
- 'duration': 5146.0,
+ 'upload_date': '20230109',
+ 'thumbnail': 'https://cdn.jwplayer.com/v2/media/jUxh5uin/poster.jpg?width=720',
+ 'timestamp': 1673270298,
+ 'description': '',
+ 'duration': 5193.0,
},
'params': {'allowed_extractors': ['generic', 'jwplatform']},
+ }, {
+ # iframe src attribute includes backslash before URL string
+ 'url': 'https://www.elespectador.com/colombia/video-asi-se-evito-la-fuga-de-john-poulos-presunto-feminicida-de-valentina-trespalacios-explicacion',
+ 'info_dict': {
+ 'id': 'QD3gsexj',
+ 'title': 'Así se evitó la fuga de John Poulos, presunto feminicida de Valentina Trespalacios',
+ 'ext': 'mp4',
+ 'upload_date': '20230127',
+ 'thumbnail': 'https://cdn.jwplayer.com/v2/media/QD3gsexj/poster.jpg?width=720',
+ 'timestamp': 1674862986,
+ 'description': 'md5:128fd74591c4e1fc2da598c5cb6f5ce4',
+ 'duration': 263.0,
+ },
}]
@classmethod
@@ -57,7 +72,7 @@ class JWPlatformIE(InfoExtractor):
# <input value=URL> is used by hyland.com
# if we find <iframe>, dont look for <input>
ret = re.findall(
- r'<%s[^>]+?%s=["\']?((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})' % (tag, key),
+ r'<%s[^>]+?%s=\\?["\']?((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})' % (tag, key),
webpage)
if ret:
return ret
diff --git a/hypervideo_dl/extractor/kakao.py b/hypervideo_dl/extractor/kakao.py
index 1f0f0a5..43055e8 100644
--- a/hypervideo_dl/extractor/kakao.py
+++ b/hypervideo_dl/extractor/kakao.py
@@ -1,5 +1,5 @@
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
@@ -101,8 +101,8 @@ class KakaoIE(InfoExtractor):
cdn_api_base, video_id, query=query,
note='Downloading video URL for profile %s' % profile_name)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- resp = self._parse_json(e.cause.read().decode(), video_id)
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
+ resp = self._parse_json(e.cause.response.read().decode(), video_id)
if resp.get('code') == 'GeoBlocked':
self.raise_geo_restricted()
raise
diff --git a/hypervideo_dl/extractor/kankanews.py b/hypervideo_dl/extractor/kankanews.py
new file mode 100644
index 0000000..46e239b
--- /dev/null
+++ b/hypervideo_dl/extractor/kankanews.py
@@ -0,0 +1,48 @@
+import time
+import random
+import string
+import hashlib
+import urllib.parse
+
+from .common import InfoExtractor
+
+
+class KankaNewsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?kankanews\.com/a/\d+\-\d+\-\d+/(?P<id>\d+)\.shtml'
+ _TESTS = [{
+ 'url': 'https://www.kankanews.com/a/2022-11-08/00310276054.shtml?appid=1088227',
+ 'md5': '05e126513c74b1258d657452a6f4eef9',
+ 'info_dict': {
+ 'id': '4485057',
+ 'url': 'http://mediaplay.kksmg.com/2022/11/08/h264_450k_mp4_1a388ad771e0e4cc28b0da44d245054e_ncm.mp4',
+ 'ext': 'mp4',
+ 'title': '视频|第23个中国记者节,我们在进博切蛋糕',
+ 'thumbnail': r're:^https?://.*\.jpg*',
+ }
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ video_id = self._search_regex(r'omsid\s*=\s*"(\d+)"', webpage, 'video id')
+
+ params = {
+ 'nonce': ''.join(random.choices(string.ascii_lowercase + string.digits, k=8)),
+ 'omsid': video_id,
+ 'platform': 'pc',
+ 'timestamp': int(time.time()),
+ 'version': '1.0',
+ }
+ params['sign'] = hashlib.md5((hashlib.md5((
+ urllib.parse.urlencode(params) + '&28c8edde3d61a0411511d3b1866f0636'
+ ).encode()).hexdigest()).encode()).hexdigest()
+
+ meta = self._download_json('https://api-app.kankanews.com/kankan/pc/getvideo',
+ video_id, query=params)['result']['video']
+
+ return {
+ 'id': video_id,
+ 'url': meta['videourl'],
+ 'title': self._search_regex(r'g\.title\s*=\s*"([^"]+)"', webpage, 'title'),
+ 'thumbnail': meta.get('titlepic'),
+ }
diff --git a/hypervideo_dl/extractor/kick.py b/hypervideo_dl/extractor/kick.py
new file mode 100644
index 0000000..d124372
--- /dev/null
+++ b/hypervideo_dl/extractor/kick.py
@@ -0,0 +1,126 @@
+from .common import InfoExtractor
+from ..networking import HEADRequest
+from ..utils import (
+ UserNotLive,
+ float_or_none,
+ merge_dicts,
+ str_or_none,
+ traverse_obj,
+ unified_timestamp,
+ url_or_none,
+)
+
+
+class KickBaseIE(InfoExtractor):
+ def _real_initialize(self):
+ self._request_webpage(HEADRequest('https://kick.com/'), None, 'Setting up session', fatal=False)
+ xsrf_token = self._get_cookies('https://kick.com/').get('XSRF-TOKEN')
+ if not xsrf_token:
+ self.write_debug('kick.com did not set XSRF-TOKEN cookie')
+ KickBaseIE._API_HEADERS = {
+ 'Authorization': f'Bearer {xsrf_token.value}',
+ 'X-XSRF-TOKEN': xsrf_token.value,
+ } if xsrf_token else {}
+
+ def _call_api(self, path, display_id, note='Downloading API JSON', headers={}, **kwargs):
+ return self._download_json(
+ f'https://kick.com/api/v1/{path}', display_id, note=note,
+ headers=merge_dicts(headers, self._API_HEADERS), **kwargs)
+
+
+class KickIE(KickBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?kick\.com/(?!(?:video|categories|search|auth)(?:[/?#]|$))(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://kick.com/yuppy',
+ 'info_dict': {
+ 'id': '6cde1-kickrp-joe-flemmingskick-info-heremust-knowmust-see21',
+ 'ext': 'mp4',
+ 'title': str,
+ 'description': str,
+ 'channel': 'yuppy',
+ 'channel_id': '33538',
+ 'uploader': 'Yuppy',
+ 'uploader_id': '33793',
+ 'upload_date': str,
+ 'live_status': 'is_live',
+ 'timestamp': int,
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'categories': list,
+ },
+ 'skip': 'livestream',
+ }, {
+ 'url': 'https://kick.com/kmack710',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ channel = self._match_id(url)
+ response = self._call_api(f'channels/{channel}', channel)
+ if not traverse_obj(response, 'livestream', expected_type=dict):
+ raise UserNotLive(video_id=channel)
+
+ return {
+ 'id': str(traverse_obj(
+ response, ('livestream', ('slug', 'id')), get_all=False, default=channel)),
+ 'formats': self._extract_m3u8_formats(
+ response['playback_url'], channel, 'mp4', live=True),
+ 'title': traverse_obj(
+ response, ('livestream', ('session_title', 'slug')), get_all=False, default=''),
+ 'description': traverse_obj(response, ('user', 'bio')),
+ 'channel': channel,
+ 'channel_id': str_or_none(traverse_obj(response, 'id', ('livestream', 'channel_id'))),
+ 'uploader': traverse_obj(response, 'name', ('user', 'username')),
+ 'uploader_id': str_or_none(traverse_obj(response, 'user_id', ('user', 'id'))),
+ 'is_live': True,
+ 'timestamp': unified_timestamp(traverse_obj(response, ('livestream', 'created_at'))),
+ 'thumbnail': traverse_obj(
+ response, ('livestream', 'thumbnail', 'url'), expected_type=url_or_none),
+ 'categories': traverse_obj(response, ('recent_categories', ..., 'name')),
+ }
+
+
+class KickVODIE(KickBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})'
+ _TESTS = [{
+ 'url': 'https://kick.com/video/54244b5e-050a-4df4-a013-b2433dafbe35',
+ 'md5': '73691206a6a49db25c5aa1588e6538fc',
+ 'info_dict': {
+ 'id': '54244b5e-050a-4df4-a013-b2433dafbe35',
+ 'ext': 'mp4',
+ 'title': 'Making 710-carBoosting. Kinda No Pixel inspired. !guilded - !links',
+ 'description': 'md5:a0d3546bf7955d0a8252ffe0fd6f518f',
+ 'channel': 'kmack710',
+ 'channel_id': '16278',
+ 'uploader': 'Kmack710',
+ 'uploader_id': '16412',
+ 'upload_date': '20221206',
+ 'timestamp': 1670318289,
+ 'duration': 40104.0,
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'categories': ['Grand Theft Auto V'],
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ response = self._call_api(f'video/{video_id}', video_id)
+
+ return {
+ 'id': video_id,
+ 'formats': self._extract_m3u8_formats(response['source'], video_id, 'mp4'),
+ 'title': traverse_obj(
+ response, ('livestream', ('session_title', 'slug')), get_all=False, default=''),
+ 'description': traverse_obj(response, ('livestream', 'channel', 'user', 'bio')),
+ 'channel': traverse_obj(response, ('livestream', 'channel', 'slug')),
+ 'channel_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'id'))),
+ 'uploader': traverse_obj(response, ('livestream', 'channel', 'user', 'username')),
+ 'uploader_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'user_id'))),
+ 'timestamp': unified_timestamp(response.get('created_at')),
+ 'duration': float_or_none(traverse_obj(response, ('livestream', 'duration')), scale=1000),
+ 'thumbnail': traverse_obj(
+ response, ('livestream', 'thumbnail'), expected_type=url_or_none),
+ 'categories': traverse_obj(response, ('livestream', 'categories', ..., 'name')),
+ }
diff --git a/hypervideo_dl/extractor/kommunetv.py b/hypervideo_dl/extractor/kommunetv.py
new file mode 100644
index 0000000..e21e556
--- /dev/null
+++ b/hypervideo_dl/extractor/kommunetv.py
@@ -0,0 +1,31 @@
+from .common import InfoExtractor
+from ..utils import update_url
+
+
+class KommunetvIE(InfoExtractor):
+ _VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P<id>\w+)'
+ _TEST = {
+ 'url': 'https://oslo.kommunetv.no/archive/921',
+ 'md5': '5f102be308ee759be1e12b63d5da4bbc',
+ 'info_dict': {
+ 'id': '921',
+ 'title': 'Bystyremøte',
+ 'ext': 'mp4'
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ headers = {
+ 'Accept': 'application/json'
+ }
+ data = self._download_json('https://oslo.kommunetv.no/api/streams?streamType=1&id=%s' % video_id, video_id, headers=headers)
+ title = data['stream']['title']
+ file = data['playlist'][0]['playlist'][0]['file']
+ url = update_url(file, query=None, fragment=None)
+ formats = self._extract_m3u8_formats(url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': title
+ }
diff --git a/hypervideo_dl/extractor/kuwo.py b/hypervideo_dl/extractor/kuwo.py
index cfec1c5..e8a061a 100644
--- a/hypervideo_dl/extractor/kuwo.py
+++ b/hypervideo_dl/extractor/kuwo.py
@@ -91,7 +91,7 @@ class KuwoIE(KuwoBaseIE):
webpage, urlh = self._download_webpage_handle(
url, song_id, note='Download song detail info',
errnote='Unable to get song detail info')
- if song_id not in urlh.geturl() or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage:
+ if song_id not in urlh.url or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage:
raise ExtractorError('this song has been offline because of copyright issues', expected=True)
song_name = self._html_search_regex(
diff --git a/hypervideo_dl/extractor/la7.py b/hypervideo_dl/extractor/la7.py
index 68dc1d4..a3cd12b 100644
--- a/hypervideo_dl/extractor/la7.py
+++ b/hypervideo_dl/extractor/la7.py
@@ -1,25 +1,19 @@
import re
from .common import InfoExtractor
-from ..utils import (
- determine_ext,
- float_or_none,
- HEADRequest,
- int_or_none,
- parse_duration,
- unified_strdate,
-)
+from ..networking import HEADRequest
+from ..utils import float_or_none, int_or_none, parse_duration, unified_strdate
class LA7IE(InfoExtractor):
IE_NAME = 'la7.it'
- _VALID_URL = r'''(?x)(https?://)?(?:
- (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/|
+ _VALID_URL = r'''(?x)https?://(?:
+ (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video|news)/|
tg\.la7\.it/repliche-tgla7\?id=
)(?P<id>.+)'''
_TESTS = [{
- # 'src' is a plain URL
+ # single quality video
'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722',
'md5': '8b613ffc0c4bf9b9e377169fc19c214c',
'info_dict': {
@@ -29,6 +23,20 @@ class LA7IE(InfoExtractor):
'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico',
'thumbnail': 're:^https?://.*',
'upload_date': '20151002',
+ 'formats': 'count:4',
+ },
+ }, {
+ # multiple quality video
+ 'url': 'https://www.la7.it/calcio-femminile/news/il-gol-di-lindsey-thomas-fiorentina-vs-milan-serie-a-calcio-femminile-26-11-2022-461736',
+ 'md5': 'd2370e78f75e8d1238cb3a0db9a2eda3',
+ 'info_dict': {
+ 'id': 'il-gol-di-lindsey-thomas-fiorentina-vs-milan-serie-a-calcio-femminile-26-11-2022-461736',
+ 'ext': 'mp4',
+ 'title': 'Il gol di Lindsey Thomas | Fiorentina vs Milan | Serie A Calcio Femminile',
+ 'description': 'Il gol di Lindsey Thomas | Fiorentina vs Milan | Serie A Calcio Femminile',
+ 'thumbnail': 're:^https?://.*',
+ 'upload_date': '20221126',
+ 'formats': 'count:8',
},
}, {
'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077',
@@ -39,7 +47,7 @@ class LA7IE(InfoExtractor):
def _generate_mp4_url(self, quality, m3u8_formats):
for f in m3u8_formats:
if f['vcodec'] != 'none' and quality in f['url']:
- http_url = '%s%s.mp4' % (self._HOST, quality)
+ http_url = f'{self._HOST}{quality}.mp4'
urlh = self._request_webpage(
HEADRequest(http_url), quality,
@@ -58,12 +66,13 @@ class LA7IE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
- if not url.startswith('http'):
- url = '%s//%s' % (self.http_scheme(), url)
+ if re.search(r'(?i)(drmsupport\s*:\s*true)\s*', webpage):
+ self.report_drm(video_id)
- webpage = self._download_webpage(url, video_id)
- video_path = self._search_regex(r'(/content/.*?).mp4', webpage, 'video_path')
+ video_path = self._search_regex(
+ r'(/content/[\w/,]+?)\.mp4(?:\.csmil)?/master\.m3u8', webpage, 'video_path')
formats = self._extract_mpd_formats(
f'{self._HOST}/local/dash/,{video_path}.mp4.urlset/manifest.mpd',
@@ -90,8 +99,7 @@ class LA7IE(InfoExtractor):
class LA7PodcastEpisodeIE(InfoExtractor):
IE_NAME = 'la7.it:pod:episode'
- _VALID_URL = r'''(?x)(https?://)?
- (?:www\.)?la7\.it/[^/]+/podcast/([^/]+-)?(?P<id>\d+)'''
+ _VALID_URL = r'https?://(?:www\.)?la7\.it/[^/]+/podcast/([^/]+-)?(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.la7.it/voicetown/podcast/la-carezza-delle-memoria-di-carlo-verdone-23-03-2021-371497',
@@ -125,14 +133,15 @@ class LA7PodcastEpisodeIE(InfoExtractor):
webpage, 'video_id', group='vid')
media_url = self._search_regex(
- (r'src:\s*([\'"])(?P<url>.+?mp3.+?)\1',
- r'data-podcast=([\'"])(?P<url>.+?mp3.+?)\1'),
+ (r'src\s*:\s*([\'"])(?P<url>\S+?mp3.+?)\1',
+ r'data-podcast\s*=\s*([\'"])(?P<url>\S+?mp3.+?)\1'),
webpage, 'media_url', group='url')
- ext = determine_ext(media_url)
formats = [{
'url': media_url,
- 'format_id': ext,
- 'ext': ext,
+ 'format_id': 'http-mp3',
+ 'ext': 'mp3',
+ 'acodec': 'mp3',
+ 'vcodec': 'none',
}]
title = self._html_search_regex(
@@ -173,7 +182,7 @@ class LA7PodcastEpisodeIE(InfoExtractor):
# and title is the same as the show_title
# add the date to the title
if date and not date_alt and ppn and ppn.lower() == title.lower():
- title += ' del %s' % date
+ title = f'{title} del {date}'
return {
'id': video_id,
'title': title,
@@ -193,7 +202,7 @@ class LA7PodcastEpisodeIE(InfoExtractor):
class LA7PodcastIE(LA7PodcastEpisodeIE): # XXX: Do not subclass from concrete IE
IE_NAME = 'la7.it:podcast'
- _VALID_URL = r'(https?://)?(www\.)?la7\.it/(?P<id>[^/]+)/podcast/?(?:$|[#?])'
+ _VALID_URL = r'https?://(?:www\.)?la7\.it/(?P<id>[^/]+)/podcast/?(?:$|[#?])'
_TESTS = [{
'url': 'https://www.la7.it/propagandalive/podcast',
@@ -201,7 +210,7 @@ class LA7PodcastIE(LA7PodcastEpisodeIE): # XXX: Do not subclass from concrete I
'id': 'propagandalive',
'title': "Propaganda Live",
},
- 'playlist_count': 10,
+ 'playlist_count_min': 10,
}]
def _real_extract(self, url):
diff --git a/hypervideo_dl/extractor/lastfm.py b/hypervideo_dl/extractor/lastfm.py
index f14198c..6710335 100644
--- a/hypervideo_dl/extractor/lastfm.py
+++ b/hypervideo_dl/extractor/lastfm.py
@@ -1,33 +1,24 @@
+import itertools
import re
from .common import InfoExtractor
-from ..utils import int_or_none, format_field
+from ..utils import int_or_none, parse_qs, traverse_obj
class LastFMPlaylistBaseIE(InfoExtractor):
def _entries(self, url, playlist_id):
- webpage = self._download_webpage(url, playlist_id)
- start_page_number = int_or_none(self._search_regex(
- r'\bpage=(\d+)', url, 'page', default=None)) or 1
- last_page_number = int_or_none(self._search_regex(
- r'>(\d+)</a>[^<]*</li>[^<]*<li[^>]+class="pagination-next', webpage, 'last_page', default=None))
-
- for page_number in range(start_page_number, (last_page_number or start_page_number) + 1):
+ single_page = traverse_obj(parse_qs(url), ('page', -1, {int_or_none}))
+ for page in itertools.count(single_page or 1):
webpage = self._download_webpage(
- url, playlist_id,
- note='Downloading page %d%s' % (page_number, format_field(last_page_number, None, ' of %d')),
- query={'page': page_number})
- page_entries = [
- self.url_result(player_url, 'Youtube')
- for player_url in set(re.findall(r'data-youtube-url="([^"]+)"', webpage))
- ]
-
- for e in page_entries:
- yield e
+ url, playlist_id, f'Downloading page {page}', query={'page': page})
+ videos = re.findall(r'data-youtube-url="([^"]+)"', webpage)
+ yield from videos
+ if single_page or not videos:
+ return
def _real_extract(self, url):
playlist_id = self._match_id(url)
- return self.playlist_result(self._entries(url, playlist_id), playlist_id)
+ return self.playlist_from_matches(self._entries(url, playlist_id), playlist_id, ie='Youtube')
class LastFMPlaylistIE(LastFMPlaylistBaseIE):
@@ -37,7 +28,7 @@ class LastFMPlaylistIE(LastFMPlaylistBaseIE):
'info_dict': {
'id': 'Oasis',
},
- 'playlist_count': 11,
+ 'playlist_mincount': 11,
}, {
'url': 'https://www.last.fm/music/Oasis',
'only_matching': True,
@@ -73,6 +64,18 @@ class LastFMUserIE(LastFMPlaylistBaseIE):
'id': '12319471',
},
'playlist_count': 30,
+ }, {
+ 'url': 'https://www.last.fm/user/naamloos1/playlists/12543760',
+ 'info_dict': {
+ 'id': '12543760',
+ },
+ 'playlist_mincount': 80,
+ }, {
+ 'url': 'https://www.last.fm/user/naamloos1/playlists/12543760?page=3',
+ 'info_dict': {
+ 'id': '12543760',
+ },
+ 'playlist_count': 32,
}]
diff --git a/hypervideo_dl/extractor/lbry.py b/hypervideo_dl/extractor/lbry.py
index b5def1e..9a9f925 100644
--- a/hypervideo_dl/extractor/lbry.py
+++ b/hypervideo_dl/extractor/lbry.py
@@ -1,18 +1,22 @@
import functools
import json
+import re
+import urllib.parse
from .common import InfoExtractor
-from ..compat import compat_str, compat_urllib_parse_unquote
+from ..networking import HEADRequest
from ..utils import (
ExtractorError,
- HEADRequest,
OnDemandPagedList,
UnsupportedError,
determine_ext,
int_or_none,
mimetype2ext,
parse_qs,
+ traverse_obj,
try_get,
+ url_or_none,
+ urlhandle_detect_ext,
urljoin,
)
@@ -52,38 +56,25 @@ class LBRYBaseIE(InfoExtractor):
'/%s:%s' % (claim_name, claim_id))
def _parse_stream(self, stream, url):
- stream_value = stream.get('value') or {}
- stream_type = stream_value.get('stream_type')
- source = stream_value.get('source') or {}
- media = stream_value.get(stream_type) or {}
- signing_channel = stream.get('signing_channel') or {}
- channel_name = signing_channel.get('name')
- channel_claim_id = signing_channel.get('claim_id')
- channel_url = None
- if channel_name and channel_claim_id:
- channel_url = self._permanent_url(url, channel_name, channel_claim_id)
+ stream_type = traverse_obj(stream, ('value', 'stream_type', {str}))
+
+ info = traverse_obj(stream, {
+ 'title': ('value', 'title', {str}),
+ 'thumbnail': ('value', 'thumbnail', 'url', {url_or_none}),
+ 'description': ('value', 'description', {str}),
+ 'license': ('value', 'license', {str}),
+ 'timestamp': ('timestamp', {int_or_none}),
+ 'release_timestamp': ('value', 'release_time', {int_or_none}),
+ 'tags': ('value', 'tags', ..., {lambda x: x or None}),
+ 'duration': ('value', stream_type, 'duration', {int_or_none}),
+ 'channel': ('signing_channel', 'value', 'title', {str}),
+ 'channel_id': ('signing_channel', 'claim_id', {str}),
+ })
+
+ channel_name = traverse_obj(stream, ('signing_channel', 'name', {str}))
+ if channel_name and info.get('channel_id'):
+ info['channel_url'] = self._permanent_url(url, channel_name, info['channel_id'])
- info = {
- 'thumbnail': try_get(stream_value, lambda x: x['thumbnail']['url'], compat_str),
- 'description': stream_value.get('description'),
- 'license': stream_value.get('license'),
- 'timestamp': int_or_none(stream.get('timestamp')),
- 'release_timestamp': int_or_none(stream_value.get('release_time')),
- 'tags': stream_value.get('tags'),
- 'duration': int_or_none(media.get('duration')),
- 'channel': try_get(signing_channel, lambda x: x['value']['title']),
- 'channel_id': channel_claim_id,
- 'channel_url': channel_url,
- 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')),
- 'filesize': int_or_none(source.get('size')),
- }
- if stream_type == 'audio':
- info['vcodec'] = 'none'
- else:
- info.update({
- 'width': int_or_none(media.get('width')),
- 'height': int_or_none(media.get('height')),
- })
return info
@@ -93,7 +84,7 @@ class LBRYIE(LBRYBaseIE):
_TESTS = [{
# Video
'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1',
- 'md5': 'fffd15d76062e9a985c22c7c7f2f4805',
+ 'md5': '65bd7ec1f6744ada55da8e4c48a2edf9',
'info_dict': {
'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d',
'ext': 'mp4',
@@ -142,9 +133,8 @@ class LBRYIE(LBRYBaseIE):
'license': 'None',
}
}, {
- # HLS
'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e',
- 'md5': '25049011f3c8bc2f8b60ad88a031837e',
+ 'md5': 'c35fac796f62a14274b4dc2addb5d0ba',
'info_dict': {
'id': 'e51671357333fe22ae88aad320bde2f6f96b1410',
'ext': 'mp4',
@@ -187,6 +177,28 @@ class LBRYIE(LBRYBaseIE):
},
'params': {'skip_download': True}
}, {
+ # original quality format w/higher resolution than HLS formats
+ 'url': 'https://odysee.com/@wickedtruths:2/Biotechnological-Invasion-of-Skin-(April-2023):4',
+ 'md5': '305b0b3b369bde1b984961f005b67193',
+ 'info_dict': {
+ 'id': '41fbfe805eb73c8d3012c0c49faa0f563274f634',
+ 'ext': 'mp4',
+ 'title': 'Biotechnological Invasion of Skin (April 2023)',
+ 'description': 'md5:709a2f4c07bd8891cda3a7cc2d6fcf5c',
+ 'channel': 'Wicked Truths',
+ 'channel_id': '23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0',
+ 'channel_url': 'https://odysee.com/@wickedtruths:23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0',
+ 'timestamp': 1685790036,
+ 'upload_date': '20230603',
+ 'release_timestamp': 1685617473,
+ 'release_date': '20230601',
+ 'duration': 1063,
+ 'thumbnail': 'https://thumbs.odycdn.com/4e6d39da4df0cfdad45f64e253a15959.webp',
+ 'tags': ['smart skin surveillance', 'biotechnology invasion of skin', 'morgellons'],
+ 'license': 'None',
+ 'protocol': 'https', # test for direct mp4 download
+ },
+ }, {
'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e',
'only_matching': True,
}, {
@@ -221,41 +233,65 @@ class LBRYIE(LBRYBaseIE):
display_id = display_id.split('/', 2)[-1].replace('/', ':')
else:
display_id = display_id.replace(':', '#')
- display_id = compat_urllib_parse_unquote(display_id)
+ display_id = urllib.parse.unquote(display_id)
uri = 'lbry://' + display_id
result = self._resolve_url(uri, display_id, 'stream')
headers = {'Referer': 'https://odysee.com/'}
- if result['value'].get('stream_type') in self._SUPPORTED_STREAM_TYPES:
+
+ formats = []
+ stream_type = traverse_obj(result, ('value', 'stream_type', {str}))
+
+ if stream_type in self._SUPPORTED_STREAM_TYPES:
claim_id, is_live = result['claim_id'], False
streaming_url = self._call_api_proxy(
'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url']
+
+ # GET request to v3 API returns original video/audio file if available
+ direct_url = re.sub(r'/api/v\d+/', '/api/v3/', streaming_url)
+ urlh = self._request_webpage(
+ direct_url, display_id, 'Checking for original quality', headers=headers, fatal=False)
+ if urlh and urlhandle_detect_ext(urlh) != 'm3u8':
+ formats.append({
+ 'url': direct_url,
+ 'format_id': 'original',
+ 'quality': 1,
+ **traverse_obj(result, ('value', {
+ 'ext': ('source', (('name', {determine_ext}), ('media_type', {mimetype2ext}))),
+ 'filesize': ('source', 'size', {int_or_none}),
+ 'width': ('video', 'width', {int_or_none}),
+ 'height': ('video', 'height', {int_or_none}),
+ }), get_all=False),
+ 'vcodec': 'none' if stream_type == 'audio' else None,
+ })
+
+ # HEAD request returns redirect response to m3u8 URL if available
final_url = self._request_webpage(
HEADRequest(streaming_url), display_id, headers=headers,
- note='Downloading streaming redirect url info').geturl()
+ note='Downloading streaming redirect url info').url
+
elif result.get('value_type') == 'stream':
claim_id, is_live = result['signing_channel']['claim_id'], True
live_data = self._download_json(
'https://api.odysee.live/livestream/is_live', claim_id,
query={'channel_claim_id': claim_id},
note='Downloading livestream JSON metadata')['data']
- streaming_url = final_url = live_data.get('VideoURL')
+ final_url = live_data.get('VideoURL')
# Upcoming videos may still give VideoURL
if not live_data.get('Live'):
- streaming_url = final_url = None
+ final_url = None
self.raise_no_formats('This stream is not live', True, claim_id)
+
else:
raise UnsupportedError(url)
- info = self._parse_stream(result, url)
if determine_ext(final_url) == 'm3u8':
- info['formats'] = self._extract_m3u8_formats(
- final_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live, headers=headers)
- else:
- info['url'] = streaming_url
+ formats.extend(self._extract_m3u8_formats(
+ final_url, display_id, 'mp4', m3u8_id='hls', live=is_live, headers=headers))
+
return {
- **info,
+ **self._parse_stream(result, url),
'id': claim_id,
- 'title': result['value']['title'],
+ 'formats': formats,
'is_live': is_live,
'http_headers': headers,
}
@@ -299,14 +335,12 @@ class LBRYChannelIE(LBRYBaseIE):
if not (stream_claim_name and stream_claim_id):
continue
- info = self._parse_stream(item, url)
- info.update({
+ yield {
+ **self._parse_stream(item, url),
'_type': 'url',
'id': stream_claim_id,
- 'title': try_get(item, lambda x: x['value']['title']),
'url': self._permanent_url(url, stream_claim_name, stream_claim_id),
- })
- yield info
+ }
def _real_extract(self, url):
display_id = self._match_id(url).replace(':', '#')
diff --git a/hypervideo_dl/extractor/lecturio.py b/hypervideo_dl/extractor/lecturio.py
index 973764c..bb059d3 100644
--- a/hypervideo_dl/extractor/lecturio.py
+++ b/hypervideo_dl/extractor/lecturio.py
@@ -25,7 +25,7 @@ class LecturioBaseIE(InfoExtractor):
self._LOGIN_URL, None, 'Downloading login popup')
def is_logged(url_handle):
- return self._LOGIN_URL not in url_handle.geturl()
+ return self._LOGIN_URL not in url_handle.url
# Already logged in
if is_logged(urlh):
diff --git a/hypervideo_dl/extractor/lefigaro.py b/hypervideo_dl/extractor/lefigaro.py
new file mode 100644
index 0000000..9465095
--- /dev/null
+++ b/hypervideo_dl/extractor/lefigaro.py
@@ -0,0 +1,135 @@
+import json
+import math
+
+from .common import InfoExtractor
+from ..utils import (
+ InAdvancePagedList,
+ traverse_obj,
+)
+
+
+class LeFigaroVideoEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://video\.lefigaro\.fr/embed/[^?#]+/(?P<id>[\w-]+)'
+
+ _TESTS = [{
+ 'url': 'https://video.lefigaro.fr/embed/figaro/video/les-francais-ne-veulent-ils-plus-travailler-suivez-en-direct-le-club-le-figaro-idees/',
+ 'md5': 'e94de44cd80818084352fcf8de1ce82c',
+ 'info_dict': {
+ 'id': 'g9j7Eovo',
+ 'title': 'Les Français ne veulent-ils plus travailler ? Retrouvez Le Club Le Figaro Idées',
+ 'description': 'md5:862b8813148ba4bf10763a65a69dfe41',
+ 'upload_date': '20230216',
+ 'timestamp': 1676581615,
+ 'duration': 3076,
+ 'thumbnail': r're:^https?://[^?#]+\.(?:jpeg|jpg)',
+ 'ext': 'mp4',
+ },
+ }, {
+ 'url': 'https://video.lefigaro.fr/embed/figaro/video/intelligence-artificielle-faut-il-sen-mefier/',
+ 'md5': '0b3f10332b812034b3a3eda1ef877c5f',
+ 'info_dict': {
+ 'id': 'LeAgybyc',
+ 'title': 'Intelligence artificielle : faut-il s’en méfier ?',
+ 'description': 'md5:249d136e3e5934a67c8cb704f8abf4d2',
+ 'upload_date': '20230124',
+ 'timestamp': 1674584477,
+ 'duration': 860,
+ 'thumbnail': r're:^https?://[^?#]+\.(?:jpeg|jpg)',
+ 'ext': 'mp4',
+ },
+ }]
+
+ _WEBPAGE_TESTS = [{
+ 'url': 'https://video.lefigaro.fr/figaro/video/suivez-en-direct-le-club-le-figaro-international-avec-philippe-gelie-9/',
+ 'md5': '3972ddf2d5f8b98699f191687258e2f9',
+ 'info_dict': {
+ 'id': 'QChnbPYA',
+ 'title': 'Où en est le couple franco-allemand ? Retrouvez Le Club Le Figaro International',
+ 'description': 'md5:6f47235b7e7c93b366fd8ebfa10572ac',
+ 'upload_date': '20230123',
+ 'timestamp': 1674503575,
+ 'duration': 3153,
+ 'thumbnail': r're:^https?://[^?#]+\.(?:jpeg|jpg)',
+ 'age_limit': 0,
+ 'ext': 'mp4',
+ },
+ }, {
+ 'url': 'https://video.lefigaro.fr/figaro/video/la-philosophe-nathalie-sarthou-lajus-est-linvitee-du-figaro-live/',
+ 'md5': '3ac0a0769546ee6be41ab52caea5d9a9',
+ 'info_dict': {
+ 'id': 'QJzqoNbf',
+ 'title': 'La philosophe Nathalie Sarthou-Lajus est l’invitée du Figaro Live',
+ 'description': 'md5:c586793bb72e726c83aa257f99a8c8c4',
+ 'upload_date': '20230217',
+ 'timestamp': 1676661986,
+ 'duration': 1558,
+ 'thumbnail': r're:^https?://[^?#]+\.(?:jpeg|jpg)',
+ 'age_limit': 0,
+ 'ext': 'mp4',
+ },
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ player_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['pageData']['playerData']
+
+ return self.url_result(
+ f'jwplatform:{player_data["videoId"]}', title=player_data.get('title'),
+ description=player_data.get('description'), thumbnail=player_data.get('poster'))
+
+
+class LeFigaroVideoSectionIE(InfoExtractor):
+ _VALID_URL = r'https?://video\.lefigaro\.fr/figaro/(?P<id>[\w-]+)/?(?:[#?]|$)'
+
+ _TESTS = [{
+ 'url': 'https://video.lefigaro.fr/figaro/le-club-le-figaro-idees/',
+ 'info_dict': {
+ 'id': 'le-club-le-figaro-idees',
+ 'title': 'Le Club Le Figaro Idées',
+ },
+ 'playlist_mincount': 14,
+ }, {
+ 'url': 'https://video.lefigaro.fr/figaro/factu/',
+ 'info_dict': {
+ 'id': 'factu',
+ 'title': 'Factu',
+ },
+ 'playlist_mincount': 519,
+ }]
+
+ _PAGE_SIZE = 20
+
+ def _get_api_response(self, display_id, page_num, note=None):
+ return self._download_json(
+ 'https://api-graphql.lefigaro.fr/graphql', display_id, note=note,
+ query={
+ 'id': 'flive-website_UpdateListPage_1fb260f996bca2d78960805ac382544186b3225f5bedb43ad08b9b8abef79af6',
+ 'variables': json.dumps({
+ 'slug': display_id,
+ 'videosLimit': self._PAGE_SIZE,
+ 'sort': 'DESC',
+ 'order': 'PUBLISHED_AT',
+ 'page': page_num,
+ }).encode(),
+ })
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ initial_response = self._get_api_response(display_id, page_num=1)['data']['playlist']
+
+ def page_func(page_num):
+ api_response = self._get_api_response(display_id, page_num + 1, note=f'Downloading page {page_num + 1}')
+
+ return [self.url_result(
+ video['embedUrl'], LeFigaroVideoEmbedIE, **traverse_obj(video, {
+ 'title': 'name',
+ 'description': 'description',
+ 'thumbnail': 'thumbnailUrl',
+ })) for video in api_response['data']['playlist']['jsonLd'][0]['itemListElement']]
+
+ entries = InAdvancePagedList(
+ page_func, math.ceil(initial_response['videoCount'] / self._PAGE_SIZE), self._PAGE_SIZE)
+
+ return self.playlist_result(entries, playlist_id=display_id, playlist_title=initial_response.get('title'))
diff --git a/hypervideo_dl/extractor/lego.py b/hypervideo_dl/extractor/lego.py
index 811b447..46fc7a9 100644
--- a/hypervideo_dl/extractor/lego.py
+++ b/hypervideo_dl/extractor/lego.py
@@ -1,7 +1,7 @@
import uuid
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
@@ -75,7 +75,7 @@ class LEGOIE(InfoExtractor):
'videoId': '%s_%s' % (uuid.UUID(video_id), locale),
}, headers=self.geo_verification_headers())
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 451:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 451:
self.raise_geo_restricted(countries=countries)
raise
diff --git a/hypervideo_dl/extractor/limelight.py b/hypervideo_dl/extractor/limelight.py
index e11ec43..4e50f10 100644
--- a/hypervideo_dl/extractor/limelight.py
+++ b/hypervideo_dl/extractor/limelight.py
@@ -1,7 +1,7 @@
import re
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
determine_ext,
float_or_none,
@@ -69,8 +69,8 @@ class LimelightBaseIE(InfoExtractor):
item_id, 'Downloading PlaylistService %s JSON' % method,
fatal=fatal, headers=headers)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- error = self._parse_json(e.cause.read().decode(), item_id)['detail']['contentAccessPermission']
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
+ error = self._parse_json(e.cause.response.read().decode(), item_id)['detail']['contentAccessPermission']
if error == 'CountryDisabled':
self.raise_geo_restricted()
raise ExtractorError(error, expected=True)
diff --git a/hypervideo_dl/extractor/linuxacademy.py b/hypervideo_dl/extractor/linuxacademy.py
index a570248..0b16442 100644
--- a/hypervideo_dl/extractor/linuxacademy.py
+++ b/hypervideo_dl/extractor/linuxacademy.py
@@ -2,11 +2,8 @@ import json
import random
from .common import InfoExtractor
-from ..compat import (
- compat_b64decode,
- compat_HTTPError,
- compat_str,
-)
+from ..compat import compat_b64decode, compat_str
+from ..networking.exceptions import HTTPError
from ..utils import (
clean_html,
ExtractorError,
@@ -75,9 +72,8 @@ class LinuxAcademyIE(InfoExtractor):
def _perform_login(self, username, password):
def random_string():
- return ''.join([
- random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
- for _ in range(32)])
+ return ''.join(random.choices(
+ '0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~', k=32))
webpage, urlh = self._download_webpage_handle(
self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
@@ -108,7 +104,7 @@ class LinuxAcademyIE(InfoExtractor):
'sso': 'true',
})
- login_state_url = urlh.geturl()
+ login_state_url = urlh.url
try:
login_page = self._download_webpage(
@@ -120,8 +116,8 @@ class LinuxAcademyIE(InfoExtractor):
'Referer': login_state_url,
})
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- error = self._parse_json(e.cause.read(), None)
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+ error = self._parse_json(e.cause.response.read(), None)
message = error.get('description') or error['code']
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, message), expected=True)
@@ -138,7 +134,7 @@ class LinuxAcademyIE(InfoExtractor):
})
access_token = self._search_regex(
- r'access_token=([^=&]+)', urlh.geturl(),
+ r'access_token=([^=&]+)', urlh.url,
'access token', default=None)
if not access_token:
access_token = self._parse_json(
diff --git a/hypervideo_dl/extractor/litv.py b/hypervideo_dl/extractor/litv.py
index 31826ac..19b298e 100644
--- a/hypervideo_dl/extractor/litv.py
+++ b/hypervideo_dl/extractor/litv.py
@@ -4,8 +4,8 @@ from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
- traverse_obj,
smuggle_url,
+ traverse_obj,
unsmuggle_url,
)
@@ -113,7 +113,7 @@ class LiTVIE(InfoExtractor):
entry_protocol='m3u8_native', m3u8_id='hls')
for a_format in formats:
# LiTV HLS segments doesn't like compressions
- a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True
+ a_format.setdefault('http_headers', {})['Accept-Encoding'] = 'identity'
title = program_info['title'] + program_info.get('secondaryMark', '')
description = program_info.get('description')
diff --git a/hypervideo_dl/extractor/livestream.py b/hypervideo_dl/extractor/livestream.py
index d883eaf..a05a0fa 100644
--- a/hypervideo_dl/extractor/livestream.py
+++ b/hypervideo_dl/extractor/livestream.py
@@ -1,33 +1,36 @@
-import re
import itertools
+import re
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_urlparse,
-)
+from ..compat import compat_str, compat_urlparse
from ..utils import (
+ determine_ext,
find_xpath_attr,
- xpath_attr,
- xpath_with_ns,
- xpath_text,
- orderedSet,
- update_url_query,
- int_or_none,
float_or_none,
+ int_or_none,
+ orderedSet,
parse_iso8601,
- determine_ext,
+ traverse_obj,
+ update_url_query,
+ xpath_attr,
+ xpath_text,
+ xpath_with_ns,
)
class LivestreamIE(InfoExtractor):
IE_NAME = 'livestream'
- _VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))/(?:events/(?P<event_id>\d+)|(?P<event_name>[^/]+))(?:/videos/(?P<id>\d+))?'
+ _VALID_URL = r'''(?x)
+ https?://(?:new\.)?livestream\.com/
+ (?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))
+ (?:/events/(?P<event_id>\d+)|/(?P<event_name>[^/]+))?
+ (?:/videos/(?P<id>\d+))?
+ '''
_EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"']
_TESTS = [{
'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
- 'md5': '53274c76ba7754fb0e8d072716f2292b',
+ 'md5': '7876c5f5dc3e711b6b73acce4aac1527',
'info_dict': {
'id': '4719370',
'ext': 'mp4',
@@ -37,22 +40,37 @@ class LivestreamIE(InfoExtractor):
'duration': 5968.0,
'like_count': int,
'view_count': int,
+ 'comment_count': int,
'thumbnail': r're:^http://.*\.jpg$'
}
}, {
- 'url': 'http://new.livestream.com/tedx/cityenglish',
+ 'url': 'https://livestream.com/coheedandcambria/websterhall',
'info_dict': {
- 'title': 'TEDCity2.0 (English)',
- 'id': '2245590',
+ 'id': '1585861',
+ 'title': 'Live From Webster Hall'
},
- 'playlist_mincount': 4,
+ 'playlist_mincount': 1,
}, {
- 'url': 'http://new.livestream.com/chess24/tatasteelchess',
+ 'url': 'https://livestream.com/dayananda/events/7954027',
'info_dict': {
- 'title': 'Tata Steel Chess',
- 'id': '3705884',
+ 'title': 'Live from Mevo',
+ 'id': '7954027',
},
- 'playlist_mincount': 60,
+ 'playlist_mincount': 4,
+ }, {
+ 'url': 'https://livestream.com/accounts/82',
+ 'info_dict': {
+ 'id': '253978',
+ 'view_count': int,
+ 'title': 'trsr',
+ 'comment_count': int,
+ 'like_count': int,
+ 'upload_date': '20120306',
+ 'timestamp': 1331042383,
+ 'thumbnail': 'http://img.new.livestream.com/videos/0000000000000372/cacbeed6-fb68-4b5e-ad9c-e148124e68a9_640x427.jpg',
+ 'duration': 15.332,
+ 'ext': 'mp4'
+ }
}, {
'url': 'https://new.livestream.com/accounts/362/events/3557232/videos/67864563/player?autoPlay=false&height=360&mute=false&width=640',
'only_matching': True,
@@ -62,7 +80,8 @@ class LivestreamIE(InfoExtractor):
}]
_API_URL_TEMPLATE = 'http://livestream.com/api/accounts/%s/events/%s'
- def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+ def _parse_smil_formats_and_subtitles(
+ self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
base_ele = find_xpath_attr(
smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase')
base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/'
@@ -86,7 +105,7 @@ class LivestreamIE(InfoExtractor):
'tbr': tbr,
'preference': -1000, # Strictly inferior than all other formats?
})
- return formats
+ return formats, {}
def _extract_video_info(self, video_data):
video_id = compat_str(video_data['id'])
@@ -179,7 +198,7 @@ class LivestreamIE(InfoExtractor):
'is_live': is_live,
}
- def _extract_event(self, event_data):
+ def _generate_event_playlist(self, event_data):
event_id = compat_str(event_data['id'])
account_id = compat_str(event_data['owner_account_id'])
feed_root_url = self._API_URL_TEMPLATE % (account_id, event_id) + '/feed.json'
@@ -189,7 +208,6 @@ class LivestreamIE(InfoExtractor):
return self._extract_stream_info(stream_info)
last_video = None
- entries = []
for i in itertools.count(1):
if last_video is None:
info_url = feed_root_url
@@ -197,31 +215,38 @@ class LivestreamIE(InfoExtractor):
info_url = '{root}?&id={id}&newer=-1&type=video'.format(
root=feed_root_url, id=last_video)
videos_info = self._download_json(
- info_url, event_id, 'Downloading page {0}'.format(i))['data']
+ info_url, event_id, f'Downloading page {i}')['data']
videos_info = [v['data'] for v in videos_info if v['type'] == 'video']
if not videos_info:
break
for v in videos_info:
v_id = compat_str(v['id'])
- entries.append(self.url_result(
- 'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v_id),
- 'Livestream', v_id, v.get('caption')))
+ yield self.url_result(
+ f'http://livestream.com/accounts/{account_id}/events/{event_id}/videos/{v_id}',
+ LivestreamIE, v_id, v.get('caption'))
last_video = videos_info[-1]['id']
- return self.playlist_result(entries, event_id, event_data['full_name'])
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
event = mobj.group('event_id') or mobj.group('event_name')
account = mobj.group('account_id') or mobj.group('account_name')
- api_url = self._API_URL_TEMPLATE % (account, event)
+ api_url = f'http://livestream.com/api/accounts/{account}'
+
if video_id:
video_data = self._download_json(
- api_url + '/videos/%s' % video_id, video_id)
+ f'{api_url}/events/{event}/videos/{video_id}', video_id)
return self._extract_video_info(video_data)
- else:
- event_data = self._download_json(api_url, video_id)
- return self._extract_event(event_data)
+ elif event:
+ event_data = self._download_json(f'{api_url}/events/{event}', None)
+ return self.playlist_result(
+ self._generate_event_playlist(event_data), str(event_data['id']), event_data['full_name'])
+
+ account_data = self._download_json(api_url, None)
+ items = traverse_obj(account_data, (('upcoming_events', 'past_events'), 'data', ...))
+ return self.playlist_result(
+ itertools.chain.from_iterable(map(self._generate_event_playlist, items)),
+ account_data.get('id'), account_data.get('full_name'))
# The original version of Livestream uses a different system
diff --git a/hypervideo_dl/extractor/lumni.py b/hypervideo_dl/extractor/lumni.py
new file mode 100644
index 0000000..5810da0
--- /dev/null
+++ b/hypervideo_dl/extractor/lumni.py
@@ -0,0 +1,24 @@
+from .common import InfoExtractor
+from .francetv import FranceTVIE
+
+
+class LumniIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?lumni\.fr/video/(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://www.lumni.fr/video/l-homme-et-son-environnement-dans-la-revolution-industrielle',
+ 'md5': '960e8240c4f2c7a20854503a71e52f5e',
+ 'info_dict': {
+ 'id': 'd2b9a4e5-a526-495b-866c-ab72737e3645',
+ 'ext': 'mp4',
+ 'title': "L'homme et son environnement dans la révolution industrielle - L'ère de l'homme",
+ 'thumbnail': 'https://assets.webservices.francetelevisions.fr/v1/assets/images/a7/17/9f/a7179f5f-63a5-4e11-8d4d-012ab942d905.jpg',
+ 'duration': 230,
+ }
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ video_id = self._html_search_regex(
+ r'<div[^>]+data-factoryid\s*=\s*["\']([^"\']+)', webpage, 'video id')
+ return self.url_result(f'francetv:{video_id}', FranceTVIE, video_id)
diff --git a/hypervideo_dl/extractor/magellantv.py b/hypervideo_dl/extractor/magellantv.py
new file mode 100644
index 0000000..0947a45
--- /dev/null
+++ b/hypervideo_dl/extractor/magellantv.py
@@ -0,0 +1,50 @@
+from .common import InfoExtractor
+from ..utils import parse_age_limit, parse_duration, traverse_obj
+
+
+class MagellanTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?magellantv\.com/(?:watch|video)/(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://www.magellantv.com/watch/my-dads-on-death-row?type=v',
+ 'info_dict': {
+ 'id': 'my-dads-on-death-row',
+ 'ext': 'mp4',
+ 'title': 'My Dad\'s On Death Row',
+ 'description': 'md5:33ba23b9f0651fc4537ed19b1d5b0d7a',
+ 'duration': 3780.0,
+ 'age_limit': 14,
+ 'tags': ['Justice', 'Reality', 'United States', 'True Crime'],
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.magellantv.com/video/james-bulger-the-new-revelations',
+ 'info_dict': {
+ 'id': 'james-bulger-the-new-revelations',
+ 'ext': 'mp4',
+ 'title': 'James Bulger: The New Revelations',
+ 'description': 'md5:7b97922038bad1d0fe8d0470d8a189f2',
+ 'duration': 2640.0,
+ 'age_limit': 0,
+ 'tags': ['Investigation', 'True Crime', 'Justice', 'Europe'],
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['reactContext']['video']['detail']
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(data['jwpVideoUrl'], video_id)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ **traverse_obj(data, {
+ 'title': ('title', {str}),
+ 'description': ('metadata', 'description', {str}),
+ 'duration': ('duration', {parse_duration}),
+ 'age_limit': ('ratingCategory', {parse_age_limit}),
+ 'tags': ('tags', ..., {str}),
+ }),
+ }
diff --git a/hypervideo_dl/extractor/mailru.py b/hypervideo_dl/extractor/mailru.py
index 387d211..0f0550c 100644
--- a/hypervideo_dl/extractor/mailru.py
+++ b/hypervideo_dl/extractor/mailru.py
@@ -1,6 +1,7 @@
import itertools
import json
import re
+import urllib.parse
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
@@ -140,17 +141,15 @@ class MailRuIE(InfoExtractor):
'http://api.video.mail.ru/videos/%s.json?new=1' % video_id,
video_id, 'Downloading video JSON')
- headers = {}
-
video_key = self._get_cookies('https://my.mail.ru').get('video_key')
- if video_key:
- headers['Cookie'] = 'video_key=%s' % video_key.value
formats = []
for f in video_data['videos']:
video_url = f.get('url')
if not video_url:
continue
+ if video_key:
+ self._set_cookie(urllib.parse.urlparse(video_url).hostname, 'video_key', video_key.value)
format_id = f.get('key')
height = int_or_none(self._search_regex(
r'^(\d+)[pP]$', format_id, 'height', default=None)) if format_id else None
@@ -158,7 +157,6 @@ class MailRuIE(InfoExtractor):
'url': video_url,
'format_id': format_id,
'height': height,
- 'http_headers': headers,
})
meta_data = video_data['meta']
diff --git a/hypervideo_dl/extractor/medaltv.py b/hypervideo_dl/extractor/medaltv.py
index 82be823..9e57ee2 100644
--- a/hypervideo_dl/extractor/medaltv.py
+++ b/hypervideo_dl/extractor/medaltv.py
@@ -8,12 +8,12 @@ from ..utils import (
float_or_none,
int_or_none,
str_or_none,
- traverse_obj,
+ traverse_obj
)
class MedalTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?medal\.tv/(?P<path>games/[^/?#&]+/clips)/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?medal\.tv/games/[^/?#&]+/clips/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://medal.tv/games/valorant/clips/jTBFnLKdLy15K',
'md5': '6930f8972914b6b9fdc2bb3918098ba0',
@@ -80,25 +80,14 @@ class MedalTVIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- path = self._match_valid_url(url).group('path')
webpage = self._download_webpage(url, video_id)
- next_data = self._search_json(
- '<script[^>]*__NEXT_DATA__[^>]*>', webpage,
+ hydration_data = self._search_json(
+ r'<script[^>]*>[^<]*\bhydrationData\s*=', webpage,
'next data', video_id, end_pattern='</script>', fatal=False)
- build_id = next_data.get('buildId')
- if not build_id:
- raise ExtractorError(
- 'Could not find build ID.', video_id=video_id)
-
- locale = next_data.get('locale', 'en')
-
- api_response = self._download_json(
- f'https://medal.tv/_next/data/{build_id}/{locale}/{path}/{video_id}.json', video_id)
-
- clip = traverse_obj(api_response, ('pageProps', 'clip')) or {}
+ clip = traverse_obj(hydration_data, ('clips', ...), get_all=False)
if not clip:
raise ExtractorError(
'Could not find video information.', video_id=video_id)
@@ -152,7 +141,7 @@ class MedalTVIE(InfoExtractor):
# Necessary because the id of the author is not known in advance.
# Won't raise an issue if no profile can be found as this is optional.
- author = traverse_obj(api_response, ('pageProps', 'profile')) or {}
+ author = traverse_obj(hydration_data, ('profiles', ...), get_all=False) or {}
author_id = str_or_none(author.get('userId'))
author_url = format_field(author_id, None, 'https://medal.tv/users/%s')
diff --git a/hypervideo_dl/extractor/mediaite.py b/hypervideo_dl/extractor/mediaite.py
index 0f9079b..ab25392 100644
--- a/hypervideo_dl/extractor/mediaite.py
+++ b/hypervideo_dl/extractor/mediaite.py
@@ -81,10 +81,24 @@ class MediaiteIE(InfoExtractor):
'upload_date': '20210930',
},
'params': {'skip_download': True}
+ }, {
+ 'url': 'https://www.mediaite.com/politics/i-cant-read-it-fast-enough-while-defending-trump-larry-kudlow-overwhelmed-by-volume-of-ex-presidents-legal-troubles/',
+ 'info_dict': {
+ 'id': 'E6EhDX5z',
+ 'ext': 'mp4',
+ 'title': 'Fox Business Network - 4:00 PM - 5:00 PM - 1:39:42 pm - 1:42:20 pm',
+ 'description': '',
+ 'thumbnail': 'https://cdn.jwplayer.com/v2/media/E6EhDX5z/poster.jpg?width=720',
+ 'duration': 157,
+ 'timestamp': 1691015535,
+ 'upload_date': '20230802',
+ },
+ 'params': {'skip_download': True}
}]
def _real_extract(self, url):
webpage = self._download_webpage(url, None)
- id = self._search_regex(r'data-video-id\s?=\s?\"([^\"]+)\"', webpage, 'id')
- data_json = self._download_json(f'https://cdn.jwplayer.com/v2/media/{id}', id)
+ video_id = self._search_regex(
+ [r'"https://cdn\.jwplayer\.com/players/(\w+)', r'data-video-id\s*=\s*\"([^\"]+)\"'], webpage, 'id')
+ data_json = self._download_json(f'https://cdn.jwplayer.com/v2/media/{video_id}', video_id)
return self._parse_jwplayer_data(data_json)
diff --git a/hypervideo_dl/extractor/mediaset.py b/hypervideo_dl/extractor/mediaset.py
index 61bdb2a..e3b728d 100644
--- a/hypervideo_dl/extractor/mediaset.py
+++ b/hypervideo_dl/extractor/mediaset.py
@@ -7,7 +7,6 @@ from ..utils import (
GeoRestrictedError,
int_or_none,
OnDemandPagedList,
- parse_qs,
try_get,
urljoin,
update_url_query,
@@ -16,20 +15,25 @@ from ..utils import (
class MediasetIE(ThePlatformBaseIE):
_TP_TLD = 'eu'
- _VALID_URL = r'''(?x)
+ _GUID_RE = r'F[0-9A-Z]{15}'
+ _VALID_URL = rf'''(?x)
(?:
mediaset:|
https?://
(?:\w+\.)+mediaset\.it/
(?:
(?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_|
- player/(?:v\d+/)?index\.html\?.*?\bprogramGuid=
+ player/(?:v\d+/)?index\.html\?\S*?\bprogramGuid=
)
- )(?P<id>[0-9A-Z]{16,})
+ )(?P<id>{_GUID_RE})
'''
+
+ _EMBED_REGEX = [
+ rf'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//(?:\w+\.)+mediaset\.it/player/(?:v\d+/)?index\.html\?\S*?programGuid={_GUID_RE})[\'"&]'
+ ]
_TESTS = [{
# full episode
- 'url': 'https://www.mediasetplay.mediaset.it/video/mrwronglezionidamore/episodio-1_F310575103000102',
+ 'url': 'https://mediasetinfinity.mediaset.it/video/mrwronglezionidamore/episodio-1_F310575103000102',
'md5': 'a7e75c6384871f322adb781d3bd72c26',
'info_dict': {
'id': 'F310575103000102',
@@ -50,7 +54,7 @@ class MediasetIE(ThePlatformBaseIE):
'chapters': [{'start_time': 0.0, 'end_time': 439.88}, {'start_time': 439.88, 'end_time': 1685.84}, {'start_time': 1685.84, 'end_time': 2682.0}],
},
}, {
- 'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501',
+ 'url': 'https://mediasetinfinity.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501',
'md5': '1276f966ac423d16ba255ce867de073e',
'info_dict': {
'id': 'F309013801000501',
@@ -71,51 +75,8 @@ class MediasetIE(ThePlatformBaseIE):
'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}],
},
}, {
- 'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-69-pezzo-di-luna_F303843101017801',
- 'md5': 'd1650ac9ff944f185556126a736df148',
- 'info_dict': {
- 'id': 'F303843101017801',
- 'ext': 'mp4',
- 'title': 'Episodio 69 - Pezzo di luna',
- 'description': 'md5:7c32c8ec4118b72588b9412f11353f73',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 263.008,
- 'upload_date': '20200902',
- 'series': 'Camera Café 5',
- 'timestamp': 1599064700,
- 'uploader': 'Italia 1',
- 'uploader_id': 'I1',
- 'season': 'Season 5',
- 'episode': 'Episode 178',
- 'season_number': 5,
- 'episode_number': 178,
- 'chapters': [{'start_time': 0.0, 'end_time': 261.88}, {'start_time': 261.88, 'end_time': 263.008}],
- },
- }, {
- 'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-51-tu-chi-sei_F303843107000601',
- 'md5': '567e9ad375b7a27a0e370650f572a1e3',
- 'info_dict': {
- 'id': 'F303843107000601',
- 'ext': 'mp4',
- 'title': 'Episodio 51 - Tu chi sei?',
- 'description': 'md5:42ef006e56824cc31787a547590923f4',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 367.021,
- 'upload_date': '20200902',
- 'series': 'Camera Café 5',
- 'timestamp': 1599069817,
- 'uploader': 'Italia 1',
- 'uploader_id': 'I1',
- 'season': 'Season 5',
- 'episode': 'Episode 6',
- 'season_number': 5,
- 'episode_number': 6,
- 'chapters': [{'start_time': 0.0, 'end_time': 358.68}, {'start_time': 358.68, 'end_time': 367.021}],
- },
- }, {
- # movie
- 'url': 'https://www.mediasetplay.mediaset.it/movie/selvaggi/selvaggi_F006474501000101',
- 'md5': '720440187a2ae26af8148eb9e6b901ed',
+ # DRM
+ 'url': 'https://mediasetinfinity.mediaset.it/movie/selvaggi/selvaggi_F006474501000101',
'info_dict': {
'id': 'F006474501000101',
'ext': 'mp4',
@@ -129,75 +90,76 @@ class MediasetIE(ThePlatformBaseIE):
'uploader_id': 'B6',
'chapters': [{'start_time': 0.0, 'end_time': 1938.56}, {'start_time': 1938.56, 'end_time': 5233.01}],
},
+ 'params': {
+ 'ignore_no_formats_error': True,
+ },
+ 'expected_warnings': [
+ 'None of the available releases match the specified AssetType, ProtectionScheme, and/or Format preferences',
+ 'Content behind paywall and DRM',
+ ],
+ 'skip': True,
}, {
- # clip
- 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680',
+ # old domain
+ 'url': 'https://www.mediasetplay.mediaset.it/video/mrwronglezionidamore/episodio-1_F310575103000102',
'only_matching': True,
}, {
- # iframe simple
+ # iframe
'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665924&id=665924',
'only_matching': True,
}, {
- # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
- 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104',
- 'only_matching': True,
- }, {
- # embedUrl (from https://www.wittytv.it/amici/est-ce-que-tu-maimes-gabriele-5-dicembre-copia/)
- 'url': 'https://static3.mediasetplay.mediaset.it/player/v2/index.html?partnerId=wittytv&configId=&programGuid=FD00000000153323&autoplay=true&purl=http://www.wittytv.it/amici/est-ce-que-tu-maimes-gabriele-5-dicembre-copia/',
- 'only_matching': True,
- }, {
'url': 'mediaset:FAFU000000665924',
'only_matching': True,
+ }]
+ _WEBPAGE_TESTS = [{
+ # Mediaset embed
+ 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml',
+ 'info_dict': {
+ 'id': 'FD00000000004929',
+ 'ext': 'mp4',
+ 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"',
+ 'duration': 67.013,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Mediaset Play',
+ 'uploader_id': 'QY',
+ 'upload_date': '20201005',
+ 'timestamp': 1601866168,
+ 'chapters': [],
+ },
+ 'params': {
+ 'skip_download': True,
+ }
}, {
- 'url': 'https://www.mediasetplay.mediaset.it/video/mediasethaacuoreilfuturo/palmieri-alicudi-lisola-dei-tre-bambini-felici--un-decreto-per-alicudi-e-tutte-le-microscuole_FD00000000102295',
- 'only_matching': True,
- }, {
- 'url': 'https://www.mediasetplay.mediaset.it/video/cherryseason/anticipazioni-degli-episodi-del-23-ottobre_F306837101005C02',
- 'only_matching': True,
- }, {
- 'url': 'https://www.mediasetplay.mediaset.it/video/tg5/ambiente-onda-umana-per-salvare-il-pianeta_F309453601079D01',
- 'only_matching': True,
- }, {
- 'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135',
- 'only_matching': True,
- }, {
- 'url': 'https://www.mediasetplay.mediaset.it/movie/herculeslaleggendahainizio/hercules-la-leggenda-ha-inizio_F305927501000102',
- 'only_matching': True,
- }, {
- 'url': 'https://mediasetinfinity.mediaset.it/video/braveandbeautiful/episodio-113_F310948005000402',
- 'only_matching': True,
- }, {
- 'url': 'https://static3.mediasetplay.mediaset.it/player/v2/index.html?partnerId=wittytv&configId=&programGuid=FD00000000153323',
- 'only_matching': True,
+ # WittyTV embed
+ 'url': 'https://www.wittytv.it/mauriziocostanzoshow/ultima-puntata-venerdi-25-novembre/',
+ 'info_dict': {
+ 'id': 'F312172801000801',
+ 'ext': 'mp4',
+ 'title': 'Ultima puntata - Venerdì 25 novembre',
+ 'description': 'Una serata all\'insegna della musica e del buonumore ma non priva di spunti di riflessione',
+ 'duration': 6203.01,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Canale 5',
+ 'uploader_id': 'C5',
+ 'upload_date': '20221126',
+ 'timestamp': 1669428689,
+ 'chapters': list,
+ 'series': 'Maurizio Costanzo Show',
+ 'season': 'Season 12',
+ 'season_number': 12,
+ 'episode': 'Episode 8',
+ 'episode_number': 8,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
}]
- def _extract_from_webpage(self, url, webpage):
- def _program_guid(qs):
- return qs.get('programGuid', [None])[0]
-
- for mobj in re.finditer(
- r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml.*?)\1',
- webpage):
- embed_url = mobj.group('url')
- embed_qs = parse_qs(embed_url)
- program_guid = _program_guid(embed_qs)
- if program_guid:
- yield self.url_result(embed_url)
- continue
-
- video_id = embed_qs.get('id', [None])[0]
- if not video_id:
- continue
- urlh = self._request_webpage(embed_url, video_id, note='Following embed URL redirect')
- embed_url = urlh.geturl()
- program_guid = _program_guid(parse_qs(embed_url))
- if program_guid:
- yield self.url_result(embed_url)
-
- def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+ def _parse_smil_formats_and_subtitles(
+ self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
for video in smil.findall(self._xpath_ns('.//video', namespace)):
video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src'])
- return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url)
+ return super(MediasetIE, self)._parse_smil_formats_and_subtitles(
+ smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url)
def _check_drm_formats(self, tp_formats, video_id):
has_nondrm, drm_manifest = False, ''
@@ -217,7 +179,7 @@ class MediasetIE(ThePlatformBaseIE):
def _real_extract(self, url):
guid = self._match_id(url)
- tp_path = 'PR1GhC/media/guid/2702976343/' + guid
+ tp_path = f'PR1GhC/media/guid/2702976343/{guid}'
info = self._extract_theplatform_metadata(tp_path, guid)
formats = []
@@ -225,15 +187,17 @@ class MediasetIE(ThePlatformBaseIE):
first_e = geo_e = None
asset_type = 'geoNo:HD,browser,geoIT|geoNo:HD,geoIT|geoNo:SD,browser,geoIT|geoNo:SD,geoIT|geoNo|HD|SD'
# TODO: fixup ISM+none manifest URLs
- for f in ('MPEG4', 'M3U'):
+ for f in ('MPEG4', 'MPEG-DASH', 'M3U'):
try:
tp_formats, tp_subtitles = self._extract_theplatform_smil(
- update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), {
+ update_url_query(f'http://link.theplatform.{self._TP_TLD}/s/{tp_path}', {
'mbr': 'true',
'formats': f,
'assetTypes': asset_type,
- }), guid, 'Downloading %s SMIL data' % (f.split('+')[0]))
+ }), guid, f'Downloading {f.split("+")[0]} SMIL data')
except ExtractorError as e:
+ if e.orig_msg == 'None of the available releases match the specified AssetType, ProtectionScheme, and/or Format preferences':
+ e.orig_msg = 'This video is DRM protected'
if not geo_e and isinstance(e, GeoRestrictedError):
geo_e = e
if not first_e:
@@ -248,7 +212,7 @@ class MediasetIE(ThePlatformBaseIE):
raise geo_e or first_e
feed_data = self._download_json(
- 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2/guid/-/' + guid,
+ f'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2/guid/-/{guid}',
guid, fatal=False)
if feed_data:
publish_info = feed_data.get('mediasetprogram$publishInfo') or {}
@@ -299,23 +263,23 @@ class MediasetShowIE(MediasetIE): # XXX: Do not subclass from concrete IE
'''
_TESTS = [{
# TV Show webpage (general webpage)
- 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061',
+ 'url': 'https://mediasetinfinity.mediaset.it/programmi-tv/leiene/leiene_SE000000000061',
'info_dict': {
'id': '000000000061',
- 'title': 'Le Iene',
+ 'title': 'Le Iene 2022/2023',
},
- 'playlist_mincount': 7,
+ 'playlist_mincount': 6,
}, {
# TV Show webpage (specific season)
- 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061,ST000000002763',
+ 'url': 'https://mediasetinfinity.mediaset.it/programmi-tv/leiene/leiene_SE000000000061,ST000000002763',
'info_dict': {
'id': '000000002763',
- 'title': 'Le Iene',
+ 'title': 'Le Iene 2021/2022',
},
'playlist_mincount': 7,
}, {
# TV Show specific playlist (with multiple pages)
- 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/iservizi_SE000000000061,ST000000002763,sb100013375',
+ 'url': 'https://mediasetinfinity.mediaset.it/programmi-tv/leiene/iservizi_SE000000000061,ST000000002763,sb100013375',
'info_dict': {
'id': '100013375',
'title': 'I servizi',
@@ -340,10 +304,9 @@ class MediasetShowIE(MediasetIE): # XXX: Do not subclass from concrete IE
playlist_id, st, sb = self._match_valid_url(url).group('id', 'st', 'sb')
if not sb:
page = self._download_webpage(url, st or playlist_id)
- entries = [self.url_result(urljoin('https://www.mediasetplay.mediaset.it', url))
+ entries = [self.url_result(urljoin('https://mediasetinfinity.mediaset.it', url))
for url in re.findall(r'href="([^<>=]+SE\d{12},ST\d{12},sb\d{9})">[^<]+<', page)]
- title = (self._html_search_regex(r'(?s)<h1[^>]*>(.+?)</h1>', page, 'title', default=None)
- or self._og_search_title(page))
+ title = self._html_extract_title(page).split('|')[0].strip()
return self.playlist_result(entries, st or playlist_id, title)
entries = OnDemandPagedList(
diff --git a/hypervideo_dl/extractor/mediasite.py b/hypervideo_dl/extractor/mediasite.py
index fe549c4..7ea78ab 100644
--- a/hypervideo_dl/extractor/mediasite.py
+++ b/hypervideo_dl/extractor/mediasite.py
@@ -171,7 +171,7 @@ class MediasiteIE(InfoExtractor):
query = mobj.group('query')
webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer?
- redirect_url = urlh.geturl()
+ redirect_url = urlh.url
# XXX: might have also extracted UrlReferrer and QueryString from the html
service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex(
diff --git a/hypervideo_dl/extractor/mediastream.py b/hypervideo_dl/extractor/mediastream.py
new file mode 100644
index 0000000..cef769f
--- /dev/null
+++ b/hypervideo_dl/extractor/mediastream.py
@@ -0,0 +1,208 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ remove_end,
+ traverse_obj,
+ urljoin,
+)
+
+
+class MediaStreamBaseIE(InfoExtractor):
+ _EMBED_BASE_URL = 'https://mdstrm.com/embed'
+ _BASE_URL_RE = r'https?://mdstrm\.com/(?:embed|live-stream)'
+
+ def _extract_mediastream_urls(self, webpage):
+ yield from traverse_obj(list(self._yield_json_ld(webpage, None)), (
+ lambda _, v: v['@type'] == 'VideoObject', ('embedUrl', 'contentUrl'),
+ {lambda x: x if re.match(rf'{self._BASE_URL_RE}/\w+', x) else None}))
+
+ for mobj in re.finditer(r'<script[^>]+>[^>]*playerMdStream\.mdstreamVideo\(\s*[\'"](?P<video_id>\w+)', webpage):
+ yield f'{self._EMBED_BASE_URL}/{mobj.group("video_id")}'
+
+ yield from re.findall(
+ rf'<iframe[^>]+\bsrc="({self._BASE_URL_RE}/\w+)', webpage)
+
+ for mobj in re.finditer(
+ r'''(?x)
+ <(?:div|ps-mediastream)[^>]+
+ (class="[^"]*MediaStreamVideoPlayer)[^"]*"[^>]+
+ data-video-id="(?P<video_id>\w+)"
+ (?:\s*data-video-type="(?P<video_type>[^"]+))?
+ (?:[^>]*>\s*<div[^>]+\1[^"]*"[^>]+data-mediastream=["\'][^>]+
+ https://mdstrm\.com/(?P<live>live-stream))?
+ ''', webpage):
+
+ video_type = 'live-stream' if mobj.group('video_type') == 'live' or mobj.group('live') else 'embed'
+ yield f'https://mdstrm.com/{video_type}/{mobj.group("video_id")}'
+
+
+class MediaStreamIE(MediaStreamBaseIE):
+ _VALID_URL = MediaStreamBaseIE._BASE_URL_RE + r'/(?P<id>\w+)'
+
+ _TESTS = [{
+ 'url': 'https://mdstrm.com/embed/6318e3f1d1d316083ae48831',
+ 'md5': '97b4f2634b8e8612cc574dfcd504df05',
+ 'info_dict': {
+ 'id': '6318e3f1d1d316083ae48831',
+ 'title': 'Video: Así fue el despido de Thomas Tuchel del Chelsea',
+ 'description': 'md5:358ce1e1396010d50a1ece1be3633c95',
+ 'thumbnail': r're:^https?://[^?#]+6318e3f1d1d316083ae48831',
+ 'ext': 'mp4',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ _WEBPAGE_TESTS = [{
+ 'url': 'https://www.multimedios.com/video/costa-rica-tv-en-vivo/v2616',
+ 'info_dict': {
+ 'id': '5a7b1e63a8da282c34d65445',
+ 'title': 're:mmtv-costarica',
+ 'description': 'mmtv-costarica',
+ 'thumbnail': 're:^https?://[^?#]+5a7b1e63a8da282c34d65445',
+ 'ext': 'mp4',
+ 'live_status': 'is_live',
+ },
+ 'params': {'skip_download': 'Livestream'},
+ }, {
+ 'url': 'https://www.multimedios.com/television/clases-de-llaves-y-castigos-quien-sabe-mas',
+ 'md5': 'de31f0b1ecc321fb35bf22d58734ea40',
+ 'info_dict': {
+ 'id': '63731bab8ec9b308a2c9ed28',
+ 'title': 'Clases de llaves y castigos ¿Quién sabe más?',
+ 'description': 'md5:1b49aa1ee5a4b32fbd66104b2d629e9d',
+ 'thumbnail': 're:^https?://[^?#]+63731bab8ec9b308a2c9ed28',
+ 'ext': 'mp4',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.americatv.com.pe/videos/esto-es-guerra/facundo-gonzalez-sufrio-fuerte-golpe-durante-competencia-frente-hugo-garcia-eeg-noticia-139120',
+ 'info_dict': {
+ 'id': '63756df1c638b008a5659dec',
+ 'title': 'Facundo González sufrió fuerte golpe durante competencia frente a Hugo García en EEG',
+ 'description': 'md5:9490c034264afd756eef7b2c3adee69e',
+ 'thumbnail': 're:^https?://[^?#]+63756df1c638b008a5659dec',
+ 'ext': 'mp4',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.americatv.com.pe/videos/al-fondo-hay-sitio/nuevas-lomas-town-bernardo-mata-se-enfrento-sujeto-luchar-amor-macarena-noticia-139083',
+ 'info_dict': {
+ 'id': '637307669609130f74cd3a6e',
+ 'title': 'Las Nuevas Lomas Town: Bernardo De La Mata se enfrentó a sujeto para luchar por el amor de Macarena',
+ 'description': 'md5:60d71772f1e1496923539ae58aa17124',
+ 'thumbnail': 're:^https?://[^?#]+637307669609130f74cd3a6e',
+ 'ext': 'mp4',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _extract_from_webpage(self, url, webpage):
+ for embed_url in self._extract_mediastream_urls(webpage):
+ yield self.url_result(embed_url, MediaStreamIE, None)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ if 'Debido a tu ubicación no puedes ver el contenido' in webpage:
+ self.raise_geo_restricted()
+
+ player_config = self._search_json(r'window\.MDSTRM\.OPTIONS\s*=', webpage, 'metadata', video_id)
+
+ formats, subtitles = [], {}
+ for video_format in player_config['src']:
+ if video_format == 'hls':
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(player_config['src'][video_format], video_id)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+ elif video_format == 'mpd':
+ fmts, subs = self._extract_mpd_formats_and_subtitles(player_config['src'][video_format], video_id)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+ else:
+ formats.append({
+ 'url': player_config['src'][video_format],
+ })
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage) or player_config.get('title'),
+ 'description': self._og_search_description(webpage),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'is_live': player_config.get('type') == 'live',
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
+
+
+class WinSportsVideoIE(MediaStreamBaseIE):
+ _VALID_URL = r'https?://www\.winsports\.co/videos/(?P<id>[\w-]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.winsports.co/videos/siempre-castellanos-gran-atajada-del-portero-cardenal-para-evitar-la-caida-de-su-arco-60536',
+ 'info_dict': {
+ 'id': '62dc8357162c4b0821fcfb3c',
+ 'display_id': 'siempre-castellanos-gran-atajada-del-portero-cardenal-para-evitar-la-caida-de-su-arco-60536',
+ 'title': '¡Siempre Castellanos! Gran atajada del portero \'cardenal\' para evitar la caída de su arco',
+ 'description': 'md5:eb811b2b2882bdc59431732c06b905f2',
+ 'thumbnail': r're:^https?://[^?#]+62dc8357162c4b0821fcfb3c',
+ 'ext': 'mp4',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.winsports.co/videos/observa-aqui-los-goles-del-empate-entre-tolima-y-nacional-60548',
+ 'info_dict': {
+ 'id': '62dcb875ef12a5526790b552',
+ 'display_id': 'observa-aqui-los-goles-del-empate-entre-tolima-y-nacional-60548',
+ 'title': 'Observa aquí los goles del empate entre Tolima y Nacional',
+ 'description': 'md5:b19402ba6e46558b93fd24b873eea9c9',
+ 'thumbnail': r're:^https?://[^?#]+62dcb875ef12a5526790b552',
+ 'ext': 'mp4',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.winsports.co/videos/equidad-vuelve-defender-su-arco-de-remates-de-junior',
+ 'info_dict': {
+ 'id': '63fa7eca72f1741ad3a4d515',
+ 'display_id': 'equidad-vuelve-defender-su-arco-de-remates-de-junior',
+ 'title': '⚽ Equidad vuelve a defender su arco de remates de Junior',
+ 'description': 'Remate de Sierra',
+ 'thumbnail': r're:^https?://[^?#]+63fa7eca72f1741ad3a4d515',
+ 'ext': 'mp4',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.winsports.co/videos/bucaramanga-se-quedo-con-el-grito-de-gol-en-la-garganta',
+ 'info_dict': {
+ 'id': '6402adb62bbf3b18d454e1b0',
+ 'display_id': 'bucaramanga-se-quedo-con-el-grito-de-gol-en-la-garganta',
+ 'title': '⚽Bucaramanga se quedó con el grito de gol en la garganta',
+ 'description': 'Gol anulado Bucaramanga',
+ 'thumbnail': r're:^https?://[^?#]+6402adb62bbf3b18d454e1b0',
+ 'ext': 'mp4',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ data = self._search_json(
+ r'<script\s*[^>]+data-drupal-selector="drupal-settings-json">', webpage, 'data', display_id)
+
+ mediastream_url = urljoin(f'{self._EMBED_BASE_URL}/', (
+ traverse_obj(data, (
+ (('settings', 'mediastream_formatter', ..., 'mediastream_id'), 'url'), {str}), get_all=False)
+ or next(self._extract_mediastream_urls(webpage), None)))
+
+ if not mediastream_url:
+ self.raise_no_formats('No MediaStream embed found in webpage')
+
+ title = clean_html(remove_end(
+ self._search_json_ld(webpage, display_id, expected_type='VideoObject', default={}).get('title')
+ or self._og_search_title(webpage), '| Win Sports'))
+
+ return self.url_result(
+ mediastream_url, MediaStreamIE, display_id, url_transparent=True, display_id=display_id, video_title=title)
diff --git a/hypervideo_dl/extractor/megatvcom.py b/hypervideo_dl/extractor/megatvcom.py
index 2f3f11f..93c7e7d 100644
--- a/hypervideo_dl/extractor/megatvcom.py
+++ b/hypervideo_dl/extractor/megatvcom.py
@@ -1,14 +1,14 @@
import re
from .common import InfoExtractor
+from ..networking import HEADRequest
from ..utils import (
+ ExtractorError,
clean_html,
determine_ext,
- ExtractorError,
extract_attributes,
get_element_by_class,
get_element_html_by_id,
- HEADRequest,
parse_qs,
unescapeHTML,
unified_timestamp,
@@ -160,5 +160,5 @@ class MegaTVComEmbedIE(MegaTVComBaseIE):
canonical_url = self._request_webpage(
HEADRequest(canonical_url), video_id,
note='Resolve canonical URL',
- errnote='Could not resolve canonical URL').geturl()
+ errnote='Could not resolve canonical URL').url
return self.url_result(canonical_url, MegaTVComIE.ie_key(), video_id)
diff --git a/hypervideo_dl/extractor/mgtv.py b/hypervideo_dl/extractor/mgtv.py
index edc92b3..31ccf00 100644
--- a/hypervideo_dl/extractor/mgtv.py
+++ b/hypervideo_dl/extractor/mgtv.py
@@ -3,15 +3,15 @@ import time
import uuid
from .common import InfoExtractor
-from ..compat import (
- compat_HTTPError,
- compat_str,
-)
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
+ parse_resolution,
+ traverse_obj,
try_get,
url_or_none,
+ urljoin,
)
@@ -30,16 +30,18 @@ class MGTVIE(InfoExtractor):
'duration': 7461,
'thumbnail': r're:^https?://.*\.jpg$',
},
+ 'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://w.mgtv.com/b/427837/15588271.html',
'info_dict': {
'id': '15588271',
'ext': 'mp4',
- 'title': '春日迟迟再出发 沉浸版',
+ 'title': '春日迟迟再出发 沉浸版第1期:陆莹结婚半年查出肾炎被离婚 吴雅婷把一半票根退给前夫',
'description': 'md5:a7a05a05b1aa87bd50cae619b19bbca6',
'thumbnail': r're:^https?://.+\.jpg',
'duration': 4026,
},
+ 'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://w.mgtv.com/b/333652/7329822.html',
'info_dict': {
@@ -50,6 +52,7 @@ class MGTVIE(InfoExtractor):
'thumbnail': r're:^https?://.+\.jpg',
'duration': 2656,
},
+ 'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://w.mgtv.com/b/427837/15591647.html',
'only_matching': True,
@@ -64,6 +67,13 @@ class MGTVIE(InfoExtractor):
'only_matching': True,
}]
+ _RESOLUTIONS = {
+ '标清': ('480p', '854x480'),
+ '高清': ('540p', '960x540'),
+ '超清': ('720p', '1280x720'),
+ '蓝光': ('1080p', '1920x1080'),
+ }
+
def _real_extract(self, url):
video_id = self._match_id(url)
tk2 = base64.urlsafe_b64encode(
@@ -76,55 +86,60 @@ class MGTVIE(InfoExtractor):
'type': 'pch5'
}, headers=self.geo_verification_headers())['data']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- error = self._parse_json(e.cause.read().decode(), None)
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+ error = self._parse_json(e.cause.response.read().decode(), None)
if error.get('code') == 40005:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
raise ExtractorError(error['msg'], expected=True)
raise
- info = api_data['info']
- title = info['title'].strip()
+
stream_data = self._download_json(
'https://pcweb.api.mgtv.com/player/getSource', video_id, query={
- 'pm2': api_data['atc']['pm2'],
'tk2': tk2,
+ 'pm2': api_data['atc']['pm2'],
'video_id': video_id,
+ 'type': 'pch5',
'src': 'intelmgtv',
}, headers=self.geo_verification_headers())['data']
- stream_domain = stream_data['stream_domain'][0]
+ stream_domain = traverse_obj(stream_data, ('stream_domain', ..., {url_or_none}), get_all=False)
formats = []
- for idx, stream in enumerate(stream_data['stream']):
- stream_path = stream.get('url')
- if not stream_path:
- continue
- format_data = self._download_json(
- stream_domain + stream_path, video_id,
- note=f'Download video info for format #{idx}')
- format_url = format_data.get('info')
+ for idx, stream in enumerate(traverse_obj(stream_data, ('stream', lambda _, v: v['url']))):
+ stream_name = traverse_obj(stream, 'name', 'standardName', 'barName', expected_type=str)
+ resolution = traverse_obj(
+ self._RESOLUTIONS, (stream_name, 1 if stream.get('scale') == '16:9' else 0))
+ format_url = traverse_obj(self._download_json(
+ urljoin(stream_domain, stream['url']), video_id, fatal=False,
+ note=f'Downloading video info for format {resolution or stream_name}'),
+ ('info', {url_or_none}))
if not format_url:
continue
tbr = int_or_none(stream.get('filebitrate') or self._search_regex(
r'_(\d+)_mp4/', format_url, 'tbr', default=None))
formats.append({
- 'format_id': compat_str(tbr or idx),
- 'url': url_or_none(format_url),
+ 'format_id': str(tbr or idx),
+ 'url': format_url,
'ext': 'mp4',
'tbr': tbr,
+ 'vcodec': stream.get('videoFormat'),
+ 'acodec': stream.get('audioFormat'),
+ **parse_resolution(resolution),
'protocol': 'm3u8_native',
'http_headers': {
'Referer': url,
},
- 'format_note': stream.get('name'),
+ 'format_note': stream_name,
})
return {
'id': video_id,
- 'title': title,
'formats': formats,
- 'description': info.get('desc'),
- 'duration': int_or_none(info.get('duration')),
- 'thumbnail': info.get('thumb'),
+ **traverse_obj(api_data, ('info', {
+ 'title': ('title', {str.strip}),
+ 'description': ('desc', {str}),
+ 'duration': ('duration', {int_or_none}),
+ 'thumbnail': ('thumb', {url_or_none}),
+ })),
'subtitles': self.extract_subtitles(video_id, stream_domain),
}
diff --git a/hypervideo_dl/extractor/minds.py b/hypervideo_dl/extractor/minds.py
index 2fb1792..27a6e38 100644
--- a/hypervideo_dl/extractor/minds.py
+++ b/hypervideo_dl/extractor/minds.py
@@ -106,7 +106,7 @@ class MindsIE(MindsBaseIE):
if poster:
urlh = self._request_webpage(poster, video_id, fatal=False)
if urlh:
- thumbnail = urlh.geturl()
+ thumbnail = urlh.url
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/miomio.py b/hypervideo_dl/extractor/miomio.py
index a0a041e..8df8cba 100644
--- a/hypervideo_dl/extractor/miomio.py
+++ b/hypervideo_dl/extractor/miomio.py
@@ -2,12 +2,8 @@ import random
from .common import InfoExtractor
from ..compat import compat_urlparse
-from ..utils import (
- xpath_text,
- int_or_none,
- ExtractorError,
- sanitized_Request,
-)
+from ..networking import Request
+from ..utils import ExtractorError, int_or_none, xpath_text
class MioMioIE(InfoExtractor):
@@ -61,7 +57,7 @@ class MioMioIE(InfoExtractor):
'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/xml.php?id=%s&r=%s' % (id, random.randint(100, 999)),
video_id)
- vid_config_request = sanitized_Request(
+ vid_config_request = Request(
'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config),
headers=http_headers)
diff --git a/hypervideo_dl/extractor/mixch.py b/hypervideo_dl/extractor/mixch.py
index 3f430a7..4be6947 100644
--- a/hypervideo_dl/extractor/mixch.py
+++ b/hypervideo_dl/extractor/mixch.py
@@ -1,8 +1,5 @@
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- traverse_obj,
-)
+from ..utils import UserNotLive, traverse_obj
class MixchIE(InfoExtractor):
@@ -33,7 +30,7 @@ class MixchIE(InfoExtractor):
initial_js_state = self._parse_json(self._search_regex(
r'(?m)^\s*window\.__INITIAL_JS_STATE__\s*=\s*(\{.+?\});\s*$', webpage, 'initial JS state'), video_id)
if not initial_js_state.get('liveInfo'):
- raise ExtractorError('Livestream has ended.', expected=True)
+ raise UserNotLive(video_id=video_id)
return {
'id': video_id,
@@ -45,7 +42,8 @@ class MixchIE(InfoExtractor):
'uploader_id': video_id,
'formats': [{
'format_id': 'hls',
- 'url': traverse_obj(initial_js_state, ('liveInfo', 'hls')) or 'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_%s.m3u8' % video_id,
+ 'url': (traverse_obj(initial_js_state, ('liveInfo', 'hls'))
+ or f'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_{video_id}.m3u8'),
'ext': 'mp4',
'protocol': 'm3u8',
}],
diff --git a/hypervideo_dl/extractor/motherless.py b/hypervideo_dl/extractor/motherless.py
index c24ef9b..769b52c 100644
--- a/hypervideo_dl/extractor/motherless.py
+++ b/hypervideo_dl/extractor/motherless.py
@@ -1,32 +1,39 @@
import datetime
import re
+import urllib.parse
from .common import InfoExtractor
-from ..compat import compat_urlparse
from ..utils import (
ExtractorError,
- InAdvancePagedList,
- orderedSet,
+ OnDemandPagedList,
+ remove_end,
str_to_int,
unified_strdate,
)
class MotherlessIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/|G[VIG]?[A-F0-9]+/)?(?P<id>[A-F0-9]+)'
_TESTS = [{
- 'url': 'http://motherless.com/AC3FFE1',
- 'md5': '310f62e325a9fafe64f68c0bccb6e75f',
+ 'url': 'http://motherless.com/EE97006',
+ 'md5': 'cb5e7438f7a3c4e886b7bccc1292a3bc',
'info_dict': {
- 'id': 'AC3FFE1',
+ 'id': 'EE97006',
'ext': 'mp4',
- 'title': 'Fucked in the ass while playing PS3',
- 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
- 'upload_date': '20100913',
- 'uploader_id': 'famouslyfuckedup',
+ 'title': 'Dogging blond Brit getting glazed (comp)',
+ 'categories': ['UK', 'slag', 'whore', 'dogging', 'cunt', 'cumhound', 'big tits', 'Pearl Necklace'],
+ 'upload_date': '20230519',
+ 'uploader_id': 'deathbird',
'thumbnail': r're:https?://.*\.jpg',
'age_limit': 18,
- }
+ 'comment_count': int,
+ 'view_count': int,
+ 'like_count': int,
+ },
+ 'params': {
+ # Incomplete cert chains
+ 'nocheckcertificate': True,
+ },
}, {
'url': 'http://motherless.com/532291B',
'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
@@ -49,16 +56,36 @@ class MotherlessIE(InfoExtractor):
'id': '633979F',
'ext': 'mp4',
'title': 'Turtlette',
- 'categories': ['superheroine heroine superher'],
+ 'categories': ['superheroine heroine superher'],
'upload_date': '20140827',
'uploader_id': 'shade0230',
'thumbnail': r're:https?://.*\.jpg',
'age_limit': 18,
- }
+ 'like_count': int,
+ 'comment_count': int,
+ 'view_count': int,
+ },
+ 'params': {
+ 'nocheckcertificate': True,
+ },
}, {
- # no keywords
'url': 'http://motherless.com/8B4BBC1',
- 'only_matching': True,
+ 'info_dict': {
+ 'id': '8B4BBC1',
+ 'ext': 'mp4',
+ 'title': 'VIDEO00441.mp4',
+ 'categories': [],
+ 'upload_date': '20160214',
+ 'uploader_id': 'NMWildGirl',
+ 'thumbnail': r're:https?://.*\.jpg',
+ 'age_limit': 18,
+ 'like_count': int,
+ 'comment_count': int,
+ 'view_count': int,
+ },
+ 'params': {
+ 'nocheckcertificate': True,
+ },
}, {
# see https://motherless.com/videos/recent for recent videos with
# uploaded date in "ago" format
@@ -72,9 +99,12 @@ class MotherlessIE(InfoExtractor):
'uploader_id': 'anonymous',
'thumbnail': r're:https?://.*\.jpg',
'age_limit': 18,
+ 'like_count': int,
+ 'comment_count': int,
+ 'view_count': int,
},
'params': {
- 'skip_download': True,
+ 'nocheckcertificate': True,
},
}]
@@ -128,10 +158,8 @@ class MotherlessIE(InfoExtractor):
(r'''<span\b[^>]+\bclass\s*=\s*["']username\b[^>]*>([^<]+)</span>''',
r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)'''),
webpage, 'uploader_id', fatal=False)
-
- categories = self._html_search_meta('keywords', webpage, default=None)
- if categories:
- categories = [cat.strip() for cat in categories.split(',')]
+ categories = self._html_search_meta('keywords', webpage, default='')
+ categories = [cat.strip() for cat in categories.split(',') if cat.strip()]
return {
'id': video_id,
@@ -148,102 +176,97 @@ class MotherlessIE(InfoExtractor):
}
-class MotherlessGroupIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)'
+class MotherlessPaginatedIE(InfoExtractor):
+ _PAGE_SIZE = 60
+
+ def _correct_path(self, url, item_id):
+ raise NotImplementedError('This method must be implemented by subclasses')
+
+ def _extract_entries(self, webpage, base):
+ for mobj in re.finditer(r'href="[^"]*(?P<href>/[A-F0-9]+)"\s+title="(?P<title>[^"]+)',
+ webpage):
+ video_url = urllib.parse.urljoin(base, mobj.group('href'))
+ video_id = MotherlessIE.get_temp_id(video_url)
+
+ if video_id:
+ yield self.url_result(video_url, MotherlessIE, video_id, mobj.group('title'))
+
+ def _real_extract(self, url):
+ item_id = self._match_id(url)
+ real_url = self._correct_path(url, item_id)
+ webpage = self._download_webpage(real_url, item_id, 'Downloading page 1')
+
+ def get_page(idx):
+ page = idx + 1
+ current_page = webpage if not idx else self._download_webpage(
+ real_url, item_id, note=f'Downloading page {page}', query={'page': page})
+ yield from self._extract_entries(current_page, real_url)
+
+ return self.playlist_result(
+ OnDemandPagedList(get_page, self._PAGE_SIZE), item_id,
+ remove_end(self._html_extract_title(webpage), ' | MOTHERLESS.COM ™'))
+
+
+class MotherlessGroupIE(MotherlessPaginatedIE):
+ _VALID_URL = r'https?://(?:www\.)?motherless\.com/g[vifm]?/(?P<id>[a-z0-9_]+)/?(?:$|[#?])'
_TESTS = [{
- 'url': 'http://motherless.com/g/movie_scenes',
+ 'url': 'http://motherless.com/gv/movie_scenes',
'info_dict': {
'id': 'movie_scenes',
'title': 'Movie Scenes',
- 'description': 'Hot and sexy scenes from "regular" movies... '
- 'Beautiful actresses fully nude... A looot of '
- 'skin! :)Enjoy!',
},
- 'playlist_mincount': 662,
+ 'playlist_mincount': 540,
}, {
- 'url': 'http://motherless.com/gv/sex_must_be_funny',
+ 'url': 'http://motherless.com/g/sex_must_be_funny',
'info_dict': {
'id': 'sex_must_be_funny',
'title': 'Sex must be funny',
- 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of '
- 'any kind!'
},
- 'playlist_mincount': 0,
- 'expected_warnings': [
- 'This group has no videos.',
- ]
+ 'playlist_count': 0,
}, {
- 'url': 'https://motherless.com/g/beautiful_cock',
+ 'url': 'https://motherless.com/gv/beautiful_cock',
'info_dict': {
'id': 'beautiful_cock',
'title': 'Beautiful Cock',
- 'description': 'Group for lovely cocks yours, mine, a friends anything human',
},
- 'playlist_mincount': 2500,
+ 'playlist_mincount': 2040,
}]
- @classmethod
- def suitable(cls, url):
- return (False if MotherlessIE.suitable(url)
- else super(MotherlessGroupIE, cls).suitable(url))
+ def _correct_path(self, url, item_id):
+ return urllib.parse.urljoin(url, f'/gv/{item_id}')
- def _extract_entries(self, webpage, base):
- entries = []
- for mobj in re.finditer(
- r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?',
- webpage):
- video_url = compat_urlparse.urljoin(base, mobj.group('href'))
- if not MotherlessIE.suitable(video_url):
- continue
- video_id = MotherlessIE._match_id(video_url)
- title = mobj.group('title')
- entries.append(self.url_result(
- video_url, ie=MotherlessIE.ie_key(), video_id=video_id,
- video_title=title))
- # Alternative fallback
- if not entries:
- entries = [
- self.url_result(
- compat_urlparse.urljoin(base, '/' + entry_id),
- ie=MotherlessIE.ie_key(), video_id=entry_id)
- for entry_id in orderedSet(re.findall(
- r'data-codename=["\']([A-Z0-9]+)', webpage))]
- return entries
- def _real_extract(self, url):
- group_id = self._match_id(url)
- page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id)
- webpage = self._download_webpage(page_url, group_id)
- title = self._search_regex(
- r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False)
- description = self._html_search_meta(
- 'description', webpage, fatal=False)
- page_count = str_to_int(self._search_regex(
- r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b',
- webpage, 'page_count', default=0))
- if not page_count:
- message = self._search_regex(
- r'''class\s*=\s*['"]error-page\b[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*''',
- webpage, 'error_msg', default=None) or 'This group has no videos.'
- self.report_warning(message, group_id)
- page_count = 1
- PAGE_SIZE = 80
-
- def _get_page(idx):
- if idx > 0:
- webpage = self._download_webpage(
- page_url, group_id, query={'page': idx + 1},
- note='Downloading page %d/%d' % (idx + 1, page_count)
- )
- for entry in self._extract_entries(webpage, url):
- yield entry
-
- playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
+class MotherlessGalleryIE(MotherlessPaginatedIE):
+ _VALID_URL = r'https?://(?:www\.)?motherless\.com/G[VIG]?(?P<id>[A-F0-9]+)/?(?:$|[#?])'
+ _TESTS = [{
+ 'url': 'https://motherless.com/GV338999F',
+ 'info_dict': {
+ 'id': '338999F',
+ 'title': 'Random',
+ },
+ 'playlist_mincount': 190,
+ }, {
+ 'url': 'https://motherless.com/GVABD6213',
+ 'info_dict': {
+ 'id': 'ABD6213',
+ 'title': 'Cuties',
+ },
+ 'playlist_mincount': 2,
+ }, {
+ 'url': 'https://motherless.com/GVBCF7622',
+ 'info_dict': {
+ 'id': 'BCF7622',
+ 'title': 'Vintage',
+ },
+ 'playlist_count': 0,
+ }, {
+ 'url': 'https://motherless.com/G035DE2F',
+ 'info_dict': {
+ 'id': '035DE2F',
+ 'title': 'General',
+ },
+ 'playlist_mincount': 420,
+ }]
- return {
- '_type': 'playlist',
- 'id': group_id,
- 'title': title,
- 'description': description,
- 'entries': playlist
- }
+ def _correct_path(self, url, item_id):
+ return urllib.parse.urljoin(url, f'/GV{item_id}')
diff --git a/hypervideo_dl/extractor/moviepilot.py b/hypervideo_dl/extractor/moviepilot.py
index ca54156..668c098 100644
--- a/hypervideo_dl/extractor/moviepilot.py
+++ b/hypervideo_dl/extractor/moviepilot.py
@@ -1,11 +1,5 @@
from .dailymotion import DailymotionIE
from .common import InfoExtractor
-from ..utils import (
- parse_iso8601,
- try_get,
-)
-
-import re
class MoviepilotIE(InfoExtractor):
@@ -16,21 +10,21 @@ class MoviepilotIE(InfoExtractor):
_TESTS = [{
'url': 'https://www.moviepilot.de/movies/interstellar-2/',
'info_dict': {
- 'id': 'x7xdut5',
+ 'id': 'x7xdpkk',
'display_id': 'interstellar-2',
'ext': 'mp4',
'title': 'Interstellar',
- 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SaXev1VvzitVZMFsR/x720',
- 'timestamp': 1400491705,
- 'description': 'md5:7dfc5c1758e7322a7346934f1f0c489c',
+ 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SaV-q1ZganMw4HVXg/x1080',
+ 'timestamp': 1605010596,
+ 'description': 'md5:0ae9cb452af52610c9ffc60f2fd0474c',
'uploader': 'Moviepilot',
'like_count': int,
'view_count': int,
'uploader_id': 'x6nd9k',
- 'upload_date': '20140519',
- 'duration': 140,
+ 'upload_date': '20201110',
+ 'duration': 97,
'age_limit': 0,
- 'tags': ['Alle Trailer', 'Movie', 'Third Party'],
+ 'tags': ['Alle Trailer', 'Movie', 'Verleih'],
},
}, {
'url': 'https://www.moviepilot.de/movies/interstellar-2/trailer',
@@ -45,14 +39,14 @@ class MoviepilotIE(InfoExtractor):
'display_id': 'queen-slim',
'title': 'Queen & Slim',
'ext': 'mp4',
- 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SbUM71WtomSjVmI_q/x720',
- 'timestamp': 1571838685,
- 'description': 'md5:73058bcd030aa12d991e4280d65fbebe',
+ 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SbUM71ZeG2N975lf2/x1080',
+ 'timestamp': 1605555825,
+ 'description': 'md5:83228bb86f5367dd181447fdc4873989',
'uploader': 'Moviepilot',
'like_count': int,
'view_count': int,
'uploader_id': 'x6nd9k',
- 'upload_date': '20191023',
+ 'upload_date': '20201116',
'duration': 138,
'age_limit': 0,
'tags': ['Movie', 'Verleih', 'Neue Trailer'],
@@ -72,12 +66,12 @@ class MoviepilotIE(InfoExtractor):
'display_id': 'muellers-buero',
'title': 'Müllers Büro',
'ext': 'mp4',
- 'description': 'md5:57501251c05cdc61ca314b7633e0312e',
- 'timestamp': 1287584475,
+ 'description': 'md5:4d23a8f4ca035196cd4523863c4fe5a4',
+ 'timestamp': 1604958457,
'age_limit': 0,
'duration': 82,
- 'upload_date': '20101020',
- 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SaMes1WfAm1d6maq_/x720',
+ 'upload_date': '20201109',
+ 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SaMes1Zg3lxLv9j5u/x1080',
'uploader': 'Moviepilot',
'like_count': int,
'view_count': int,
@@ -91,22 +85,13 @@ class MoviepilotIE(InfoExtractor):
webpage = self._download_webpage(f'https://www.moviepilot.de/movies/{video_id}/trailer', video_id)
- duration = try_get(
- re.match(r'P(?P<hours>\d+)H(?P<mins>\d+)M(?P<secs>\d+)S',
- self._html_search_meta('duration', webpage, fatal=False) or ''),
- lambda mobj: sum(float(x) * y for x, y in zip(mobj.groups(), (3600, 60, 1))))
- # _html_search_meta is not used since we don't want name=description to match
- description = self._html_search_regex(
- '<meta[^>]+itemprop="description"[^>]+content="([^>"]+)"', webpage, 'description', fatal=False)
+ clip = self._search_nextjs_data(webpage, video_id)['props']['initialProps']['pageProps']
return {
'_type': 'url_transparent',
'ie_key': DailymotionIE.ie_key(),
'display_id': video_id,
- 'title': self._og_search_title(webpage),
- 'url': self._html_search_meta('embedURL', webpage),
- 'thumbnail': self._html_search_meta('thumbnailURL', webpage),
- 'description': description,
- 'duration': duration,
- 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage), delimiter=' ')
+ 'title': clip.get('title'),
+ 'url': f'https://www.dailymotion.com/video/{clip["videoRemoteId"]}',
+ 'description': clip.get('summary'),
}
diff --git a/hypervideo_dl/extractor/mtv.py b/hypervideo_dl/extractor/mtv.py
index d91be62..0d700b9 100644
--- a/hypervideo_dl/extractor/mtv.py
+++ b/hypervideo_dl/extractor/mtv.py
@@ -2,16 +2,15 @@ import re
from .common import InfoExtractor
from ..compat import compat_str
+from ..networking import HEADRequest, Request
from ..utils import (
ExtractorError,
+ RegexNotFoundError,
find_xpath_attr,
fix_xml_ampersands,
float_or_none,
- HEADRequest,
int_or_none,
join_nonempty,
- RegexNotFoundError,
- sanitized_Request,
strip_or_none,
timeconvert,
try_get,
@@ -51,15 +50,15 @@ class MTVServicesInfoExtractor(InfoExtractor):
def _extract_mobile_video_formats(self, mtvn_id):
webpage_url = self._MOBILE_TEMPLATE % mtvn_id
- req = sanitized_Request(webpage_url)
+ req = Request(webpage_url)
# Otherwise we get a webpage that would execute some javascript
- req.add_header('User-Agent', 'curl/7')
+ req.headers['User-Agent'] = 'curl/7'
webpage = self._download_webpage(req, mtvn_id,
'Downloading mobile page')
metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url'))
req = HEADRequest(metrics_url)
response = self._request_webpage(req, mtvn_id, 'Resolving url')
- url = response.geturl()
+ url = response.url
# Transform the url to get the best quality:
url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1)
return [{'url': url, 'ext': 'mp4'}]
diff --git a/hypervideo_dl/extractor/museai.py b/hypervideo_dl/extractor/museai.py
new file mode 100644
index 0000000..7f66928
--- /dev/null
+++ b/hypervideo_dl/extractor/museai.py
@@ -0,0 +1,112 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ determine_ext,
+ float_or_none,
+ int_or_none,
+ js_to_json,
+ traverse_obj,
+ url_or_none,
+)
+
+
+class MuseAIIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?muse\.ai/(?:v|embed)/(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://muse.ai/embed/YdTWvUW',
+ 'md5': 'f994f9a38be1c3aaf9e37cbd7d76fe7c',
+ 'info_dict': {
+ 'id': 'YdTWvUW',
+ 'ext': 'mp4',
+ 'title': '2023-05-28-Grabien-1941111 (1)',
+ 'description': '',
+ 'uploader': 'Today News Africa',
+ 'uploader_id': 'TodayNewsAfrica',
+ 'upload_date': '20230528',
+ 'timestamp': 1685285044,
+ 'duration': 1291.3,
+ 'view_count': int,
+ 'availability': 'public',
+ },
+ }, {
+ 'url': 'https://muse.ai/v/gQ4gGAA-0756',
+ 'md5': '52dbfc78e865e56dc19a1715badc35e8',
+ 'info_dict': {
+ 'id': 'gQ4gGAA',
+ 'ext': 'mp4',
+ 'title': '0756',
+ 'description': 'md5:0ca1483f9aac423e9a96ad00bb3a0785',
+ 'uploader': 'Aerial.ie',
+ 'uploader_id': 'aerial',
+ 'upload_date': '20210306',
+ 'timestamp': 1615072842,
+ 'duration': 21.4,
+ 'view_count': int,
+ 'availability': 'public',
+ },
+ }]
+ _WEBPAGE_TESTS = [{
+ 'url': 'https://muse.ai/docs',
+ 'playlist_mincount': 4,
+ 'info_dict': {
+ 'id': 'docs',
+ 'title': 'muse.ai | docs',
+ 'description': 'md5:6c0293431481582739c82ee8902687fa',
+ 'age_limit': 0,
+ 'thumbnail': 'https://muse.ai/static/imgs/poster-img-docs.png',
+ },
+ 'params': {'allowed_extractors': ['all', '-html5']},
+ }]
+ _EMBED_REGEX = [r'<iframe[^>]*\bsrc=["\'](?P<url>https://muse\.ai/embed/\w+)']
+
+ @classmethod
+ def _extract_embed_urls(cls, url, webpage):
+ yield from super()._extract_embed_urls(url, webpage)
+ for embed_id in re.findall(r'<script>[^<]*\bMusePlayer\(\{[^}<]*\bvideo:\s*["\'](\w+)["\']', webpage):
+ yield f'https://muse.ai/embed/{embed_id}'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(f'https://muse.ai/embed/{video_id}', video_id)
+ data = self._search_json(
+ r'player\.setData\(', webpage, 'player data', video_id, transform_source=js_to_json)
+
+ source_url = data['url']
+ if not url_or_none(source_url):
+ raise ExtractorError('Unable to extract video URL')
+
+ formats = [{
+ 'url': source_url,
+ 'format_id': 'source',
+ 'quality': 1,
+ **traverse_obj(data, {
+ 'ext': ('filename', {determine_ext}),
+ 'width': ('width', {int_or_none}),
+ 'height': ('height', {int_or_none}),
+ 'filesize': ('size', {int_or_none}),
+ }),
+ }]
+ if source_url.endswith('/data'):
+ base_url = f'{source_url[:-5]}/videos'
+ formats.extend(self._extract_m3u8_formats(
+ f'{base_url}/hls.m3u8', video_id, m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_mpd_formats(
+ f'{base_url}/dash.mpd', video_id, mpd_id='dash', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ **traverse_obj(data, {
+ 'title': ('title', {str}),
+ 'description': ('description', {str}),
+ 'duration': ('duration', {float_or_none}),
+ 'timestamp': ('tcreated', {int_or_none}),
+ 'uploader': ('owner_name', {str}),
+ 'uploader_id': ('owner_username', {str}),
+ 'view_count': ('views', {int_or_none}),
+ 'age_limit': ('mature', {lambda x: 18 if x else None}),
+ 'availability': ('visibility', {lambda x: x if x in ('private', 'unlisted') else 'public'}),
+ }),
+ }
diff --git a/hypervideo_dl/extractor/myvideoge.py b/hypervideo_dl/extractor/myvideoge.py
index 513d4cb..64cee48 100644
--- a/hypervideo_dl/extractor/myvideoge.py
+++ b/hypervideo_dl/extractor/myvideoge.py
@@ -1,5 +1,16 @@
+import re
+
from .common import InfoExtractor
-from ..utils import js_to_json
+from ..utils import (
+ MONTH_NAMES,
+ clean_html,
+ get_element_by_class,
+ get_element_by_id,
+ int_or_none,
+ js_to_json,
+ qualities,
+ unified_strdate,
+)
class MyVideoGeIE(InfoExtractor):
@@ -11,37 +22,50 @@ class MyVideoGeIE(InfoExtractor):
'id': '3941048',
'ext': 'mp4',
'title': 'The best prikol',
+ 'upload_date': '20200611',
'thumbnail': r're:^https?://.*\.jpg$',
- 'uploader': 'md5:d72addd357b0dd914e704781f7f777d8',
- 'description': 'md5:5c0371f540f5888d603ebfedd46b6df3'
- }
+ 'uploader': 'chixa33',
+ 'description': 'md5:5b067801318e33c2e6eea4ab90b1fdd3',
+ },
}
+ _MONTH_NAMES_KA = ['იანვარი', 'თებერვალი', 'მარტი', 'აპრილი', 'მაისი', 'ივნისი', 'ივლისი', 'აგვისტო', 'სექტემბერი', 'ოქტომბერი', 'ნოემბერი', 'დეკემბერი']
+
+ _quality = staticmethod(qualities(('SD', 'HD')))
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<h1[^>]*>([^<]+)</h1>', webpage, 'title')
- description = self._og_search_description(webpage)
- thumbnail = self._html_search_meta(['og:image'], webpage)
- uploader = self._search_regex(r'<a[^>]+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False)
+ title = (
+ self._og_search_title(webpage, default=None)
+ or clean_html(get_element_by_class('my_video_title', webpage))
+ or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title\b', webpage, 'title'))
jwplayer_sources = self._parse_json(
self._search_regex(
- r"(?s)jwplayer\(\"mvplayer\"\).setup\(.*?sources: (.*?])", webpage, 'jwplayer sources'),
- video_id, transform_source=js_to_json)
+ r'''(?s)jwplayer\s*\(\s*['"]mvplayer['"]\s*\)\s*\.\s*setup\s*\(.*?\bsources\s*:\s*(\[.*?])\s*[,});]''', webpage, 'jwplayer sources', fatal=False)
+ or '',
+ video_id, transform_source=js_to_json, fatal=False)
+
+ formats = self._parse_jwplayer_formats(jwplayer_sources or [], video_id)
+ for f in formats or []:
+ f['quality'] = self._quality(f['format_id'])
- def _formats_key(f):
- if f['label'] == 'SD':
- return -1
- elif f['label'] == 'HD':
- return 1
- else:
- return 0
+ description = (
+ self._og_search_description(webpage)
+ or get_element_by_id('long_desc_holder', webpage)
+ or self._html_search_meta('description', webpage))
- jwplayer_sources = sorted(jwplayer_sources, key=_formats_key)
+ uploader = self._search_regex(r'<a[^>]+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False)
- formats = self._parse_jwplayer_formats(jwplayer_sources, video_id)
+ upload_date = get_element_by_class('mv_vid_upl_date', webpage)
+ # as ka locale may not be present roll a local date conversion
+ upload_date = (unified_strdate(
+ # translate any ka month to an en one
+ re.sub('|'.join(self._MONTH_NAMES_KA),
+ lambda m: MONTH_NAMES['en'][self._MONTH_NAMES_KA.index(m.group(0))],
+ upload_date, re.I))
+ if upload_date else None)
return {
'id': video_id,
@@ -49,5 +73,9 @@ class MyVideoGeIE(InfoExtractor):
'description': description,
'uploader': uploader,
'formats': formats,
- 'thumbnail': thumbnail
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'upload_date': upload_date,
+ 'view_count': int_or_none(get_element_by_class('mv_vid_views', webpage)),
+ 'like_count': int_or_none(get_element_by_id('likes_count', webpage)),
+ 'dislike_count': int_or_none(get_element_by_id('dislikes_count', webpage)),
}
diff --git a/hypervideo_dl/extractor/mzaalo.py b/hypervideo_dl/extractor/mzaalo.py
new file mode 100644
index 0000000..1996368
--- /dev/null
+++ b/hypervideo_dl/extractor/mzaalo.py
@@ -0,0 +1,95 @@
+from .common import InfoExtractor
+from ..utils import (
+ parse_age_limit,
+ parse_duration,
+ traverse_obj,
+ url_or_none,
+)
+
+
+class MzaaloIE(InfoExtractor):
+ _VALID_URL = r'(?i)https?://(?:www\.)?mzaalo\.com/(?:play|watch)/(?P<type>movie|original|clip)/(?P<id>[a-f0-9-]+)/[\w-]+'
+ _TESTS = [{
+ # Movies
+ 'url': 'https://www.mzaalo.com/play/movie/c0958d9f-f90e-4503-a755-44358758921d/Jamun',
+ 'info_dict': {
+ 'id': 'c0958d9f-f90e-4503-a755-44358758921d',
+ 'title': 'Jamun',
+ 'ext': 'mp4',
+ 'description': 'md5:24fe9ebb9bbe5b36f7b54b90ab1e2f31',
+ 'thumbnails': 'count:15',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 5527.0,
+ 'language': 'hin',
+ 'categories': ['Drama'],
+ 'age_limit': 13,
+ },
+ 'params': {'skip_download': 'm3u8'}
+ }, {
+ # Shows
+ 'url': 'https://www.mzaalo.com/play/original/93d42b2b-f373-4c2d-bca4-997412cb069d/Modi-Season-2-CM-TO-PM/Episode-1:Decision,-Not-Promises',
+ 'info_dict': {
+ 'id': '93d42b2b-f373-4c2d-bca4-997412cb069d',
+ 'title': 'Episode 1:Decision, Not Promises',
+ 'ext': 'mp4',
+ 'description': 'md5:16f76058432a54774fbb2561a1955652',
+ 'thumbnails': 'count:22',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 2040.0,
+ 'language': 'hin',
+ 'categories': ['Drama'],
+ 'age_limit': 13,
+ },
+ 'params': {'skip_download': 'm3u8'}
+ }, {
+ # Streams/Clips
+ 'url': 'https://www.mzaalo.com/play/clip/83cdbcb5-400a-42f1-a1d2-459053cfbda5/Manto-Ki-Kahaaniya',
+ 'info_dict': {
+ 'id': '83cdbcb5-400a-42f1-a1d2-459053cfbda5',
+ 'title': 'Manto Ki Kahaaniya',
+ 'ext': 'mp4',
+ 'description': 'md5:c3c5f1d05f0fd1bfcb05b673d1cc9f2f',
+ 'thumbnails': 'count:3',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 1937.0,
+ 'language': 'hin',
+ },
+ 'params': {'skip_download': 'm3u8'}
+ }, {
+ 'url': 'https://mzaalo.com/watch/MOVIE/389c892d-0b65-4019-bf73-d4edcb1c014f/Chalo-Dilli',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id, type_ = self._match_valid_url(url).group('id', 'type')
+ path = (f'partner/streamurl?&assetId={video_id}&getClipDetails=YES' if type_ == 'clip'
+ else f'api/v2/player/details?assetType={type_.upper()}&assetId={video_id}')
+ data = self._download_json(
+ f'https://production.mzaalo.com/platform/{path}', video_id, headers={
+ 'Ocp-Apim-Subscription-Key': '1d0caac2702049b89a305929fdf4cbae',
+ })['data']
+
+ formats = self._extract_m3u8_formats(data['streamURL'], video_id)
+
+ subtitles = {}
+ for subs_lang, subs_url in traverse_obj(data, ('subtitles', {dict.items}, ...)):
+ if url_or_none(subs_url):
+ subtitles[subs_lang] = [{'url': subs_url, 'ext': 'vtt'}]
+
+ lang = traverse_obj(data, ('language', {str.lower}))
+ for f in formats:
+ f['language'] = lang
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ **traverse_obj(data, {
+ 'title': ('title', {str}),
+ 'description': ('description', {str}),
+ 'duration': ('duration', {parse_duration}),
+ 'age_limit': ('maturity_rating', {parse_age_limit}),
+ 'thumbnails': ('images', ..., {'url': {url_or_none}}),
+ 'categories': ('genre', ..., {str}),
+ }),
+ }
diff --git a/hypervideo_dl/extractor/naver.py b/hypervideo_dl/extractor/naver.py
index e2e6e97..d79caf5 100644
--- a/hypervideo_dl/extractor/naver.py
+++ b/hypervideo_dl/extractor/naver.py
@@ -21,6 +21,23 @@ from ..utils import (
class NaverBaseIE(InfoExtractor):
_CAPTION_EXT_RE = r'\.(?:ttml|vtt)'
+ @staticmethod # NB: Used in VLiveWebArchiveIE, WeverseIE
+ def process_subtitles(vod_data, process_url):
+ ret = {'subtitles': {}, 'automatic_captions': {}}
+ for caption in traverse_obj(vod_data, ('captions', 'list', ...)):
+ caption_url = caption.get('source')
+ if not caption_url:
+ continue
+ type_ = 'automatic_captions' if caption.get('type') == 'auto' else 'subtitles'
+ lang = caption.get('locale') or join_nonempty('language', 'country', from_dict=caption) or 'und'
+ if caption.get('type') == 'fan':
+ lang += '_fan%d' % next(i for i in itertools.count(1) if f'{lang}_fan{i}' not in ret[type_])
+ ret[type_].setdefault(lang, []).extend({
+ 'url': sub_url,
+ 'name': join_nonempty('label', 'fanName', from_dict=caption, delim=' - '),
+ } for sub_url in process_url(caption_url))
+ return ret
+
def _extract_video_info(self, video_id, vid, key):
video_data = self._download_json(
'http://play.rmcnmv.naver.com/vod/play/v2.0/' + vid,
@@ -79,34 +96,18 @@ class NaverBaseIE(InfoExtractor):
]
return [caption_url]
- automatic_captions = {}
- subtitles = {}
- for caption in get_list('caption'):
- caption_url = caption.get('source')
- if not caption_url:
- continue
- sub_dict = automatic_captions if caption.get('type') == 'auto' else subtitles
- lang = caption.get('locale') or join_nonempty('language', 'country', from_dict=caption) or 'und'
- if caption.get('type') == 'fan':
- lang += '_fan%d' % next(i for i in itertools.count(1) if f'{lang}_fan{i}' not in sub_dict)
- sub_dict.setdefault(lang, []).extend({
- 'url': sub_url,
- 'name': join_nonempty('label', 'fanName', from_dict=caption, delim=' - '),
- } for sub_url in get_subs(caption_url))
-
user = meta.get('user', {})
return {
'id': video_id,
'title': title,
'formats': formats,
- 'subtitles': subtitles,
- 'automatic_captions': automatic_captions,
'thumbnail': try_get(meta, lambda x: x['cover']['source']),
'view_count': int_or_none(meta.get('count')),
'uploader_id': user.get('id'),
'uploader': user.get('name'),
'uploader_url': user.get('url'),
+ **self.process_subtitles(video_data, get_subs),
}
diff --git a/hypervideo_dl/extractor/nbc.py b/hypervideo_dl/extractor/nbc.py
index 1ea6355..b3c28ab 100644
--- a/hypervideo_dl/extractor/nbc.py
+++ b/hypervideo_dl/extractor/nbc.py
@@ -3,29 +3,34 @@ import json
import re
from .common import InfoExtractor
-from .theplatform import ThePlatformIE
+from .theplatform import ThePlatformIE, default_ns
from .adobepass import AdobePassIE
from ..compat import compat_urllib_parse_unquote
+from ..networking import HEADRequest
from ..utils import (
ExtractorError,
+ RegexNotFoundError,
+ UserNotLive,
+ clean_html,
+ determine_ext,
+ float_or_none,
int_or_none,
+ mimetype2ext,
parse_age_limit,
parse_duration,
- RegexNotFoundError,
+ remove_end,
smuggle_url,
- str_or_none,
traverse_obj,
try_get,
- unified_strdate,
+ unescapeHTML,
unified_timestamp,
update_url_query,
url_basename,
- variadic,
)
class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
- _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>n?\d+))'
+ _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>(?:NBCE|n)?\d+))'
_TESTS = [
{
@@ -38,10 +43,18 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
'timestamp': 1424246400,
'upload_date': '20150218',
'uploader': 'NBCU-COM',
+ 'episode': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
+ 'episode_number': 86,
+ 'season': 'Season 2',
+ 'season_number': 2,
+ 'series': 'Tonight Show: Jimmy Fallon',
+ 'duration': 237.0,
+ 'chapters': 'count:1',
+ 'tags': 'count:4',
+ 'thumbnail': r're:https?://.+\.jpg',
},
'params': {
- # m3u8 download
- 'skip_download': True,
+ 'skip_download': 'm3u8',
},
},
{
@@ -55,11 +68,7 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
'upload_date': '20141206',
'uploader': 'NBCU-COM',
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- 'skip': 'Only works from US',
+ 'skip': 'page not found',
},
{
# HLS streams requires the 'hdnea3' cookie
@@ -73,10 +82,58 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
'upload_date': '20090315',
'uploader': 'NBCU-COM',
},
+ 'skip': 'page not found',
+ },
+ {
+ # manifest url does not have extension
+ 'url': 'https://www.nbc.com/the-golden-globe-awards/video/oprah-winfrey-receives-cecil-b-de-mille-award-at-the-2018-golden-globes/3646439',
+ 'info_dict': {
+ 'id': '3646439',
+ 'ext': 'mp4',
+ 'title': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes',
+ 'episode': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes',
+ 'episode_number': 1,
+ 'season': 'Season 75',
+ 'season_number': 75,
+ 'series': 'The Golden Globe Awards',
+ 'description': 'Oprah Winfrey receives the Cecil B. de Mille Award at the 75th Annual Golden Globe Awards.',
+ 'uploader': 'NBCU-COM',
+ 'upload_date': '20180107',
+ 'timestamp': 1515312000,
+ 'duration': 570.0,
+ 'tags': 'count:8',
+ 'thumbnail': r're:https?://.+\.jpg',
+ 'chapters': 'count:1',
+ },
'params': {
- 'skip_download': True,
+ 'skip_download': 'm3u8',
+ },
+ },
+ {
+ # new video_id format
+ 'url': 'https://www.nbc.com/quantum-leap/video/bens-first-leap-nbcs-quantum-leap/NBCE125189978',
+ 'info_dict': {
+ 'id': 'NBCE125189978',
+ 'ext': 'mp4',
+ 'title': 'Ben\'s First Leap | NBC\'s Quantum Leap',
+ 'description': 'md5:a82762449b7ec4bb83291a7b355ebf8e',
+ 'uploader': 'NBCU-COM',
+ 'series': 'Quantum Leap',
+ 'season': 'Season 1',
+ 'season_number': 1,
+ 'episode': 'Ben\'s First Leap | NBC\'s Quantum Leap',
+ 'episode_number': 1,
+ 'duration': 170.171,
+ 'chapters': [],
+ 'timestamp': 1663956155,
+ 'upload_date': '20220923',
+ 'tags': 'count:10',
+ 'age_limit': 0,
+ 'thumbnail': r're:https?://.+\.jpg',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
},
- 'skip': 'Only works from US',
},
{
'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310',
@@ -136,6 +193,7 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
query = {
'mbr': 'true',
'manifest': 'm3u',
+ 'switch': 'HLSServiceSecure',
}
video_id = video_data['mpxGuid']
tp_path = 'NnzsPC/media/guid/%s/%s' % (video_data.get('mpxAccountId') or '2410887629', video_id)
@@ -599,32 +657,54 @@ class NBCStationsIE(InfoExtractor):
_TESTS = [{
'url': 'https://www.nbclosangeles.com/news/local/large-structure-fire-in-downtown-la-prompts-smoke-odor-advisory/2968618/',
- 'md5': '462041d91bd762ef5a38b7d85d6dc18f',
'info_dict': {
'id': '2968618',
'ext': 'mp4',
'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory',
- 'description': None,
+ 'description': 'md5:417ed3c2d91fe9d301e6db7b0942f182',
+ 'duration': 112.513,
'timestamp': 1661135892,
- 'upload_date': '20220821',
+ 'upload_date': '20220822',
'uploader': 'NBC 4',
- 'uploader_id': 'KNBC',
+ 'channel_id': 'KNBC',
'channel': 'nbclosangeles',
},
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
}, {
'url': 'https://www.telemundoarizona.com/responde/huracan-complica-reembolso-para-televidente-de-tucson/2247002/',
- 'md5': '0917dcf7885be1023a9220630d415f67',
'info_dict': {
'id': '2247002',
'ext': 'mp4',
- 'title': 'Huracán complica que televidente de Tucson reciba reembolso',
+ 'title': 'Huracán complica que televidente de Tucson reciba reembolso',
'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf',
+ 'duration': 172.406,
'timestamp': 1660886507,
'upload_date': '20220819',
'uploader': 'Telemundo Arizona',
- 'uploader_id': 'KTAZ',
+ 'channel_id': 'KTAZ',
'channel': 'telemundoarizona',
},
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ # direct mp4 link
+ 'url': 'https://www.nbcboston.com/weather/video-weather/highs-near-freezing-in-boston-on-wednesday/2961135/',
+ 'md5': '9bf8c41dc7abbb75b1a44f1491a4cc85',
+ 'info_dict': {
+ 'id': '2961135',
+ 'ext': 'mp4',
+ 'title': 'Highs Near Freezing in Boston on Wednesday',
+ 'description': 'md5:3ec486609a926c99f00a3512e6c0e85b',
+ 'duration': 235.669,
+ 'timestamp': 1675268656,
+ 'upload_date': '20230201',
+ 'uploader': '',
+ 'channel_id': 'WBTS',
+ 'channel': 'nbcboston',
+ },
}]
_RESOLUTIONS = {
@@ -640,51 +720,42 @@ class NBCStationsIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
nbc_data = self._search_json(
- r'<script>var\s*nbc\s*=', webpage, 'NBC JSON data', video_id)
+ r'<script>\s*var\s+nbc\s*=', webpage, 'NBC JSON data', video_id)
pdk_acct = nbc_data.get('pdkAcct') or 'Yh1nAC'
fw_ssid = traverse_obj(nbc_data, ('video', 'fwSSID'))
- fw_network_id = traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114')
- video_data = self._parse_json(self._html_search_regex(
- r'data-videos="([^"]*)"', webpage, 'video data', default='{}'), video_id)
- video_data = variadic(video_data)[0]
- video_data.update(self._parse_json(self._html_search_regex(
- r'data-meta="([^"]*)"', webpage, 'metadata', default='{}'), video_id))
+ video_data = self._search_json(
+ r'data-videos="\[', webpage, 'video data', video_id, default={}, transform_source=unescapeHTML)
+ video_data.update(self._search_json(
+ r'data-meta="', webpage, 'metadata', video_id, default={}, transform_source=unescapeHTML))
+ if not video_data:
+ raise ExtractorError('No video metadata found in webpage', expected=True)
- formats = []
+ info, formats = {}, []
+ is_live = int_or_none(video_data.get('mpx_is_livestream')) == 1
+ query = {
+ 'formats': 'MPEG-DASH none,M3U none,MPEG-DASH none,MPEG4,MP3',
+ 'format': 'SMIL',
+ 'fwsitesection': fw_ssid,
+ 'fwNetworkID': traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114'),
+ 'pprofile': 'ots_desktop_html',
+ 'sensitive': 'false',
+ 'w': '1920',
+ 'h': '1080',
+ 'mode': 'LIVE' if is_live else 'on-demand',
+ 'vpaid': 'script',
+ 'schema': '2.0',
+ 'sdk': 'PDK 6.1.3',
+ }
- if video_data.get('mpx_is_livestream') == '1':
- live = True
- player_id = traverse_obj(
- video_data, 'mpx_m3upid', ('video', 'meta', 'mpx_m3upid'), 'mpx_pid',
- ('video', 'meta', 'mpx_pid'), 'pid_streaming_web_medium')
- query = {
- 'mbr': 'true',
- 'assetTypes': 'LegacyRelease',
- 'fwsitesection': fw_ssid,
- 'fwNetworkID': fw_network_id,
- 'pprofile': 'ots_desktop_html',
- 'sensitive': 'false',
- 'w': '1920',
- 'h': '1080',
- 'rnd': '1660303',
- 'mode': 'LIVE',
- 'format': 'SMIL',
- 'tracking': 'true',
- 'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3',
- 'vpaid': 'script',
- 'schema': '2.0',
- 'SDK': 'PDK+6.1.3',
- }
- info = {
- 'title': f'{channel} livestream',
- }
+ if is_live:
+ player_id = traverse_obj(video_data, ((None, ('video', 'meta')), (
+ 'mpx_m3upid', 'mpx_pid', 'pid_streaming_web_medium')), get_all=False)
+ info['title'] = f'{channel} livestream'
else:
- live = False
- player_id = traverse_obj(
- video_data, ('video', 'meta', 'pid_streaming_web_high'), 'pid_streaming_web_high',
- ('video', 'meta', 'mpx_pid'), 'mpx_pid')
+ player_id = traverse_obj(video_data, (
+ (None, ('video', 'meta')), ('pid_streaming_web_high', 'mpx_pid')), get_all=False)
date_string = traverse_obj(video_data, 'date_string', 'date_gmt')
if date_string:
@@ -692,63 +763,70 @@ class NBCStationsIE(InfoExtractor):
r'datetime="([^"]+)"', date_string, 'date string', fatal=False)
else:
date_string = traverse_obj(
- nbc_data, ('dataLayer', 'adobe', 'prop70'), ('dataLayer', 'adobe', 'eVar70'),
- ('dataLayer', 'adobe', 'eVar59'))
+ nbc_data, ('dataLayer', 'adobe', ('prop70', 'eVar70', 'eVar59')), get_all=False)
- video_url = traverse_obj(video_data, ('video', 'meta', 'mp4_url'), 'mp4_url')
+ video_url = traverse_obj(video_data, ((None, ('video', 'meta')), 'mp4_url'), get_all=False)
if video_url:
- height = url_basename(video_url).split('-')[1].split('p')[0]
+ ext = determine_ext(video_url)
+ height = self._search_regex(r'\d+-(\d+)p', url_basename(video_url), 'height', default=None)
formats.append({
'url': video_url,
- 'ext': 'mp4',
+ 'ext': ext,
'width': int_or_none(self._RESOLUTIONS.get(height)),
'height': int_or_none(height),
- 'format_id': f'http-{height}',
+ 'format_id': f'http-{ext}',
})
- query = {
- 'mbr': 'true',
- 'assetTypes': 'LegacyRelease',
- 'fwsitesection': fw_ssid,
- 'fwNetworkID': fw_network_id,
- 'format': 'redirect',
- 'manifest': 'm3u',
- 'Tracking': 'true',
- 'Embedded': 'true',
- 'formats': 'MPEG4',
- }
- info = {
- 'title': video_data.get('title') or traverse_obj(
- nbc_data, ('dataLayer', 'contenttitle'), ('dataLayer', 'title'),
- ('dataLayer', 'adobe', 'prop22'), ('dataLayer', 'id')),
- 'description': traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text'),
- 'upload_date': str_or_none(unified_strdate(date_string)),
- 'timestamp': int_or_none(unified_timestamp(date_string)),
- }
-
- if not player_id:
- raise ExtractorError(
- 'No video player ID or livestream player ID found in webpage', expected=True)
-
- headers = {'Origin': f'https://www.{channel}.com'}
- manifest, urlh = self._download_webpage_handle(
- f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id,
- headers=headers, query=query, note='Downloading manifest')
- if live:
- manifest_url = self._search_regex(r'<video src="([^"]*)', manifest, 'manifest URL')
- else:
- manifest_url = urlh.geturl()
+ info.update({
+ 'title': video_data.get('title') or traverse_obj(nbc_data, (
+ 'dataLayer', (None, 'adobe'), ('contenttitle', 'title', 'prop22')), get_all=False),
+ 'description':
+ traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text')
+ or clean_html(traverse_obj(nbc_data, ('dataLayer', 'summary'))),
+ 'timestamp': unified_timestamp(date_string),
+ })
+
+ smil = None
+ if player_id and fw_ssid:
+ smil = self._download_xml(
+ f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id,
+ note='Downloading SMIL data', query=query, fatal=is_live)
+ subtitles = self._parse_smil_subtitles(smil, default_ns) if smil else {}
+ for video in smil.findall(self._xpath_ns('.//video', default_ns)) if smil else []:
+ info['duration'] = float_or_none(remove_end(video.get('dur'), 'ms'), 1000)
+ video_src_url = video.get('src')
+ ext = mimetype2ext(video.get('type'), default=determine_ext(video_src_url))
+ if ext == 'm3u8':
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ video_src_url, video_id, 'mp4', m3u8_id='hls', fatal=is_live,
+ live=is_live, errnote='No HLS formats found')
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+ elif video_src_url:
+ formats.append({
+ 'url': video_src_url,
+ 'format_id': f'https-{ext}',
+ 'ext': ext,
+ 'width': int_or_none(video.get('width')),
+ 'height': int_or_none(video.get('height')),
+ })
- formats.extend(self._extract_m3u8_formats(
- manifest_url, video_id, 'mp4', headers=headers, m3u8_id='hls',
- fatal=live, live=live, errnote='No HLS formats found'))
+ if not formats:
+ self.raise_no_formats('No video content found in webpage', expected=True)
+ elif is_live:
+ try:
+ self._request_webpage(
+ HEADRequest(formats[0]['url']), video_id, note='Checking live status')
+ except ExtractorError:
+ raise UserNotLive(video_id=channel)
return {
- 'id': str_or_none(video_id),
+ 'id': video_id,
'channel': channel,
- 'uploader': str_or_none(nbc_data.get('on_air_name')),
- 'uploader_id': str_or_none(nbc_data.get('callLetters')),
+ 'channel_id': nbc_data.get('callLetters'),
+ 'uploader': nbc_data.get('on_air_name'),
'formats': formats,
- 'is_live': live,
+ 'subtitles': subtitles,
+ 'is_live': is_live,
**info,
}
diff --git a/hypervideo_dl/extractor/nebula.py b/hypervideo_dl/extractor/nebula.py
index 861fcb1..4f3e691 100644
--- a/hypervideo_dl/extractor/nebula.py
+++ b/hypervideo_dl/extractor/nebula.py
@@ -1,13 +1,11 @@
import itertools
import json
-import time
-import urllib.error
-import urllib.parse
from .common import InfoExtractor
-from ..utils import ExtractorError, parse_iso8601, try_get
+from ..networking.exceptions import HTTPError
+from ..utils import ExtractorError, make_archive_id, parse_iso8601, remove_start
-_BASE_URL_RE = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)'
+_BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)'
class NebulaBaseIE(InfoExtractor):
@@ -15,11 +13,10 @@ class NebulaBaseIE(InfoExtractor):
_nebula_api_token = None
_nebula_bearer_token = None
- _zype_access_token = None
def _perform_nebula_auth(self, username, password):
if not username or not password:
- self.raise_login_required()
+ self.raise_login_required(method='password')
data = json.dumps({'email': username, 'password': password}).encode('utf8')
response = self._download_json(
@@ -33,38 +30,10 @@ class NebulaBaseIE(InfoExtractor):
note='Logging in to Nebula with supplied credentials',
errnote='Authentication failed or rejected')
if not response or not response.get('key'):
- self.raise_login_required()
-
- # save nebula token as cookie
- self._set_cookie(
- 'nebula.app', 'nebula-auth',
- urllib.parse.quote(
- json.dumps({
- "apiToken": response["key"],
- "isLoggingIn": False,
- "isLoggingOut": False,
- }, separators=(",", ":"))),
- expire_time=int(time.time()) + 86400 * 365,
- )
+ self.raise_login_required(method='password')
return response['key']
- def _retrieve_nebula_api_token(self, username=None, password=None):
- """
- Check cookie jar for valid token. Try to authenticate using credentials if no valid token
- can be found in the cookie jar.
- """
- nebula_cookies = self._get_cookies('https://nebula.app')
- nebula_cookie = nebula_cookies.get('nebula-auth')
- if nebula_cookie:
- self.to_screen('Authenticating to Nebula with token from cookie jar')
- nebula_cookie_value = urllib.parse.unquote(nebula_cookie.value)
- nebula_api_token = self._parse_json(nebula_cookie_value, None).get('apiToken')
- if nebula_api_token:
- return nebula_api_token
-
- return self._perform_nebula_auth(username, password)
-
def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''):
assert method in ('GET', 'POST',)
assert auth_type in ('api', 'bearer',)
@@ -79,7 +48,7 @@ class NebulaBaseIE(InfoExtractor):
return inner_call()
except ExtractorError as exc:
# if 401 or 403, attempt credential re-auth and retry
- if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403):
+ if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.status in (401, 403):
self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}')
self._perform_login()
return inner_call()
@@ -95,35 +64,25 @@ class NebulaBaseIE(InfoExtractor):
note='Authorizing to Nebula')
return response['token']
- def _fetch_zype_access_token(self):
- """
- Get a Zype access token, which is required to access video streams -- in our case: to
- generate video URLs.
- """
- user_object = self._call_nebula_api('https://api.watchnebula.com/api/v1/auth/user/', note='Retrieving Zype access token')
-
- access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], str)
- if not access_token:
- if try_get(user_object, lambda x: x['is_subscribed'], bool):
- # TODO: Reimplement the same Zype token polling the Nebula frontend implements
- # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532
- raise ExtractorError(
- 'Unable to extract Zype access token from Nebula API authentication endpoint. '
- 'Open an arbitrary video in a browser with this account to generate a token',
- expected=True)
- raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')
- return access_token
+ def _fetch_video_formats(self, slug):
+ stream_info = self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/stream/',
+ video_id=slug,
+ auth_type='bearer',
+ note='Fetching video stream info')
+ manifest_url = stream_info['manifest']
+ return self._extract_m3u8_formats_and_subtitles(manifest_url, slug, 'mp4')
def _build_video_info(self, episode):
- zype_id = episode['zype_id']
- zype_video_url = f'https://player.zype.com/embed/{zype_id}.html?access_token={self._zype_access_token}'
+ fmts, subs = self._fetch_video_formats(episode['slug'])
channel_slug = episode['channel_slug']
+ channel_title = episode['channel_title']
+ zype_id = episode.get('zype_id')
return {
- 'id': episode['zype_id'],
+ 'id': remove_start(episode['id'], 'video_episode:'),
'display_id': episode['slug'],
- '_type': 'url_transparent',
- 'ie_key': 'Zype',
- 'url': zype_video_url,
+ 'formats': fmts,
+ 'subtitles': subs,
+ 'webpage_url': f'https://nebula.tv/{episode["slug"]}',
'title': episode['title'],
'description': episode['description'],
'timestamp': parse_iso8601(episode['published_at']),
@@ -133,30 +92,32 @@ class NebulaBaseIE(InfoExtractor):
'height': key,
} for key, tn in episode['assets']['thumbnail'].items()],
'duration': episode['duration'],
- 'channel': episode['channel_title'],
+ 'channel': channel_title,
'channel_id': channel_slug,
- 'channel_url': f'https://nebula.app/{channel_slug}',
- 'uploader': episode['channel_title'],
+ 'channel_url': f'https://nebula.tv/{channel_slug}',
+ 'uploader': channel_title,
'uploader_id': channel_slug,
- 'uploader_url': f'https://nebula.app/{channel_slug}',
- 'series': episode['channel_title'],
- 'creator': episode['channel_title'],
+ 'uploader_url': f'https://nebula.tv/{channel_slug}',
+ 'series': channel_title,
+ 'creator': channel_title,
+ 'extractor_key': NebulaIE.ie_key(),
+ 'extractor': NebulaIE.IE_NAME,
+ '_old_archive_ids': [make_archive_id(NebulaIE, zype_id)] if zype_id else None,
}
def _perform_login(self, username=None, password=None):
- self._nebula_api_token = self._retrieve_nebula_api_token(username, password)
+ self._nebula_api_token = self._perform_nebula_auth(username, password)
self._nebula_bearer_token = self._fetch_nebula_bearer_token()
- self._zype_access_token = self._fetch_zype_access_token()
class NebulaIE(NebulaBaseIE):
_VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)'
_TESTS = [
{
- 'url': 'https://nebula.app/videos/that-time-disney-remade-beauty-and-the-beast',
+ 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast',
'md5': '14944cfee8c7beeea106320c47560efc',
'info_dict': {
- 'id': '5c271b40b13fd613090034fd',
+ 'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf',
'ext': 'mp4',
'title': 'That Time Disney Remade Beauty and the Beast',
'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.',
@@ -167,47 +128,43 @@ class NebulaIE(NebulaBaseIE):
'uploader': 'Lindsay Ellis',
'uploader_id': 'lindsayellis',
'timestamp': 1533009600,
- 'uploader_url': 'https://nebula.app/lindsayellis',
+ 'uploader_url': 'https://nebula.tv/lindsayellis',
'series': 'Lindsay Ellis',
- 'average_rating': int,
'display_id': 'that-time-disney-remade-beauty-and-the-beast',
- 'channel_url': 'https://nebula.app/lindsayellis',
+ 'channel_url': 'https://nebula.tv/lindsayellis',
'creator': 'Lindsay Ellis',
'duration': 2212,
- 'view_count': int,
'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*',
},
},
{
- 'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
+ 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
'md5': 'd05739cf6c38c09322422f696b569c23',
'info_dict': {
- 'id': '5e7e78171aaf320001fbd6be',
+ 'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34',
'ext': 'mp4',
'title': 'Landing Craft - How The Allies Got Ashore',
'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
'upload_date': '20200327',
'timestamp': 1585348140,
- 'channel': 'Real Engineering',
- 'channel_id': 'realengineering',
- 'uploader': 'Real Engineering',
- 'uploader_id': 'realengineering',
- 'view_count': int,
- 'series': 'Real Engineering',
- 'average_rating': int,
+ 'channel': 'Real Engineering — The Logistics of D-Day',
+ 'channel_id': 'd-day',
+ 'uploader': 'Real Engineering — The Logistics of D-Day',
+ 'uploader_id': 'd-day',
+ 'series': 'Real Engineering — The Logistics of D-Day',
'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
- 'creator': 'Real Engineering',
+ 'creator': 'Real Engineering — The Logistics of D-Day',
'duration': 841,
- 'channel_url': 'https://nebula.app/realengineering',
- 'uploader_url': 'https://nebula.app/realengineering',
+ 'channel_url': 'https://nebula.tv/d-day',
+ 'uploader_url': 'https://nebula.tv/d-day',
'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*',
},
},
{
- 'url': 'https://nebula.app/videos/money-episode-1-the-draw',
+ 'url': 'https://nebula.tv/videos/money-episode-1-the-draw',
'md5': 'ebe28a7ad822b9ee172387d860487868',
'info_dict': {
- 'id': '5e779ebdd157bc0001d1c75a',
+ 'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553',
'ext': 'mp4',
'title': 'Episode 1: The Draw',
'description': r'contains:There’s free money on offer… if the players can all work together.',
@@ -217,14 +174,12 @@ class NebulaIE(NebulaBaseIE):
'channel_id': 'tom-scott-presents-money',
'uploader': 'Tom Scott Presents: Money',
'uploader_id': 'tom-scott-presents-money',
- 'uploader_url': 'https://nebula.app/tom-scott-presents-money',
+ 'uploader_url': 'https://nebula.tv/tom-scott-presents-money',
'duration': 825,
- 'channel_url': 'https://nebula.app/tom-scott-presents-money',
- 'view_count': int,
+ 'channel_url': 'https://nebula.tv/tom-scott-presents-money',
'series': 'Tom Scott Presents: Money',
'display_id': 'money-episode-1-the-draw',
'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*',
- 'average_rating': int,
'creator': 'Tom Scott Presents: Money',
},
},
@@ -232,10 +187,14 @@ class NebulaIE(NebulaBaseIE):
'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
'only_matching': True,
},
+ {
+ 'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw',
+ 'only_matching': True,
+ },
]
def _fetch_video_metadata(self, slug):
- return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/',
+ return self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/',
video_id=slug,
auth_type='bearer',
note='Fetching video meta data')
@@ -251,7 +210,7 @@ class NebulaSubscriptionsIE(NebulaBaseIE):
_VALID_URL = rf'{_BASE_URL_RE}/myshows'
_TESTS = [
{
- 'url': 'https://nebula.app/myshows',
+ 'url': 'https://nebula.tv/myshows',
'playlist_mincount': 1,
'info_dict': {
'id': 'myshows',
@@ -279,7 +238,7 @@ class NebulaChannelIE(NebulaBaseIE):
_VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|videos/)(?P<id>[-\w]+)'
_TESTS = [
{
- 'url': 'https://nebula.app/tom-scott-presents-money',
+ 'url': 'https://nebula.tv/tom-scott-presents-money',
'info_dict': {
'id': 'tom-scott-presents-money',
'title': 'Tom Scott Presents: Money',
@@ -287,13 +246,13 @@ class NebulaChannelIE(NebulaBaseIE):
},
'playlist_count': 5,
}, {
- 'url': 'https://nebula.app/lindsayellis',
+ 'url': 'https://nebula.tv/lindsayellis',
'info_dict': {
'id': 'lindsayellis',
'title': 'Lindsay Ellis',
'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
},
- 'playlist_mincount': 100,
+ 'playlist_mincount': 2,
},
]
diff --git a/hypervideo_dl/extractor/nekohacker.py b/hypervideo_dl/extractor/nekohacker.py
new file mode 100644
index 0000000..e10ffe9
--- /dev/null
+++ b/hypervideo_dl/extractor/nekohacker.py
@@ -0,0 +1,217 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ determine_ext,
+ extract_attributes,
+ get_element_by_class,
+ get_element_text_and_html_by_tag,
+ parse_duration,
+ traverse_obj,
+ try_call,
+ url_or_none,
+)
+
+
+class NekoHackerIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?nekohacker\.com/(?P<id>(?!free-dl)[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://nekohacker.com/nekoverse/',
+ 'info_dict': {
+ 'id': 'nekoverse',
+ 'title': 'Nekoverse',
+ },
+ 'playlist': [
+ {
+ 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/01-Spaceship.mp3',
+ 'md5': '44223701ebedba0467ebda4cc07fb3aa',
+ 'info_dict': {
+ 'id': '1712',
+ 'ext': 'mp3',
+ 'title': 'Spaceship',
+ 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg',
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ 'release_date': '20221101',
+ 'album': 'Nekoverse',
+ 'artist': 'Neko Hacker',
+ 'track': 'Spaceship',
+ 'track_number': 1,
+ 'duration': 195.0
+ }
+ },
+ {
+ 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/02-City-Runner.mp3',
+ 'md5': '8f853c71719389d32bbbd3f1a87b3f08',
+ 'info_dict': {
+ 'id': '1713',
+ 'ext': 'mp3',
+ 'title': 'City Runner',
+ 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg',
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ 'release_date': '20221101',
+ 'album': 'Nekoverse',
+ 'artist': 'Neko Hacker',
+ 'track': 'City Runner',
+ 'track_number': 2,
+ 'duration': 148.0
+ }
+ },
+ {
+ 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/03-Nature-Talk.mp3',
+ 'md5': '5a8a8ae852720cee4c0ac95c7d1a7450',
+ 'info_dict': {
+ 'id': '1714',
+ 'ext': 'mp3',
+ 'title': 'Nature Talk',
+ 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg',
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ 'release_date': '20221101',
+ 'album': 'Nekoverse',
+ 'artist': 'Neko Hacker',
+ 'track': 'Nature Talk',
+ 'track_number': 3,
+ 'duration': 174.0
+ }
+ },
+ {
+ 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/04-Crystal-World.mp3',
+ 'md5': 'd8e59a48061764e50d92386a294abd50',
+ 'info_dict': {
+ 'id': '1715',
+ 'ext': 'mp3',
+ 'title': 'Crystal World',
+ 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg',
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ 'release_date': '20221101',
+ 'album': 'Nekoverse',
+ 'artist': 'Neko Hacker',
+ 'track': 'Crystal World',
+ 'track_number': 4,
+ 'duration': 199.0
+ }
+ }
+ ]
+ }, {
+ 'url': 'https://nekohacker.com/susume/',
+ 'info_dict': {
+ 'id': 'susume',
+ 'title': '進め!むじなカンパニー',
+ },
+ 'playlist': [
+ {
+ 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/進め!むじなカンパニー-feat.-六科なじむ-CV_-日高里菜-割戶真友-CV_-金元寿子-軽井沢ユキ-CV_-上坂すみれ-出稼ぎガルシア-CV_-金子彩花-.mp3',
+ 'md5': 'fb13f008aa81f26ba48f91fd2d6186ce',
+ 'info_dict': {
+ 'id': '711',
+ 'ext': 'mp3',
+ 'title': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0',
+ 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png',
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ 'release_date': '20210115',
+ 'album': '進め!むじなカンパニー',
+ 'artist': 'Neko Hacker',
+ 'track': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0',
+ 'track_number': 1,
+ 'duration': None
+ }
+ },
+ {
+ 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/むじな-de-なじむ-feat.-六科なじむ-CV_-日高里菜-.mp3',
+ 'md5': '028803f70241df512b7764e73396fdd1',
+ 'info_dict': {
+ 'id': '709',
+ 'ext': 'mp3',
+ 'title': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )',
+ 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png',
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ 'release_date': '20210115',
+ 'album': '進め!むじなカンパニー',
+ 'artist': 'Neko Hacker',
+ 'track': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )',
+ 'track_number': 2,
+ 'duration': None
+ }
+ },
+ {
+ 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/進め!むじなカンパニー-instrumental.mp3',
+ 'md5': 'adde9e9a16e1da5e602b579c247d0fb9',
+ 'info_dict': {
+ 'id': '710',
+ 'ext': 'mp3',
+ 'title': '進め!むじなカンパニー (instrumental)',
+ 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png',
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ 'release_date': '20210115',
+ 'album': '進め!むじなカンパニー',
+ 'artist': 'Neko Hacker',
+ 'track': '進め!むじなカンパニー (instrumental)',
+ 'track_number': 3,
+ 'duration': None
+ }
+ },
+ {
+ 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/むじな-de-なじむ-instrumental.mp3',
+ 'md5': 'ebb0443039cf5f9ff7fd557ed9b23599',
+ 'info_dict': {
+ 'id': '712',
+ 'ext': 'mp3',
+ 'title': 'むじな de なじむ (instrumental)',
+ 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png',
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ 'release_date': '20210115',
+ 'album': '進め!むじなカンパニー',
+ 'artist': 'Neko Hacker',
+ 'track': 'むじな de なじむ (instrumental)',
+ 'track_number': 4,
+ 'duration': None
+ }
+ }
+ ]
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+ playlist = get_element_by_class('playlist', webpage)
+
+ if not playlist:
+ iframe = try_call(lambda: get_element_text_and_html_by_tag('iframe', webpage)[1]) or ''
+ iframe_src = url_or_none(extract_attributes(iframe).get('src'))
+ if not iframe_src:
+ raise ExtractorError('No playlist or embed found in webpage')
+ elif re.match(r'https?://(?:\w+\.)?spotify\.com/', iframe_src):
+ raise ExtractorError('Spotify embeds are not supported', expected=True)
+ return self.url_result(url, 'Generic')
+
+ entries = []
+ for track_number, track in enumerate(re.findall(r'(<li[^>]+data-audiopath[^>]+>)', playlist), 1):
+ entry = traverse_obj(extract_attributes(track), {
+ 'url': ('data-audiopath', {url_or_none}),
+ 'ext': ('data-audiopath', {determine_ext}),
+ 'id': 'data-trackid',
+ 'title': 'data-tracktitle',
+ 'track': 'data-tracktitle',
+ 'album': 'data-albumtitle',
+ 'duration': ('data-tracktime', {parse_duration}),
+ 'release_date': ('data-releasedate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0),
+ 'thumbnail': ('data-albumart', {url_or_none}),
+ })
+ entries.append({
+ **entry,
+ 'track_number': track_number,
+ 'artist': 'Neko Hacker',
+ 'vcodec': 'none',
+ 'acodec': 'mp3' if entry['ext'] == 'mp3' else None,
+ })
+
+ return self.playlist_result(entries, playlist_id, traverse_obj(entries, (0, 'album')))
diff --git a/hypervideo_dl/extractor/neteasemusic.py b/hypervideo_dl/extractor/neteasemusic.py
index 5957098..5b7307b 100644
--- a/hypervideo_dl/extractor/neteasemusic.py
+++ b/hypervideo_dl/extractor/neteasemusic.py
@@ -11,6 +11,7 @@ from random import randint
from .common import InfoExtractor
from ..aes import aes_ecb_encrypt, pkcs7_padding
from ..compat import compat_urllib_parse_urlencode
+from ..networking import Request
from ..utils import (
ExtractorError,
bytes_to_intlist,
@@ -18,7 +19,6 @@ from ..utils import (
float_or_none,
int_or_none,
intlist_to_bytes,
- sanitized_Request,
try_get,
)
@@ -146,8 +146,8 @@ class NetEaseMusicBaseIE(InfoExtractor):
return int(round(ms / 1000.0))
def query_api(self, endpoint, video_id, note):
- req = sanitized_Request('%s%s' % (self._API_BASE, endpoint))
- req.add_header('Referer', self._API_BASE)
+ req = Request('%s%s' % (self._API_BASE, endpoint))
+ req.headers['Referer'] = self._API_BASE
return self._download_json(req, video_id, note)
diff --git a/hypervideo_dl/extractor/netverse.py b/hypervideo_dl/extractor/netverse.py
index 719a9da..ef53e15 100644
--- a/hypervideo_dl/extractor/netverse.py
+++ b/hypervideo_dl/extractor/netverse.py
@@ -1,4 +1,6 @@
-from .common import InfoExtractor
+import itertools
+
+from .common import InfoExtractor, SearchInfoExtractor
from .dailymotion import DailymotionIE
from ..utils import smuggle_url, traverse_obj
@@ -16,6 +18,26 @@ class NetverseBaseIE(InfoExtractor):
f'https://api.netverse.id/medias/api/v2/{self._ENDPOINTS[endpoint]}/{slug}/{season_id}',
display_id or slug, query=query)
+ def _get_comments(self, video_id):
+ last_page_number = None
+ for i in itertools.count(1):
+ comment_data = self._download_json(
+ f'https://api.netverse.id/mediadetails/api/v3/videos/comments/{video_id}',
+ video_id, data=b'', fatal=False, query={'page': i},
+ note=f'Downloading JSON comment metadata page {i}') or {}
+ yield from traverse_obj(comment_data, ('response', 'comments', 'data', ..., {
+ 'id': '_id',
+ 'text': 'comment',
+ 'author_id': 'customer_id',
+ 'author': ('customer', 'name'),
+ 'author_thumbnail': ('customer', 'profile_picture'),
+ }))
+
+ if not last_page_number:
+ last_page_number = traverse_obj(comment_data, ('response', 'comments', 'last_page'))
+ if i >= (last_page_number or 0):
+ break
+
class NetverseIE(NetverseBaseIE):
_VALID_URL = r'https?://(?:\w+\.)?netverse\.id/(?P<type>watch|video)/(?P<display_id>[^/?#&]+)'
@@ -28,7 +50,7 @@ class NetverseIE(NetverseBaseIE):
'ext': 'mp4',
'season': 'Season 2016',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
- 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/T7aV31Y0eGRWBbwkK/x1080',
+ 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080',
'episode_number': 22,
'episode': 'Episode 22',
'uploader_id': 'x2ir3vq',
@@ -51,7 +73,7 @@ class NetverseIE(NetverseBaseIE):
'ext': 'mp4',
'season': 'Season 2',
'description': 'md5:8a74f70812cca267e19ee0635f0af835',
- 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/Thwuy1YURicFmGu0v/x1080',
+ 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080',
'episode_number': 2,
'episode': 'Episode 2',
'view_count': int,
@@ -75,7 +97,7 @@ class NetverseIE(NetverseBaseIE):
'title': 'Tetangga Baru',
'season': 'Season 1',
'description': 'md5:23fcf70e97d461d3029d25d59b2ccfb9',
- 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/T3Ogm1YEnnyjVKAFF/x1080',
+ 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080',
'episode_number': 1,
'episode': 'Episode 1',
'timestamp': 1624538169,
@@ -96,7 +118,7 @@ class NetverseIE(NetverseBaseIE):
'info_dict': {
'id': 'x887jzz',
'ext': 'mp4',
- 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/TfuZ_1Y6PboJ5An_s/x1080',
+ 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080',
'season': 'Season 1',
'episode_number': 1,
'description': 'md5:d4f627b3e7a3f9acdc55f6cdd5ea41d5',
@@ -114,6 +136,60 @@ class NetverseIE(NetverseBaseIE):
'upload_date': '20220225',
},
'skip': 'This video get Geo-blocked for some country'
+ }, {
+ # video with comments
+ 'url': 'https://netverse.id/video/episode-1-season-2016-ok-food',
+ 'info_dict': {
+ 'id': 'k6hetBPiQMljSxxvAy7',
+ 'ext': 'mp4',
+ 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080',
+ 'display_id': 'episode-1-season-2016-ok-food',
+ 'like_count': int,
+ 'description': '',
+ 'duration': 1471,
+ 'age_limit': 0,
+ 'timestamp': 1642405848,
+ 'episode_number': 1,
+ 'season': 'Season 2016',
+ 'uploader_id': 'x2ir3vq',
+ 'title': 'Episode 1 - Season 2016 - Ok Food',
+ 'upload_date': '20220117',
+ 'tags': [],
+ 'view_count': int,
+ 'episode': 'Episode 1',
+ 'uploader': 'Net Prime',
+ 'comment_count': int,
+ },
+ 'params': {
+ 'getcomments': True
+ }
+ }, {
+ # video with multiple page comment
+ 'url': 'https://netverse.id/video/match-island-eps-1-fix',
+ 'info_dict': {
+ 'id': 'x8aznjc',
+ 'ext': 'mp4',
+ 'like_count': int,
+ 'tags': ['Match-Island', 'Pd00111'],
+ 'display_id': 'match-island-eps-1-fix',
+ 'view_count': int,
+ 'episode': 'Episode 1',
+ 'uploader': 'Net Prime',
+ 'duration': 4070,
+ 'timestamp': 1653068165,
+ 'description': 'md5:e9cf3b480ad18e9c33b999e3494f223f',
+ 'age_limit': 0,
+ 'title': 'Welcome To Match Island',
+ 'upload_date': '20220520',
+ 'episode_number': 1,
+ 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080',
+ 'uploader_id': 'x2ir3vq',
+ 'season': 'Season 1',
+ 'comment_count': int,
+ },
+ 'params': {
+ 'getcomments': True
+ }
}]
def _real_extract(self, url):
@@ -131,6 +207,7 @@ class NetverseIE(NetverseBaseIE):
'thumbnail': traverse_obj(videos, ('program_detail', 'thumbnail_image')),
'description': traverse_obj(videos, ('program_detail', 'description')),
'episode_number': videos.get('episode_order'),
+ '__post_extractor': self.extract_comments(display_id),
}
@@ -174,3 +251,31 @@ class NetversePlaylistIE(NetverseBaseIE):
self.parse_playlist(playlist_data['response'], playlist_id),
traverse_obj(playlist_data, ('response', 'webseries_info', 'slug')),
traverse_obj(playlist_data, ('response', 'webseries_info', 'title')))
+
+
+class NetverseSearchIE(SearchInfoExtractor):
+ _SEARCH_KEY = 'netsearch'
+
+ _TESTS = [{
+ 'url': 'netsearch10:tetangga',
+ 'info_dict': {
+ 'id': 'tetangga',
+ 'title': 'tetangga',
+ },
+ 'playlist_count': 10,
+ }]
+
+ def _search_results(self, query):
+ last_page = None
+ for i in itertools.count(1):
+ search_data = self._download_json(
+ 'https://api.netverse.id/search/elastic/search', query,
+ query={'q': query, 'page': i}, note=f'Downloading page {i}')
+
+ videos = traverse_obj(search_data, ('response', 'data', ...))
+ for video in videos:
+ yield self.url_result(f'https://netverse.id/video/{video["slug"]}', NetverseIE)
+
+ last_page = last_page or traverse_obj(search_data, ('response', 'lastpage'))
+ if not videos or i >= (last_page or 0):
+ break
diff --git a/hypervideo_dl/extractor/nfl.py b/hypervideo_dl/extractor/nfl.py
index 29c53d5..cc3f449 100644
--- a/hypervideo_dl/extractor/nfl.py
+++ b/hypervideo_dl/extractor/nfl.py
@@ -1,10 +1,18 @@
+import base64
+import json
import re
+import time
+import uuid
+from .anvato import AnvatoIE
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
clean_html,
determine_ext,
get_element_by_class,
+ traverse_obj,
+ urlencode_postdata,
)
@@ -54,15 +62,14 @@ class NFLBaseIE(InfoExtractor):
)/
'''
_VIDEO_CONFIG_REGEX = r'<script[^>]+id="[^"]*video-config-[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}[^"]*"[^>]*>\s*({.+});?\s*</script>'
+ _ANVATO_PREFIX = 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:'
def _parse_video_config(self, video_config, display_id):
video_config = self._parse_json(video_config, display_id)
item = video_config['playlist'][0]
mcp_id = item.get('mcpID')
if mcp_id:
- info = self.url_result(
- 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:' + mcp_id,
- 'Anvato', mcp_id)
+ info = self.url_result(f'{self._ANVATO_PREFIX}{mcp_id}', AnvatoIE, mcp_id)
else:
media_id = item.get('id') or item['entityId']
title = item.get('title')
@@ -157,3 +164,138 @@ class NFLArticleIE(NFLBaseIE):
'nfl-c-article__title', webpage)) or self._html_search_meta(
['og:title', 'twitter:title'], webpage)
return self.playlist_result(entries, display_id, title)
+
+
+class NFLPlusReplayIE(NFLBaseIE):
+ IE_NAME = 'nfl.com:plus:replay'
+ _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/games/[\w-]+/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.nfl.com/plus/games/giants-at-vikings-2022-post-1/1572108',
+ 'info_dict': {
+ 'id': '1572108',
+ 'ext': 'mp4',
+ 'title': 'New York Giants at Minnesota Vikings',
+ 'description': 'New York Giants play the Minnesota Vikings at U.S. Bank Stadium on January 15, 2023',
+ 'uploader': 'NFL',
+ 'upload_date': '20230116',
+ 'timestamp': 1673864520,
+ 'duration': 7157,
+ 'categories': ['Game Highlights'],
+ 'tags': ['Minnesota Vikings', 'New York Giants', 'Minnesota Vikings vs. New York Giants'],
+ 'thumbnail': r're:^https?://.*\.jpg',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id)
+
+
+class NFLPlusEpisodeIE(NFLBaseIE):
+ IE_NAME = 'nfl.com:plus:episode'
+ _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/episodes/(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'note': 'premium content',
+ 'url': 'https://www.nfl.com/plus/episodes/kurt-s-qb-insider-conference-championships',
+ 'info_dict': {
+ 'id': '1576832',
+ 'ext': 'mp4',
+ 'title': 'Kurt\'s QB Insider: Conference Championships',
+ 'description': 'md5:944f7fab56f7a37430bf8473f5473857',
+ 'uploader': 'NFL',
+ 'upload_date': '20230127',
+ 'timestamp': 1674782760,
+ 'duration': 730,
+ 'categories': ['Analysis'],
+ 'tags': ['Cincinnati Bengals at Kansas City Chiefs (2022-POST-3)'],
+ 'thumbnail': r're:^https?://.*\.jpg',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ _CLIENT_DATA = {
+ 'clientKey': '4cFUW6DmwJpzT9L7LrG3qRAcABG5s04g',
+ 'clientSecret': 'CZuvCL49d9OwfGsR',
+ 'deviceId': str(uuid.uuid4()),
+ 'deviceInfo': base64.b64encode(json.dumps({
+ 'model': 'desktop',
+ 'version': 'Chrome',
+ 'osName': 'Windows',
+ 'osVersion': '10.0',
+ }, separators=(',', ':')).encode()).decode(),
+ 'networkType': 'other',
+ 'nflClaimGroupsToAdd': [],
+ 'nflClaimGroupsToRemove': [],
+ }
+ _ACCOUNT_INFO = {}
+ _API_KEY = None
+
+ _TOKEN = None
+ _TOKEN_EXPIRY = 0
+
+ def _get_account_info(self, url, video_id):
+ cookies = self._get_cookies('https://www.nfl.com/')
+ login_token = traverse_obj(cookies, (
+ (f'glt_{self._API_KEY}', f'gig_loginToken_{self._API_KEY}',
+ lambda k, _: k.startswith('glt_') or k.startswith('gig_loginToken_')),
+ {lambda x: x.value}), get_all=False)
+ if not login_token:
+ self.raise_login_required()
+
+ account = self._download_json(
+ 'https://auth-id.nfl.com/accounts.getAccountInfo', video_id,
+ note='Downloading account info', data=urlencode_postdata({
+ 'include': 'profile,data',
+ 'lang': 'en',
+ 'APIKey': self._API_KEY,
+ 'sdk': 'js_latest',
+ 'login_token': login_token,
+ 'authMode': 'cookie',
+ 'pageURL': url,
+ 'sdkBuild': traverse_obj(cookies, (
+ 'gig_canary_ver', {lambda x: x.value.partition('-')[0]}), default='13642'),
+ 'format': 'json',
+ }), headers={'Content-Type': 'application/x-www-form-urlencoded'})
+
+ self._ACCOUNT_INFO = traverse_obj(account, {
+ 'signatureTimestamp': 'signatureTimestamp',
+ 'uid': 'UID',
+ 'uidSignature': 'UIDSignature',
+ })
+
+ if len(self._ACCOUNT_INFO) != 3:
+ raise ExtractorError('Failed to retrieve account info with provided cookies', expected=True)
+
+ def _get_auth_token(self, url, video_id):
+ if not self._ACCOUNT_INFO:
+ self._get_account_info(url, video_id)
+
+ token = self._download_json(
+ 'https://api.nfl.com/identity/v3/token%s' % (
+ '/refresh' if self._ACCOUNT_INFO.get('refreshToken') else ''),
+ video_id, headers={'Content-Type': 'application/json'}, note='Downloading access token',
+ data=json.dumps({**self._CLIENT_DATA, **self._ACCOUNT_INFO}, separators=(',', ':')).encode())
+
+ self._TOKEN = token['accessToken']
+ self._TOKEN_EXPIRY = token['expiresIn']
+ self._ACCOUNT_INFO['refreshToken'] = token['refreshToken']
+
+ def _real_extract(self, url):
+ slug = self._match_id(url)
+
+ if not self._API_KEY:
+ webpage = self._download_webpage(url, slug, fatal=False) or ''
+ self._API_KEY = self._search_regex(
+ r'window\.gigyaApiKey=["\'](\w+)["\'];', webpage, 'API key',
+ default='3_Qa8TkWpIB8ESCBT8tY2TukbVKgO5F6BJVc7N1oComdwFzI7H2L9NOWdm11i_BY9f')
+
+ if not self._TOKEN or self._TOKEN_EXPIRY <= int(time.time()):
+ self._get_auth_token(url, slug)
+
+ video_id = self._download_json(
+ f'https://api.nfl.com/content/v1/videos/episodes/{slug}', slug, headers={
+ 'Authorization': f'Bearer {self._TOKEN}',
+ })['mcpPlaybackId']
+
+ return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id)
diff --git a/hypervideo_dl/extractor/nhk.py b/hypervideo_dl/extractor/nhk.py
index 59702b2..fbd6a18 100644
--- a/hypervideo_dl/extractor/nhk.py
+++ b/hypervideo_dl/extractor/nhk.py
@@ -2,11 +2,15 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
+ int_or_none,
+ join_nonempty,
parse_duration,
traverse_obj,
unescapeHTML,
unified_timestamp,
- urljoin
+ url_or_none,
+ urljoin,
)
@@ -66,7 +70,7 @@ class NhkBaseIE(InfoExtractor):
info.update({
'_type': 'url_transparent',
'ie_key': 'Piksel',
- 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id,
+ 'url': 'https://movie-s.nhk.or.jp/v/refid/nhkworld/prefid/' + vod_id,
'id': vod_id,
})
else:
@@ -93,6 +97,19 @@ class NhkVodIE(NhkBaseIE):
# Content available only for a limited period of time. Visit
# https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
_TESTS = [{
+ 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2061601/',
+ 'info_dict': {
+ 'id': 'yd8322ch',
+ 'ext': 'mp4',
+ 'description': 'md5:109c8b05d67a62d0592f2b445d2cd898',
+ 'title': 'GRAND SUMO Highlights - [Recap] May Tournament Day 1 (Opening Day)',
+ 'upload_date': '20230514',
+ 'timestamp': 1684083791,
+ 'series': 'GRAND SUMO Highlights',
+ 'episode': '[Recap] May Tournament Day 1 (Opening Day)',
+ 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1684084443/4028649.jpg?w=1920&h=1080',
+ },
+ }, {
# video clip
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/',
'md5': '7a90abcfe610ec22a6bfe15bd46b30ca',
@@ -103,6 +120,9 @@ class NhkVodIE(NhkBaseIE):
'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
'timestamp': 1565965194,
'upload_date': '20190816',
+ 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1567086278/3715195.jpg?w=1920&h=1080',
+ 'series': 'Dining with the Chef',
+ 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU',
},
}, {
# audio clip
@@ -113,10 +133,7 @@ class NhkVodIE(NhkBaseIE):
'title': "Japan's Top Inventions - Miniature Video Cameras",
'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b',
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
+ 'skip': '404 Not Found',
}, {
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/',
'only_matching': True,
@@ -132,7 +149,6 @@ class NhkVodIE(NhkBaseIE):
}, {
# video, alphabetic character in ID #29670
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/',
- 'only_matching': True,
'info_dict': {
'id': 'qfjay6cg',
'ext': 'mp4',
@@ -141,7 +157,8 @@ class NhkVodIE(NhkBaseIE):
'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$',
'upload_date': '20210615',
'timestamp': 1623722008,
- }
+ },
+ 'skip': '404 Not Found',
}]
def _real_extract(self, url):
@@ -152,12 +169,19 @@ class NhkVodProgramIE(NhkBaseIE):
_VALID_URL = r'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
_TESTS = [{
# video program episodes
+ 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo',
+ 'info_dict': {
+ 'id': 'sumo',
+ 'title': 'GRAND SUMO Highlights',
+ },
+ 'playlist_mincount': 12,
+ }, {
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway',
'info_dict': {
'id': 'japanrailway',
'title': 'Japan Railway Journal',
},
- 'playlist_mincount': 1,
+ 'playlist_mincount': 12,
}, {
# video program clips
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip',
@@ -334,3 +358,210 @@ class NhkForSchoolProgramListIE(InfoExtractor):
for x in traverse_obj(bangumi_list, ('part', ..., 'part-video-dasid')) or []]
return self.playlist_result(bangumis, program_id, title, description)
+
+
+class NhkRadiruIE(InfoExtractor):
+ _GEO_COUNTRIES = ['JP']
+ IE_DESC = 'NHK らじる (Radiru/Rajiru)'
+ _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
+ _TESTS = [{
+ 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3853544',
+ 'skip': 'Episode expired on 2023-04-16',
+ 'info_dict': {
+ 'channel': 'NHK-FM',
+ 'description': 'md5:94b08bdeadde81a97df4ec882acce3e9',
+ 'ext': 'm4a',
+ 'id': '0449_01_3853544',
+ 'series': 'ジャズ・トゥナイト',
+ 'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
+ 'timestamp': 1680969600,
+ 'title': 'ジャズ・トゥナイト NEWジャズ特集',
+ 'upload_date': '20230408',
+ 'release_timestamp': 1680962400,
+ 'release_date': '20230408',
+ 'was_live': True,
+ },
+ }, {
+ # playlist, airs every weekday so it should _hopefully_ be okay forever
+ 'url': 'https://www.nhk.or.jp/radio/ondemand/detail.html?p=0458_01',
+ 'info_dict': {
+ 'id': '0458_01',
+ 'title': 'ベストオブクラシック',
+ 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。',
+ 'channel': 'NHK-FM',
+ 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg',
+ },
+ 'playlist_mincount': 3,
+ }, {
+ # one with letters in the id
+ 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470',
+ 'note': 'Expires on 2024-03-31',
+ 'info_dict': {
+ 'id': 'F300_06_3738470',
+ 'ext': 'm4a',
+ 'title': '有島武郎「一房のぶどう」',
+ 'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n(2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より)',
+ 'channel': 'NHKラジオ第1、NHK-FM',
+ 'timestamp': 1635757200,
+ 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg',
+ 'release_date': '20161207',
+ 'series': 'らじる文庫 by ラジオ深夜便 ',
+ 'release_timestamp': 1481126700,
+ 'upload_date': '20211101',
+ }
+ }, {
+ # news
+ 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109',
+ 'skip': 'Expires on 2023-04-17',
+ 'info_dict': {
+ 'id': 'F261_01_3855109',
+ 'ext': 'm4a',
+ 'channel': 'NHKラジオ第1',
+ 'timestamp': 1681635900,
+ 'release_date': '20230416',
+ 'series': 'NHKラジオニュース',
+ 'title': '午後6時のNHKニュース',
+ 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
+ 'upload_date': '20230416',
+ 'release_timestamp': 1681635600,
+ },
+ }]
+
+ def _extract_episode_info(self, headline, programme_id, series_meta):
+ episode_id = f'{programme_id}_{headline["headline_id"]}'
+ episode = traverse_obj(headline, ('file_list', 0, {dict}))
+
+ return {
+ **series_meta,
+ 'id': episode_id,
+ 'formats': self._extract_m3u8_formats(episode.get('file_name'), episode_id, fatal=False),
+ 'container': 'm4a_dash', # force fixup, AAC-only HLS
+ 'was_live': True,
+ 'series': series_meta.get('title'),
+ 'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'),
+ **traverse_obj(episode, {
+ 'title': 'file_title',
+ 'description': 'file_title_sub',
+ 'timestamp': ('open_time', {unified_timestamp}),
+ 'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}),
+ }),
+ }
+
+ def _real_extract(self, url):
+ site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline')
+ programme_id = f'{site_id}_{corner_id}'
+
+ if site_id == 'F261':
+ json_url = 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json'
+ else:
+ json_url = f'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json'
+
+ meta = self._download_json(json_url, programme_id)['main']
+
+ series_meta = traverse_obj(meta, {
+ 'title': 'program_name',
+ 'channel': 'media_name',
+ 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}),
+ }, get_all=False)
+
+ if headline_id:
+ return self._extract_episode_info(
+ traverse_obj(meta, (
+ 'detail_list', lambda _, v: v['headline_id'] == headline_id), get_all=False),
+ programme_id, series_meta)
+
+ def entries():
+ for headline in traverse_obj(meta, ('detail_list', ..., {dict})):
+ yield self._extract_episode_info(headline, programme_id, series_meta)
+
+ return self.playlist_result(
+ entries(), programme_id, playlist_description=meta.get('site_detail'), **series_meta)
+
+
+class NhkRadioNewsPageIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.nhk\.or\.jp/radionews/?(?:$|[?#])'
+ _TESTS = [{
+ # airs daily, on-the-hour most hours
+ 'url': 'https://www.nhk.or.jp/radionews/',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': 'F261_01',
+ 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
+ 'description': 'md5:bf2c5b397e44bc7eb26de98d8f15d79d',
+ 'channel': 'NHKラジオ第1',
+ 'title': 'NHKラジオニュース',
+ }
+ }]
+
+ def _real_extract(self, url):
+ return self.url_result('https://www.nhk.or.jp/radio/ondemand/detail.html?p=F261_01', NhkRadiruIE)
+
+
+class NhkRadiruLiveIE(InfoExtractor):
+ _GEO_COUNTRIES = ['JP']
+ _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/player/\?ch=(?P<id>r[12]|fm)'
+ _TESTS = [{
+ # radio 1, no area specified
+ 'url': 'https://www.nhk.or.jp/radio/player/?ch=r1',
+ 'info_dict': {
+ 'id': 'r1-tokyo',
+ 'title': 're:^NHKネットラジオ第1 東京.+$',
+ 'ext': 'm4a',
+ 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r1-200x200.png',
+ 'live_status': 'is_live',
+ },
+ }, {
+ # radio 2, area specified
+ # (the area doesnt actually matter, r2 is national)
+ 'url': 'https://www.nhk.or.jp/radio/player/?ch=r2',
+ 'params': {'extractor_args': {'nhkradirulive': {'area': ['fukuoka']}}},
+ 'info_dict': {
+ 'id': 'r2-fukuoka',
+ 'title': 're:^NHKネットラジオ第2 福岡.+$',
+ 'ext': 'm4a',
+ 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r2-200x200.png',
+ 'live_status': 'is_live',
+ },
+ }, {
+ # fm, area specified
+ 'url': 'https://www.nhk.or.jp/radio/player/?ch=fm',
+ 'params': {'extractor_args': {'nhkradirulive': {'area': ['sapporo']}}},
+ 'info_dict': {
+ 'id': 'fm-sapporo',
+ 'title': 're:^NHKネットラジオFM 札幌.+$',
+ 'ext': 'm4a',
+ 'thumbnail': 'https://www.nhk.or.jp/common/img/media/fm-200x200.png',
+ 'live_status': 'is_live',
+ }
+ }]
+
+ _NOA_STATION_IDS = {'r1': 'n1', 'r2': 'n2', 'fm': 'n3'}
+
+ def _real_extract(self, url):
+ station = self._match_id(url)
+ area = self._configuration_arg('area', ['tokyo'])[0]
+
+ config = self._download_xml(
+ 'https://www.nhk.or.jp/radio/config/config_web.xml', station, 'Downloading area information')
+ data = config.find(f'.//data//area[.="{area}"]/..')
+
+ if not data:
+ raise ExtractorError('Invalid area. Valid areas are: %s' % ', '.join(
+ [i.text for i in config.findall('.//data//area')]), expected=True)
+
+ noa_info = self._download_json(
+ f'https:{config.find(".//url_program_noa").text}'.format(area=data.find('areakey').text),
+ station, note=f'Downloading {area} station metadata')
+ present_info = traverse_obj(noa_info, ('nowonair_list', self._NOA_STATION_IDS.get(station), 'present'))
+
+ return {
+ 'title': ' '.join(traverse_obj(present_info, (('service', 'area',), 'name', {str}))),
+ 'id': join_nonempty(station, area),
+ 'thumbnails': traverse_obj(present_info, ('service', 'images', ..., {
+ 'url': 'url',
+ 'width': ('width', {int_or_none}),
+ 'height': ('height', {int_or_none}),
+ })),
+ 'formats': self._extract_m3u8_formats(data.find(f'{station}hls').text, station),
+ 'is_live': True,
+ }
diff --git a/hypervideo_dl/extractor/niconico.py b/hypervideo_dl/extractor/niconico.py
index 2103037..fa2d709 100644
--- a/hypervideo_dl/extractor/niconico.py
+++ b/hypervideo_dl/extractor/niconico.py
@@ -5,13 +5,15 @@ import json
import re
import time
+from urllib.parse import urlparse
+
from .common import InfoExtractor, SearchInfoExtractor
-from ..compat import (
- compat_HTTPError,
-)
+from ..dependencies import websockets
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
OnDemandPagedList,
+ WebSocketsWrapper,
bug_reports_message,
clean_html,
float_or_none,
@@ -392,7 +394,7 @@ class NiconicoIE(InfoExtractor):
webpage, handle = self._download_webpage_handle(
'https://www.nicovideo.jp/watch/' + video_id, video_id)
if video_id.startswith('so'):
- video_id = self._match_id(handle.geturl())
+ video_id = self._match_id(handle.url)
api_data = self._parse_json(self._html_search_regex(
'data-api-data="([^"]+)"', webpage,
@@ -403,9 +405,9 @@ class NiconicoIE(InfoExtractor):
'https://www.nicovideo.jp/api/watch/v3/%s?_frontendId=6&_frontendVersion=0&actionTrackId=AAAAAAAAAA_%d' % (video_id, round(time.time() * 1000)), video_id,
note='Downloading API JSON', errnote='Unable to fetch data')['data']
except ExtractorError:
- if not isinstance(e.cause, compat_HTTPError):
+ if not isinstance(e.cause, HTTPError):
raise
- webpage = e.cause.read().decode('utf-8', 'replace')
+ webpage = e.cause.response.read().decode('utf-8', 'replace')
error_msg = self._html_search_regex(
r'(?s)<section\s+class="(?:(?:ErrorMessage|WatchExceptionPage-message)\s*)+">(.+?)</section>',
webpage, 'error reason', default=None)
@@ -477,23 +479,32 @@ class NiconicoIE(InfoExtractor):
user_id_str = session_api_data.get('serviceUserId')
thread_ids = traverse_obj(api_data, ('comment', 'threads', lambda _, v: v['isActive']))
- raw_danmaku = self._extract_all_comments(video_id, thread_ids, user_id_str, comment_user_key)
- if not raw_danmaku:
+ legacy_danmaku = self._extract_legacy_comments(video_id, thread_ids, user_id_str, comment_user_key) or []
+
+ new_comments = traverse_obj(api_data, ('comment', 'nvComment'))
+ new_danmaku = self._extract_new_comments(
+ new_comments.get('server'), video_id,
+ new_comments.get('params'), new_comments.get('threadKey'))
+
+ if not legacy_danmaku and not new_danmaku:
self.report_warning(f'Failed to get comments. {bug_reports_message()}')
return
+
return {
'comments': [{
'ext': 'json',
- 'data': json.dumps(raw_danmaku),
+ 'data': json.dumps(legacy_danmaku + new_danmaku),
}],
}
- def _extract_all_comments(self, video_id, threads, user_id, user_key):
+ def _extract_legacy_comments(self, video_id, threads, user_id, user_key):
auth_data = {
'user_id': user_id,
'userkey': user_key,
} if user_id and user_key else {'user_id': ''}
+ api_url = traverse_obj(threads, (..., 'server'), get_all=False)
+
# Request Start
post_data = [{'ping': {'content': 'rs:0'}}]
for i, thread in enumerate(threads):
@@ -532,17 +543,32 @@ class NiconicoIE(InfoExtractor):
# Request Final
post_data.append({'ping': {'content': 'rf:0'}})
- for api_url in self._COMMENT_API_ENDPOINTS:
- comments = self._download_json(
- api_url, video_id, data=json.dumps(post_data).encode(), fatal=False,
- headers={
- 'Referer': 'https://www.nicovideo.jp/watch/%s' % video_id,
- 'Origin': 'https://www.nicovideo.jp',
- 'Content-Type': 'text/plain;charset=UTF-8',
- },
- note='Downloading comments', errnote=f'Failed to access endpoint {api_url}')
- if comments:
- return comments
+ return self._download_json(
+ f'{api_url}/api.json', video_id, data=json.dumps(post_data).encode(), fatal=False,
+ headers={
+ 'Referer': f'https://www.nicovideo.jp/watch/{video_id}',
+ 'Origin': 'https://www.nicovideo.jp',
+ 'Content-Type': 'text/plain;charset=UTF-8',
+ },
+ note='Downloading comments', errnote=f'Failed to access endpoint {api_url}')
+
+ def _extract_new_comments(self, endpoint, video_id, params, thread_key):
+ comments = self._download_json(
+ f'{endpoint}/v1/threads', video_id, data=json.dumps({
+ 'additionals': {},
+ 'params': params,
+ 'threadKey': thread_key,
+ }).encode(), fatal=False,
+ headers={
+ 'Referer': 'https://www.nicovideo.jp/',
+ 'Origin': 'https://www.nicovideo.jp',
+ 'Content-Type': 'text/plain;charset=UTF-8',
+ 'x-client-os-type': 'others',
+ 'x-frontend-id': '6',
+ 'x-frontend-version': '0',
+ },
+ note='Downloading comments (new)', errnote='Failed to download comments (new)')
+ return traverse_obj(comments, ('data', 'threads', ..., 'comments', ...))
class NiconicoPlaylistBaseIE(InfoExtractor):
@@ -636,10 +662,10 @@ class NiconicoPlaylistIE(NiconicoPlaylistBaseIE):
class NiconicoSeriesIE(InfoExtractor):
IE_NAME = 'niconico:series'
- _VALID_URL = r'https?://(?:(?:www\.|sp\.)?nicovideo\.jp|nico\.ms)/series/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:(?:www\.|sp\.)?nicovideo\.jp(?:/user/\d+)?|nico\.ms)/series/(?P<id>\d+)'
_TESTS = [{
- 'url': 'https://www.nicovideo.jp/series/110226',
+ 'url': 'https://www.nicovideo.jp/user/44113208/series/110226',
'info_dict': {
'id': '110226',
'title': 'ご立派ァ!のシリーズ',
@@ -659,7 +685,7 @@ class NiconicoSeriesIE(InfoExtractor):
def _real_extract(self, url):
list_id = self._match_id(url)
- webpage = self._download_webpage(f'https://www.nicovideo.jp/series/{list_id}', list_id)
+ webpage = self._download_webpage(url, list_id)
title = self._search_regex(
(r'<title>「(.+)(全',
@@ -667,16 +693,15 @@ class NiconicoSeriesIE(InfoExtractor):
webpage, 'title', fatal=False)
if title:
title = unescapeHTML(title)
- playlist = [
- self.url_result(f'https://www.nicovideo.jp/watch/{v_id}', video_id=v_id)
- for v_id in re.findall(r'data-href=[\'"](?:https://www\.nicovideo\.jp)?/watch/([a-z0-9]+)', webpage)]
- return self.playlist_result(playlist, list_id, title)
+ json_data = next(self._yield_json_ld(webpage, None, fatal=False))
+ return self.playlist_from_matches(
+ traverse_obj(json_data, ('itemListElement', ..., 'url')), list_id, title, ie=NiconicoIE)
class NiconicoHistoryIE(NiconicoPlaylistBaseIE):
IE_NAME = 'niconico:history'
- IE_DESC = 'NicoNico user history. Requires cookies.'
- _VALID_URL = r'https?://(?:www\.|sp\.)?nicovideo\.jp/my/history'
+ IE_DESC = 'NicoNico user history or likes. Requires cookies.'
+ _VALID_URL = r'https?://(?:www\.|sp\.)?nicovideo\.jp/my/(?P<id>history(?:/like)?)'
_TESTS = [{
'note': 'PC page, with /video',
@@ -694,23 +719,29 @@ class NiconicoHistoryIE(NiconicoPlaylistBaseIE):
'note': 'mobile page, without /video',
'url': 'https://sp.nicovideo.jp/my/history',
'only_matching': True,
+ }, {
+ 'note': 'PC page',
+ 'url': 'https://www.nicovideo.jp/my/history/like',
+ 'only_matching': True,
+ }, {
+ 'note': 'Mobile page',
+ 'url': 'https://sp.nicovideo.jp/my/history/like',
+ 'only_matching': True,
}]
def _call_api(self, list_id, resource, query):
+ path = 'likes' if list_id == 'history/like' else 'watch/history'
return self._download_json(
- 'https://nvapi.nicovideo.jp/v1/users/me/watch/history', 'history',
- f'Downloading {resource}', query=query,
- headers=self._API_HEADERS)['data']
+ f'https://nvapi.nicovideo.jp/v1/users/me/{path}', list_id,
+ f'Downloading {resource}', query=query, headers=self._API_HEADERS)['data']
def _real_extract(self, url):
- list_id = 'history'
+ list_id = self._match_id(url)
try:
- mylist = self._call_api(list_id, 'list', {
- 'pageSize': 1,
- })
+ mylist = self._call_api(list_id, 'list', {'pageSize': 1})
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- self.raise_login_required('You have to be logged in to get your watch history')
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+ self.raise_login_required('You have to be logged in to get your history')
raise
return self.playlist_result(self._entries(list_id), list_id, **self._parse_owner(mylist))
@@ -866,3 +897,162 @@ class NiconicoUserIE(InfoExtractor):
def _real_extract(self, url):
list_id = self._match_id(url)
return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key())
+
+
+class NiconicoLiveIE(InfoExtractor):
+ IE_NAME = 'niconico:live'
+ IE_DESC = 'ニコニコ生放送'
+ _VALID_URL = r'https?://(?:sp\.)?live2?\.nicovideo\.jp/(?:watch|gate)/(?P<id>lv\d+)'
+ _TESTS = [{
+ 'note': 'this test case includes invisible characters for title, pasting them as-is',
+ 'url': 'https://live.nicovideo.jp/watch/lv339533123',
+ 'info_dict': {
+ 'id': 'lv339533123',
+ 'title': '激辛ペヤング食べます‪( ;ᯅ; )‬(歌枠オーディション参加中)',
+ 'view_count': 1526,
+ 'comment_count': 1772,
+ 'description': '初めましてもかって言います❕\nのんびり自由に適当に暮らしてます',
+ 'uploader': 'もか',
+ 'channel': 'ゲストさんのコミュニティ',
+ 'channel_id': 'co5776900',
+ 'channel_url': 'https://com.nicovideo.jp/community/co5776900',
+ 'timestamp': 1670677328,
+ 'is_live': True,
+ },
+ 'skip': 'livestream',
+ }, {
+ 'url': 'https://live2.nicovideo.jp/watch/lv339533123',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://sp.live.nicovideo.jp/watch/lv339533123',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://sp.live2.nicovideo.jp/watch/lv339533123',
+ 'only_matching': True,
+ }]
+
+ _KNOWN_LATENCY = ('high', 'low')
+
+ def _real_extract(self, url):
+ if not websockets:
+ raise ExtractorError('websockets library is not available. Please install it.', expected=True)
+ video_id = self._match_id(url)
+ webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id)
+
+ embedded_data = self._parse_json(unescapeHTML(self._search_regex(
+ r'<script\s+id="embedded-data"\s*data-props="(.+?)"', webpage, 'embedded data')), video_id)
+
+ ws_url = traverse_obj(embedded_data, ('site', 'relive', 'webSocketUrl'))
+ if not ws_url:
+ raise ExtractorError('The live hasn\'t started yet or already ended.', expected=True)
+ ws_url = update_url_query(ws_url, {
+ 'frontend_id': traverse_obj(embedded_data, ('site', 'frontendId')) or '9',
+ })
+
+ hostname = remove_start(urlparse(urlh.url).hostname, 'sp.')
+ cookies = try_get(urlh.url, self._downloader._calc_cookies)
+ latency = try_get(self._configuration_arg('latency'), lambda x: x[0])
+ if latency not in self._KNOWN_LATENCY:
+ latency = 'high'
+
+ ws = WebSocketsWrapper(ws_url, {
+ 'Cookies': str_or_none(cookies) or '',
+ 'Origin': f'https://{hostname}',
+ 'Accept': '*/*',
+ 'User-Agent': self.get_param('http_headers')['User-Agent'],
+ })
+
+ self.write_debug('[debug] Sending HLS server request')
+ ws.send(json.dumps({
+ 'type': 'startWatching',
+ 'data': {
+ 'stream': {
+ 'quality': 'abr',
+ 'protocol': 'hls+fmp4',
+ 'latency': latency,
+ 'chasePlay': False
+ },
+ 'room': {
+ 'protocol': 'webSocket',
+ 'commentable': True
+ },
+ 'reconnect': False,
+ }
+ }))
+
+ while True:
+ recv = ws.recv()
+ if not recv:
+ continue
+ data = json.loads(recv)
+ if not isinstance(data, dict):
+ continue
+ if data.get('type') == 'stream':
+ m3u8_url = data['data']['uri']
+ qualities = data['data']['availableQualities']
+ break
+ elif data.get('type') == 'disconnect':
+ self.write_debug(recv)
+ raise ExtractorError('Disconnected at middle of extraction')
+ elif data.get('type') == 'error':
+ self.write_debug(recv)
+ message = traverse_obj(data, ('body', 'code')) or recv
+ raise ExtractorError(message)
+ elif self.get_param('verbose', False):
+ if len(recv) > 100:
+ recv = recv[:100] + '...'
+ self.write_debug('Server said: %s' % recv)
+
+ title = traverse_obj(embedded_data, ('program', 'title')) or self._html_search_meta(
+ ('og:title', 'twitter:title'), webpage, 'live title', fatal=False)
+
+ raw_thumbs = traverse_obj(embedded_data, ('program', 'thumbnail')) or {}
+ thumbnails = []
+ for name, value in raw_thumbs.items():
+ if not isinstance(value, dict):
+ thumbnails.append({
+ 'id': name,
+ 'url': value,
+ **parse_resolution(value, lenient=True),
+ })
+ continue
+
+ for k, img_url in value.items():
+ res = parse_resolution(k, lenient=True) or parse_resolution(img_url, lenient=True)
+ width, height = res.get('width'), res.get('height')
+
+ thumbnails.append({
+ 'id': f'{name}_{width}x{height}',
+ 'url': img_url,
+ **res,
+ })
+
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True)
+ for fmt, q in zip(formats, reversed(qualities[1:])):
+ fmt.update({
+ 'format_id': q,
+ 'protocol': 'niconico_live',
+ 'ws': ws,
+ 'video_id': video_id,
+ 'cookies': cookies,
+ 'live_latency': latency,
+ 'origin': hostname,
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ **traverse_obj(embedded_data, {
+ 'view_count': ('program', 'statistics', 'watchCount'),
+ 'comment_count': ('program', 'statistics', 'commentCount'),
+ 'uploader': ('program', 'supplier', 'name'),
+ 'channel': ('socialGroup', 'name'),
+ 'channel_id': ('socialGroup', 'id'),
+ 'channel_url': ('socialGroup', 'socialGroupPageUrl'),
+ }),
+ 'description': clean_html(traverse_obj(embedded_data, ('program', 'description'))),
+ 'timestamp': int_or_none(traverse_obj(embedded_data, ('program', 'openTime'))),
+ 'is_live': True,
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/ninenow.py b/hypervideo_dl/extractor/ninenow.py
index b970f8c..c655b75 100644
--- a/hypervideo_dl/extractor/ninenow.py
+++ b/hypervideo_dl/extractor/ninenow.py
@@ -53,7 +53,7 @@ class NineNowIE(InfoExtractor):
'upload_date': '20210421',
},
'expected_warnings': ['Ignoring subtitle tracks'],
- 'params':{
+ 'params': {
'skip_download': True,
}
}]
diff --git a/hypervideo_dl/extractor/nitter.py b/hypervideo_dl/extractor/nitter.py
index 251bf44..5d1ca1f 100644
--- a/hypervideo_dl/extractor/nitter.py
+++ b/hypervideo_dl/extractor/nitter.py
@@ -39,59 +39,99 @@ class NitterIE(InfoExtractor):
)
HTTP_INSTANCES = (
- 'nitter.42l.fr',
- 'nitter.pussthecat.org',
- 'nitter.nixnet.services',
+ 'nitter.lacontrevoie.fr',
'nitter.fdn.fr',
'nitter.1d4.us',
'nitter.kavin.rocks',
'nitter.unixfox.eu',
'nitter.domain.glass',
- 'nitter.eu',
'nitter.namazso.eu',
- 'nitter.actionsack.com',
'birdsite.xanny.family',
- 'nitter.hu',
- 'twitr.gq',
'nitter.moomoo.me',
- 'nittereu.moomoo.me',
- 'bird.from.tf',
+ 'bird.trom.tf',
'nitter.it',
'twitter.censors.us',
- 'twitter.grimneko.de',
- 'nitter.alefvanoon.xyz',
- 'n.hyperborea.cloud',
- 'nitter.ca',
+ 'nitter.grimneko.de',
'twitter.076.ne.jp',
- 'twitter.mstdn.social',
'nitter.fly.dev',
'notabird.site',
'nitter.weiler.rocks',
- 'nitter.silkky.cloud',
'nitter.sethforprivacy.com',
- 'nttr.stream',
'nitter.cutelab.space',
'nitter.nl',
'nitter.mint.lgbt',
'nitter.bus-hit.me',
- 'fuckthesacklers.network',
- 'nitter.govt.land',
- 'nitter.datatunnel.xyz',
'nitter.esmailelbob.xyz',
'tw.artemislena.eu',
- 'de.nttr.stream',
'nitter.winscloud.net',
'nitter.tiekoetter.com',
'nitter.spaceint.fr',
- 'twtr.bch.bar',
- 'nitter.exonip.de',
- 'nitter.mastodon.pro',
- 'nitter.notraxx.ch',
-
-
- # not in the list anymore
- 'nitter.skrep.in',
- 'nitter.snopyta.org',
+ 'nitter.privacy.com.de',
+ 'nitter.poast.org',
+ 'nitter.bird.froth.zone',
+ 'nitter.dcs0.hu',
+ 'twitter.dr460nf1r3.org',
+ 'nitter.garudalinux.org',
+ 'twitter.femboy.hu',
+ 'nitter.cz',
+ 'nitter.privacydev.net',
+ 'nitter.evil.site',
+ 'tweet.lambda.dance',
+ 'nitter.kylrth.com',
+ 'nitter.foss.wtf',
+ 'nitter.priv.pw',
+ 'nitter.tokhmi.xyz',
+ 'nitter.catalyst.sx',
+ 'unofficialbird.com',
+ 'nitter.projectsegfau.lt',
+ 'nitter.eu.projectsegfau.lt',
+ 'singapore.unofficialbird.com',
+ 'canada.unofficialbird.com',
+ 'india.unofficialbird.com',
+ 'nederland.unofficialbird.com',
+ 'uk.unofficialbird.com',
+ 'n.l5.ca',
+ 'nitter.slipfox.xyz',
+ 'nitter.soopy.moe',
+ 'nitter.qwik.space',
+ 'read.whatever.social',
+ 'nitter.rawbit.ninja',
+ 'nt.vern.cc',
+ 'ntr.odyssey346.dev',
+ 'nitter.ir',
+ 'nitter.privacytools.io',
+ 'nitter.sneed.network',
+ 'n.sneed.network',
+ 'nitter.manasiwibi.com',
+ 'nitter.smnz.de',
+ 'nitter.twei.space',
+ 'nitter.inpt.fr',
+ 'nitter.d420.de',
+ 'nitter.caioalonso.com',
+ 'nitter.at',
+ 'nitter.drivet.xyz',
+ 'nitter.pw',
+ 'nitter.nicfab.eu',
+ 'bird.habedieeh.re',
+ 'nitter.hostux.net',
+ 'nitter.adminforge.de',
+ 'nitter.platypush.tech',
+ 'nitter.mask.sh',
+ 'nitter.pufe.org',
+ 'nitter.us.projectsegfau.lt',
+ 'nitter.arcticfoxes.net',
+ 't.com.sb',
+ 'nitter.kling.gg',
+ 'nitter.ktachibana.party',
+ 'nitter.riverside.rocks',
+ 'nitter.girlboss.ceo',
+ 'nitter.lunar.icu',
+ 'twitter.moe.ngo',
+ 'nitter.freedit.eu',
+ 'ntr.frail.duckdns.org',
+ 'nitter.librenode.org',
+ 'n.opnxng.com',
+ 'nitter.plus.st',
)
DEAD_INSTANCES = (
@@ -117,6 +157,32 @@ class NitterIE(InfoExtractor):
'nitter.weaponizedhumiliation.com',
'nitter.vxempire.xyz',
'tweet.lambda.dance',
+ 'nitter.ca',
+ 'nitter.42l.fr',
+ 'nitter.pussthecat.org',
+ 'nitter.nixnet.services',
+ 'nitter.eu',
+ 'nitter.actionsack.com',
+ 'nitter.hu',
+ 'twitr.gq',
+ 'nittereu.moomoo.me',
+ 'bird.from.tf',
+ 'twitter.grimneko.de',
+ 'nitter.alefvanoon.xyz',
+ 'n.hyperborea.cloud',
+ 'twitter.mstdn.social',
+ 'nitter.silkky.cloud',
+ 'nttr.stream',
+ 'fuckthesacklers.network',
+ 'nitter.govt.land',
+ 'nitter.datatunnel.xyz',
+ 'de.nttr.stream',
+ 'twtr.bch.bar',
+ 'nitter.exonip.de',
+ 'nitter.mastodon.pro',
+ 'nitter.notraxx.ch',
+ 'nitter.skrep.in',
+ 'nitter.snopyta.org',
)
INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
diff --git a/hypervideo_dl/extractor/njpwworld.py b/hypervideo_dl/extractor/njpwworld.py
index 7b8a526..6078381 100644
--- a/hypervideo_dl/extractor/njpwworld.py
+++ b/hypervideo_dl/extractor/njpwworld.py
@@ -51,7 +51,7 @@ class NJPWWorldIE(InfoExtractor):
data=urlencode_postdata({'login_id': username, 'pw': password}),
headers={'Referer': 'https://front.njpwworld.com/auth'})
# /auth/login will return 302 for successful logins
- if urlh.geturl() == self._LOGIN_URL:
+ if urlh.url == self._LOGIN_URL:
self.report_warning('unable to login')
return False
diff --git a/hypervideo_dl/extractor/noice.py b/hypervideo_dl/extractor/noice.py
new file mode 100644
index 0000000..e6e3433
--- /dev/null
+++ b/hypervideo_dl/extractor/noice.py
@@ -0,0 +1,116 @@
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ determine_ext,
+ int_or_none,
+ parse_iso8601,
+ traverse_obj,
+ variadic,
+)
+
+
+class NoicePodcastIE(InfoExtractor):
+ _VALID_URL = r'https?://open\.noice\.id/content/(?P<id>[a-fA-F0-9-]+)'
+ _TESTS = [{
+ 'url': 'https://open.noice.id/content/7694bb04-ff0f-40fa-a60b-5b39f29584b2',
+ 'info_dict': {
+ 'id': '7694bb04-ff0f-40fa-a60b-5b39f29584b2',
+ 'ext': 'm4a',
+ 'season': 'Season 1',
+ 'description': 'md5:58d1274e6857b6fbbecf47075885380d',
+ 'release_date': '20221115',
+ 'timestamp': 1668496642,
+ 'season_number': 1,
+ 'upload_date': '20221115',
+ 'release_timestamp': 1668496642,
+ 'title': 'Eps 1. Belajar dari Wishnutama: Kreatif Bukan Followers! (bersama Wishnutama)',
+ 'modified_date': '20221121',
+ 'categories': ['Bisnis dan Keuangan'],
+ 'duration': 3567,
+ 'modified_timestamp': 1669030647,
+ 'thumbnail': 'https://images.noiceid.cc/catalog/content-1668496302560',
+ 'channel_id': '9dab1024-5b92-4265-ae1c-63da87359832',
+ 'like_count': int,
+ 'channel': 'Noice Space Talks',
+ 'comment_count': int,
+ 'dislike_count': int,
+ 'channel_follower_count': int,
+ }
+ }, {
+ 'url': 'https://open.noice.id/content/222134e4-99f2-456f-b8a2-b8be404bf063',
+ 'info_dict': {
+ 'id': '222134e4-99f2-456f-b8a2-b8be404bf063',
+ 'ext': 'm4a',
+ 'release_timestamp': 1653488220,
+ 'description': 'md5:35074f6190cef52b05dd133bb2ef460e',
+ 'upload_date': '20220525',
+ 'timestamp': 1653460637,
+ 'release_date': '20220525',
+ 'thumbnail': 'https://images.noiceid.cc/catalog/content-1653460337625',
+ 'title': 'Eps 1: Dijodohin Sama Anak Pak RT',
+ 'modified_timestamp': 1669030647,
+ 'season_number': 1,
+ 'modified_date': '20221121',
+ 'categories': ['Cerita dan Drama'],
+ 'duration': 1830,
+ 'season': 'Season 1',
+ 'channel_id': '60193f6b-d24d-4b23-913b-ceed5a731e74',
+ 'dislike_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ 'channel': 'Dear Jerome',
+ 'channel_follower_count': int,
+ }
+ }]
+
+ def _get_formats_and_subtitles(self, media_url, video_id):
+ formats, subtitles = [], {}
+ for url in variadic(media_url):
+ ext = determine_ext(url)
+ if ext == 'm3u8':
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(url, video_id)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+ else:
+ formats.append({
+ 'url': url,
+ 'ext': 'mp3',
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ })
+ return formats, subtitles
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ nextjs_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['contentDetails']
+
+ media_url_list = traverse_obj(nextjs_data, (('rawContentUrl', 'url'), ))
+ formats, subtitles = self._get_formats_and_subtitles(media_url_list, display_id)
+
+ return {
+ 'id': nextjs_data.get('id') or display_id,
+ 'title': nextjs_data.get('title') or self._html_search_meta('og:title', webpage),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'description': (nextjs_data.get('description') or clean_html(nextjs_data.get('htmlDescription'))
+ or self._html_search_meta(['description', 'og:description'], webpage)),
+ 'thumbnail': nextjs_data.get('image') or self._html_search_meta('og:image', webpage),
+ 'timestamp': parse_iso8601(nextjs_data.get('createdAt')),
+ 'release_timestamp': parse_iso8601(nextjs_data.get('publishedAt')),
+ 'modified_timestamp': parse_iso8601(
+ nextjs_data.get('updatedAt') or self._html_search_meta('og:updated_time', webpage)),
+ 'duration': int_or_none(nextjs_data.get('duration')),
+ 'categories': traverse_obj(nextjs_data, ('genres', ..., 'name')),
+ 'season': nextjs_data.get('seasonName'),
+ 'season_number': int_or_none(nextjs_data.get('seasonNumber')),
+ 'channel': traverse_obj(nextjs_data, ('catalog', 'title')),
+ 'channel_id': traverse_obj(nextjs_data, ('catalog', 'id'), 'catalogId'),
+ **traverse_obj(nextjs_data, ('meta', 'aggregations', {
+ 'like_count': 'likes',
+ 'dislike_count': 'dislikes',
+ 'comment_count': 'comments',
+ 'channel_follower_count': 'followers',
+ }))
+ }
diff --git a/hypervideo_dl/extractor/noodlemagazine.py b/hypervideo_dl/extractor/noodlemagazine.py
index e620895..1cea0db 100644
--- a/hypervideo_dl/extractor/noodlemagazine.py
+++ b/hypervideo_dl/extractor/noodlemagazine.py
@@ -1,9 +1,14 @@
from .common import InfoExtractor
from ..utils import (
- parse_duration,
+ extract_attributes,
+ get_element_html_by_id,
+ int_or_none,
parse_count,
- unified_strdate
+ parse_duration,
+ unified_strdate,
+ urljoin,
)
+from ..utils.traversal import traverse_obj
class NoodleMagazineIE(InfoExtractor):
@@ -37,15 +42,21 @@ class NoodleMagazineIE(InfoExtractor):
like_count = parse_count(self._html_search_meta('ya:ovs:likes', webpage, default=None))
upload_date = unified_strdate(self._html_search_meta('ya:ovs:upload_date', webpage, default=''))
- key = self._html_search_regex(rf'/{video_id}\?(?:.*&)?m=([^&"\'\s,]+)', webpage, 'key')
- playlist_info = self._download_json(f'https://adult.noodlemagazine.com/playlist/{video_id}?m={key}', video_id)
- thumbnail = self._og_search_property('image', webpage, default=None) or playlist_info.get('image')
+ player_path = extract_attributes(get_element_html_by_id('iplayer', webpage) or '')['src']
+ player_iframe = self._download_webpage(
+ urljoin('https://adult.noodlemagazine.com', player_path), video_id, 'Downloading iframe page')
+ playlist_url = self._search_regex(
+ r'window\.playlistUrl\s*=\s*["\']([^"\']+)["\']', player_iframe, 'playlist url')
+ playlist_info = self._download_json(
+ urljoin('https://adult.noodlemagazine.com', playlist_url), video_id, headers={'Referer': url})
- formats = [{
- 'url': source.get('file'),
- 'quality': source.get('label'),
- 'ext': source.get('type'),
- } for source in playlist_info.get('sources')]
+ thumbnail = self._og_search_property('image', webpage, default=None) or playlist_info.get('image')
+ formats = traverse_obj(playlist_info, ('sources', lambda _, v: v['file'], {
+ 'url': 'file',
+ 'format_id': 'label',
+ 'height': ('label', {int_or_none}),
+ 'ext': 'type',
+ }))
return {
'id': video_id,
diff --git a/hypervideo_dl/extractor/nosnl.py b/hypervideo_dl/extractor/nosnl.py
index eba94c4..cea54c9 100644
--- a/hypervideo_dl/extractor/nosnl.py
+++ b/hypervideo_dl/extractor/nosnl.py
@@ -3,7 +3,7 @@ from ..utils import parse_duration, parse_iso8601, traverse_obj
class NOSNLArticleIE(InfoExtractor):
- _VALID_URL = r'https?://nos\.nl/((?!video)(\w+/)?\w+/)\d+-(?P<display_id>[\w-]+)'
+ _VALID_URL = r'https?://nos\.nl/(?P<type>video|(\w+/)?\w+)/?\d+-(?P<display_id>[\w-]+)'
_TESTS = [
{
# only 1 video
@@ -22,13 +22,14 @@ class NOSNLArticleIE(InfoExtractor):
'info_dict': {
'id': '2440409',
'title': 'Vannacht sliepen weer enkele honderden asielzoekers in Ter Apel buiten',
- 'description': 'Er werd wel geprobeerd om kwetsbare migranten onderdak te bieden, zegt het COA.',
+ 'description': 'md5:72b1e1674d798460e79d78fa37e9f56d',
'tags': ['aanmeldcentrum', 'Centraal Orgaan opvang asielzoekers', 'COA', 'asielzoekers', 'Ter Apel'],
'modified_timestamp': 1660452773,
'modified_date': '20220814',
'upload_date': '20220813',
'thumbnail': 'https://cdn.nos.nl/image/2022/07/18/880346/1024x576a.jpg',
'timestamp': 1660401384,
+ 'categories': ['Regionaal nieuws', 'Binnenland'],
},
'playlist_count': 2,
}, {
@@ -37,20 +38,37 @@ class NOSNLArticleIE(InfoExtractor):
'info_dict': {
'id': '2440789',
'title': 'Wekdienst 16/8: Groningse acties tien jaar na zware aardbeving • Femke Bol in actie op EK atletiek ',
- 'description': 'Nieuws, weer, verkeer: met dit overzicht begin je geïnformeerd aan de dag.',
+ 'description': 'md5:0bd277ed7a44fc15cb12a9d27d8f6641',
'tags': ['wekdienst'],
'modified_date': '20220816',
'modified_timestamp': 1660625449,
'timestamp': 1660625449,
'upload_date': '20220816',
'thumbnail': 'https://cdn.nos.nl/image/2022/08/16/888178/1024x576a.jpg',
+ 'categories': ['Binnenland', 'Buitenland'],
},
'playlist_count': 2,
+ }, {
+ # video url
+ 'url': 'https://nos.nl/video/2452718-xi-en-trudeau-botsen-voor-de-camera-op-g20-top-je-hebt-gelekt',
+ 'info_dict': {
+ 'id': '2452718',
+ 'title': 'Xi en Trudeau botsen voor de camera op G20-top: \'Je hebt gelekt\'',
+ 'modified_date': '20221117',
+ 'description': 'md5:61907dac576f75c11bf8ffffd4a3cc0f',
+ 'tags': ['Xi', 'Trudeau', 'G20', 'indonesié'],
+ 'upload_date': '20221117',
+ 'thumbnail': 'https://cdn.nos.nl/image/2022/11/17/916155/1024x576a.jpg',
+ 'modified_timestamp': 1668663388,
+ 'timestamp': 1668663388,
+ 'categories': ['Buitenland'],
+ },
+ 'playlist_mincount': 1,
}
]
def _entries(self, nextjs_json, display_id):
- for item in nextjs_json['items']:
+ for item in nextjs_json:
if item.get('type') == 'video':
formats, subtitle = self._extract_m3u8_formats_and_subtitles(
traverse_obj(item, ('source', 'url')), display_id, ext='mp4')
@@ -77,13 +95,14 @@ class NOSNLArticleIE(InfoExtractor):
}
def _real_extract(self, url):
- display_id = self._match_valid_url(url).group('display_id')
+ site_type, display_id = self._match_valid_url(url).group('type', 'display_id')
webpage = self._download_webpage(url, display_id)
nextjs_json = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['data']
return {
'_type': 'playlist',
- 'entries': self._entries(nextjs_json, display_id),
+ 'entries': self._entries(
+ [nextjs_json['video']] if site_type == 'video' else nextjs_json['items'], display_id),
'id': str(nextjs_json['id']),
'title': nextjs_json.get('title') or self._html_search_meta(['title', 'og:title', 'twitter:title'], webpage),
'description': (nextjs_json.get('description')
@@ -91,5 +110,6 @@ class NOSNLArticleIE(InfoExtractor):
'tags': nextjs_json.get('keywords'),
'modified_timestamp': parse_iso8601(nextjs_json.get('modifiedAt')),
'thumbnail': nextjs_json.get('shareImageSrc') or self._html_search_meta(['og:image', 'twitter:image'], webpage),
- 'timestamp': parse_iso8601(nextjs_json.get('publishedAt'))
+ 'timestamp': parse_iso8601(nextjs_json.get('publishedAt')),
+ 'categories': traverse_obj(nextjs_json, ('categories', ..., 'label')),
}
diff --git a/hypervideo_dl/extractor/nosvideo.py b/hypervideo_dl/extractor/nosvideo.py
index b6d3ea4..7e9688c 100644
--- a/hypervideo_dl/extractor/nosvideo.py
+++ b/hypervideo_dl/extractor/nosvideo.py
@@ -1,9 +1,9 @@
import re
from .common import InfoExtractor
+from ..networking import Request
from ..utils import (
ExtractorError,
- sanitized_Request,
urlencode_postdata,
xpath_text,
xpath_with_ns,
@@ -36,8 +36,8 @@ class NosVideoIE(InfoExtractor):
'op': 'download1',
'method_free': 'Continue to Video',
}
- req = sanitized_Request(url, urlencode_postdata(fields))
- req.add_header('Content-type', 'application/x-www-form-urlencoded')
+ req = Request(url, urlencode_postdata(fields))
+ req.headers['Content-type'] = 'application/x-www-form-urlencoded'
webpage = self._download_webpage(req, video_id,
'Downloading download page')
if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
diff --git a/hypervideo_dl/extractor/nowness.py b/hypervideo_dl/extractor/nowness.py
index 18bb880..b86b7e2 100644
--- a/hypervideo_dl/extractor/nowness.py
+++ b/hypervideo_dl/extractor/nowness.py
@@ -4,10 +4,8 @@ from .brightcove import (
)
from .common import InfoExtractor
from ..compat import compat_str
-from ..utils import (
- ExtractorError,
- sanitized_Request,
-)
+from ..networking import Request
+from ..utils import ExtractorError
class NownessBaseIE(InfoExtractor):
@@ -40,7 +38,7 @@ class NownessBaseIE(InfoExtractor):
def _api_request(self, url, request_path):
display_id = self._match_id(url)
- request = sanitized_Request(
+ request = Request(
'http://api.nowness.com/api/' + request_path % display_id,
headers={
'X-Nowness-Language': 'zh-cn' if 'cn.nowness.com' in url else 'en-us',
diff --git a/hypervideo_dl/extractor/npo.py b/hypervideo_dl/extractor/npo.py
index f18cb9e..40fee24 100644
--- a/hypervideo_dl/extractor/npo.py
+++ b/hypervideo_dl/extractor/npo.py
@@ -1,36 +1,22 @@
+import random
import re
+import urllib.parse
from .common import InfoExtractor
-from ..compat import (
- compat_HTTPError,
- compat_str,
-)
from ..utils import (
determine_ext,
- ExtractorError,
- fix_xml_ampersands,
int_or_none,
merge_dicts,
orderedSet,
- parse_duration,
- qualities,
str_or_none,
- strip_jsonp,
- unified_strdate,
+ try_call,
unified_timestamp,
url_or_none,
urlencode_postdata,
)
-class NPOBaseIE(InfoExtractor):
- def _get_token(self, video_id):
- return self._download_json(
- 'http://ida.omroep.nl/app.php/auth', video_id,
- note='Downloading token')['token']
-
-
-class NPOIE(NPOBaseIE):
+class NPOIE(InfoExtractor):
IE_NAME = 'npo'
IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl'
_VALID_URL = r'''(?x)
@@ -58,6 +44,7 @@ class NPOIE(NPOBaseIE):
'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.',
'upload_date': '20140622',
},
+ 'skip': 'Video was removed',
}, {
'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800',
'md5': 'da50a5787dbfc1603c4ad80f31c5120b',
@@ -69,29 +56,41 @@ class NPOIE(NPOBaseIE):
'upload_date': '20090227',
'duration': 2400,
},
+ 'skip': 'Video was removed',
}, {
'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289',
- 'md5': 'f8065e4e5a7824068ed3c7e783178f2c',
+ 'md5': '1b279c0547f6b270e014c576415268c5',
'info_dict': {
'id': 'VPWON_1169289',
- 'ext': 'm4v',
- 'title': 'Tegenlicht: Zwart geld. De toekomst komt uit Afrika',
- 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
+ 'ext': 'mp4',
+ 'title': 'Zwart geld: de toekomst komt uit Afrika',
+ 'description': 'md5:dffaf3d628a9c36f78ca48d834246261',
'upload_date': '20130225',
'duration': 3000,
+ 'creator': 'NED2',
+ 'series': 'Tegenlicht',
+ 'timestamp': 1361822340,
+ 'thumbnail': 'https://images.npo.nl/tile/1280x720/142854.jpg',
+ 'episode': 'Zwart geld: de toekomst komt uit Afrika',
+ 'episode_number': 18,
},
}, {
'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706',
'info_dict': {
'id': 'WO_VPRO_043706',
- 'ext': 'm4v',
+ 'ext': 'mp4',
'title': 'De nieuwe mens - Deel 1',
'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b',
'duration': 4680,
+ 'episode': 'De nieuwe mens - Deel 1',
+ 'thumbnail': 'https://images.npo.nl/tile/1280x720/6289.jpg',
+ 'timestamp': 1279716057,
+ 'series': 'De nieuwe mens - Deel 1',
+ 'upload_date': '20100721',
},
'params': {
'skip_download': True,
- }
+ },
}, {
# non asf in streams
'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771',
@@ -102,20 +101,25 @@ class NPOIE(NPOBaseIE):
},
'params': {
'skip_download': True,
- }
+ },
+ 'skip': 'Video was removed',
}, {
'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content',
'info_dict': {
'id': 'VPWON_1233944',
- 'ext': 'm4v',
+ 'ext': 'mp4',
'title': 'Aap, poot, pies',
- 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde',
+ 'description': 'md5:4b46b1b9553b4c036a04d2a532a137e6',
'upload_date': '20150508',
'duration': 599,
+ 'episode': 'Aap, poot, pies',
+ 'thumbnail': 'https://images.poms.omroep.nl/image/s1280/c1280x720/608118.jpg',
+ 'timestamp': 1431064200,
+ 'series': 'Aap, poot, pies',
},
'params': {
'skip_download': True,
- }
+ },
}, {
'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698',
'info_dict': {
@@ -128,7 +132,8 @@ class NPOIE(NPOBaseIE):
},
'params': {
'skip_download': True,
- }
+ },
+ 'skip': 'Video was removed',
}, {
# audio
'url': 'http://www.npo.nl/jouw-stad-rotterdam/29-01-2017/RBX_FUNX_6683215/RBX_FUNX_7601437',
@@ -140,7 +145,8 @@ class NPOIE(NPOBaseIE):
},
'params': {
'skip_download': True,
- }
+ },
+ 'skip': 'Video was removed',
}, {
'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547',
'only_matching': True,
@@ -169,6 +175,25 @@ class NPOIE(NPOBaseIE):
}, {
'url': 'https://npo.nl/KN_1698996',
'only_matching': True,
+ }, {
+ 'url': 'https://www.npo3.nl/the-genius/21-11-2022/VPWON_1341105',
+ 'info_dict': {
+ 'id': 'VPWON_1341105',
+ 'ext': 'mp4',
+ 'duration': 2658,
+ 'series': 'The Genius',
+ 'description': 'md5:db02f1456939ca63f7c408f858044e94',
+ 'title': 'The Genius',
+ 'timestamp': 1669062000,
+ 'creator': 'NED3',
+ 'episode': 'The Genius',
+ 'thumbnail': 'https://images.npo.nl/tile/1280x720/1827650.jpg',
+ 'episode_number': 8,
+ 'upload_date': '20221121',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
@classmethod
@@ -179,25 +204,32 @@ class NPOIE(NPOBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
- return self._get_info(url, video_id) or self._get_old_info(video_id)
-
- def _get_info(self, url, video_id):
- token = self._download_json(
- 'https://www.npostart.nl/api/token', video_id,
- 'Downloading token', headers={
- 'Referer': url,
- 'X-Requested-With': 'XMLHttpRequest',
- })['token']
-
- player = self._download_json(
- 'https://www.npostart.nl/player/%s' % video_id, video_id,
- 'Downloading player JSON', data=urlencode_postdata({
- 'autoplay': 0,
- 'share': 1,
- 'pageUrl': url,
- 'hasAdConsent': 0,
- '_token': token,
- }))
+ if urllib.parse.urlparse(url).netloc in ['www.ntr.nl', 'ntr.nl']:
+ player = self._download_json(
+ f'https://www.ntr.nl/ajax/player/embed/{video_id}', video_id,
+ 'Downloading player JSON', query={
+ 'parameters[elementId]': f'npo{random.randint(0, 999)}',
+ 'parameters[sterReferralUrl]': url,
+ 'parameters[autoplay]': 0,
+ })
+ else:
+ self._request_webpage(
+ 'https://www.npostart.nl/api/token', video_id,
+ 'Downloading token', headers={
+ 'Referer': url,
+ 'X-Requested-With': 'XMLHttpRequest',
+ })
+ player = self._download_json(
+ f'https://www.npostart.nl/player/{video_id}', video_id,
+ 'Downloading player JSON', data=urlencode_postdata({
+ 'autoplay': 0,
+ 'share': 1,
+ 'pageUrl': url,
+ 'hasAdConsent': 0,
+ }), headers={
+ 'x-xsrf-token': try_call(lambda: urllib.parse.unquote(
+ self._get_cookies('https://www.npostart.nl')['XSRF-TOKEN'].value))
+ })
player_token = player['token']
@@ -210,7 +242,7 @@ class NPOIE(NPOBaseIE):
video_id, 'Downloading %s profile JSON' % profile, fatal=False,
query={
'profile': profile,
- 'quality': 'npo',
+ 'quality': 'npoplus',
'tokenId': player_token,
'streamType': 'broadcast',
})
@@ -291,188 +323,8 @@ class NPOIE(NPOBaseIE):
return info
- def _get_old_info(self, video_id):
- metadata = self._download_json(
- 'http://e.omroep.nl/metadata/%s' % video_id,
- video_id,
- # We have to remove the javascript callback
- transform_source=strip_jsonp,
- )
-
- error = metadata.get('error')
- if error:
- raise ExtractorError(error, expected=True)
-
- # For some videos actual video id (prid) is different (e.g. for
- # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698
- # video id is POMS_WNL_853698 but prid is POW_00996502)
- video_id = metadata.get('prid') or video_id
-
- # titel is too generic in some cases so utilize aflevering_titel as well
- # when available (e.g. http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html)
- title = metadata['titel']
- sub_title = metadata.get('aflevering_titel')
- if sub_title and sub_title != title:
- title += ': %s' % sub_title
-
- token = self._get_token(video_id)
-
- formats = []
- urls = set()
-
- def is_legal_url(format_url):
- return format_url and format_url not in urls and re.match(
- r'^(?:https?:)?//', format_url)
-
- QUALITY_LABELS = ('Laag', 'Normaal', 'Hoog')
- QUALITY_FORMATS = ('adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std')
-
- quality_from_label = qualities(QUALITY_LABELS)
- quality_from_format_id = qualities(QUALITY_FORMATS)
- items = self._download_json(
- 'http://ida.omroep.nl/app.php/%s' % video_id, video_id,
- 'Downloading formats JSON', query={
- 'adaptive': 'yes',
- 'token': token,
- })['items'][0]
- for num, item in enumerate(items):
- item_url = item.get('url')
- if not is_legal_url(item_url):
- continue
- urls.add(item_url)
- format_id = self._search_regex(
- r'video/ida/([^/]+)', item_url, 'format id',
- default=None)
-
- item_label = item.get('label')
-
- def add_format_url(format_url):
- width = int_or_none(self._search_regex(
- r'(\d+)[xX]\d+', format_url, 'width', default=None))
- height = int_or_none(self._search_regex(
- r'\d+[xX](\d+)', format_url, 'height', default=None))
- if item_label in QUALITY_LABELS:
- quality = quality_from_label(item_label)
- f_id = item_label
- elif item_label in QUALITY_FORMATS:
- quality = quality_from_format_id(format_id)
- f_id = format_id
- else:
- quality, f_id = [None] * 2
- formats.append({
- 'url': format_url,
- 'format_id': f_id,
- 'width': width,
- 'height': height,
- 'quality': quality,
- })
-
- # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706
- if item.get('contentType') in ('url', 'audio'):
- add_format_url(item_url)
- continue
-
- try:
- stream_info = self._download_json(
- item_url + '&type=json', video_id,
- 'Downloading %s stream JSON'
- % item_label or item.get('format') or format_id or num)
- except ExtractorError as ee:
- if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
- error = (self._parse_json(
- ee.cause.read().decode(), video_id,
- fatal=False) or {}).get('errorstring')
- if error:
- raise ExtractorError(error, expected=True)
- raise
- # Stream URL instead of JSON, example: npo:LI_NL1_4188102
- if isinstance(stream_info, compat_str):
- if not stream_info.startswith('http'):
- continue
- video_url = stream_info
- # JSON
- else:
- video_url = stream_info.get('url')
- if not video_url or 'vodnotavailable.' in video_url or video_url in urls:
- continue
- urls.add(video_url)
- if determine_ext(video_url) == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- video_url, video_id, ext='mp4',
- entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
- else:
- add_format_url(video_url)
-
- is_live = metadata.get('medium') == 'live'
-
- if not is_live:
- for num, stream in enumerate(metadata.get('streams', [])):
- stream_url = stream.get('url')
- if not is_legal_url(stream_url):
- continue
- urls.add(stream_url)
- # smooth streaming is not supported
- stream_type = stream.get('type', '').lower()
- if stream_type in ['ss', 'ms']:
- continue
- if stream_type == 'hds':
- f4m_formats = self._extract_f4m_formats(
- stream_url, video_id, fatal=False)
- # f4m downloader downloads only piece of live stream
- for f4m_format in f4m_formats:
- f4m_format['preference'] = -5
- formats.extend(f4m_formats)
- elif stream_type == 'hls':
- formats.extend(self._extract_m3u8_formats(
- stream_url, video_id, ext='mp4', fatal=False))
- # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706
- elif '.asf' in stream_url:
- asx = self._download_xml(
- stream_url, video_id,
- 'Downloading stream %d ASX playlist' % num,
- transform_source=fix_xml_ampersands, fatal=False)
- if not asx:
- continue
- ref = asx.find('./ENTRY/Ref')
- if ref is None:
- continue
- video_url = ref.get('href')
- if not video_url or video_url in urls:
- continue
- urls.add(video_url)
- formats.append({
- 'url': video_url,
- 'ext': stream.get('formaat', 'asf'),
- 'quality': stream.get('kwaliteit'),
- 'preference': -10,
- })
- else:
- formats.append({
- 'url': stream_url,
- 'quality': stream.get('kwaliteit'),
- })
-
- subtitles = {}
- if metadata.get('tt888') == 'ja':
- subtitles['nl'] = [{
- 'ext': 'vtt',
- 'url': 'http://tt888.omroep.nl/tt888/%s' % video_id,
- }]
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': metadata.get('info'),
- 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'],
- 'upload_date': unified_strdate(metadata.get('gidsdatum')),
- 'duration': parse_duration(metadata.get('tijdsduur')),
- 'formats': formats,
- 'subtitles': subtitles,
- 'is_live': is_live,
- }
-
-class NPOLiveIE(NPOBaseIE):
+class NPOLiveIE(InfoExtractor):
IE_NAME = 'npo.nl:live'
_VALID_URL = r'https?://(?:www\.)?npo(?:start)?\.nl/live(?:/(?P<id>[^/?#&]+))?'
diff --git a/hypervideo_dl/extractor/nrk.py b/hypervideo_dl/extractor/nrk.py
index 88d08e5..384865a 100644
--- a/hypervideo_dl/extractor/nrk.py
+++ b/hypervideo_dl/extractor/nrk.py
@@ -3,7 +3,8 @@ import random
import re
from .common import InfoExtractor
-from ..compat import compat_HTTPError, compat_str
+from ..compat import compat_str
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
determine_ext,
@@ -148,7 +149,7 @@ class NRKIE(NRKBaseIE):
try:
return self._call_api(f'playback/{item}/program/{video_id}', video_id, item, query=query)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 400:
return self._call_api(f'playback/{item}/{video_id}', video_id, item, query=query)
raise
diff --git a/hypervideo_dl/extractor/ntvru.py b/hypervideo_dl/extractor/ntvru.py
index 8d5877d..91b7724 100644
--- a/hypervideo_dl/extractor/ntvru.py
+++ b/hypervideo_dl/extractor/ntvru.py
@@ -21,6 +21,7 @@ class NTVRuIE(InfoExtractor):
'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
'thumbnail': r're:^http://.*\.jpg',
'duration': 136,
+ 'view_count': int,
},
}, {
'url': 'http://www.ntv.ru/video/novosti/750370/',
@@ -32,6 +33,7 @@ class NTVRuIE(InfoExtractor):
'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
'thumbnail': r're:^http://.*\.jpg',
'duration': 172,
+ 'view_count': int,
},
}, {
'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416',
@@ -43,6 +45,7 @@ class NTVRuIE(InfoExtractor):
'description': '«Сегодня». 21 марта 2014 года. 16:00',
'thumbnail': r're:^http://.*\.jpg',
'duration': 1496,
+ 'view_count': int,
},
}, {
'url': 'https://www.ntv.ru/kino/Koma_film/m70281/o336036/video/',
@@ -54,6 +57,7 @@ class NTVRuIE(InfoExtractor):
'description': 'Остросюжетный фильм «Кома»',
'thumbnail': r're:^http://.*\.jpg',
'duration': 5592,
+ 'view_count': int,
},
}, {
'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/',
@@ -65,6 +69,7 @@ class NTVRuIE(InfoExtractor):
'description': '«Дело врачей»: «Деревце жизни»',
'thumbnail': r're:^http://.*\.jpg',
'duration': 2590,
+ 'view_count': int,
},
}, {
# Schemeless file URL
@@ -115,6 +120,14 @@ class NTVRuIE(InfoExtractor):
'url': file_,
'filesize': int_or_none(xpath_text(video, './%ssize' % format_id)),
})
+ hls_manifest = xpath_text(video, './playback/hls')
+ if hls_manifest:
+ formats.extend(self._extract_m3u8_formats(
+ hls_manifest, video_id, m3u8_id='hls', fatal=False))
+ dash_manifest = xpath_text(video, './playback/dash')
+ if dash_manifest:
+ formats.extend(self._extract_mpd_formats(
+ dash_manifest, video_id, mpd_id='dash', fatal=False))
return {
'id': xpath_text(video, './id'),
diff --git a/hypervideo_dl/extractor/nubilesporn.py b/hypervideo_dl/extractor/nubilesporn.py
new file mode 100644
index 0000000..d4f1d9d
--- /dev/null
+++ b/hypervideo_dl/extractor/nubilesporn.py
@@ -0,0 +1,99 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ float_or_none,
+ format_field,
+ get_element_by_class,
+ get_element_by_id,
+ get_element_html_by_class,
+ get_elements_by_class,
+ int_or_none,
+ try_call,
+ unified_timestamp,
+ urlencode_postdata,
+)
+
+
+class NubilesPornIE(InfoExtractor):
+ _NETRC_MACHINE = 'nubiles-porn'
+ _VALID_URL = r'''(?x)
+ https://members.nubiles-porn.com/video/watch/(?P<id>\d+)
+ (?:/(?P<display_id>[\w\-]+-s(?P<season>\d+)e(?P<episode>\d+)))?
+ '''
+
+ _TESTS = [{
+ 'url': 'https://members.nubiles-porn.com/video/watch/165320/trying-to-focus-my-one-track-mind-s3e1',
+ 'md5': 'fa7f09da8027c35e4bdf0f94f55eac82',
+ 'info_dict': {
+ 'id': '165320',
+ 'title': 'Trying To Focus My One Track Mind - S3:E1',
+ 'ext': 'mp4',
+ 'display_id': 'trying-to-focus-my-one-track-mind-s3e1',
+ 'thumbnail': 'https://images.nubiles-porn.com/videos/trying_to_focus_my_one_track_mind/samples/cover1280.jpg',
+ 'description': 'md5:81f3d4372e0e39bff5c801da277a5141',
+ 'timestamp': 1676160000,
+ 'upload_date': '20230212',
+ 'channel': 'Younger Mommy',
+ 'channel_id': '64',
+ 'channel_url': 'https://members.nubiles-porn.com/video/website/64',
+ 'like_count': int,
+ 'average_rating': float,
+ 'age_limit': 18,
+ 'categories': ['Big Boobs', 'Big Naturals', 'Blowjob', 'Brunette', 'Cowgirl', 'Girl Orgasm', 'Girl-Boy',
+ 'Glasses', 'Hardcore', 'Milf', 'Shaved Pussy', 'Tattoos', 'YoungerMommy.com'],
+ 'tags': list,
+ 'cast': ['Kenzie Love'],
+ 'availability': 'needs_auth',
+ 'series': 'Younger Mommy',
+ 'series_id': '64',
+ 'season': 'Season 3',
+ 'season_number': 3,
+ 'episode': 'Episode 1',
+ 'episode_number': 1
+ }
+ }]
+
+ def _perform_login(self, username, password):
+ login_webpage = self._download_webpage('https://nubiles-porn.com/login', video_id=None)
+ inputs = self._hidden_inputs(login_webpage)
+ inputs.update({'username': username, 'password': password})
+ self._request_webpage('https://nubiles-porn.com/authentication/login', None, data=urlencode_postdata(inputs))
+
+ def _real_extract(self, url):
+ url_match = self._match_valid_url(url)
+ video_id = url_match.group('id')
+ page = self._download_webpage(url, video_id)
+
+ media_entries = self._parse_html5_media_entries(
+ url, get_element_by_class('watch-page-video-wrapper', page), video_id)[0]
+
+ channel_id, channel_name = self._search_regex(
+ r'/video/website/(?P<id>\d+).+>(?P<name>\w+).com', get_element_html_by_class('site-link', page),
+ 'channel', fatal=False, group=('id', 'name')) or (None, None)
+ channel_name = re.sub(r'([^A-Z]+)([A-Z]+)', r'\1 \2', channel_name)
+
+ return {
+ 'id': video_id,
+ 'title': self._search_regex('<h2>([^<]+)</h2>', page, 'title', fatal=False),
+ 'formats': media_entries.get('formats'),
+ 'display_id': url_match.group('display_id'),
+ 'thumbnail': media_entries.get('thumbnail'),
+ 'description': clean_html(get_element_html_by_class('content-pane-description', page)),
+ 'timestamp': unified_timestamp(get_element_by_class('date', page)),
+ 'channel': channel_name,
+ 'channel_id': channel_id,
+ 'channel_url': format_field(channel_id, None, 'https://members.nubiles-porn.com/video/website/%s'),
+ 'like_count': int_or_none(get_element_by_id('likecount', page)),
+ 'average_rating': float_or_none(get_element_by_class('score', page)),
+ 'age_limit': 18,
+ 'categories': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_element_by_class('categories', page))))),
+ 'tags': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_elements_by_class('tags', page)[1])))),
+ 'cast': get_elements_by_class('content-pane-performer', page),
+ 'availability': 'needs_auth',
+ 'series': channel_name,
+ 'series_id': channel_id,
+ 'season_number': int_or_none(url_match.group('season')),
+ 'episode_number': int_or_none(url_match.group('episode'))
+ }
diff --git a/hypervideo_dl/extractor/nzonscreen.py b/hypervideo_dl/extractor/nzonscreen.py
new file mode 100644
index 0000000..6926bc5
--- /dev/null
+++ b/hypervideo_dl/extractor/nzonscreen.py
@@ -0,0 +1,93 @@
+from .common import InfoExtractor
+from ..utils import (
+ float_or_none,
+ int_or_none,
+ remove_end,
+ strip_or_none,
+ traverse_obj,
+ url_or_none,
+)
+
+
+class NZOnScreenIE(InfoExtractor):
+ _VALID_URL = r'^https://www\.nzonscreen\.com/title/(?P<id>[^/?#]+)'
+ _TESTS = [{
+ 'url': 'https://www.nzonscreen.com/title/shoop-shoop-diddy-wop-cumma-cumma-wang-dang-1982',
+ 'info_dict': {
+ 'id': '726ed6585c6bfb30',
+ 'ext': 'mp4',
+ 'format_id': 'hi',
+ 'display_id': 'shoop-shoop-diddy-wop-cumma-cumma-wang-dang-1982',
+ 'title': 'Monte Video - "Shoop Shoop, Diddy Wop"',
+ 'description': 'Monte Video - "Shoop Shoop, Diddy Wop"',
+ 'alt_title': 'Shoop Shoop Diddy Wop Cumma Cumma Wang Dang | Music Video',
+ 'thumbnail': r're:https://www\.nzonscreen\.com/content/images/.+\.jpg',
+ 'duration': 158,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.nzonscreen.com/title/shes-a-mod-1964?collection=best-of-the-60s',
+ 'info_dict': {
+ 'id': '3dbe709ff03c36f1',
+ 'ext': 'mp4',
+ 'format_id': 'hi',
+ 'display_id': 'shes-a-mod-1964',
+ 'title': 'Ray Columbus - \'She\'s A Mod\'',
+ 'description': 'Ray Columbus - \'She\'s A Mod\'',
+ 'alt_title': 'She\'s a Mod | Music Video',
+ 'thumbnail': r're:https://www\.nzonscreen\.com/content/images/.+\.jpg',
+ 'duration': 130,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.nzonscreen.com/title/puha-and-pakeha-1968/overview',
+ 'info_dict': {
+ 'id': 'f86342544385ad8a',
+ 'ext': 'mp4',
+ 'format_id': 'hi',
+ 'display_id': 'puha-and-pakeha-1968',
+ 'title': 'Looking At New Zealand - Puha and Pakeha',
+ 'alt_title': 'Looking at New Zealand - \'Pūhā and Pākehā\' | Television',
+ 'description': 'An excerpt from this television programme.',
+ 'duration': 212,
+ 'thumbnail': r're:https://www\.nzonscreen\.com/content/images/.+\.jpg',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _extract_formats(self, playlist):
+ for quality, (id_, url) in enumerate(traverse_obj(
+ playlist, ('h264', {'lo': 'lo_res', 'hi': 'hi_res'}), expected_type=url_or_none).items()):
+ yield {
+ 'url': url,
+ 'format_id': id_,
+ 'ext': 'mp4',
+ 'quality': quality,
+ 'height': int_or_none(playlist.get('height')) if id_ == 'hi' else None,
+ 'width': int_or_none(playlist.get('width')) if id_ == 'hi' else None,
+ 'filesize_approx': float_or_none(traverse_obj(playlist, ('h264', f'{id_}_res_mb')), invscale=1024**2),
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ playlist = self._parse_json(self._html_search_regex(
+ r'data-video-config=\'([^\']+)\'', webpage, 'media data'), video_id)
+
+ return {
+ 'id': playlist['uuid'],
+ 'display_id': video_id,
+ 'title': strip_or_none(playlist.get('label')),
+ 'description': strip_or_none(playlist.get('description')),
+ 'alt_title': strip_or_none(remove_end(
+ self._html_extract_title(webpage, default=None) or self._og_search_title(webpage),
+ ' | NZ On Screen')),
+ 'thumbnail': traverse_obj(playlist, ('thumbnail', 'path')),
+ 'duration': float_or_none(playlist.get('duration')),
+ 'formats': list(self._extract_formats(playlist)),
+ 'http_headers': {
+ 'Referer': 'https://www.nzonscreen.com/',
+ 'Origin': 'https://www.nzonscreen.com/',
+ }
+ }
diff --git a/hypervideo_dl/extractor/odkmedia.py b/hypervideo_dl/extractor/odkmedia.py
new file mode 100644
index 0000000..b852160
--- /dev/null
+++ b/hypervideo_dl/extractor/odkmedia.py
@@ -0,0 +1,105 @@
+import json
+
+from .common import InfoExtractor
+from ..networking.exceptions import HTTPError
+from ..utils import (
+ ExtractorError,
+ GeoRestrictedError,
+ float_or_none,
+ traverse_obj,
+ try_call
+)
+
+
+class OnDemandChinaEpisodeIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.ondemandchina\.com/\w+/watch/(?P<series>[\w-]+)/(?P<id>ep-(?P<ep>\d+))'
+ _TESTS = [{
+ 'url': 'https://www.ondemandchina.com/en/watch/together-against-covid-19/ep-1',
+ 'info_dict': {
+ 'id': '264394',
+ 'ext': 'mp4',
+ 'duration': 3256.88,
+ 'title': 'EP 1 The Calling',
+ 'alt_title': '第1集 令出如山',
+ 'thumbnail': 'https://d2y2efdi5wgkcl.cloudfront.net/fit-in/256x256/media-io/2020/9/11/image.d9816e81.jpg',
+ 'description': '疫情严峻,党政军民学、东西南北中协同应考',
+ 'tags': ['Social Humanities', 'Documentary', 'Medical', 'Social'],
+ }
+ }]
+
+ _QUERY = '''
+ query Episode($programSlug: String!, $episodeNumber: Int!) {
+ episode(
+ programSlug: $programSlug
+ episodeNumber: $episodeNumber
+ kind: "series"
+ part: null
+ ) {
+ id
+ title
+ titleEn
+ titleKo
+ titleZhHans
+ titleZhHant
+ synopsis
+ synopsisEn
+ synopsisKo
+ synopsisZhHans
+ synopsisZhHant
+ videoDuration
+ images {
+ thumbnail
+ }
+ }
+ }'''
+
+ def _real_extract(self, url):
+ program_slug, display_id, ep_number = self._match_valid_url(url).group('series', 'id', 'ep')
+ webpage = self._download_webpage(url, display_id)
+
+ video_info = self._download_json(
+ 'https://odc-graphql.odkmedia.io/graphql', display_id,
+ headers={'Content-type': 'application/json'},
+ data=json.dumps({
+ 'operationName': 'Episode',
+ 'query': self._QUERY,
+ 'variables': {
+ 'programSlug': program_slug,
+ 'episodeNumber': int(ep_number),
+ },
+ }).encode())['data']['episode']
+
+ try:
+ source_json = self._download_json(
+ f'https://odkmedia.io/odc/api/v2/playback/{video_info["id"]}/', display_id,
+ headers={'Authorization': '', 'service-name': 'odc'})
+ except ExtractorError as e:
+ if isinstance(e.cause, HTTPError):
+ error_data = self._parse_json(e.cause.response.read(), display_id)['detail']
+ raise GeoRestrictedError(error_data)
+
+ formats, subtitles = [], {}
+ for source in traverse_obj(source_json, ('sources', ...)):
+ if source.get('type') == 'hls':
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(source.get('url'), display_id)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+ else:
+ self.report_warning(f'Unsupported format {source.get("type")}', display_id)
+
+ return {
+ 'id': str(video_info['id']),
+ 'duration': float_or_none(video_info.get('videoDuration'), 1000),
+ 'thumbnail': (traverse_obj(video_info, ('images', 'thumbnail'))
+ or self._html_search_meta(['og:image', 'twitter:image'], webpage)),
+ 'title': (traverse_obj(video_info, 'title', 'titleEn')
+ or self._html_search_meta(['og:title', 'twitter:title'], webpage)
+ or self._html_extract_title(webpage)),
+ 'alt_title': traverse_obj(video_info, 'titleKo', 'titleZhHans', 'titleZhHant'),
+ 'description': (traverse_obj(
+ video_info, 'synopsisEn', 'synopsisKo', 'synopsisZhHans', 'synopsisZhHant', 'synopisis')
+ or self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'tags': try_call(lambda: self._html_search_meta('keywords', webpage).split(', '))
+ }
diff --git a/hypervideo_dl/extractor/odnoklassniki.py b/hypervideo_dl/extractor/odnoklassniki.py
index 4f325f0..1be45d8 100644
--- a/hypervideo_dl/extractor/odnoklassniki.py
+++ b/hypervideo_dl/extractor/odnoklassniki.py
@@ -1,3 +1,5 @@
+import urllib.parse
+
from .common import InfoExtractor
from ..compat import (
compat_etree_fromstring,
@@ -5,15 +7,18 @@ from ..compat import (
compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
)
+from ..networking import HEADRequest
from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
qualities,
smuggle_url,
+ traverse_obj,
unescapeHTML,
unified_strdate,
unsmuggle_url,
+ url_or_none,
urlencode_postdata,
)
@@ -40,7 +45,7 @@ class OdnoklassnikiIE(InfoExtractor):
'ext': 'mp4',
'timestamp': 1545580896,
'view_count': int,
- 'thumbnail': 'https://coub-attachments.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'title': 'Народная забава',
'uploader': 'Nevata',
'upload_date': '20181223',
@@ -64,13 +69,14 @@ class OdnoklassnikiIE(InfoExtractor):
'title': str,
'uploader': str,
},
+ 'skip': 'vk extractor error',
}, {
- # metadata in JSON
+ # metadata in JSON, webm_dash with Firefox UA
'url': 'http://ok.ru/video/20079905452',
- 'md5': '5d2b64756e2af296e3b383a0bc02a6aa',
+ 'md5': '8f477d8931c531374a3e36daec617b2c',
'info_dict': {
'id': '20079905452',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Культура меняет нас (прекрасный ролик!))',
'thumbnail': str,
'duration': 100,
@@ -80,10 +86,14 @@ class OdnoklassnikiIE(InfoExtractor):
'like_count': int,
'age_limit': 0,
},
+ 'params': {
+ 'format': 'bv[ext=webm]',
+ 'http_headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0'},
+ },
}, {
# metadataUrl
'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
- 'md5': 'f8c951122516af72e6e6ffdd3c41103b',
+ 'md5': '2bae2f58eefe1b3d26f3926c4a64d2f3',
'info_dict': {
'id': '63567059965189-0',
'ext': 'mp4',
@@ -97,10 +107,11 @@ class OdnoklassnikiIE(InfoExtractor):
'age_limit': 0,
'start_time': 5,
},
+ 'params': {'skip_download': 'm3u8'},
}, {
# YouTube embed (metadataUrl, provider == USER_YOUTUBE)
'url': 'https://ok.ru/video/3952212382174',
- 'md5': '91749d0bd20763a28d083fa335bbd37a',
+ 'md5': '5fb5f83ce16cb212d6bf887282b5da53',
'info_dict': {
'id': '5axVgHHDBvU',
'ext': 'mp4',
@@ -115,7 +126,7 @@ class OdnoklassnikiIE(InfoExtractor):
'live_status': 'not_live',
'view_count': int,
'thumbnail': 'https://i.mycdn.me/i?r=AEHujHvw2RjEbemUCNEorZbxYpb_p_9AcN2FmGik64Krkcmz37YtlY093oAM5-HIEAt7Zi9s0CiBOSDmbngC-I-k&fn=external_8',
- 'uploader_url': 'http://www.youtube.com/user/MrKewlkid94',
+ 'uploader_url': 'https://www.youtube.com/@MrKewlkid94',
'channel_follower_count': int,
'tags': ['youtube-dl', 'youtube playlists', 'download videos', 'download audio'],
'channel_id': 'UCVGtvURtEURYHtJFUegdSug',
@@ -144,7 +155,6 @@ class OdnoklassnikiIE(InfoExtractor):
},
'skip': 'Video has not been found',
}, {
- # TODO: HTTP Error 400: Bad Request, it only works if there's no cookies when downloading
'note': 'Only available in mobile webpage',
'url': 'https://m.ok.ru/video/2361249957145',
'info_dict': {
@@ -152,6 +162,26 @@ class OdnoklassnikiIE(InfoExtractor):
'ext': 'mp4',
'title': 'Быковское крещение',
'duration': 3038.181,
+ 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+',
+ },
+ }, {
+ 'note': 'subtitles',
+ 'url': 'https://ok.ru/video/4249587550747',
+ 'info_dict': {
+ 'id': '4249587550747',
+ 'ext': 'mp4',
+ 'title': 'Small Country An African Childhood (2020) (1080p) +subtitle',
+ 'uploader': 'Sunflower Movies',
+ 'uploader_id': '595802161179',
+ 'upload_date': '20220816',
+ 'duration': 6728,
+ 'age_limit': 0,
+ 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+',
+ 'like_count': int,
+ 'subtitles': dict,
+ },
+ 'params': {
+ 'skip_download': True,
},
}, {
'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
@@ -202,8 +232,15 @@ class OdnoklassnikiIE(InfoExtractor):
'like_count': 0,
'duration': 10444,
},
+ 'skip': 'Site no longer embeds',
}]
+ def _clear_cookies(self, cdn_url):
+ # Direct http downloads will fail if CDN cookies are set
+ # so we need to reset them after each format extraction
+ self.cookiejar.clear(domain='.mycdn.me')
+ self.cookiejar.clear(domain=urllib.parse.urlparse(cdn_url).hostname)
+
@classmethod
def _extract_embed_urls(cls, url, webpage):
for x in super()._extract_embed_urls(url, webpage):
@@ -294,6 +331,16 @@ class OdnoklassnikiIE(InfoExtractor):
like_count = int_or_none(metadata.get('likeCount'))
+ subtitles = {}
+ for sub in traverse_obj(metadata, ('movie', 'subtitleTracks', ...), expected_type=dict):
+ sub_url = sub.get('url')
+ if not sub_url:
+ continue
+ subtitles.setdefault(sub.get('language') or 'en', []).append({
+ 'url': sub_url,
+ 'ext': 'vtt',
+ })
+
info = {
'id': video_id,
'title': title,
@@ -305,6 +352,7 @@ class OdnoklassnikiIE(InfoExtractor):
'like_count': like_count,
'age_limit': age_limit,
'start_time': start_time,
+ 'subtitles': subtitles,
}
# pladform
@@ -331,14 +379,22 @@ class OdnoklassnikiIE(InfoExtractor):
formats = [{
'url': f['url'],
'ext': 'mp4',
- 'format_id': f['name'],
- } for f in metadata['videos']]
+ 'format_id': f.get('name'),
+ } for f in traverse_obj(metadata, ('videos', lambda _, v: url_or_none(v['url'])))]
- m3u8_url = metadata.get('hlsManifestUrl')
+ m3u8_url = traverse_obj(metadata, 'hlsManifestUrl', 'ondemandHls')
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
+ self._clear_cookies(m3u8_url)
+
+ for mpd_id, mpd_key in [('dash', 'ondemandDash'), ('webm', 'metadataWebmUrl')]:
+ mpd_url = metadata.get(mpd_key)
+ if mpd_url:
+ formats.extend(self._extract_mpd_formats(
+ mpd_url, video_id, mpd_id=mpd_id, fatal=False))
+ self._clear_cookies(mpd_url)
dash_manifest = metadata.get('metadataEmbedded')
if dash_manifest:
@@ -357,6 +413,7 @@ class OdnoklassnikiIE(InfoExtractor):
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ self._clear_cookies(m3u8_url)
rtmp_url = metadata.get('rtmpUrl')
if rtmp_url:
formats.append({
@@ -390,6 +447,10 @@ class OdnoklassnikiIE(InfoExtractor):
r'data-video="(.+?)"', webpage, 'json data')
json_data = self._parse_json(unescapeHTML(json_data), video_id) or {}
+ redirect_url = self._request_webpage(HEADRequest(
+ json_data['videoSrc']), video_id, 'Requesting download URL').url
+ self._clear_cookies(redirect_url)
+
return {
'id': video_id,
'title': json_data.get('videoName'),
@@ -397,7 +458,7 @@ class OdnoklassnikiIE(InfoExtractor):
'thumbnail': json_data.get('videoPosterSrc'),
'formats': [{
'format_id': 'mobile',
- 'url': json_data.get('videoSrc'),
+ 'url': redirect_url,
'ext': 'mp4',
}]
}
diff --git a/hypervideo_dl/extractor/oneplace.py b/hypervideo_dl/extractor/oneplace.py
new file mode 100644
index 0000000..86337ad
--- /dev/null
+++ b/hypervideo_dl/extractor/oneplace.py
@@ -0,0 +1,43 @@
+from .common import InfoExtractor
+
+
+class OnePlacePodcastIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.oneplace\.com/[\w]+/[^/]+/listen/[\w-]+-(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.oneplace.com/ministries/a-daily-walk/listen/living-in-the-last-days-part-2-958461.html',
+ 'info_dict': {
+ 'id': '958461',
+ 'ext': 'mp3',
+ 'title': 'Living in the Last Days Part 2 | A Daily Walk with John Randall',
+ 'description': 'md5:fbb8f1cf21447ac54ecaa2887fc20c6e',
+ }
+ }, {
+ 'url': 'https://www.oneplace.com/ministries/ankerberg-show/listen/ep-3-relying-on-the-constant-companionship-of-the-holy-spirit-part-2-922513.html',
+ 'info_dict': {
+ 'id': '922513',
+ 'ext': 'mp3',
+ 'description': 'md5:8b810b4349aa40a5d033b4536fe428e1',
+ 'title': 'md5:ce10f7d8d5ddcf485ed8905ef109659d',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ return {
+ 'id': video_id,
+ 'url': self._search_regex((
+ r'mp3-url\s*=\s*"([^"]+)',
+ r'<div[^>]+id\s*=\s*"player"[^>]+data-media-url\s*=\s*"(?P<media_url>[^"]+)',
+ ), webpage, 'media url'),
+ 'ext': 'mp3',
+ 'vcodec': 'none',
+ 'title': self._html_search_regex((
+ r'<div[^>]class\s*=\s*"details"[^>]+>[^<]<h2[^>]+>(?P<content>[^>]+)>',
+ self._meta_regex('og:title'), self._meta_regex('title'),
+ ), webpage, 'title', group='content', default=None),
+ 'description': self._html_search_regex(
+ r'<div[^>]+class="[^"]+epDesc"[^>]*>\s*(?P<desc>.+?)\s*</div>',
+ webpage, 'description', default=None),
+ }
diff --git a/hypervideo_dl/extractor/opencast.py b/hypervideo_dl/extractor/opencast.py
index fa46757..1fafd9a 100644
--- a/hypervideo_dl/extractor/opencast.py
+++ b/hypervideo_dl/extractor/opencast.py
@@ -55,9 +55,9 @@ class OpencastBaseIE(InfoExtractor):
transport = track.get('transport')
if transport == 'DASH' or ext == 'mpd':
- formats.extend(self._extract_mpd_formats_and_subtitles(href, video_id, mpd_id='dash', fatal=False))
+ formats.extend(self._extract_mpd_formats(href, video_id, mpd_id='dash', fatal=False))
elif transport == 'HLS' or ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats_and_subtitles(
+ formats.extend(self._extract_m3u8_formats(
href, video_id, m3u8_id='hls', entry_protocol='m3u8_native', fatal=False))
elif transport == 'HDS' or ext == 'f4m':
formats.extend(self._extract_f4m_formats(href, video_id, f4m_id='hds', fatal=False))
@@ -105,10 +105,9 @@ class OpencastBaseIE(InfoExtractor):
class OpencastIE(OpencastBaseIE):
- _VALID_URL = r'''(?x)
- https?://(?P<host>%s)/paella/ui/watch.html\?.*?
- id=(?P<id>%s)
- ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE)
+ _VALID_URL = rf'''(?x)
+ https?://(?P<host>{OpencastBaseIE._INSTANCES_RE})/paella/ui/watch\.html\?
+ (?:[^#]+&)?id=(?P<id>{OpencastBaseIE._UUID_RE})'''
_API_BASE = 'https://%s/search/episode.json?id=%s'
@@ -123,6 +122,9 @@ class OpencastIE(OpencastBaseIE):
'thumbnail': r're:^https?://.*\.jpg$',
'timestamp': 1606208400,
'upload_date': '20201124',
+ 'season_id': 'cf68a4a1-36b1-4a53-a6ba-61af5705a0d0',
+ 'series': 'Kryptographie - WiSe 15/16',
+ 'creator': 'Alexander May',
},
}
]
@@ -134,10 +136,11 @@ class OpencastIE(OpencastBaseIE):
class OpencastPlaylistIE(OpencastBaseIE):
- _VALID_URL = r'''(?x)
- https?://(?P<host>%s)/engage/ui/index.html\?.*?
- epFrom=(?P<id>%s)
- ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE)
+ _VALID_URL = rf'''(?x)
+ https?://(?P<host>{OpencastBaseIE._INSTANCES_RE})(?:
+ /engage/ui/index\.html\?(?:[^#]+&)?epFrom=|
+ /ltitools/index\.html\?(?:[^#]+&)?series=
+ )(?P<id>{OpencastBaseIE._UUID_RE})'''
_API_BASE = 'https://%s/search/episode.json?sid=%s'
@@ -148,15 +151,23 @@ class OpencastPlaylistIE(OpencastBaseIE):
'id': 'cf68a4a1-36b1-4a53-a6ba-61af5705a0d0',
'title': 'Kryptographie - WiSe 15/16',
},
- 'playlist_mincount': 28,
+ 'playlist_mincount': 29,
},
{
- 'url': 'https://oc-video.ruhr-uni-bochum.de/engage/ui/index.html?e=1&p=1&epFrom=b1a54262-3684-403f-9731-8e77c3766f9a',
+ 'url': 'https://oc-video1.ruhr-uni-bochum.de/ltitools/index.html?subtool=series&series=cf68a4a1-36b1-4a53-a6ba-61af5705a0d0&lng=de',
'info_dict': {
- 'id': 'b1a54262-3684-403f-9731-8e77c3766f9a',
- 'title': 'inSTUDIES-Social movements and prefigurative politics in a global perspective',
+ 'id': 'cf68a4a1-36b1-4a53-a6ba-61af5705a0d0',
+ 'title': 'Kryptographie - WiSe 15/16',
+ },
+ 'playlist_mincount': 29,
+ },
+ {
+ 'url': 'https://electures.uni-muenster.de/engage/ui/index.html?e=1&p=1&epFrom=39391d10-a711-4d23-b21d-afd2ed7d758c',
+ 'info_dict': {
+ 'id': '39391d10-a711-4d23-b21d-afd2ed7d758c',
+ 'title': '021670 Theologische Themen bei Hans Blumenberg WiSe 2017/18',
},
- 'playlist_mincount': 6,
+ 'playlist_mincount': 13,
},
]
diff --git a/hypervideo_dl/extractor/orf.py b/hypervideo_dl/extractor/orf.py
index e9d23a4..cc3c003 100644
--- a/hypervideo_dl/extractor/orf.py
+++ b/hypervideo_dl/extractor/orf.py
@@ -2,11 +2,11 @@ import functools
import re
from .common import InfoExtractor
+from ..networking import HEADRequest
from ..utils import (
clean_html,
determine_ext,
float_or_none,
- HEADRequest,
InAdvancePagedList,
int_or_none,
join_nonempty,
diff --git a/hypervideo_dl/extractor/owncloud.py b/hypervideo_dl/extractor/owncloud.py
new file mode 100644
index 0000000..79fd830
--- /dev/null
+++ b/hypervideo_dl/extractor/owncloud.py
@@ -0,0 +1,80 @@
+import re
+import urllib.parse
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ determine_ext,
+ url_or_none,
+ urlencode_postdata,
+)
+
+
+class OwnCloudIE(InfoExtractor):
+ _INSTANCES_RE = '|'.join((
+ r'(?:[^\.]+\.)?sciebo\.de',
+ r'cloud\.uni-koblenz-landau\.de',
+ ))
+ _VALID_URL = rf'https?://(?:{_INSTANCES_RE})/s/(?P<id>[\w.-]+)'
+
+ _TESTS = [
+ {
+ 'url': 'https://ruhr-uni-bochum.sciebo.de/s/wWhqZzh9jTumVFN',
+ 'info_dict': {
+ 'id': 'wWhqZzh9jTumVFN',
+ 'ext': 'mp4',
+ 'title': 'CmvpJST.mp4',
+ },
+ },
+ {
+ 'url': 'https://ruhr-uni-bochum.sciebo.de/s/WNDuFu0XuFtmm3f',
+ 'info_dict': {
+ 'id': 'WNDuFu0XuFtmm3f',
+ 'ext': 'mp4',
+ 'title': 'CmvpJST.mp4',
+ },
+ 'params': {
+ 'videopassword': '12345',
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage, urlh = self._download_webpage_handle(url, video_id)
+
+ if re.search(r'<label[^>]+for="password"', webpage):
+ webpage = self._verify_video_password(webpage, urlh.url, video_id)
+
+ hidden_inputs = self._hidden_inputs(webpage)
+ title = hidden_inputs.get('filename')
+ parsed_url = urllib.parse.urlparse(url)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': url_or_none(hidden_inputs.get('downloadURL')) or parsed_url._replace(
+ path=urllib.parse.urljoin(parsed_url.path, 'download')).geturl(),
+ 'ext': determine_ext(title),
+ }
+
+ def _verify_video_password(self, webpage, url, video_id):
+ password = self.get_param('videopassword')
+ if password is None:
+ raise ExtractorError(
+ 'This video is protected by a password, use the --video-password option',
+ expected=True)
+
+ validation_response = self._download_webpage(
+ url, video_id, 'Validating Password', 'Wrong password?',
+ data=urlencode_postdata({
+ 'requesttoken': self._hidden_inputs(webpage)['requesttoken'],
+ 'password': password,
+ }))
+
+ if re.search(r'<label[^>]+for="password"', validation_response):
+ warning = self._search_regex(
+ r'<div[^>]+class="warning">([^<]*)</div>', validation_response,
+ 'warning', default='The password is wrong')
+ raise ExtractorError(f'Opening the video failed, {self.IE_NAME} said: {warning!r}', expected=True)
+ return validation_response
diff --git a/hypervideo_dl/extractor/packtpub.py b/hypervideo_dl/extractor/packtpub.py
index 51778d8..5620330 100644
--- a/hypervideo_dl/extractor/packtpub.py
+++ b/hypervideo_dl/extractor/packtpub.py
@@ -1,10 +1,7 @@
import json
from .common import InfoExtractor
-from ..compat import (
- # compat_str,
- compat_HTTPError,
-)
+from ..networking.exceptions import HTTPError
from ..utils import (
clean_html,
ExtractorError,
@@ -54,8 +51,8 @@ class PacktPubIE(PacktPubBaseIE):
'password': password,
}).encode())['data']['access']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 404):
- message = self._parse_json(e.cause.read().decode(), None)['message']
+ if isinstance(e.cause, HTTPError) and e.cause.status in (400, 401, 404):
+ message = self._parse_json(e.cause.response.read().decode(), None)['message']
raise ExtractorError(message, expected=True)
raise
@@ -70,7 +67,7 @@ class PacktPubIE(PacktPubBaseIE):
'https://services.packtpub.com/products-v1/products/%s/%s/%s' % (course_id, chapter_id, video_id), video_id,
'Downloading JSON video', headers=headers)['data']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 400:
self.raise_login_required('This video is locked')
raise
diff --git a/hypervideo_dl/extractor/panopto.py b/hypervideo_dl/extractor/panopto.py
index 32c103b..6e3c9f4 100644
--- a/hypervideo_dl/extractor/panopto.py
+++ b/hypervideo_dl/extractor/panopto.py
@@ -412,7 +412,7 @@ class PanoptoIE(PanoptoBaseIE):
return {
'id': video_id,
'title': delivery.get('SessionName'),
- 'cast': traverse_obj(delivery, ('Contributors', ..., 'DisplayName'), default=[], expected_type=lambda x: x or None),
+ 'cast': traverse_obj(delivery, ('Contributors', ..., 'DisplayName'), expected_type=lambda x: x or None),
'timestamp': session_start_time - 11640000000 if session_start_time else None,
'duration': delivery.get('Duration'),
'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random()}',
@@ -563,7 +563,7 @@ class PanoptoListIE(PanoptoBaseIE):
base_url, '/Services/Data.svc/GetFolderInfo', folder_id,
data={'folderID': folder_id}, fatal=False)
return {
- 'title': get_first(response, 'Name', default=[])
+ 'title': get_first(response, 'Name')
}
def _real_extract(self, url):
diff --git a/hypervideo_dl/extractor/parler.py b/hypervideo_dl/extractor/parler.py
index 68a60bc..2af805e 100644
--- a/hypervideo_dl/extractor/parler.py
+++ b/hypervideo_dl/extractor/parler.py
@@ -1,13 +1,14 @@
+import functools
+
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import (
clean_html,
- format_field,
int_or_none,
strip_or_none,
traverse_obj,
unified_timestamp,
- urlencode_postdata,
+ urljoin,
)
@@ -24,7 +25,7 @@ class ParlerIE(InfoExtractor):
'thumbnail': 'https://bl-images.parler.com/videos/6ce7cdf3-a27a-4d72-bf9c-d3e17ce39a66/thumbnail.jpeg',
'title': 'Parler video #df79fdba-07cc-48fe-b085-3293897520d7',
'description': 'md5:6f220bde2df4a97cbb89ac11f1fd8197',
- 'timestamp': 1659744000,
+ 'timestamp': 1659785481,
'upload_date': '20220806',
'uploader': 'Tulsi Gabbard',
'uploader_id': 'TulsiGabbard',
@@ -35,77 +36,56 @@ class ParlerIE(InfoExtractor):
},
},
{
- 'url': 'https://parler.com/feed/a7406eb4-91e5-4793-b5e3-ade57a24e287',
- 'md5': '11687e2f5bb353682cee338d181422ed',
- 'info_dict': {
- 'id': 'a7406eb4-91e5-4793-b5e3-ade57a24e287',
- 'ext': 'mp4',
- 'thumbnail': 'https://bl-images.parler.com/videos/317827a8-1e48-4cbc-981f-7dd17d4c1183/thumbnail.jpeg',
- 'title': 'Parler video #a7406eb4-91e5-4793-b5e3-ade57a24e287',
- 'description': 'This man should run for office',
- 'timestamp': 1659657600,
- 'upload_date': '20220805',
- 'uploader': 'Benny Johnson',
- 'uploader_id': 'BennyJohnson',
- 'uploader_url': 'https://parler.com/BennyJohnson',
- 'view_count': int,
- 'comment_count': int,
- 'repost_count': int,
- },
- },
- {
'url': 'https://parler.com/feed/f23b85c1-6558-470f-b9ff-02c145f28da5',
'md5': 'eaba1ff4a10fe281f5ce74e930ab2cb4',
'info_dict': {
'id': 'r5vkSaz8PxQ',
'ext': 'mp4',
- 'thumbnail': 'https://i.ytimg.com/vi_webp/r5vkSaz8PxQ/maxresdefault.webp',
- 'title': 'Tom MacDonald Names Reaction',
- 'description': 'md5:33c21f0d35ae6dc2edf3007d6696baea',
- 'upload_date': '20220716',
- 'duration': 1267,
- 'uploader': 'Mahesh Chookolingo',
- 'uploader_id': 'maheshchookolingo',
- 'uploader_url': 'http://www.youtube.com/user/maheshchookolingo',
- 'channel': 'Mahesh Chookolingo',
- 'channel_id': 'UCox6YeMSY1PQInbCtTaZj_w',
- 'channel_url': 'https://www.youtube.com/channel/UCox6YeMSY1PQInbCtTaZj_w',
- 'categories': ['Entertainment'],
- 'tags': list,
- 'availability': 'public',
'live_status': 'not_live',
- 'view_count': int,
'comment_count': int,
+ 'duration': 1267,
'like_count': int,
'channel_follower_count': int,
- 'age_limit': 0,
+ 'channel_id': 'UCox6YeMSY1PQInbCtTaZj_w',
+ 'upload_date': '20220716',
+ 'thumbnail': 'https://i.ytimg.com/vi/r5vkSaz8PxQ/maxresdefault.jpg',
+ 'tags': 'count:17',
+ 'availability': 'public',
+ 'categories': ['Entertainment'],
'playable_in_embed': True,
+ 'channel': 'Who Knows What! With Mahesh & Friends',
+ 'title': 'Tom MacDonald Names Reaction',
+ 'uploader': 'Who Knows What! With Mahesh & Friends',
+ 'uploader_id': '@maheshchookolingo',
+ 'age_limit': 0,
+ 'description': 'md5:33c21f0d35ae6dc2edf3007d6696baea',
+ 'channel_url': 'https://www.youtube.com/channel/UCox6YeMSY1PQInbCtTaZj_w',
+ 'view_count': int,
+ 'uploader_url': 'http://www.youtube.com/@maheshchookolingo',
},
},
]
def _real_extract(self, url):
video_id = self._match_id(url)
- data = self._download_json(
- 'https://parler.com/open-api/ParleyDetailEndpoint.php', video_id,
- data=urlencode_postdata({'uuid': video_id}))['data'][0]
- primary = data['primary']
-
- embed = self._parse_json(primary.get('V2LINKLONG') or '', video_id, fatal=False)
- if embed:
- return self.url_result(embed[0], YoutubeIE)
+ data = self._download_json(f'https://api.parler.com/v0/public/parleys/{video_id}',
+ video_id)['data']
+ if data.get('link'):
+ return self.url_result(data['link'], YoutubeIE)
return {
'id': video_id,
- 'url': traverse_obj(primary, ('video_data', 'videoSrc')),
- 'thumbnail': traverse_obj(primary, ('video_data', 'thumbnailUrl')),
- 'title': '',
- 'description': strip_or_none(clean_html(primary.get('full_body'))) or None,
- 'timestamp': unified_timestamp(primary.get('date_created')),
- 'uploader': strip_or_none(primary.get('name')),
- 'uploader_id': strip_or_none(primary.get('username')),
- 'uploader_url': format_field(strip_or_none(primary.get('username')), None, 'https://parler.com/%s'),
- 'view_count': int_or_none(primary.get('view_count')),
- 'comment_count': int_or_none(traverse_obj(data, ('engagement', 'commentCount'))),
- 'repost_count': int_or_none(traverse_obj(data, ('engagement', 'echoCount'))),
+ 'title': strip_or_none(data.get('title')) or '',
+ **traverse_obj(data, {
+ 'url': ('video', 'videoSrc'),
+ 'thumbnail': ('video', 'thumbnailUrl'),
+ 'description': ('body', {clean_html}),
+ 'timestamp': ('date_created', {unified_timestamp}),
+ 'uploader': ('user', 'name', {strip_or_none}),
+ 'uploader_id': ('user', 'username', {str}),
+ 'uploader_url': ('user', 'username', {functools.partial(urljoin, 'https://parler.com/')}),
+ 'view_count': ('views', {int_or_none}),
+ 'comment_count': ('total_comments', {int_or_none}),
+ 'repost_count': ('echos', {int_or_none}),
+ })
}
diff --git a/hypervideo_dl/extractor/patreon.py b/hypervideo_dl/extractor/patreon.py
index 4dc0298..79b041d 100644
--- a/hypervideo_dl/extractor/patreon.py
+++ b/hypervideo_dl/extractor/patreon.py
@@ -1,22 +1,22 @@
import itertools
-from urllib.error import HTTPError
from .common import InfoExtractor
from .vimeo import VimeoIE
-
from ..compat import compat_urllib_parse_unquote
+from ..networking.exceptions import HTTPError
from ..utils import (
+ KNOWN_EXTENSIONS,
+ ExtractorError,
clean_html,
determine_ext,
- ExtractorError,
int_or_none,
- KNOWN_EXTENSIONS,
mimetype2ext,
parse_iso8601,
str_or_none,
traverse_obj,
try_get,
url_or_none,
+ urljoin,
)
@@ -37,9 +37,9 @@ class PatreonBaseIE(InfoExtractor):
item_id, note='Downloading API JSON' if not note else note,
query=query, fatal=fatal, headers=headers)
except ExtractorError as e:
- if not isinstance(e.cause, HTTPError) or mimetype2ext(e.cause.headers.get('Content-Type')) != 'json':
+ if not isinstance(e.cause, HTTPError) or mimetype2ext(e.cause.response.headers.get('Content-Type')) != 'json':
raise
- err_json = self._parse_json(self._webpage_read_content(e.cause, None, item_id), item_id, fatal=False)
+ err_json = self._parse_json(self._webpage_read_content(e.cause.response, None, item_id), item_id, fatal=False)
err_message = traverse_obj(err_json, ('errors', ..., 'detail'), get_all=False)
if err_message:
raise ExtractorError(f'Patreon said: {err_message}', expected=True)
@@ -310,7 +310,7 @@ class PatreonIE(PatreonBaseIE):
f'posts/{post_id}/comments', post_id, query=params, note='Downloading comments page %d' % page)
cursor = None
- for comment in traverse_obj(response, (('data', ('included', lambda _, v: v['type'] == 'comment')), ...), default=[]):
+ for comment in traverse_obj(response, (('data', ('included', lambda _, v: v['type'] == 'comment')), ...)):
count += 1
comment_id = comment.get('id')
attributes = comment.get('attributes') or {}
@@ -404,8 +404,8 @@ class PatreonCampaignIE(PatreonBaseIE):
posts_json = self._call_api('posts', campaign_id, query=params, note='Downloading posts page %d' % page)
cursor = traverse_obj(posts_json, ('meta', 'pagination', 'cursors', 'next'))
- for post in posts_json.get('data') or []:
- yield self.url_result(url_or_none(traverse_obj(post, ('attributes', 'patreon_url'))), 'Patreon')
+ for post_url in traverse_obj(posts_json, ('data', ..., 'attributes', 'patreon_url')):
+ yield self.url_result(urljoin('https://www.patreon.com/', post_url), PatreonIE)
if cursor is None:
break
diff --git a/hypervideo_dl/extractor/pbs.py b/hypervideo_dl/extractor/pbs.py
index 5bdf561..2bb2ea9 100644
--- a/hypervideo_dl/extractor/pbs.py
+++ b/hypervideo_dl/extractor/pbs.py
@@ -11,6 +11,7 @@ from ..utils import (
orderedSet,
strip_jsonp,
strip_or_none,
+ traverse_obj,
unified_strdate,
url_or_none,
US_RATINGS,
@@ -696,3 +697,61 @@ class PBSIE(InfoExtractor):
'subtitles': subtitles,
'chapters': chapters,
}
+
+
+class PBSKidsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?pbskids\.org/video/[\w-]+/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'https://pbskids.org/video/molly-of-denali/3030407927',
+ 'md5': '1ded20a017cc6b53446238f1804ce4c7',
+ 'info_dict': {
+ 'id': '3030407927',
+ 'title': 'Bird in the Hand/Bye-Bye Birdie',
+ 'channel': 'molly-of-denali',
+ 'duration': 1540,
+ 'ext': 'mp4',
+ 'series': 'Molly of Denali',
+ 'description': 'md5:d006b2211633685d8ebc8d03b6d5611e',
+ 'categories': ['Episode'],
+ 'upload_date': '20190718',
+ }
+ },
+ {
+ 'url': 'https://pbskids.org/video/plum-landing/2365205059',
+ 'md5': '92e5d189851a64ae1d0237a965be71f5',
+ 'info_dict': {
+ 'id': '2365205059',
+ 'title': 'Cooper\'s Favorite Place in Nature',
+ 'channel': 'plum-landing',
+ 'duration': 67,
+ 'ext': 'mp4',
+ 'series': 'Plum Landing',
+ 'description': 'md5:657e5fc4356a84ead1c061eb280ff05d',
+ 'categories': ['Episode'],
+ 'upload_date': '20140302',
+ }
+ }
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ meta = self._search_json(r'window\._PBS_KIDS_DEEPLINK\s*=', webpage, 'video info', video_id)
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ traverse_obj(meta, ('video_obj', 'URI', {url_or_none})), video_id, ext='mp4')
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ **traverse_obj(meta, {
+ 'categories': ('video_obj', 'video_type', {str}, {lambda x: [x] if x else None}),
+ 'channel': ('show_slug', {str}),
+ 'description': ('video_obj', 'description', {str}),
+ 'duration': ('video_obj', 'duration', {int_or_none}),
+ 'series': ('video_obj', 'program_title', {str}),
+ 'title': ('video_obj', 'title', {str}),
+ 'upload_date': ('video_obj', 'air_date', {unified_strdate}),
+ })
+ }
diff --git a/hypervideo_dl/extractor/peekvids.py b/hypervideo_dl/extractor/peekvids.py
index 2d9b9a7..d1fc058 100644
--- a/hypervideo_dl/extractor/peekvids.py
+++ b/hypervideo_dl/extractor/peekvids.py
@@ -1,71 +1,128 @@
+import re
+
from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ get_element_by_class,
+ int_or_none,
+ merge_dicts,
+ url_or_none,
+)
+
+
+class PeekVidsBaseIE(InfoExtractor):
+ def _real_extract(self, url):
+ domain, video_id = self._match_valid_url(url).group('domain', 'id')
+ webpage = self._download_webpage(url, video_id, expected_status=429)
+ if '>Rate Limit Exceeded' in webpage:
+ raise ExtractorError(
+ f'You are suspected as a bot. Wait, or pass the captcha on the site and provide cookies. {self._login_hint()}',
+ video_id=video_id, expected=True)
+
+ title = self._html_search_regex(r'(?s)<h1\b[^>]*>(.+?)</h1>', webpage, 'title')
+
+ display_id = video_id
+ video_id = self._search_regex(r'(?s)<video\b[^>]+\bdata-id\s*=\s*["\']?([\w-]+)', webpage, 'short video ID')
+ srcs = self._download_json(
+ f'https://www.{domain}/v-alt/{video_id}', video_id,
+ note='Downloading list of source files')
+
+ formats = []
+ for k, v in srcs.items():
+ f_url = url_or_none(v)
+ if not f_url:
+ continue
+
+ height = self._search_regex(r'^data-src(\d{3,})$', k, 'height', default=None)
+ if not height:
+ continue
+
+ formats.append({
+ 'url': f_url,
+ 'format_id': height,
+ 'height': int_or_none(height),
+ })
+
+ if not formats:
+ formats = [{'url': url} for url in srcs.values()]
+ info = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={})
+ info.pop('url', None)
-class PeekVidsIE(InfoExtractor):
+ # may not have found the thumbnail if it was in a list in the ld+json
+ info.setdefault('thumbnail', self._og_search_thumbnail(webpage))
+ detail = (get_element_by_class('detail-video-block', webpage)
+ or get_element_by_class('detail-block', webpage) or '')
+ info['description'] = self._html_search_regex(
+ rf'(?s)(.+?)(?:{re.escape(info.get("description", ""))}\s*<|<ul\b)',
+ detail, 'description', default=None) or None
+ info['title'] = re.sub(r'\s*[,-][^,-]+$', '', info.get('title') or title) or self._generic_title(url)
+
+ def cat_tags(name, html):
+ l = self._html_search_regex(
+ rf'(?s)<span\b[^>]*>\s*{re.escape(name)}\s*:\s*</span>(.+?)</li>',
+ html, name, default='')
+ return list(filter(None, re.split(r'\s+', l)))
+
+ return merge_dicts({
+ 'id': video_id,
+ 'display_id': display_id,
+ 'age_limit': 18,
+ 'formats': formats,
+ 'categories': cat_tags('Categories', detail),
+ 'tags': cat_tags('Tags', detail),
+ 'uploader': self._html_search_regex(r'[Uu]ploaded\s+by\s(.+?)"', webpage, 'uploader', default=None),
+ }, info)
+
+
+class PeekVidsIE(PeekVidsBaseIE):
_VALID_URL = r'''(?x)
- https?://(?:www\.)?peekvids\.com/
+ https?://(?:www\.)?(?P<domain>peekvids\.com)/
(?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=)
(?P<id>[^/?&#]*)
'''
_TESTS = [{
'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd',
- 'md5': 'a00940646c428e232407e3e62f0e8ef5',
+ 'md5': '2ff6a357a9717dc9dc9894b51307e9a2',
'info_dict': {
- 'id': 'BSyLMbN0YCd',
- 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub',
+ 'id': '1262717',
+ 'display_id': 'BSyLMbN0YCd',
+ 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp',
'ext': 'mp4',
'thumbnail': r're:^https?://.*\.jpg$',
- 'description': 'Watch Dane Jones - Cute redhead with perfect tits with Mini Vamp (7 min), uploaded by SEXYhub.com',
+ 'description': 'md5:0a61df3620de26c0af8963b1a730cd69',
'timestamp': 1642579329,
'upload_date': '20220119',
'duration': 416,
'view_count': int,
'age_limit': 18,
+ 'uploader': 'SEXYhub.com',
+ 'categories': list,
+ 'tags': list,
},
}]
- _DOMAIN = 'www.peekvids.com'
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- short_video_id = self._html_search_regex(r'<video [^>]*data-id="(.+?)"', webpage, 'short video ID')
- srcs = self._download_json(
- f'https://{self._DOMAIN}/v-alt/{short_video_id}', video_id,
- note='Downloading list of source files')
- formats = [{
- 'url': url,
- 'ext': 'mp4',
- 'format_id': name[8:],
- } for name, url in srcs.items() if len(name) > 8 and name.startswith('data-src')]
- if not formats:
- formats = [{'url': url} for url in srcs.values()]
- info = self._search_json_ld(webpage, video_id, expected_type='VideoObject')
- info.update({
- 'id': video_id,
- 'age_limit': 18,
- 'formats': formats,
- })
- return info
-
-
-class PlayVidsIE(PeekVidsIE): # XXX: Do not subclass from concrete IE
- _VALID_URL = r'https?://(?:www\.)?playvids\.com/(?:embed/|[^/]{2}/)?(?P<id>[^/?#]*)'
+class PlayVidsIE(PeekVidsBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?(?P<domain>playvids\.com)/(?:embed/|\w\w?/)?(?P<id>[^/?#]*)'
_TESTS = [{
'url': 'https://www.playvids.com/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp',
- 'md5': 'cd7dfd8a2e815a45402369c76e3c1825',
+ 'md5': '2f12e50213dd65f142175da633c4564c',
'info_dict': {
- 'id': 'U3pBrYhsjXM',
- 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub',
+ 'id': '1978030',
+ 'display_id': 'U3pBrYhsjXM',
+ 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp',
'ext': 'mp4',
'thumbnail': r're:^https?://.*\.jpg$',
- 'description': 'Watch Dane Jones - Cute redhead with perfect tits with Mini Vamp video in HD, uploaded by SEXYhub.com',
+ 'description': 'md5:0a61df3620de26c0af8963b1a730cd69',
'timestamp': 1640435839,
'upload_date': '20211225',
'duration': 416,
'view_count': int,
'age_limit': 18,
+ 'uploader': 'SEXYhub.com',
+ 'categories': list,
+ 'tags': list,
},
}, {
'url': 'https://www.playvids.com/es/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp',
@@ -73,5 +130,62 @@ class PlayVidsIE(PeekVidsIE): # XXX: Do not subclass from concrete IE
}, {
'url': 'https://www.playvids.com/embed/U3pBrYhsjXM',
'only_matching': True,
+ }, {
+ 'url': 'https://www.playvids.com/bKmGLe3IwjZ/sv/brazzers-800-phone-sex-madison-ivy-always-on-the-line',
+ 'md5': 'e783986e596cafbf46411a174ab42ba6',
+ 'info_dict': {
+ 'id': '762385',
+ 'display_id': 'bKmGLe3IwjZ',
+ 'ext': 'mp4',
+ 'title': 'Brazzers - 1 800 Phone Sex: Madison Ivy Always On The Line 6',
+ 'description': 'md5:bdcd2db2b8ad85831a491d7c8605dcef',
+ 'timestamp': 1516958544,
+ 'upload_date': '20180126',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 480,
+ 'uploader': 'Brazzers',
+ 'age_limit': 18,
+ 'view_count': int,
+ 'age_limit': 18,
+ 'categories': list,
+ 'tags': list,
+ },
+ }, {
+ 'url': 'https://www.playvids.com/v/47iUho33toY',
+ 'md5': 'b056b5049d34b648c1e86497cf4febce',
+ 'info_dict': {
+ 'id': '700621',
+ 'display_id': '47iUho33toY',
+ 'ext': 'mp4',
+ 'title': 'KATEE OWEN STRIPTIASE IN SEXY RED LINGERIE',
+ 'description': None,
+ 'timestamp': 1507052209,
+ 'upload_date': '20171003',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 332,
+ 'uploader': 'Cacerenele',
+ 'age_limit': 18,
+ 'view_count': int,
+ 'categories': list,
+ 'tags': list,
+ },
+ }, {
+ 'url': 'https://www.playvids.com/z3_7iwWCmqt/sexy-teen-filipina-striptease-beautiful-pinay-bargirl-strips-and-dances',
+ 'md5': 'efa09be9f031314b7b7e3bc6510cd0df',
+ 'info_dict': {
+ 'id': '1523518',
+ 'display_id': 'z3_7iwWCmqt',
+ 'ext': 'mp4',
+ 'title': 'SEXY TEEN FILIPINA STRIPTEASE - Beautiful Pinay Bargirl Strips and Dances',
+ 'description': None,
+ 'timestamp': 1607470323,
+ 'upload_date': '20201208',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 593,
+ 'uploader': 'yorours',
+ 'age_limit': 18,
+ 'view_count': int,
+ 'categories': list,
+ 'tags': list,
+ },
}]
- _DOMAIN = 'www.playvids.com'
diff --git a/hypervideo_dl/extractor/peloton.py b/hypervideo_dl/extractor/peloton.py
index 4835822..7864299 100644
--- a/hypervideo_dl/extractor/peloton.py
+++ b/hypervideo_dl/extractor/peloton.py
@@ -3,7 +3,7 @@ import re
import urllib.parse
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
float_or_none,
@@ -83,8 +83,8 @@ class PelotonIE(InfoExtractor):
}).encode(),
headers={'Content-Type': 'application/json', 'User-Agent': 'web'})
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- json_string = self._webpage_read_content(e.cause, None, video_id)
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+ json_string = self._webpage_read_content(e.cause.response, None, video_id)
res = self._parse_json(json_string, video_id)
raise ExtractorError(res['message'], expected=res['message'] == 'Login failed')
else:
@@ -96,8 +96,8 @@ class PelotonIE(InfoExtractor):
'https://api.onepeloton.com/api/subscription/stream', video_id, note='Downloading token',
data=json.dumps({}).encode(), headers={'Content-Type': 'application/json'})
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- json_string = self._webpage_read_content(e.cause, None, video_id)
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
+ json_string = self._webpage_read_content(e.cause.response, None, video_id)
res = self._parse_json(json_string, video_id)
raise ExtractorError(res['message'], expected=res['message'] == 'Stream limit reached')
else:
@@ -109,7 +109,7 @@ class PelotonIE(InfoExtractor):
try:
self._start_session(video_id)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
self._login(video_id)
self._start_session(video_id)
else:
diff --git a/hypervideo_dl/extractor/pgatour.py b/hypervideo_dl/extractor/pgatour.py
new file mode 100644
index 0000000..36c2c62
--- /dev/null
+++ b/hypervideo_dl/extractor/pgatour.py
@@ -0,0 +1,47 @@
+from .brightcove import BrightcoveNewIE
+from .common import InfoExtractor
+
+
+class PGATourIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?pgatour\.com/video/[\w-]+/(?P<tc>T)?(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.pgatour.com/video/competition/T6322447785112/adam-hadwin-2023-the-players-round-4-18th-hole-shot-1',
+ 'info_dict': {
+ 'id': '6322447785112',
+ 'ext': 'mp4',
+ 'title': 'Adam Hadwin | 2023 THE PLAYERS | Round 4 | 18th hole | Shot 1',
+ 'uploader_id': '6116716431001',
+ 'upload_date': '20230312',
+ 'timestamp': 1678653136,
+ 'duration': 20.011,
+ 'thumbnail': r're:^https://.+\.jpg',
+ 'tags': 'count:7',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.pgatour.com/video/features/6322506425112/follow-the-players-trophy-on-championship-sunday',
+ 'info_dict': {
+ 'id': '6322506425112',
+ 'ext': 'mp4',
+ 'title': 'Follow THE PLAYERS trophy on Championship Sunday',
+ 'description': 'md5:4d29e4bdfa03694a0ebfd08950398568',
+ 'uploader_id': '6082840763001',
+ 'upload_date': '20230313',
+ 'timestamp': 1678739835,
+ 'duration': 123.435,
+ 'thumbnail': r're:^https://.+\.jpg',
+ 'tags': 'count:8',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ video_id, is_tourcast = self._match_valid_url(url).group('id', 'tc')
+
+ # From https://www.pgatour.com/_next/static/chunks/pages/_app-8bcf849560daf38d.js
+ account_id = '6116716431001' if is_tourcast else '6082840763001'
+ player_id = 'Vsd5Umu8r' if is_tourcast else 'FWIBYMBPj'
+
+ return self.url_result(
+ f'https://players.brightcove.net/{account_id}/{player_id}_default/index.html?videoId={video_id}',
+ BrightcoveNewIE)
diff --git a/hypervideo_dl/extractor/piapro.py b/hypervideo_dl/extractor/piapro.py
index d8d9c78..5f39e06 100644
--- a/hypervideo_dl/extractor/piapro.py
+++ b/hypervideo_dl/extractor/piapro.py
@@ -12,17 +12,22 @@ from ..utils import (
class PiaproIE(InfoExtractor):
_NETRC_MACHINE = 'piapro'
- _VALID_URL = r'https?://piapro\.jp/t/(?P<id>\w+)/?'
+ _VALID_URL = r'https?://piapro\.jp/(?:t|content)/(?P<id>\w+)/?'
_TESTS = [{
'url': 'https://piapro.jp/t/NXYR',
- 'md5': 'a9d52f27d13bafab7ee34116a7dcfa77',
+ 'md5': 'f7c0f760913fb1d44a1c45a4af793909',
'info_dict': {
'id': 'NXYR',
'ext': 'mp3',
'uploader': 'wowaka',
'uploader_id': 'wowaka',
'title': '裏表ラバーズ',
- 'thumbnail': r're:^https?://.*\.jpg$',
+ 'description': 'http://www.nicovideo.jp/watch/sm8082467',
+ 'duration': 189.0,
+ 'timestamp': 1251785475,
+ 'thumbnail': r're:^https?://.*\.(?:png|jpg)$',
+ 'upload_date': '20090901',
+ 'view_count': int,
}
}, {
'note': 'There are break lines in description, mandating (?s) flag',
@@ -34,8 +39,16 @@ class PiaproIE(InfoExtractor):
'title': '青に溶けた風船 / 初音ミク',
'description': 'md5:d395a9bd151447631a5a1460bc7f9132',
'uploader': 'シアン・キノ',
+ 'duration': 229.0,
+ 'timestamp': 1644030039,
+ 'upload_date': '20220205',
+ 'view_count': int,
+ 'thumbnail': r're:^https?://.*\.(?:png|jpg)$',
'uploader_id': 'cyankino',
}
+ }, {
+ 'url': 'https://piapro.jp/content/hcw0z3a169wtemz6',
+ 'only_matching': True
}]
_login_status = False
@@ -56,7 +69,7 @@ class PiaproIE(InfoExtractor):
if urlh is False:
login_ok = False
else:
- parts = compat_urlparse.urlparse(urlh.geturl())
+ parts = compat_urlparse.urlparse(urlh.url)
if parts.path != '/':
login_ok = False
if not login_ok:
diff --git a/hypervideo_dl/extractor/picarto.py b/hypervideo_dl/extractor/picarto.py
index 36a062d..d415ba2 100644
--- a/hypervideo_dl/extractor/picarto.py
+++ b/hypervideo_dl/extractor/picarto.py
@@ -1,7 +1,10 @@
+import urllib.parse
+
from .common import InfoExtractor
from ..utils import (
ExtractorError,
- js_to_json,
+ str_or_none,
+ traverse_obj,
)
@@ -84,7 +87,7 @@ class PicartoIE(InfoExtractor):
class PicartoVodIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?picarto\.tv/(?:videopopout|\w+/videos)/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv',
'md5': '3ab45ba4352c52ee841a28fb73f2d9ca',
@@ -94,6 +97,18 @@ class PicartoVodIE(InfoExtractor):
'title': 'ArtofZod_2017.12.12.00.13.23.flv',
'thumbnail': r're:^https?://.*\.jpg'
},
+ 'skip': 'The VOD does not exist',
+ }, {
+ 'url': 'https://picarto.tv/ArtofZod/videos/772650',
+ 'md5': '00067a0889f1f6869cc512e3e79c521b',
+ 'info_dict': {
+ 'id': '772650',
+ 'ext': 'mp4',
+ 'title': 'Art of Zod - Drawing and Painting',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'channel': 'ArtofZod',
+ 'age_limit': 18,
+ }
}, {
'url': 'https://picarto.tv/videopopout/Plague',
'only_matching': True,
@@ -102,21 +117,36 @@ class PicartoVodIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- vod_info = self._parse_json(
- self._search_regex(
- r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage,
- 'vod player'),
- video_id, transform_source=js_to_json)
+ data = self._download_json(
+ 'https://ptvintern.picarto.tv/ptvapi', video_id, query={
+ 'query': f'''{{
+ video(id: "{video_id}") {{
+ id
+ title
+ adult
+ file_name
+ video_recording_image_url
+ channel {{
+ name
+ }}
+ }}
+}}'''
+ })['data']['video']
+
+ file_name = data['file_name']
+ netloc = urllib.parse.urlparse(data['video_recording_image_url']).netloc
formats = self._extract_m3u8_formats(
- vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls')
+ f'https://{netloc}/stream/hls/{file_name}/index.m3u8', video_id, 'mp4', m3u8_id='hls')
return {
'id': video_id,
- 'title': video_id,
- 'thumbnail': vod_info.get('vodThumb'),
+ **traverse_obj(data, {
+ 'id': ('id', {str_or_none}),
+ 'title': ('title', {str}),
+ 'thumbnail': 'video_recording_image_url',
+ 'channel': ('channel', 'name', {str}),
+ 'age_limit': ('adult', {lambda x: 18 if x else 0}),
+ }),
'formats': formats,
}
diff --git a/hypervideo_dl/extractor/piksel.py b/hypervideo_dl/extractor/piksel.py
index cc60b30..97a9bf5 100644
--- a/hypervideo_dl/extractor/piksel.py
+++ b/hypervideo_dl/extractor/piksel.py
@@ -7,8 +7,10 @@ from ..utils import (
int_or_none,
join_nonempty,
parse_iso8601,
+ traverse_obj,
try_get,
unescapeHTML,
+ urljoin,
)
@@ -63,11 +65,11 @@ class PikselIE(InfoExtractor):
}
]
- def _call_api(self, app_token, resource, display_id, query, fatal=True):
- response = (self._download_json(
- 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token),
- display_id, query=query, fatal=fatal) or {}).get('response')
- failure = try_get(response, lambda x: x['failure']['reason'])
+ def _call_api(self, app_token, resource, display_id, query, host='https://player.piksel.com', fatal=True):
+ url = urljoin(host, f'/ws/ws_{resource}/api/{app_token}/mode/json/apiv/5')
+ response = traverse_obj(
+ self._download_json(url, display_id, query=query, fatal=fatal), ('response', {dict})) or {}
+ failure = traverse_obj(response, ('failure', 'reason')) if response else 'Empty response from API'
if failure:
if fatal:
raise ExtractorError(failure, expected=True)
@@ -83,7 +85,7 @@ class PikselIE(InfoExtractor):
], webpage, 'app token')
query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id}
program = self._call_api(
- app_token, 'program', display_id, query)['WsProgramResponse']['program']
+ app_token, 'program', display_id, query, url)['WsProgramResponse']['program']
video_id = program['uuid']
video_data = program['asset']
title = video_data['title']
@@ -129,7 +131,7 @@ class PikselIE(InfoExtractor):
process_asset_files(try_get(self._call_api(
app_token, 'asset_file', display_id, {
'assetid': asset_id,
- }, False), lambda x: x['WsAssetFileResponse']['AssetFiles']))
+ }, url, False), lambda x: x['WsAssetFileResponse']['AssetFiles']))
m3u8_url = dict_get(video_data, [
'm3u8iPadURL',
diff --git a/hypervideo_dl/extractor/pinterest.py b/hypervideo_dl/extractor/pinterest.py
index 2c6cd6d..8361fbb 100644
--- a/hypervideo_dl/extractor/pinterest.py
+++ b/hypervideo_dl/extractor/pinterest.py
@@ -1,19 +1,24 @@
import json
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
determine_ext,
float_or_none,
int_or_none,
- try_get,
+ str_or_none,
+ strip_or_none,
+ traverse_obj,
unified_timestamp,
url_or_none,
)
class PinterestBaseIE(InfoExtractor):
- _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)'
+ _VALID_URL_BASE = r'''(?x)
+ https?://(?:[^/]+\.)?pinterest\.(?:
+ com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|
+ dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|
+ co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)'''
def _call_api(self, resource, video_id, options):
return self._download_json(
@@ -24,14 +29,53 @@ class PinterestBaseIE(InfoExtractor):
def _extract_video(self, data, extract_formats=True):
video_id = data['id']
+ thumbnails = []
+ images = data.get('images')
+ if isinstance(images, dict):
+ for thumbnail_id, thumbnail in images.items():
+ if not isinstance(thumbnail, dict):
+ continue
+ thumbnail_url = url_or_none(thumbnail.get('url'))
+ if not thumbnail_url:
+ continue
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'width': int_or_none(thumbnail.get('width')),
+ 'height': int_or_none(thumbnail.get('height')),
+ })
- title = (data.get('title') or data.get('grid_title') or video_id).strip()
+ info = {
+ 'title': strip_or_none(traverse_obj(data, 'title', 'grid_title', default='')),
+ 'description': traverse_obj(data, 'seo_description', 'description'),
+ 'timestamp': unified_timestamp(data.get('created_at')),
+ 'thumbnails': thumbnails,
+ 'uploader': traverse_obj(data, ('closeup_attribution', 'full_name')),
+ 'uploader_id': str_or_none(traverse_obj(data, ('closeup_attribution', 'id'))),
+ 'repost_count': int_or_none(data.get('repin_count')),
+ 'comment_count': int_or_none(data.get('comment_count')),
+ 'categories': traverse_obj(data, ('pin_join', 'visual_annotation'), expected_type=list),
+ 'tags': traverse_obj(data, 'hashtags', expected_type=list),
+ }
urls = []
formats = []
duration = None
- if extract_formats:
- for format_id, format_dict in data['videos']['video_list'].items():
+ domain = data.get('domain', '')
+ if domain.lower() != 'uploaded by user' and traverse_obj(data, ('embed', 'src')):
+ if not info['title']:
+ info['title'] = None
+ return {
+ '_type': 'url_transparent',
+ 'url': data['embed']['src'],
+ **info,
+ }
+
+ elif extract_formats:
+ video_list = traverse_obj(
+ data, ('videos', 'video_list'),
+ ('story_pin_data', 'pages', ..., 'blocks', ..., 'video', 'video_list'),
+ expected_type=dict, get_all=False, default={})
+ for format_id, format_dict in video_list.items():
if not isinstance(format_dict, dict):
continue
format_url = url_or_none(format_dict.get('url'))
@@ -53,72 +97,79 @@ class PinterestBaseIE(InfoExtractor):
'duration': duration,
})
- description = data.get('description') or data.get('description_html') or data.get('seo_description')
- timestamp = unified_timestamp(data.get('created_at'))
-
- def _u(field):
- return try_get(data, lambda x: x['closeup_attribution'][field], compat_str)
-
- uploader = _u('full_name')
- uploader_id = _u('id')
-
- repost_count = int_or_none(data.get('repin_count'))
- comment_count = int_or_none(data.get('comment_count'))
- categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list)
- tags = data.get('hashtags')
-
- thumbnails = []
- images = data.get('images')
- if isinstance(images, dict):
- for thumbnail_id, thumbnail in images.items():
- if not isinstance(thumbnail, dict):
- continue
- thumbnail_url = url_or_none(thumbnail.get('url'))
- if not thumbnail_url:
- continue
- thumbnails.append({
- 'url': thumbnail_url,
- 'width': int_or_none(thumbnail.get('width')),
- 'height': int_or_none(thumbnail.get('height')),
- })
-
return {
'id': video_id,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'timestamp': timestamp,
- 'thumbnails': thumbnails,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'repost_count': repost_count,
- 'comment_count': comment_count,
- 'categories': categories,
- 'tags': tags,
'formats': formats,
+ 'duration': duration,
+ 'webpage_url': f'https://www.pinterest.com/pin/{video_id}/',
'extractor_key': PinterestIE.ie_key(),
+ 'extractor': PinterestIE.IE_NAME,
+ **info,
}
class PinterestIE(PinterestBaseIE):
_VALID_URL = r'%s/pin/(?P<id>\d+)' % PinterestBaseIE._VALID_URL_BASE
_TESTS = [{
+ # formats found in data['videos']
'url': 'https://www.pinterest.com/pin/664281013778109217/',
'md5': '6550c2af85d6d9f3fe3b88954d1577fc',
'info_dict': {
'id': '664281013778109217',
'ext': 'mp4',
'title': 'Origami',
- 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd',
+ 'description': 'md5:e29801cab7d741ea8c741bc50c8d00ab',
'duration': 57.7,
'timestamp': 1593073622,
'upload_date': '20200625',
- 'uploader': 'Love origami -I am Dafei',
- 'uploader_id': '586523688879454212',
- 'repost_count': 50,
- 'comment_count': 0,
+ 'repost_count': int,
+ 'comment_count': int,
'categories': list,
'tags': list,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
+ },
+ }, {
+ # formats found in data['story_pin_data']
+ 'url': 'https://www.pinterest.com/pin/1084663891475263837/',
+ 'md5': '069ac19919ab9e1e13fa60de46290b03',
+ 'info_dict': {
+ 'id': '1084663891475263837',
+ 'ext': 'mp4',
+ 'title': 'Gadget, Cool products, Amazon product, technology, Kitchen gadgets',
+ 'description': 'md5:d0a4b6ae996ff0c6eed83bc869598d13',
+ 'uploader': 'CoolCrazyGadgets',
+ 'uploader_id': '1084664028912989237',
+ 'upload_date': '20211003',
+ 'timestamp': 1633246654.0,
+ 'duration': 14.9,
+ 'comment_count': int,
+ 'repost_count': int,
+ 'categories': 'count:9',
+ 'tags': list,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
+ },
+ }, {
+ # vimeo.com embed
+ 'url': 'https://www.pinterest.ca/pin/441282463481903715/',
+ 'info_dict': {
+ 'id': '111691128',
+ 'ext': 'mp4',
+ 'title': 'Tonite Let\'s All Make Love In London (1967)',
+ 'description': 'md5:8190f37b3926807809ec57ec21aa77b2',
+ 'uploader': 'Vimeo',
+ 'uploader_id': '473792960706651251',
+ 'upload_date': '20180120',
+ 'timestamp': 1516409040,
+ 'duration': 3404,
+ 'comment_count': int,
+ 'repost_count': int,
+ 'categories': 'count:9',
+ 'tags': [],
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
+ 'uploader_url': 'https://vimeo.com/willardandrade',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
},
}, {
'url': 'https://co.pinterest.com/pin/824721750502199491/',
diff --git a/hypervideo_dl/extractor/pladform.py b/hypervideo_dl/extractor/pladform.py
index dcf18e1..0050068 100644
--- a/hypervideo_dl/extractor/pladform.py
+++ b/hypervideo_dl/extractor/pladform.py
@@ -78,7 +78,7 @@ class PladformIE(InfoExtractor):
expected=True)
if not video:
- targetUrl = self._request_webpage(url, video_id, note='Resolving final URL').geturl()
+ targetUrl = self._request_webpage(url, video_id, note='Resolving final URL').url
if targetUrl == url:
raise ExtractorError('Can\'t parse page')
return self.url_result(targetUrl)
diff --git a/hypervideo_dl/extractor/platzi.py b/hypervideo_dl/extractor/platzi.py
index b8a4414..166b98c 100644
--- a/hypervideo_dl/extractor/platzi.py
+++ b/hypervideo_dl/extractor/platzi.py
@@ -36,7 +36,7 @@ class PlatziBaseIE(InfoExtractor):
headers={'Referer': self._LOGIN_URL})
# login succeeded
- if 'platzi.com/login' not in urlh.geturl():
+ if 'platzi.com/login' not in urlh.url:
return
login_error = self._webpage_read_content(
diff --git a/hypervideo_dl/extractor/playplustv.py b/hypervideo_dl/extractor/playplustv.py
index 316f220..a4439c8 100644
--- a/hypervideo_dl/extractor/playplustv.py
+++ b/hypervideo_dl/extractor/playplustv.py
@@ -1,13 +1,9 @@
import json
from .common import InfoExtractor
-from ..compat import compat_HTTPError
-from ..utils import (
- clean_html,
- ExtractorError,
- int_or_none,
- PUTRequest,
-)
+from ..networking import PUTRequest
+from ..networking.exceptions import HTTPError
+from ..utils import ExtractorError, clean_html, int_or_none
class PlayPlusTVIE(InfoExtractor):
@@ -47,9 +43,9 @@ class PlayPlusTVIE(InfoExtractor):
try:
self._token = self._download_json(req, None)['token']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
raise ExtractorError(self._parse_json(
- e.cause.read(), None)['errorMessage'], expected=True)
+ e.cause.response.read(), None)['errorMessage'], expected=True)
raise
self._profile = self._call_api('Profiles')['list'][0]['_id']
diff --git a/hypervideo_dl/extractor/playsuisse.py b/hypervideo_dl/extractor/playsuisse.py
index a635ac9..76288c7 100644
--- a/hypervideo_dl/extractor/playsuisse.py
+++ b/hypervideo_dl/extractor/playsuisse.py
@@ -5,10 +5,16 @@ from ..utils import int_or_none, traverse_obj
class PlaySuisseIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/watch/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/(?:watch|detail)/(?:[^#]*[?&]episodeId=)?(?P<id>[0-9]+)'
_TESTS = [
{
+ # Old URL
'url': 'https://www.playsuisse.ch/watch/763211/0',
+ 'only_matching': True,
+ },
+ {
+ # episode in a series
+ 'url': 'https://www.playsuisse.ch/watch/763182?episodeId=763211',
'md5': '82df2a470b2dfa60c2d33772a8a60cf8',
'info_dict': {
'id': '763211',
@@ -21,11 +27,11 @@ class PlaySuisseIE(InfoExtractor):
'season_number': 1,
'episode': 'Knochen',
'episode_number': 1,
- 'thumbnail': 'md5:9260abe0c0ec9b69914d0a10d54c5878'
+ 'thumbnail': 're:https://playsuisse-img.akamaized.net/',
}
- },
- {
- 'url': 'https://www.playsuisse.ch/watch/808675/0',
+ }, {
+ # film
+ 'url': 'https://www.playsuisse.ch/watch/808675',
'md5': '818b94c1d2d7c4beef953f12cb8f3e75',
'info_dict': {
'id': '808675',
@@ -33,26 +39,60 @@ class PlaySuisseIE(InfoExtractor):
'title': 'Der Läufer',
'description': 'md5:9f61265c7e6dcc3e046137a792b275fd',
'duration': 5280,
- 'episode': 'Der Läufer',
- 'thumbnail': 'md5:44af7d65ee02bbba4576b131868bb783'
+ 'thumbnail': 're:https://playsuisse-img.akamaized.net/',
}
- },
- {
- 'url': 'https://www.playsuisse.ch/watch/817193/0',
- 'md5': '1d6c066f92cd7fffd8b28a53526d6b59',
+ }, {
+ # series (treated as a playlist)
+ 'url': 'https://www.playsuisse.ch/detail/1115687',
'info_dict': {
- 'id': '817193',
- 'ext': 'mp4',
- 'title': 'Die Einweihungsparty',
- 'description': 'md5:91ebf04d3a42cb3ab70666acf750a930',
- 'duration': 1380,
- 'series': 'Nr. 47',
- 'season': 'Season 1',
- 'season_number': 1,
- 'episode': 'Die Einweihungsparty',
- 'episode_number': 1,
- 'thumbnail': 'md5:637585fb106e3a4bcd991958924c7e44'
- }
+ 'description': 'md5:e4a2ae29a8895823045b5c3145a02aa3',
+ 'id': '1115687',
+ 'series': 'They all came out to Montreux',
+ 'title': 'They all came out to Montreux',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'description': 'md5:f2462744834b959a31adc6292380cda2',
+ 'duration': 3180,
+ 'episode': 'Folge 1',
+ 'episode_number': 1,
+ 'id': '1112663',
+ 'season': 'Season 1',
+ 'season_number': 1,
+ 'series': 'They all came out to Montreux',
+ 'thumbnail': 're:https://playsuisse-img.akamaized.net/',
+ 'title': 'Folge 1',
+ 'ext': 'mp4'
+ },
+ }, {
+ 'info_dict': {
+ 'description': 'md5:9dfd308699fe850d3bce12dc1bad9b27',
+ 'duration': 2935,
+ 'episode': 'Folge 2',
+ 'episode_number': 2,
+ 'id': '1112661',
+ 'season': 'Season 1',
+ 'season_number': 1,
+ 'series': 'They all came out to Montreux',
+ 'thumbnail': 're:https://playsuisse-img.akamaized.net/',
+ 'title': 'Folge 2',
+ 'ext': 'mp4'
+ },
+ }, {
+ 'info_dict': {
+ 'description': 'md5:14a93a3356b2492a8f786ab2227ef602',
+ 'duration': 2994,
+ 'episode': 'Folge 3',
+ 'episode_number': 3,
+ 'id': '1112664',
+ 'season': 'Season 1',
+ 'season_number': 1,
+ 'series': 'They all came out to Montreux',
+ 'thumbnail': 're:https://playsuisse-img.akamaized.net/',
+ 'title': 'Folge 3',
+ 'ext': 'mp4'
+ }
+ }],
}
]
@@ -142,6 +182,6 @@ class PlaySuisseIE(InfoExtractor):
'subtitles': subtitles,
'series': media_data.get('seriesName'),
'season_number': int_or_none(media_data.get('seasonNumber')),
- 'episode': media_data.get('name'),
+ 'episode': media_data.get('name') if media_data.get('episodeNumber') else None,
'episode_number': int_or_none(media_data.get('episodeNumber')),
}
diff --git a/hypervideo_dl/extractor/plutotv.py b/hypervideo_dl/extractor/plutotv.py
index 71a05cc..caffeb2 100644
--- a/hypervideo_dl/extractor/plutotv.py
+++ b/hypervideo_dl/extractor/plutotv.py
@@ -84,6 +84,17 @@ class PlutoTVIE(InfoExtractor):
}, {
'url': 'https://pluto.tv/it/on-demand/series/csi-vegas/episode/legacy-2021-1-1',
'only_matching': True,
+ },
+ {
+ 'url': 'https://pluto.tv/en/on-demand/movies/attack-of-the-killer-tomatoes-1977-1-1-ptv1',
+ 'md5': '7db56369c0da626a32d505ec6eb3f89f',
+ 'info_dict': {
+ 'id': '5b190c7bb0875c36c90c29c4',
+ 'ext': 'mp4',
+ 'title': 'Attack of the Killer Tomatoes',
+ 'description': 'A group of scientists band together to save the world from mutated tomatoes that KILL! (1978)',
+ 'duration': 5700,
+ }
}
]
@@ -103,7 +114,7 @@ class PlutoTVIE(InfoExtractor):
compat_urlparse.urljoin(first_segment_url.group(1), '0-end/master.m3u8'))
continue
first_segment_url = re.search(
- r'^(https?://.*/).+\-0+\.ts$', res,
+ r'^(https?://.*/).+\-0+[0-1]0\.ts$', res,
re.MULTILINE)
if first_segment_url:
m3u8_urls.add(
diff --git a/hypervideo_dl/extractor/polskieradio.py b/hypervideo_dl/extractor/polskieradio.py
index 99244f6..5bf92b9 100644
--- a/hypervideo_dl/extractor/polskieradio.py
+++ b/hypervideo_dl/extractor/polskieradio.py
@@ -2,24 +2,24 @@ import itertools
import json
import math
import re
+import urllib.parse
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_urllib_parse_unquote,
- compat_urlparse
-)
+from ..compat import compat_str
from ..utils import (
- extract_attributes,
ExtractorError,
InAdvancePagedList,
+ determine_ext,
+ extract_attributes,
int_or_none,
js_to_json,
parse_iso8601,
strip_or_none,
- unified_timestamp,
+ traverse_obj,
unescapeHTML,
+ unified_timestamp,
url_or_none,
+ urljoin,
)
@@ -42,34 +42,17 @@ class PolskieRadioBaseExtractor(InfoExtractor):
'duration': int_or_none(media.get('length')),
'vcodec': 'none' if media.get('provider') == 'audio' else None,
})
- entry_title = compat_urllib_parse_unquote(media['desc'])
+ entry_title = urllib.parse.unquote(media['desc'])
if entry_title:
entry['title'] = entry_title
yield entry
-class PolskieRadioIE(PolskieRadioBaseExtractor):
- _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
- _TESTS = [{ # Old-style single broadcast.
- 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie',
- 'info_dict': {
- 'id': '1587943',
- 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie',
- 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5',
- },
- 'playlist': [{
- 'md5': '2984ee6ce9046d91fc233bc1a864a09a',
- 'info_dict': {
- 'id': '1540576',
- 'ext': 'mp3',
- 'title': 'md5:d4623290d4ac983bf924061c75c23a0d',
- 'timestamp': 1456594200,
- 'upload_date': '20160227',
- 'duration': 2364,
- 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$'
- },
- }],
- }, { # New-style single broadcast.
+class PolskieRadioLegacyIE(PolskieRadioBaseExtractor):
+ # legacy sites
+ IE_NAME = 'polskieradio:legacy'
+ _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/[Aa]rtykul/(?P<id>\d+)'
+ _TESTS = [{
'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo',
'info_dict': {
'id': '2534482',
@@ -97,16 +80,6 @@ class PolskieRadioIE(PolskieRadioBaseExtractor):
'title': 'Pogłos 29 października godz. 23:01',
},
}, {
- 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis',
- 'only_matching': True,
- }, {
- 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943',
- 'only_matching': True,
- }, {
- # with mp4 video
- 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej',
- 'only_matching': True,
- }, {
'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci',
'only_matching': True,
}]
@@ -114,7 +87,9 @@ class PolskieRadioIE(PolskieRadioBaseExtractor):
def _real_extract(self, url):
playlist_id = self._match_id(url)
- webpage = self._download_webpage(url, playlist_id)
+ webpage, urlh = self._download_webpage_handle(url, playlist_id)
+ if PolskieRadioIE.suitable(urlh.url):
+ return self.url_result(urlh.url, PolskieRadioIE, playlist_id)
content = self._search_regex(
r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>',
@@ -153,26 +128,201 @@ class PolskieRadioIE(PolskieRadioBaseExtractor):
return self.playlist_result(entries, playlist_id, title, description)
-class PolskieRadioCategoryIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)'
+class PolskieRadioIE(PolskieRadioBaseExtractor):
+ # new next.js sites
+ _VALID_URL = r'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P<id>\d+)'
+ _TESTS = [{
+ # articleData, attachments
+ 'url': 'https://jedynka.polskieradio.pl/artykul/1587943',
+ 'info_dict': {
+ 'id': '1587943',
+ 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie',
+ 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5',
+ },
+ 'playlist': [{
+ 'md5': '2984ee6ce9046d91fc233bc1a864a09a',
+ 'info_dict': {
+ 'id': '7a85d429-5356-4def-a347-925e4ae7406b',
+ 'ext': 'mp3',
+ 'title': 'md5:d4623290d4ac983bf924061c75c23a0d',
+ },
+ }],
+ }, {
+ # post, legacy html players
+ 'url': 'https://trojka.polskieradio.pl/artykul/2589163,Czy-wciaz-otrzymujemy-zdjecia-z-sond-Voyager',
+ 'info_dict': {
+ 'id': '2589163',
+ 'title': 'Czy wciąż otrzymujemy zdjęcia z sond Voyager?',
+ 'description': 'md5:cf1a7f348d63a2db9c0d7a63d1669473',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '2577880',
+ 'ext': 'mp3',
+ 'title': 'md5:a57d10a0c02abd34dd675cb33707ad5a',
+ 'duration': 321,
+ },
+ }],
+ }, {
+ # data, legacy
+ 'url': 'https://radiokierowcow.pl/artykul/2694529',
+ 'info_dict': {
+ 'id': '2694529',
+ 'title': 'Zielona fala reliktem przeszłości?',
+ 'description': 'md5:f20a9a7ed9cb58916c54add94eae3bc0',
+ },
+ 'playlist_count': 3,
+ }, {
+ 'url': 'https://trojka.polskieradio.pl/artykul/1632955',
+ 'only_matching': True,
+ }, {
+ # with mp4 video
+ 'url': 'https://trojka.polskieradio.pl/artykul/1634903',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://jedynka.polskieradio.pl/artykul/3042436,Polityka-wschodnia-ojca-i-syna-Wladyslawa-Lokietka-i-Kazimierza-Wielkiego',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ article_data = traverse_obj(
+ self._search_nextjs_data(webpage, playlist_id), (
+ 'props', 'pageProps', (('data', 'articleData'), 'post', 'data')), get_all=False)
+
+ title = strip_or_none(article_data['title'])
+
+ description = strip_or_none(article_data.get('lead'))
+
+ entries = [{
+ 'url': entry['file'],
+ 'ext': determine_ext(entry.get('fileName')),
+ 'id': self._search_regex(
+ r'([a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12})', entry['file'], 'entry id'),
+ 'title': strip_or_none(entry.get('description')) or title,
+ } for entry in article_data.get('attachments') or () if entry.get('fileType') in ('Audio', )]
+
+ if not entries:
+ # some legacy articles have no json attachments, but players in body
+ entries = self._extract_webpage_player_entries(article_data['content'], playlist_id, {
+ 'title': title,
+ })
+
+ return self.playlist_result(entries, playlist_id, title, description)
+
+
+class PolskieRadioAuditionIE(InfoExtractor):
+ # new next.js sites
+ IE_NAME = 'polskieradio:audition'
+ _VALID_URL = r'https?://(?:[^/]+\.)?polskieradio\.pl/audycj[ae]/(?P<id>\d+)'
_TESTS = [{
- 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA',
+ # articles, PR1
+ 'url': 'https://jedynka.polskieradio.pl/audycje/5102',
'info_dict': {
'id': '5102',
- 'title': 'HISTORIA ŻYWA',
+ 'title': 'Historia żywa',
+ 'thumbnail': r're:https://static\.prsa\.pl/images/.+',
},
'playlist_mincount': 38,
}, {
- 'url': 'http://www.polskieradio.pl/7/4807',
+ # episodes, PR1
+ 'url': 'https://jedynka.polskieradio.pl/audycje/5769',
'info_dict': {
- 'id': '4807',
- 'title': 'Vademecum 1050. rocznicy Chrztu Polski'
+ 'id': '5769',
+ 'title': 'AgroFakty',
+ 'thumbnail': r're:https://static\.prsa\.pl/images/.+',
},
- 'playlist_mincount': 5
+ 'playlist_mincount': 269,
}, {
- 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source',
- 'only_matching': True
+ # both episodes and articles, PR3
+ 'url': 'https://trojka.polskieradio.pl/audycja/8906',
+ 'info_dict': {
+ 'id': '8906',
+ 'title': 'Trójka budzi',
+ 'thumbnail': r're:https://static\.prsa\.pl/images/.+',
+ },
+ 'playlist_mincount': 722,
}, {
+ # some articles were "promoted to main page" and thus link to old frontend
+ 'url': 'https://trojka.polskieradio.pl/audycja/305',
+ 'info_dict': {
+ 'id': '305',
+ 'title': 'Co w mowie piszczy?',
+ 'thumbnail': r're:https://static\.prsa\.pl/images/.+',
+ },
+ 'playlist_count': 1523,
+ }]
+
+ def _call_lp3(self, path, query, video_id, note):
+ return self._download_json(
+ f'https://lp3test.polskieradio.pl/{path}', video_id, note,
+ query=query, headers={'x-api-key': '9bf6c5a2-a7d0-4980-9ed7-a3f7291f2a81'})
+
+ def _entries(self, playlist_id, has_episodes, has_articles):
+ for i in itertools.count(1) if has_episodes else []:
+ page = self._call_lp3(
+ 'AudioArticle/GetListByCategoryId', {
+ 'categoryId': playlist_id,
+ 'PageSize': 10,
+ 'skip': i,
+ 'format': 400,
+ }, playlist_id, f'Downloading episode list page {i}')
+ if not traverse_obj(page, 'data'):
+ break
+ for episode in page['data']:
+ yield {
+ 'id': str(episode['id']),
+ 'url': episode['file'],
+ 'title': episode.get('title'),
+ 'duration': int_or_none(episode.get('duration')),
+ 'timestamp': parse_iso8601(episode.get('datePublic')),
+ }
+
+ for i in itertools.count(1) if has_articles else []:
+ page = self._call_lp3(
+ 'Article/GetListByCategoryId', {
+ 'categoryId': playlist_id,
+ 'PageSize': 9,
+ 'skip': i,
+ 'format': 400,
+ }, playlist_id, f'Downloading article list page {i}')
+ if not traverse_obj(page, 'data'):
+ break
+ for article in page['data']:
+ yield {
+ '_type': 'url_transparent',
+ 'id': str(article['id']),
+ 'url': article['url'],
+ 'title': article.get('shortTitle'),
+ 'description': traverse_obj(article, ('description', 'lead')),
+ 'timestamp': parse_iso8601(article.get('datePublic')),
+ }
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ page_props = traverse_obj(
+ self._search_nextjs_data(self._download_webpage(url, playlist_id), playlist_id),
+ ('props', 'pageProps', ('data', None)), get_all=False)
+
+ has_episodes = bool(traverse_obj(page_props, 'episodes', 'audios'))
+ has_articles = bool(traverse_obj(page_props, 'articles'))
+
+ return self.playlist_result(
+ self._entries(playlist_id, has_episodes, has_articles), playlist_id,
+ title=traverse_obj(page_props, ('details', 'name')),
+ description=traverse_obj(page_props, ('details', 'description', 'lead')),
+ thumbnail=traverse_obj(page_props, ('details', 'photo')))
+
+
+class PolskieRadioCategoryIE(InfoExtractor):
+ # legacy sites
+ IE_NAME = 'polskieradio:category'
+ _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/(?:\d+(?:,[^/]+)?/|[^/]+/Tag)(?P<id>\d+)'
+ _TESTS = [{
'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow',
'info_dict': {
'id': '4143',
@@ -187,8 +337,35 @@ class PolskieRadioCategoryIE(InfoExtractor):
},
'playlist_mincount': 61
}, {
- 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA',
- 'only_matching': True,
+ # billennium tabs
+ 'url': 'https://www.polskieradio.pl/8/2385',
+ 'info_dict': {
+ 'id': '2385',
+ 'title': 'Droga przez mąkę',
+ },
+ 'playlist_mincount': 111,
+ }, {
+ 'url': 'https://www.polskieradio.pl/10/4930',
+ 'info_dict': {
+ 'id': '4930',
+ 'title': 'Teraz K-pop!',
+ },
+ 'playlist_mincount': 392,
+ }, {
+ # post back pages, audio content directly without articles
+ 'url': 'https://www.polskieradio.pl/8,dwojka/7376,nowa-mowa',
+ 'info_dict': {
+ 'id': '7376',
+ 'title': 'Nowa mowa',
+ },
+ 'playlist_mincount': 244,
+ }, {
+ 'url': 'https://www.polskieradio.pl/Krzysztof-Dziuba/Tag175458',
+ 'info_dict': {
+ 'id': '175458',
+ 'title': 'Krzysztof Dziuba',
+ },
+ 'playlist_mincount': 420,
}, {
'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka',
'only_matching': True,
@@ -196,35 +373,73 @@ class PolskieRadioCategoryIE(InfoExtractor):
@classmethod
def suitable(cls, url):
- return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url)
+ return False if PolskieRadioLegacyIE.suitable(url) else super().suitable(url)
def _entries(self, url, page, category_id):
content = page
+ is_billennium_tabs = 'onclick="TB_LoadTab(' in page
+ is_post_back = 'onclick="__doPostBack(' in page
+ pagination = page if is_billennium_tabs else None
for page_num in itertools.count(2):
for a_entry, entry_id in re.findall(
- r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>',
+ r'(?s)<article[^>]+>.*?(<a[^>]+href=["\'](?:(?:https?)?://[^/]+)?/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>',
content):
entry = extract_attributes(a_entry)
- href = entry.get('href')
- if not href:
- continue
- yield self.url_result(
- compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(),
- entry_id, entry.get('title'))
- mobj = re.search(
- r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1',
- content)
- if not mobj:
- break
- next_url = compat_urlparse.urljoin(url, mobj.group('url'))
- content = self._download_webpage(
- next_url, category_id, 'Downloading page %s' % page_num)
+ if entry.get('href'):
+ yield self.url_result(
+ urljoin(url, entry['href']), PolskieRadioLegacyIE, entry_id, entry.get('title'))
+ for a_entry in re.findall(r'<span data-media=({[^ ]+})', content):
+ yield traverse_obj(self._parse_json(a_entry, category_id), {
+ 'url': 'file',
+ 'id': 'uid',
+ 'duration': 'length',
+ 'title': ('title', {urllib.parse.unquote}),
+ 'description': ('desc', {urllib.parse.unquote}),
+ })
+ if is_billennium_tabs:
+ params = self._search_json(
+ r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+onclick=["\']TB_LoadTab\(',
+ pagination, 'next page params', category_id, default=None, close_objects=1,
+ contains_pattern='.+', transform_source=lambda x: '[%s' % js_to_json(unescapeHTML(x)))
+ if not params:
+ break
+ tab_content = self._download_json(
+ 'https://www.polskieradio.pl/CMS/TemplateBoxesManagement/TemplateBoxTabContent.aspx/GetTabContent',
+ category_id, f'Downloading page {page_num}', headers={'content-type': 'application/json'},
+ data=json.dumps(dict(zip((
+ 'boxInstanceId', 'tabId', 'categoryType', 'sectionId', 'categoryId', 'pagerMode',
+ 'subjectIds', 'tagIndexId', 'queryString', 'name', 'openArticlesInParentTemplate',
+ 'idSectionFromUrl', 'maxDocumentAge', 'showCategoryForArticle', 'pageNumber'
+ ), params))).encode())['d']
+ content, pagination = tab_content['Content'], tab_content.get('PagerContent')
+ elif is_post_back:
+ target = self._search_regex(
+ r'onclick=(?:["\'])__doPostBack\((?P<q1>["\'])(?P<target>[\w$]+)(?P=q1)\s*,\s*(?P<q2>["\'])Next(?P=q2)',
+ content, 'pagination postback target', group='target', default=None)
+ if not target:
+ break
+ content = self._download_webpage(
+ url, category_id, f'Downloading page {page_num}',
+ data=urllib.parse.urlencode({
+ **self._hidden_inputs(content),
+ '__EVENTTARGET': target,
+ '__EVENTARGUMENT': 'Next',
+ }).encode())
+ else:
+ next_url = urljoin(url, self._search_regex(
+ r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1',
+ content, 'next page url', group='url', default=None))
+ if not next_url:
+ break
+ content = self._download_webpage(next_url, category_id, f'Downloading page {page_num}')
def _real_extract(self, url):
category_id = self._match_id(url)
- webpage = self._download_webpage(url, category_id)
+ webpage, urlh = self._download_webpage_handle(url, category_id)
+ if PolskieRadioAuditionIE.suitable(urlh.url):
+ return self.url_result(urlh.url, PolskieRadioAuditionIE, category_id)
title = self._html_search_regex(
- r'<title>([^<]+) - [^<]+ - [^<]+</title>',
+ r'<title>([^<]+)(?: - [^<]+ - [^<]+| w [Pp]olskie[Rr]adio\.pl\s*)</title>',
webpage, 'title', fatal=False)
return self.playlist_result(
self._entries(url, webpage, category_id),
@@ -358,7 +573,7 @@ class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor):
'entries': InAdvancePagedList(
get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE),
'id': str(data['id']),
- 'title': data['title'],
+ 'title': data.get('title'),
'description': data.get('description'),
'uploader': data.get('announcer'),
}
@@ -374,6 +589,10 @@ class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor):
'ext': 'mp3',
'title': 'Theresa May rezygnuje. Co dalej z brexitem?',
'description': 'md5:e41c409a29d022b70ef0faa61dbded60',
+ 'episode': 'Theresa May rezygnuje. Co dalej z brexitem?',
+ 'duration': 2893,
+ 'thumbnail': 'https://static.prsa.pl/images/58649376-c8a0-4ba2-a714-78b383285f5f.jpg',
+ 'series': 'Raport o stanie świata',
},
}]
@@ -389,39 +608,3 @@ class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor):
'Content-Type': 'application/json',
})
return self._parse_episode(data[0])
-
-
-class PolskieRadioRadioKierowcowIE(PolskieRadioBaseExtractor):
- _VALID_URL = r'https?://(?:www\.)?radiokierowcow\.pl/artykul/(?P<id>[0-9]+)'
- IE_NAME = 'polskieradio:kierowcow'
-
- _TESTS = [{
- 'url': 'https://radiokierowcow.pl/artykul/2694529',
- 'info_dict': {
- 'id': '2694529',
- 'title': 'Zielona fala reliktem przeszłości?',
- 'description': 'md5:343950a8717c9818fdfd4bd2b8ca9ff2',
- },
- 'playlist_count': 3,
- }]
-
- def _real_extract(self, url):
- media_id = self._match_id(url)
- webpage = self._download_webpage(url, media_id)
- nextjs_build = self._search_nextjs_data(webpage, media_id)['buildId']
- article = self._download_json(
- f'https://radiokierowcow.pl/_next/data/{nextjs_build}/artykul/{media_id}.json?articleId={media_id}',
- media_id)
- data = article['pageProps']['data']
- title = data['title']
- entries = self._extract_webpage_player_entries(data['content'], media_id, {
- 'title': title,
- })
-
- return {
- '_type': 'playlist',
- 'id': media_id,
- 'entries': entries,
- 'title': title,
- 'description': data.get('lead'),
- }
diff --git a/hypervideo_dl/extractor/porn91.py b/hypervideo_dl/extractor/porn91.py
index af4a0dc..7d16a16 100644
--- a/hypervideo_dl/extractor/porn91.py
+++ b/hypervideo_dl/extractor/porn91.py
@@ -1,26 +1,48 @@
+import urllib.parse
from .common import InfoExtractor
from ..utils import (
- parse_duration,
+ determine_ext,
int_or_none,
+ parse_duration,
+ remove_end,
+ unified_strdate,
ExtractorError,
)
class Porn91IE(InfoExtractor):
IE_NAME = '91porn'
- _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P<id>[\w\d]+)'
+ _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/view_video.php\?([^#]+&)?viewkey=(?P<id>\w+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134',
- 'md5': '7fcdb5349354f40d41689bd0fa8db05a',
+ 'md5': 'd869db281402e0ef4ddef3c38b866f86',
'info_dict': {
'id': '7e42283b4f5ab36da134',
'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!',
+ 'description': 'md5:1ff241f579b07ae936a54e810ad2e891',
'ext': 'mp4',
'duration': 431,
+ 'upload_date': '20150520',
+ 'comment_count': int,
+ 'view_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://91porn.com/view_video.php?viewkey=7ef0cf3d362c699ab91c',
+ 'md5': 'f8fd50540468a6d795378cd778b40226',
+ 'info_dict': {
+ 'id': '7ef0cf3d362c699ab91c',
+ 'title': '真实空乘,冲上云霄第二部',
+ 'description': 'md5:618bf9652cafcc66cd277bd96789baea',
+ 'ext': 'mp4',
+ 'duration': 248,
+ 'upload_date': '20221119',
+ 'comment_count': int,
+ 'view_count': int,
'age_limit': 18,
}
- }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -29,32 +51,45 @@ class Porn91IE(InfoExtractor):
webpage = self._download_webpage(
'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id)
- if '作为游客,你每天只可观看10个视频' in webpage:
- raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True)
+ if '视频不存在,可能已经被删除或者被举报为不良内容!' in webpage:
+ raise ExtractorError('91 Porn says: Video does not exist', expected=True)
- title = self._search_regex(
- r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title')
- title = title.replace('\n', '')
+ daily_limit = self._search_regex(
+ r'作为游客,你每天只可观看([\d]+)个视频', webpage, 'exceeded daily limit', default=None, fatal=False)
+ if daily_limit:
+ raise ExtractorError(f'91 Porn says: Daily limit {daily_limit} videos exceeded', expected=True)
video_link_url = self._search_regex(
- r'<textarea[^>]+id=["\']fm-video_link[^>]+>([^<]+)</textarea>',
- webpage, 'video link')
- videopage = self._download_webpage(video_link_url, video_id)
-
- info_dict = self._parse_html5_media_entries(url, videopage, video_id)[0]
-
- duration = parse_duration(self._search_regex(
- r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False))
+ r'document\.write\(\s*strencode2\s*\(\s*((?:"[^"]+")|(?:\'[^\']+\'))', webpage, 'video link')
+ video_link_url = self._search_regex(
+ r'src=["\']([^"\']+)["\']', urllib.parse.unquote(video_link_url), 'unquoted video link')
- comment_count = int_or_none(self._search_regex(
- r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False))
+ formats, subtitles = self._get_formats_and_subtitle(video_link_url, video_id)
- info_dict.update({
+ return {
'id': video_id,
- 'title': title,
- 'duration': duration,
- 'comment_count': comment_count,
- 'age_limit': self._rta_search(webpage),
- })
+ 'title': remove_end(self._html_extract_title(webpage).replace('\n', ''), 'Chinese homemade video').strip(),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'upload_date': unified_strdate(self._search_regex(
+ r'<span\s+class=["\']title-yakov["\']>(\d{4}-\d{2}-\d{2})</span>', webpage, 'upload_date', fatal=False)),
+ 'description': self._html_search_regex(
+ r'<span\s+class=["\']more title["\']>\s*([^<]+)', webpage, 'description', fatal=False),
+ 'duration': parse_duration(self._search_regex(
+ r'时长:\s*<span[^>]*>\s*(\d+(?::\d+){1,2})', webpage, 'duration', fatal=False)),
+ 'comment_count': int_or_none(self._search_regex(
+ r'留言:\s*<span[^>]*>\s*(\d+)\s*</span>', webpage, 'comment count', fatal=False)),
+ 'view_count': int_or_none(self._search_regex(
+ r'热度:\s*<span[^>]*>\s*(\d+)\s*</span>', webpage, 'view count', fatal=False)),
+ 'age_limit': 18,
+ }
+
+ def _get_formats_and_subtitle(self, video_link_url, video_id):
+ ext = determine_ext(video_link_url)
+ if ext == 'm3u8':
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_link_url, video_id, ext='mp4')
+ else:
+ formats = [{'url': video_link_url, 'ext': ext}]
+ subtitles = {}
- return info_dict
+ return formats, subtitles
diff --git a/hypervideo_dl/extractor/pornez.py b/hypervideo_dl/extractor/pornez.py
index df0e44a..bc45f86 100644
--- a/hypervideo_dl/extractor/pornez.py
+++ b/hypervideo_dl/extractor/pornez.py
@@ -1,41 +1,60 @@
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ clean_html,
+ int_or_none,
+ get_element_by_class,
+ urljoin,
+)
class PornezIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?pornez\.net/video(?P<id>[0-9]+)/'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?pornez\.net/(?:video(?P<id>\w+)|watch)/'
+ _TESTS = [{
'url': 'https://pornez.net/video344819/mistresst-funny_penis_names-wmv/',
- 'md5': '2e19a0a1cff3a5dbea0ef1b9e80bcbbc',
'info_dict': {
'id': '344819',
'ext': 'mp4',
- 'title': r'mistresst funny_penis_names wmv',
+ 'title': 'mistresst funny_penis_names wmv',
'thumbnail': r're:^https?://.*\.jpg$',
'age_limit': 18,
- }
- }
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://pornez.net/watch/leana+lovings+stiff+for+stepdaughter/',
+ 'info_dict': {
+ 'id': '156161',
+ 'ext': 'mp4',
+ 'title': 'Watch leana lovings stiff for stepdaughter porn video.',
+ 'age_limit': 18,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://pornez.net/videovzs27fj/tutor4k-e14-blue-wave-1080p-nbq-tutor4k-e14-blue-wave/',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- iframe_src = self._html_search_regex(
- r'<iframe[^>]+src="(https?://pornez\.net/player/\?[^"]+)"', webpage, 'iframe', fatal=True)
- title = self._html_search_meta(['name', 'twitter:title', 'og:title'], webpage, 'title', default=None)
- if title is None:
- title = self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title', fatal=True)
- thumbnail = self._html_search_meta(['thumbnailUrl'], webpage, 'title', default=None)
- webpage = self._download_webpage(iframe_src, video_id)
- entries = self._parse_html5_media_entries(iframe_src, webpage, video_id)[0]
- for format in entries['formats']:
- height = self._search_regex(r'_(\d+)\.m3u8', format['url'], 'height')
- format['format_id'] = '%sp' % height
- format['height'] = int_or_none(height)
+ if not video_id:
+ video_id = self._search_regex(
+ r'<link[^>]+\bhref=["\']https?://pornez.net/\?p=(\w+)["\']', webpage, 'id')
+
+ iframe_src = self._html_search_regex(r'<iframe[^>]+src="([^"]+)"', webpage, 'iframe')
+ iframe = self._download_webpage(urljoin('https://pornez.net', iframe_src), video_id)
+
+ entries = self._parse_html5_media_entries(iframe_src, iframe, video_id)[0]
+ for fmt in entries['formats']:
+ height = self._search_regex(r'_(\d+)\.m3u8', fmt['url'], 'height')
+ fmt['format_id'] = '%sp' % height
+ fmt['height'] = int_or_none(height)
entries.update({
'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'age_limit': 18
+ 'title': (clean_html(get_element_by_class('video-title', webpage))
+ or self._html_search_meta(
+ ['twitter:title', 'og:title', 'description'], webpage, 'title', default=None)),
+ 'thumbnail': self._html_search_meta(['thumbnailUrl'], webpage, 'thumb', default=None),
+ 'age_limit': 18,
})
return entries
diff --git a/hypervideo_dl/extractor/pornhub.py b/hypervideo_dl/extractor/pornhub.py
index 5d8d7c1..999d038 100644
--- a/hypervideo_dl/extractor/pornhub.py
+++ b/hypervideo_dl/extractor/pornhub.py
@@ -3,11 +3,12 @@ import itertools
import math
import operator
import re
-import urllib.request
from .common import InfoExtractor
from .openload import PhantomJSwrapper
-from ..compat import compat_HTTPError, compat_str
+from ..compat import compat_str
+from ..networking import Request
+from ..networking.exceptions import HTTPError
from ..utils import (
NO_DEFAULT,
ExtractorError,
@@ -46,8 +47,8 @@ class PornHubBaseIE(InfoExtractor):
r'document\.cookie\s*=\s*["\']RNKEY=',
r'document\.location\.reload\(true\)')):
url_or_request = args[0]
- url = (url_or_request.get_full_url()
- if isinstance(url_or_request, urllib.request.Request)
+ url = (url_or_request.url
+ if isinstance(url_or_request, Request)
else url_or_request)
phantom = PhantomJSwrapper(self, required_version='2.0')
phantom.get(url, html=webpage)
@@ -58,6 +59,12 @@ class PornHubBaseIE(InfoExtractor):
def _real_initialize(self):
self._logged_in = False
+ def _set_age_cookies(self, host):
+ self._set_cookie(host, 'age_verified', '1')
+ self._set_cookie(host, 'accessAgeDisclaimerPH', '1')
+ self._set_cookie(host, 'accessAgeDisclaimerUK', '1')
+ self._set_cookie(host, 'accessPH', '1')
+
def _login(self, host):
if self._logged_in:
return
@@ -267,8 +274,7 @@ class PornHubIE(PornHubBaseIE):
video_id = mobj.group('id')
self._login(host)
-
- self._set_cookie(host, 'age_verified', '1')
+ self._set_age_cookies(host)
def dl_webpage(platform):
self._set_cookie(host, 'platform', platform)
@@ -569,6 +575,7 @@ class PornHubUserIE(PornHubPlaylistBaseIE):
mobj = self._match_valid_url(url)
user_id = mobj.group('id')
videos_url = '%s/videos' % mobj.group('url')
+ self._set_age_cookies(mobj.group('host'))
page = self._extract_page(url)
if page:
videos_url = update_url_query(videos_url, {'page': page})
@@ -597,7 +604,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
base_url, item_id, note, query={'page': num})
def is_404(e):
- return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404
+ return isinstance(e.cause, HTTPError) and e.cause.status == 404
base_url = url
has_page = page is not None
@@ -633,6 +640,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
item_id = mobj.group('id')
self._login(host)
+ self._set_age_cookies(host)
return self.playlist_result(self._entries(url, host, item_id), item_id)
@@ -812,5 +820,6 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE):
item_id = mobj.group('id')
self._login(host)
+ self._set_age_cookies(host)
return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id)
diff --git a/hypervideo_dl/extractor/pr0gramm.py b/hypervideo_dl/extractor/pr0gramm.py
new file mode 100644
index 0000000..2eb327f
--- /dev/null
+++ b/hypervideo_dl/extractor/pr0gramm.py
@@ -0,0 +1,97 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import merge_dicts
+
+
+class Pr0grammStaticIE(InfoExtractor):
+ # Possible urls:
+ # https://pr0gramm.com/static/5466437
+ _VALID_URL = r'https?://pr0gramm\.com/static/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://pr0gramm.com/static/5466437',
+ 'md5': '52fa540d70d3edc286846f8ca85938aa',
+ 'info_dict': {
+ 'id': '5466437',
+ 'ext': 'mp4',
+ 'title': 'pr0gramm-5466437 by g11st',
+ 'uploader': 'g11st',
+ 'upload_date': '20221221',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ # Fetch media sources
+ entries = self._parse_html5_media_entries(url, webpage, video_id)
+ media_info = entries[0]
+
+ # Fetch author
+ uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader')
+
+ # Fetch approx upload timestamp from filename
+ # Have None-defaults in case the extraction fails
+ uploadDay = None
+ uploadMon = None
+ uploadYear = None
+ uploadTimestr = None
+ # (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4)
+ m = re.search(r'//img\.pr0gramm\.com/(?P<year>[\d]+)/(?P<mon>[\d]+)/(?P<day>[\d]+)/\w+\.\w{,4}', webpage)
+
+ if (m):
+ # Up to a day of accuracy should suffice...
+ uploadDay = m.groupdict().get('day')
+ uploadMon = m.groupdict().get('mon')
+ uploadYear = m.groupdict().get('year')
+ uploadTimestr = uploadYear + uploadMon + uploadDay
+
+ return merge_dicts({
+ 'id': video_id,
+ 'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''),
+ 'uploader': uploader,
+ 'upload_date': uploadTimestr
+ }, media_info)
+
+
+# This extractor is for the primary url (used for sharing, and appears in the
+# location bar) Since this page loads the DOM via JS, yt-dl can't find any
+# video information here. So let's redirect to a compatibility version of
+# the site, which does contain the <video>-element by itself, without requiring
+# js to be ran.
+class Pr0grammIE(InfoExtractor):
+ # Possible urls:
+ # https://pr0gramm.com/new/546637
+ # https://pr0gramm.com/new/video/546637
+ # https://pr0gramm.com/top/546637
+ # https://pr0gramm.com/top/video/546637
+ # https://pr0gramm.com/user/g11st/uploads/5466437
+ # https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290
+ # https://pr0gramm.com/user/froschler/reinziehen-1elf/5232030
+ # https://pr0gramm.com/user/froschler/1elf/5232030
+ # https://pr0gramm.com/new/5495710:comment62621020 <- this is not the id!
+ # https://pr0gramm.com/top/fruher war alles damals/5498175
+
+ _VALID_URL = r'https?:\/\/pr0gramm\.com\/(?!static/\d+).+?\/(?P<id>[\d]+)(:|$)'
+ _TEST = {
+ 'url': 'https://pr0gramm.com/new/video/5466437',
+ 'info_dict': {
+ 'id': '5466437',
+ 'ext': 'mp4',
+ 'title': 'pr0gramm-5466437 by g11st',
+ 'uploader': 'g11st',
+ 'upload_date': '20221221',
+ }
+ }
+
+ def _generic_title():
+ return "oof"
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ return self.url_result(
+ 'https://pr0gramm.com/static/' + video_id,
+ video_id=video_id,
+ ie=Pr0grammStaticIE.ie_key())
diff --git a/hypervideo_dl/extractor/prankcast.py b/hypervideo_dl/extractor/prankcast.py
index 0eb5f98..b2ec5bb 100644
--- a/hypervideo_dl/extractor/prankcast.py
+++ b/hypervideo_dl/extractor/prankcast.py
@@ -18,7 +18,7 @@ class PrankCastIE(InfoExtractor):
'cast': ['Devonanustart', 'Phonelosers'],
'description': '',
'categories': ['prank'],
- 'tags': ['prank call', 'prank'],
+ 'tags': ['prank call', 'prank', 'live show'],
'upload_date': '20220825'
}
}, {
@@ -35,7 +35,7 @@ class PrankCastIE(InfoExtractor):
'cast': ['phonelosers'],
'description': '',
'categories': ['prank'],
- 'tags': ['prank call', 'prank'],
+ 'tags': ['prank call', 'prank', 'live show'],
'upload_date': '20221006'
}
}]
@@ -62,5 +62,5 @@ class PrankCastIE(InfoExtractor):
'cast': list(filter(None, [uploader] + traverse_obj(guests_json, (..., 'name')))),
'description': json_info.get('broadcast_description'),
'categories': [json_info.get('broadcast_category')],
- 'tags': self._parse_json(json_info.get('broadcast_tags') or '{}', video_id)
+ 'tags': try_call(lambda: json_info['broadcast_tags'].split(','))
}
diff --git a/hypervideo_dl/extractor/puhutv.py b/hypervideo_dl/extractor/puhutv.py
index 482e570..4b8e5e9 100644
--- a/hypervideo_dl/extractor/puhutv.py
+++ b/hypervideo_dl/extractor/puhutv.py
@@ -1,8 +1,6 @@
from .common import InfoExtractor
-from ..compat import (
- compat_HTTPError,
- compat_str,
-)
+from ..compat import compat_str
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
@@ -72,7 +70,7 @@ class PuhuTVIE(InfoExtractor):
display_id, 'Downloading video JSON',
headers=self.geo_verification_headers())
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
self.raise_geo_restricted()
raise
diff --git a/hypervideo_dl/extractor/qdance.py b/hypervideo_dl/extractor/qdance.py
new file mode 100644
index 0000000..62b08b3
--- /dev/null
+++ b/hypervideo_dl/extractor/qdance.py
@@ -0,0 +1,150 @@
+import json
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ jwt_decode_hs256,
+ str_or_none,
+ traverse_obj,
+ try_call,
+ url_or_none,
+)
+
+
+class QDanceIE(InfoExtractor):
+ _NETRC_MACHINE = 'qdance'
+ _VALID_URL = r'https?://(?:www\.)?q-dance\.com/network/(?:library|live)/(?P<id>\d+)'
+ _TESTS = [{
+ 'note': 'vod',
+ 'url': 'https://www.q-dance.com/network/library/146542138',
+ 'info_dict': {
+ 'id': '146542138',
+ 'ext': 'mp4',
+ 'title': 'Sound Rush [LIVE] | Defqon.1 Weekend Festival 2022 | Friday | RED',
+ 'display_id': 'sound-rush-live-v3-defqon-1-weekend-festival-2022-friday-red',
+ 'description': 'Relive Defqon.1 - Primal Energy 2022 with the sounds of Sound Rush LIVE at the RED on Friday! 🔥',
+ 'season': 'Defqon.1 Weekend Festival 2022',
+ 'season_id': '31840632',
+ 'series': 'Defqon.1',
+ 'series_id': '31840378',
+ 'thumbnail': 'https://images.q-dance.network/1674829540-20220624171509-220624171509_delio_dn201093-2.jpg',
+ 'availability': 'premium_only',
+ 'duration': 1829,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'note': 'livestream',
+ 'url': 'https://www.q-dance.com/network/live/149170353',
+ 'info_dict': {
+ 'id': '149170353',
+ 'ext': 'mp4',
+ 'title': r're:^Defqon\.1 2023 - Friday - RED',
+ 'display_id': 'defqon-1-2023-friday-red',
+ 'description': 'md5:3c73fbbd4044e578e696adfc64019163',
+ 'season': 'Defqon.1 Weekend Festival 2023',
+ 'season_id': '141735599',
+ 'series': 'Defqon.1',
+ 'series_id': '31840378',
+ 'thumbnail': 'https://images.q-dance.network/1686849069-area-thumbs_red.png',
+ 'availability': 'subscriber_only',
+ 'live_status': 'is_live',
+ 'channel_id': 'qdancenetwork.video_149170353',
+ },
+ 'skip': 'Completed livestream',
+ }]
+
+ _access_token = None
+ _refresh_token = None
+
+ def _call_login_api(self, data, note='Logging in'):
+ login = self._download_json(
+ 'https://members.id-t.com/api/auth/login', None, note, headers={
+ 'content-type': 'application/json',
+ 'brand': 'qdance',
+ 'origin': 'https://www.q-dance.com',
+ 'referer': 'https://www.q-dance.com/',
+ }, data=json.dumps(data, separators=(',', ':')).encode(),
+ expected_status=lambda x: True)
+
+ tokens = traverse_obj(login, ('data', {
+ '_id-t-accounts-token': ('accessToken', {str}),
+ '_id-t-accounts-refresh': ('refreshToken', {str}),
+ '_id-t-accounts-id-token': ('idToken', {str}),
+ }))
+
+ if not tokens.get('_id-t-accounts-token'):
+ error = ': '.join(traverse_obj(login, ('error', ('code', 'message'), {str})))
+ if 'validation_error' not in error:
+ raise ExtractorError(f'Q-Dance API said "{error}"')
+ msg = 'Invalid username or password' if 'email' in data else 'Refresh token has expired'
+ raise ExtractorError(msg, expected=True)
+
+ for name, value in tokens.items():
+ self._set_cookie('.q-dance.com', name, value)
+
+ def _perform_login(self, username, password):
+ self._call_login_api({'email': username, 'password': password})
+
+ def _real_initialize(self):
+ cookies = self._get_cookies('https://www.q-dance.com/')
+ self._refresh_token = try_call(lambda: cookies['_id-t-accounts-refresh'].value)
+ self._access_token = try_call(lambda: cookies['_id-t-accounts-token'].value)
+ if not self._access_token:
+ self.raise_login_required()
+
+ def _get_auth(self):
+ if (try_call(lambda: jwt_decode_hs256(self._access_token)['exp']) or 0) <= int(time.time() - 120):
+ if not self._refresh_token:
+ raise ExtractorError(
+ 'Cannot refresh access token, login with hypervideo or refresh cookies in browser')
+ self._call_login_api({'refreshToken': self._refresh_token}, note='Refreshing access token')
+ self._real_initialize()
+
+ return {'Authorization': self._access_token}
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ data = self._search_nuxt_data(webpage, video_id, traverse=('data', 0, 'data'))
+
+ def extract_availability(level):
+ level = int_or_none(level) or 0
+ return self._availability(
+ needs_premium=(level >= 20), needs_subscription=(level >= 15), needs_auth=True)
+
+ info = traverse_obj(data, {
+ 'title': ('title', {str.strip}),
+ 'description': ('description', {str.strip}),
+ 'display_id': ('slug', {str}),
+ 'thumbnail': ('thumbnail', {url_or_none}),
+ 'duration': ('durationInSeconds', {int_or_none}, {lambda x: x or None}),
+ 'availability': ('subscription', 'level', {extract_availability}),
+ 'is_live': ('type', {lambda x: x.lower() == 'live'}),
+ 'artist': ('acts', ..., {str}),
+ 'series': ('event', 'title', {str.strip}),
+ 'series_id': ('event', 'id', {str_or_none}),
+ 'season': ('eventEdition', 'title', {str.strip}),
+ 'season_id': ('eventEdition', 'id', {str_or_none}),
+ 'channel_id': ('pubnub', 'channelName', {str}),
+ })
+
+ stream = self._download_json(
+ f'https://dc9h6qmsoymbq.cloudfront.net/api/content/videos/{video_id}/url',
+ video_id, headers=self._get_auth(), expected_status=401)
+
+ m3u8_url = traverse_obj(stream, ('data', 'url', {url_or_none}))
+ if not m3u8_url and traverse_obj(stream, ('error', 'code')) == 'unauthorized':
+ raise ExtractorError('Your account does not have access to this content', expected=True)
+
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, fatal=False, live=True) if m3u8_url else []
+ if not formats:
+ self.raise_no_formats('No active streams found', expected=bool(info.get('is_live')))
+
+ return {
+ **info,
+ 'id': video_id,
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/radiko.py b/hypervideo_dl/extractor/radiko.py
index f102922..cef68eb 100644
--- a/hypervideo_dl/extractor/radiko.py
+++ b/hypervideo_dl/extractor/radiko.py
@@ -1,5 +1,4 @@
import base64
-import re
import urllib.parse
from .common import InfoExtractor
@@ -15,6 +14,23 @@ from ..utils import (
class RadikoBaseIE(InfoExtractor):
_FULL_KEY = None
+ _HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED = (
+ 'https://c-rpaa.smartstream.ne.jp',
+ 'https://si-c-radiko.smartstream.ne.jp',
+ 'https://tf-f-rpaa-radiko.smartstream.ne.jp',
+ 'https://tf-c-rpaa-radiko.smartstream.ne.jp',
+ 'https://si-f-radiko.smartstream.ne.jp',
+ 'https://rpaa.smartstream.ne.jp',
+ )
+ _HOSTS_FOR_TIME_FREE_FFMPEG_SUPPORTED = (
+ 'https://rd-wowza-radiko.radiko-cf.com',
+ 'https://radiko.jp',
+ 'https://f-radiko.smartstream.ne.jp',
+ )
+ # Following URL forcibly connects not Time Free but Live
+ _HOSTS_FOR_LIVE = (
+ 'https://c-radiko.smartstream.ne.jp',
+ )
def _auth_client(self):
_, auth1_handle = self._download_webpage_handle(
@@ -25,7 +41,7 @@ class RadikoBaseIE(InfoExtractor):
'x-radiko-device': 'pc',
'x-radiko-user': 'dummy_user',
})
- auth1_header = auth1_handle.info()
+ auth1_header = auth1_handle.headers
auth_token = auth1_header['X-Radiko-AuthToken']
kl = int(auth1_header['X-Radiko-KeyLength'])
@@ -92,9 +108,9 @@ class RadikoBaseIE(InfoExtractor):
formats = []
found = set()
for url_tag in m3u8_urls:
- pcu = url_tag.find('playlist_create_url')
+ pcu = url_tag.find('playlist_create_url').text
url_attrib = url_tag.attrib
- playlist_url = update_url_query(pcu.text, {
+ playlist_url = update_url_query(pcu, {
'station_id': station,
**query,
'l': '15',
@@ -118,9 +134,10 @@ class RadikoBaseIE(InfoExtractor):
'X-Radiko-AuthToken': auth_token,
})
for sf in subformats:
- if re.fullmatch(r'[cf]-radiko\.smartstream\.ne\.jp', domain):
- # Prioritize live radio vs playback based on extractor
- sf['preference'] = 100 if is_onair else -100
+ if (is_onair ^ pcu.startswith(self._HOSTS_FOR_LIVE)) or (
+ not is_onair and pcu.startswith(self._HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED)):
+ sf['preference'] = -100
+ sf['format_note'] = 'not preferred'
if not is_onair and url_attrib['timefree'] == '1' and time_to_skip:
sf['downloader_options'] = {'ffmpeg_args': ['-ss', time_to_skip]}
formats.extend(subformats)
diff --git a/hypervideo_dl/extractor/radiocanada.py b/hypervideo_dl/extractor/radiocanada.py
index 72c21d5..1a5a635 100644
--- a/hypervideo_dl/extractor/radiocanada.py
+++ b/hypervideo_dl/extractor/radiocanada.py
@@ -1,5 +1,5 @@
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
determine_ext,
ExtractorError,
@@ -74,8 +74,8 @@ class RadioCanadaIE(InfoExtractor):
return self._download_json(
'https://services.radio-canada.ca/media/' + path, video_id, query=query)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422):
- data = self._parse_json(e.cause.read().decode(), None)
+ if isinstance(e.cause, HTTPError) and e.cause.status in (401, 422):
+ data = self._parse_json(e.cause.response.read().decode(), None)
error = data.get('error_description') or data['errorMessage']['text']
raise ExtractorError(error, expected=True)
raise
diff --git a/hypervideo_dl/extractor/rai.py b/hypervideo_dl/extractor/rai.py
index cab12cc..df4102a 100644
--- a/hypervideo_dl/extractor/rai.py
+++ b/hypervideo_dl/extractor/rai.py
@@ -1,19 +1,12 @@
import re
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_urlparse,
-)
from ..utils import (
clean_html,
determine_ext,
ExtractorError,
filter_dict,
- find_xpath_attr,
- fix_xml_ampersands,
GeoRestrictedError,
- HEADRequest,
int_or_none,
join_nonempty,
parse_duration,
@@ -35,82 +28,70 @@ class RaiBaseIE(InfoExtractor):
_GEO_BYPASS = False
def _extract_relinker_info(self, relinker_url, video_id, audio_only=False):
+ def fix_cdata(s):
+ # remove \r\n\t before and after <![CDATA[ ]]> to avoid
+ # polluted text with xpath_text
+ s = re.sub(r'(\]\]>)[\r\n\t]+(</)', '\\1\\2', s)
+ return re.sub(r'(>)[\r\n\t]+(<!\[CDATA\[)', '\\1\\2', s)
+
if not re.match(r'https?://', relinker_url):
return {'formats': [{'url': relinker_url}]}
- formats = []
- geoprotection = None
- is_live = None
- duration = None
-
- for platform in ('mon', 'flash', 'native'):
- relinker = self._download_xml(
- relinker_url, video_id,
- note=f'Downloading XML metadata for platform {platform}',
- transform_source=fix_xml_ampersands,
- query={'output': 45, 'pl': platform},
- headers=self.geo_verification_headers())
-
- if xpath_text(relinker, './license_url', default='{}') != '{}':
- self.report_drm(video_id)
+ # set User-Agent to generic 'Rai' to avoid quality filtering from
+ # the media server and get the maximum qualities available
+ relinker = self._download_xml(
+ relinker_url, video_id, note='Downloading XML metadata',
+ transform_source=fix_cdata, query={'output': 64},
+ headers={**self.geo_verification_headers(), 'User-Agent': 'Rai'})
- if not geoprotection:
- geoprotection = xpath_text(
- relinker, './geoprotection', default=None) == 'Y'
+ if xpath_text(relinker, './license_url', default='{}') != '{}':
+ self.report_drm(video_id)
- if not is_live:
- is_live = xpath_text(
- relinker, './is_live', default=None) == 'Y'
- if not duration:
- duration = parse_duration(xpath_text(
- relinker, './duration', default=None))
+ is_live = xpath_text(relinker, './is_live', default='N') == 'Y'
+ duration = parse_duration(xpath_text(relinker, './duration', default=None))
+ media_url = xpath_text(relinker, './url[@type="content"]', default=None)
- url_elem = find_xpath_attr(relinker, './url', 'type', 'content')
- if url_elem is None:
- continue
+ if not media_url:
+ self.raise_no_formats('The relinker returned no media url')
- media_url = url_elem.text
+ # geo flag is a bit unreliable and not properly set all the time
+ geoprotection = xpath_text(relinker, './geoprotection', default='N') == 'Y'
- # This does not imply geo restriction (e.g.
- # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html)
- if '/video_no_available.mp4' in media_url:
- continue
-
- ext = determine_ext(media_url)
- if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'):
- continue
+ ext = determine_ext(media_url)
+ formats = []
- if ext == 'mp3':
- formats.append({
- 'url': media_url,
- 'vcodec': 'none',
- 'acodec': 'mp3',
- 'format_id': 'http-mp3',
- })
- break
- elif ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon':
- formats.extend(self._extract_m3u8_formats(
- media_url, video_id, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
- elif ext == 'f4m' or platform == 'flash':
- manifest_url = update_url_query(
- media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'),
- {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'})
- formats.extend(self._extract_f4m_formats(
- manifest_url, video_id, f4m_id='hds', fatal=False))
- else:
- bitrate = int_or_none(xpath_text(relinker, 'bitrate'))
- formats.append({
- 'url': media_url,
- 'tbr': bitrate if bitrate > 0 else None,
- 'format_id': f'http-{bitrate if bitrate > 0 else "http"}',
- })
+ if ext == 'mp3':
+ formats.append({
+ 'url': media_url,
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ 'format_id': 'https-mp3',
+ })
+ elif ext == 'm3u8' or 'format=m3u8' in media_url:
+ formats.extend(self._extract_m3u8_formats(
+ media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ elif ext == 'f4m':
+ # very likely no longer needed. Cannot find any url that uses it.
+ manifest_url = update_url_query(
+ media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'),
+ {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'})
+ formats.extend(self._extract_f4m_formats(
+ manifest_url, video_id, f4m_id='hds', fatal=False))
+ elif ext == 'mp4':
+ bitrate = int_or_none(xpath_text(relinker, './bitrate'))
+ formats.append({
+ 'url': media_url,
+ 'tbr': bitrate if bitrate > 0 else None,
+ 'format_id': join_nonempty('https', bitrate, delim='-'),
+ })
+ else:
+ raise ExtractorError('Unrecognized media file found')
- if not formats and geoprotection is True:
+ if (not formats and geoprotection is True) or '/video_no_available.mp4' in media_url:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
- if not audio_only:
- formats.extend(self._create_http_urls(relinker_url, formats))
+ if not audio_only and not is_live:
+ formats.extend(self._create_http_urls(media_url, relinker_url, formats))
return filter_dict({
'is_live': is_live,
@@ -118,38 +99,31 @@ class RaiBaseIE(InfoExtractor):
'formats': formats,
})
- def _create_http_urls(self, relinker_url, fmts):
- _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?'
+ def _create_http_urls(self, manifest_url, relinker_url, fmts):
+ _MANIFEST_REG = r'/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4)?(?:\.csmil)?/playlist\.m3u8'
_MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s'
_QUALITY = {
# tbr: w, h
- '250': [352, 198],
- '400': [512, 288],
- '700': [512, 288],
- '800': [700, 394],
- '1200': [736, 414],
- '1800': [1024, 576],
- '2400': [1280, 720],
- '3200': [1440, 810],
- '3600': [1440, 810],
- '5000': [1920, 1080],
- '10000': [1920, 1080],
+ 250: [352, 198],
+ 400: [512, 288],
+ 600: [512, 288],
+ 700: [512, 288],
+ 800: [700, 394],
+ 1200: [736, 414],
+ 1500: [920, 518],
+ 1800: [1024, 576],
+ 2400: [1280, 720],
+ 3200: [1440, 810],
+ 3600: [1440, 810],
+ 5000: [1920, 1080],
+ 10000: [1920, 1080],
}
- def test_url(url):
- resp = self._request_webpage(
- HEADRequest(url), None, headers={'User-Agent': 'Rai'},
- fatal=False, errnote=False, note=False)
-
- if resp is False:
+ def percentage(number, target, pc=20, roof=125):
+ '''check if the target is in the range of number +/- percent'''
+ if not number or number < 0:
return False
-
- if resp.code == 200:
- return False if resp.url == url else resp.url
- return None
-
- # filter out audio-only formats
- fmts = [f for f in fmts if not f.get('vcodec') == 'none']
+ return abs(target - number) < min(float(number) * float(pc) / 100.0, roof)
def get_format_info(tbr):
import math
@@ -157,67 +131,78 @@ class RaiBaseIE(InfoExtractor):
if len(fmts) == 1 and not br:
br = fmts[0].get('tbr')
if br and br > 300:
- tbr = compat_str(math.floor(br / 100) * 100)
+ tbr = math.floor(br / 100) * 100
else:
- tbr = '250'
+ tbr = 250
# try extracting info from available m3u8 formats
- format_copy = None
+ format_copy = [None, None]
for f in fmts:
if f.get('tbr'):
- br_limit = math.floor(br / 100)
- if br_limit - 1 <= math.floor(f['tbr'] / 100) <= br_limit + 1:
- format_copy = f.copy()
+ if percentage(tbr, f['tbr']):
+ format_copy[0] = f.copy()
+ if [f.get('width'), f.get('height')] == _QUALITY.get(tbr):
+ format_copy[1] = f.copy()
+ format_copy[1]['tbr'] = tbr
+
+ # prefer format with similar bitrate because there might be
+ # multiple video with the same resolution but different bitrate
+ format_copy = format_copy[0] or format_copy[1] or {}
return {
+ 'format_id': f'https-{tbr}',
'width': format_copy.get('width'),
'height': format_copy.get('height'),
'tbr': format_copy.get('tbr'),
'vcodec': format_copy.get('vcodec'),
'acodec': format_copy.get('acodec'),
'fps': format_copy.get('fps'),
- 'format_id': f'https-{tbr}',
} if format_copy else {
+ 'format_id': f'https-{tbr}',
'width': _QUALITY[tbr][0],
'height': _QUALITY[tbr][1],
- 'format_id': f'https-{tbr}',
- 'tbr': int(tbr),
+ 'tbr': tbr,
+ 'vcodec': 'avc1',
+ 'acodec': 'mp4a',
+ 'fps': 25,
}
- loc = test_url(_MP4_TMPL % (relinker_url, '*'))
- if not isinstance(loc, compat_str):
- return []
+ # filter out single-stream formats
+ fmts = [f for f in fmts
+ if not f.get('vcodec') == 'none' and not f.get('acodec') == 'none']
- mobj = re.match(
- _RELINKER_REG,
- test_url(relinker_url) or '')
+ mobj = re.search(_MANIFEST_REG, manifest_url)
if not mobj:
return []
-
available_qualities = mobj.group('quality').split(',') if mobj.group('quality') else ['*']
- available_qualities = [i for i in available_qualities if i]
formats = []
- for q in available_qualities:
- fmt = {
+ for q in filter(None, available_qualities):
+ self.write_debug(f'Creating https format for quality {q}')
+ formats.append({
'url': _MP4_TMPL % (relinker_url, q),
'protocol': 'https',
'ext': 'mp4',
**get_format_info(q)
- }
- formats.append(fmt)
+ })
return formats
@staticmethod
+ def _get_thumbnails_list(thumbs, url):
+ return [{
+ 'url': urljoin(url, thumb_url),
+ } for thumb_url in (thumbs or {}).values() if thumb_url]
+
+ @staticmethod
def _extract_subtitles(url, video_data):
STL_EXT = 'stl'
SRT_EXT = 'srt'
subtitles = {}
- subtitles_array = video_data.get('subtitlesArray') or []
+ subtitles_array = video_data.get('subtitlesArray') or video_data.get('subtitleList') or []
for k in ('subtitles', 'subtitlesUrl'):
subtitles_array.append({'url': video_data.get(k)})
for subtitle in subtitles_array:
sub_url = subtitle.get('url')
- if sub_url and isinstance(sub_url, compat_str):
+ if sub_url and isinstance(sub_url, str):
sub_lang = subtitle.get('language') or 'it'
sub_url = urljoin(url, sub_url)
sub_ext = determine_ext(sub_url, SRT_EXT)
@@ -236,7 +221,7 @@ class RaiBaseIE(InfoExtractor):
class RaiPlayIE(RaiBaseIE):
_VALID_URL = rf'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>{RaiBaseIE._UUID_RE}))\.(?:html|json)'
_TESTS = [{
- 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
+ 'url': 'https://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
'md5': '8970abf8caf8aef4696e7b1f2adfc696',
'info_dict': {
'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
@@ -244,22 +229,20 @@ class RaiPlayIE(RaiBaseIE):
'title': 'Report del 07/04/2014',
'alt_title': 'St 2013/14 - Report - Espresso nel caffè - 07/04/2014',
'description': 'md5:d730c168a58f4bb35600fc2f881ec04e',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'uploader': 'Rai Gulp',
+ 'thumbnail': r're:^https?://www\.raiplay\.it/.+\.jpg',
+ 'uploader': 'Rai 3',
+ 'creator': 'Rai 3',
'duration': 6160,
'series': 'Report',
'season': '2013/14',
- 'subtitles': {
- 'it': 'count:4',
- },
+ 'subtitles': {'it': 'count:4'},
'release_year': 2022,
'episode': 'Espresso nel caffè - 07/04/2014',
'timestamp': 1396919880,
'upload_date': '20140408',
+ 'formats': 'count:4',
},
- 'params': {
- 'skip_download': True,
- },
+ 'params': {'skip_download': True},
}, {
# 1080p direct mp4 url
'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html',
@@ -270,8 +253,9 @@ class RaiPlayIE(RaiBaseIE):
'title': 'Blanca - S1E1 - Senza occhi',
'alt_title': 'St 1 Ep 1 - Blanca - Senza occhi',
'description': 'md5:75f95d5c030ec8bac263b1212322e28c',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'uploader': 'Rai 1',
+ 'thumbnail': r're:^https://www\.raiplay\.it/dl/img/.+\.jpg',
+ 'uploader': 'Rai Premium',
+ 'creator': 'Rai Fiction',
'duration': 6493,
'series': 'Blanca',
'season': 'Season 1',
@@ -281,6 +265,30 @@ class RaiPlayIE(RaiBaseIE):
'episode': 'Senza occhi',
'timestamp': 1637318940,
'upload_date': '20211119',
+ 'formats': 'count:12',
+ },
+ 'params': {'skip_download': True},
+ 'expected_warnings': ['Video not available. Likely due to geo-restriction.']
+ }, {
+ # 1500 quality
+ 'url': 'https://www.raiplay.it/video/2012/09/S1E11---Tutto-cio-che-luccica-0cab3323-732e-45d6-8e86-7704acab6598.html',
+ 'md5': 'a634d20e8ab2d43724c273563f6bf87a',
+ 'info_dict': {
+ 'id': '0cab3323-732e-45d6-8e86-7704acab6598',
+ 'ext': 'mp4',
+ 'title': 'Mia and Me - S1E11 - Tutto ciò che luccica',
+ 'alt_title': 'St 1 Ep 11 - Mia and Me - Tutto ciò che luccica',
+ 'description': 'md5:4969e594184b1920c4c1f2b704da9dea',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Rai Gulp',
+ 'series': 'Mia and Me',
+ 'season': 'Season 1',
+ 'episode_number': 11,
+ 'release_year': 2015,
+ 'season_number': 1,
+ 'episode': 'Tutto ciò che luccica',
+ 'timestamp': 1348495020,
+ 'upload_date': '20120924',
},
}, {
'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?',
@@ -299,57 +307,40 @@ class RaiPlayIE(RaiBaseIE):
base, video_id = self._match_valid_url(url).groups()
media = self._download_json(
- base + '.json', video_id, 'Downloading video JSON')
+ f'{base}.json', video_id, 'Downloading video JSON')
if not self.get_param('allow_unplayable_formats'):
- if try_get(
- media,
- (lambda x: x['rights_management']['rights']['drm'],
- lambda x: x['program_info']['rights_management']['rights']['drm']),
- dict):
+ if traverse_obj(media, (('program_info', None), 'rights_management', 'rights', 'drm')):
self.report_drm(video_id)
- title = media['name']
video = media['video']
-
relinker_info = self._extract_relinker_info(video['content_url'], video_id)
-
- thumbnails = []
- for _, value in media.get('images', {}).items():
- if value:
- thumbnails.append({
- 'url': urljoin(url, value),
- })
-
- date_published = media.get('date_published')
- time_published = media.get('time_published')
- if date_published and time_published:
- date_published += ' ' + time_published
-
- subtitles = self._extract_subtitles(url, video)
-
- program_info = media.get('program_info') or {}
+ date_published = join_nonempty(
+ media.get('date_published'), media.get('time_published'), delim=' ')
season = media.get('season')
-
alt_title = join_nonempty(media.get('subtitle'), media.get('toptitle'), delim=' - ')
return {
'id': remove_start(media.get('id'), 'ContentItem-') or video_id,
'display_id': video_id,
- 'title': title,
+ 'title': media.get('name'),
'alt_title': strip_or_none(alt_title or None),
'description': media.get('description'),
- 'uploader': strip_or_none(media.get('channel') or None),
- 'creator': strip_or_none(media.get('editor') or None),
+ 'uploader': strip_or_none(
+ traverse_obj(media, ('program_info', 'channel'))
+ or media.get('channel') or None),
+ 'creator': strip_or_none(
+ traverse_obj(media, ('program_info', 'editor'))
+ or media.get('editor') or None),
'duration': parse_duration(video.get('duration')),
'timestamp': unified_timestamp(date_published),
- 'thumbnails': thumbnails,
- 'series': program_info.get('name'),
+ 'thumbnails': self._get_thumbnails_list(media.get('images'), url),
+ 'series': traverse_obj(media, ('program_info', 'name')),
'season_number': int_or_none(season),
'season': season if (season and not season.isdigit()) else None,
'episode': media.get('episode_title'),
'episode_number': int_or_none(media.get('episode')),
- 'subtitles': subtitles,
+ 'subtitles': self._extract_subtitles(url, video),
'release_year': int_or_none(traverse_obj(media, ('track_info', 'edit_year'))),
**relinker_info
}
@@ -371,38 +362,39 @@ class RaiPlayLiveIE(RaiPlayIE): # XXX: Do not subclass from concrete IE
'live_status': 'is_live',
'upload_date': '20090502',
'timestamp': 1241276220,
+ 'formats': 'count:3',
},
- 'params': {
- 'skip_download': True,
- },
+ 'params': {'skip_download': True},
}]
class RaiPlayPlaylistIE(InfoExtractor):
_VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?'
_TESTS = [{
+ # entire series episodes + extras...
'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/',
'info_dict': {
'id': 'nondirloalmiocapo',
'title': 'Non dirlo al mio capo',
'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b',
},
- 'playlist_mincount': 12,
+ 'playlist_mincount': 30,
}, {
+ # single season
'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/episodi/stagione-2/',
'info_dict': {
'id': 'nondirloalmiocapo',
'title': 'Non dirlo al mio capo - Stagione 2',
'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b',
},
- 'playlist_mincount': 12,
+ 'playlist_count': 12,
}]
def _real_extract(self, url):
base, playlist_id, extra_id = self._match_valid_url(url).groups()
program = self._download_json(
- base + '.json', playlist_id, 'Downloading program JSON')
+ f'{base}.json', playlist_id, 'Downloading program JSON')
if extra_id:
extra_id = extra_id.upper().rstrip('/')
@@ -450,7 +442,7 @@ class RaiPlaySoundIE(RaiBaseIE):
'title': 'Il Ruggito del Coniglio del 10/12/2021',
'alt_title': 'md5:0e6476cd57858bb0f3fcc835d305b455',
'description': 'md5:2a17d2107e59a4a8faa0e18334139ee2',
- 'thumbnail': r're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.+\.jpg$',
'uploader': 'rai radio 2',
'duration': 5685,
'series': 'Il Ruggito del Coniglio',
@@ -459,9 +451,7 @@ class RaiPlaySoundIE(RaiBaseIE):
'timestamp': 1638346620,
'upload_date': '20211201',
},
- 'params': {
- 'skip_download': True,
- },
+ 'params': {'skip_download': True},
}]
def _real_extract(self, url):
@@ -480,9 +470,6 @@ class RaiPlaySoundIE(RaiBaseIE):
lambda x: x['live']['create_date']))
podcast_info = traverse_obj(media, 'podcast_info', ('live', 'cards', 0)) or {}
- thumbnails = [{
- 'url': urljoin(url, thumb_url),
- } for thumb_url in (podcast_info.get('images') or {}).values() if thumb_url]
return {
**info,
@@ -494,7 +481,7 @@ class RaiPlaySoundIE(RaiBaseIE):
'uploader': traverse_obj(media, ('track_info', 'channel'), expected_type=strip_or_none),
'creator': traverse_obj(media, ('track_info', 'editor'), expected_type=strip_or_none),
'timestamp': unified_timestamp(date_published),
- 'thumbnails': thumbnails,
+ 'thumbnails': self._get_thumbnails_list(podcast_info.get('images'), url),
'series': podcast_info.get('title'),
'season_number': int_or_none(media.get('season')),
'episode': media.get('episode_title'),
@@ -512,30 +499,30 @@ class RaiPlaySoundLiveIE(RaiPlaySoundIE): # XXX: Do not subclass from concrete
'display_id': 'radio2',
'ext': 'mp4',
'title': r're:Rai Radio 2 \d+-\d+-\d+ \d+:\d+',
- 'thumbnail': r're:https://www.raiplaysound.it/dl/img/.+?png',
+ 'thumbnail': r're:^https://www\.raiplaysound\.it/dl/img/.+\.png',
'uploader': 'rai radio 2',
'series': 'Rai Radio 2',
'creator': 'raiplaysound',
'is_live': True,
'live_status': 'is_live',
},
- 'params': {
- 'skip_download': 'live',
- },
+ 'params': {'skip_download': True},
}]
class RaiPlaySoundPlaylistIE(InfoExtractor):
_VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?:programmi|playlist|audiolibri)/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?'
_TESTS = [{
+ # entire show
'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio',
'info_dict': {
'id': 'ilruggitodelconiglio',
'title': 'Il Ruggito del Coniglio',
- 'description': 'md5:1bbaf631245a7ab1ec4d9fbb3c7aa8f3',
+ 'description': 'md5:48cff6972435964284614d70474132e6',
},
'playlist_mincount': 65,
}, {
+ # single season
'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio/puntate/prima-stagione-1995',
'info_dict': {
'id': 'ilruggitodelconiglio_puntate_prima-stagione-1995',
@@ -568,22 +555,19 @@ class RaiPlaySoundPlaylistIE(InfoExtractor):
class RaiIE(RaiBaseIE):
_VALID_URL = rf'https?://[^/]+\.(?:rai\.(?:it|tv))/.+?-(?P<id>{RaiBaseIE._UUID_RE})(?:-.+?)?\.html'
_TESTS = [{
- # var uniquename = "ContentItem-..."
- # data-id="ContentItem-..."
'url': 'https://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html',
'info_dict': {
'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9',
'ext': 'mp4',
'title': 'TG PRIMO TEMPO',
- 'thumbnail': r're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg',
'duration': 1758,
'upload_date': '20140612',
},
- 'skip': 'This content is available only in Italy',
+ 'params': {'skip_download': True},
+ 'expected_warnings': ['Video not available. Likely due to geo-restriction.']
}, {
- # with ContentItem in og:url
'url': 'https://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html',
- 'md5': '06345bd97c932f19ffb129973d07a020',
'info_dict': {
'id': 'efb17665-691c-45d5-a60c-5301333cbb0c',
'ext': 'mp4',
@@ -592,123 +576,51 @@ class RaiIE(RaiBaseIE):
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2214,
'upload_date': '20161103'
- }
+ },
+ 'params': {'skip_download': True},
}, {
- # Direct MMS URL
+ # Direct MMS: Media URL no longer works.
'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html',
'only_matching': True,
}]
- def _extract_from_content_id(self, content_id, url):
+ def _real_extract(self, url):
+ content_id = self._match_id(url)
media = self._download_json(
f'https://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-{content_id}.html?json',
- content_id, 'Downloading video JSON')
+ content_id, 'Downloading video JSON', fatal=False, expected_status=404)
- title = media['name'].strip()
+ if media is None:
+ return None
- media_type = media['type']
- if 'Audio' in media_type:
+ if 'Audio' in media['type']:
relinker_info = {
'formats': [{
- 'format_id': media.get('formatoAudio'),
+ 'format_id': join_nonempty('https', media.get('formatoAudio'), delim='-'),
'url': media['audioUrl'],
'ext': media.get('formatoAudio'),
+ 'vcodec': 'none',
+ 'acodec': media.get('formatoAudio'),
}]
}
- elif 'Video' in media_type:
+ elif 'Video' in media['type']:
relinker_info = self._extract_relinker_info(media['mediaUri'], content_id)
else:
raise ExtractorError('not a media file')
- thumbnails = []
- for image_type in ('image', 'image_medium', 'image_300'):
- thumbnail_url = media.get(image_type)
- if thumbnail_url:
- thumbnails.append({
- 'url': compat_urlparse.urljoin(url, thumbnail_url),
- })
-
- subtitles = self._extract_subtitles(url, media)
+ thumbnails = self._get_thumbnails_list(
+ {image_type: media.get(image_type) for image_type in (
+ 'image', 'image_medium', 'image_300')}, url)
return {
'id': content_id,
- 'title': title,
- 'description': strip_or_none(media.get('desc') or None),
+ 'title': strip_or_none(media.get('name') or media.get('title')),
+ 'description': strip_or_none(media.get('desc')) or None,
'thumbnails': thumbnails,
- 'uploader': strip_or_none(media.get('author') or None),
+ 'uploader': strip_or_none(media.get('author')) or None,
'upload_date': unified_strdate(media.get('date')),
'duration': parse_duration(media.get('length')),
- 'subtitles': subtitles,
- **relinker_info
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
-
- content_item_id = None
-
- content_item_url = self._html_search_meta(
- ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url',
- 'twitter:player', 'jsonlink'), webpage, default=None)
- if content_item_url:
- content_item_id = self._search_regex(
- rf'ContentItem-({self._UUID_RE})', content_item_url,
- 'content item id', default=None)
-
- if not content_item_id:
- content_item_id = self._search_regex(
- rf'''(?x)
- (?:
- (?:initEdizione|drawMediaRaiTV)\(|
- <(?:[^>]+\bdata-id|var\s+uniquename)=|
- <iframe[^>]+\bsrc=
- )
- (["\'])
- (?:(?!\1).)*\bContentItem-(?P<id>{self._UUID_RE})
- ''',
- webpage, 'content item id', default=None, group='id')
-
- content_item_ids = set()
- if content_item_id:
- content_item_ids.add(content_item_id)
- if video_id not in content_item_ids:
- content_item_ids.add(video_id)
-
- for content_item_id in content_item_ids:
- try:
- return self._extract_from_content_id(content_item_id, url)
- except GeoRestrictedError:
- raise
- except ExtractorError:
- pass
-
- relinker_url = self._proto_relative_url(self._search_regex(
- r'''(?x)
- (?:
- var\s+videoURL|
- mediaInfo\.mediaUri
- )\s*=\s*
- ([\'"])
- (?P<url>
- (?:https?:)?
- //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\?
- (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1
- ''',
- webpage, 'relinker URL', group='url'))
-
- relinker_info = self._extract_relinker_info(
- urljoin(url, relinker_url), video_id)
-
- title = self._search_regex(
- r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1',
- webpage, 'title', group='title',
- default=None) or self._og_search_title(webpage)
-
- return {
- 'id': video_id,
- 'title': title,
+ 'subtitles': self._extract_subtitles(url, media),
**relinker_info
}
@@ -726,7 +638,8 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE
'duration': 1589,
'upload_date': '20220529',
'uploader': 'rainews',
- }
+ },
+ 'params': {'skip_download': True},
}, {
# old content with fallback method to extract media urls
'url': 'https://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html',
@@ -739,12 +652,14 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE
'duration': 833,
'upload_date': '20161103'
},
+ 'params': {'skip_download': True},
'expected_warnings': ['unable to extract player_data'],
}, {
# iframe + drm
'url': 'https://www.rainews.it/iframe/video/2022/07/euro2022-europei-calcio-femminile-italia-belgio-gol-0-1-video-4de06a69-de75-4e32-a657-02f0885f8118.html',
'only_matching': True,
}]
+ _PLAYER_TAG = 'news'
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -752,8 +667,8 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE
webpage = self._download_webpage(url, video_id)
player_data = self._search_json(
- r'<rainews-player\s*data=\'', webpage, 'player_data', video_id,
- transform_source=clean_html, fatal=False)
+ rf'<rai{self._PLAYER_TAG}-player\s*data=\'', webpage, 'player_data', video_id,
+ transform_source=clean_html, default={})
track_info = player_data.get('track_info')
relinker_url = traverse_obj(player_data, 'mediapolis', 'content_url')
@@ -770,16 +685,36 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE
return {
'id': video_id,
- 'title': track_info.get('title') or self._og_search_title(webpage),
+ 'title': player_data.get('title') or track_info.get('title') or self._og_search_title(webpage),
'upload_date': unified_strdate(track_info.get('date')),
'uploader': strip_or_none(track_info.get('editor') or None),
**relinker_info
}
+class RaiCulturaIE(RaiNewsIE): # XXX: Do not subclass from concrete IE
+ _VALID_URL = rf'https?://(www\.)?raicultura\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html'
+ _EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)']
+ _TESTS = [{
+ 'url': 'https://www.raicultura.it/letteratura/articoli/2018/12/Alberto-Asor-Rosa-Letteratura-e-potere-05ba8775-82b5-45c5-a89d-dd955fbde1fb.html',
+ 'info_dict': {
+ 'id': '05ba8775-82b5-45c5-a89d-dd955fbde1fb',
+ 'ext': 'mp4',
+ 'title': 'Alberto Asor Rosa: Letteratura e potere',
+ 'duration': 1756,
+ 'upload_date': '20181206',
+ 'uploader': 'raicultura',
+ 'formats': 'count:2',
+ },
+ 'params': {'skip_download': True},
+ }]
+ _PLAYER_TAG = 'cultura'
+
+
class RaiSudtirolIE(RaiBaseIE):
- _VALID_URL = r'https?://raisudtirol\.rai\.it/.+?media=(?P<id>[TP]tv\d+)'
+ _VALID_URL = r'https?://raisudtirol\.rai\.it/.+media=(?P<id>\w+)'
_TESTS = [{
+ # mp4 file
'url': 'https://raisudtirol.rai.it/la/index.php?media=Ptv1619729460',
'info_dict': {
'id': 'Ptv1619729460',
@@ -787,34 +722,62 @@ class RaiSudtirolIE(RaiBaseIE):
'title': 'Euro: trasmisciun d\'economia - 29-04-2021 20:51',
'series': 'Euro: trasmisciun d\'economia',
'upload_date': '20210429',
- 'thumbnail': r're:https://raisudtirol\.rai\.it/img/.+?\.jpg',
+ 'thumbnail': r're:https://raisudtirol\.rai\.it/img/.+\.jpg',
'uploader': 'raisudtirol',
- }
+ 'formats': 'count:1',
+ },
+ 'params': {'skip_download': True},
+ }, {
+ # m3u manifest
+ 'url': 'https://raisudtirol.rai.it/it/kidsplayer.php?lang=it&media=GUGGUG_P1.smil',
+ 'info_dict': {
+ 'id': 'GUGGUG_P1',
+ 'ext': 'mp4',
+ 'title': 'GUGGUG! La Prospettiva - Die Perspektive',
+ 'uploader': 'raisudtirol',
+ 'formats': 'count:6',
+ },
+ 'params': {'skip_download': True},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_date = self._html_search_regex(r'<span class="med_data">(.+?)</span>', webpage, 'video_date', fatal=False)
- video_title = self._html_search_regex(r'<span class="med_title">(.+?)</span>', webpage, 'video_title', fatal=False)
- video_url = self._html_search_regex(r'sources:\s*\[\{file:\s*"(.+?)"\}\]', webpage, 'video_url')
- video_thumb = self._html_search_regex(r'image: \'(.+?)\'', webpage, 'video_thumb', fatal=False)
+ video_date = self._html_search_regex(
+ r'<span class="med_data">(.+?)</span>', webpage, 'video_date', default=None)
+ video_title = self._html_search_regex([
+ r'<span class="med_title">(.+?)</span>', r'title: \'(.+?)\','],
+ webpage, 'video_title', default=None)
+ video_url = self._html_search_regex([
+ r'sources:\s*\[\{file:\s*"(.+?)"\}\]',
+ r'<source\s+src="(.+?)"\s+type="application/x-mpegURL"'],
+ webpage, 'video_url', default=None)
+
+ ext = determine_ext(video_url)
+ if ext == 'm3u8':
+ formats = self._extract_m3u8_formats(video_url, video_id)
+ elif ext == 'mp4':
+ formats = [{
+ 'format_id': 'https-mp4',
+ 'url': self._proto_relative_url(video_url),
+ 'width': 1024,
+ 'height': 576,
+ 'fps': 25,
+ 'vcodec': 'avc1',
+ 'acodec': 'mp4a',
+ }]
+ else:
+ formats = []
+ self.raise_no_formats(f'Unrecognized media file: {video_url}')
return {
'id': video_id,
'title': join_nonempty(video_title, video_date, delim=' - '),
- 'series': video_title,
+ 'series': video_title if video_date else None,
'upload_date': unified_strdate(video_date),
- 'thumbnail': urljoin('https://raisudtirol.rai.it/', video_thumb),
+ 'thumbnail': urljoin('https://raisudtirol.rai.it/', self._html_search_regex(
+ r'image: \'(.+?)\'', webpage, 'video_thumb', default=None)),
'uploader': 'raisudtirol',
- 'formats': [{
- 'format_id': 'https-mp4',
- 'url': self._proto_relative_url(video_url),
- 'width': 1024,
- 'height': 576,
- 'fps': 25,
- 'vcodec': 'h264',
- 'acodec': 'aac',
- }],
+ 'formats': formats,
}
diff --git a/hypervideo_dl/extractor/rbgtum.py b/hypervideo_dl/extractor/rbgtum.py
new file mode 100644
index 0000000..47649cf
--- /dev/null
+++ b/hypervideo_dl/extractor/rbgtum.py
@@ -0,0 +1,93 @@
+import re
+
+from .common import InfoExtractor
+
+
+class RbgTumIE(InfoExtractor):
+ _VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P<id>.+)'
+ _TESTS = [{
+ # Combined view
+ 'url': 'https://live.rbg.tum.de/w/cpp/22128',
+ 'md5': '53a5e7b3e07128e33bbf36687fe1c08f',
+ 'info_dict': {
+ 'id': 'cpp/22128',
+ 'ext': 'mp4',
+ 'title': 'Lecture: October 18. 2022',
+ 'series': 'Concepts of C++ programming (IN2377)',
+ }
+ }, {
+ # Presentation only
+ 'url': 'https://live.rbg.tum.de/w/I2DL/12349/PRES',
+ 'md5': '36c584272179f3e56b0db5d880639cba',
+ 'info_dict': {
+ 'id': 'I2DL/12349/PRES',
+ 'ext': 'mp4',
+ 'title': 'Lecture 3: Introduction to Neural Networks',
+ 'series': 'Introduction to Deep Learning (IN2346)',
+ }
+ }, {
+ # Camera only
+ 'url': 'https://live.rbg.tum.de/w/fvv-info/16130/CAM',
+ 'md5': 'e04189d92ff2f56aedf5cede65d37aad',
+ 'info_dict': {
+ 'id': 'fvv-info/16130/CAM',
+ 'ext': 'mp4',
+ 'title': 'Fachschaftsvollversammlung',
+ 'series': 'Fachschaftsvollversammlung Informatik',
+ }
+ }, ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8')
+ lecture_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title')
+ lecture_series_title = self._html_search_regex(
+ r'(?s)<title\b[^>]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?</title>', webpage, 'series')
+
+ formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
+
+ return {
+ 'id': video_id,
+ 'title': lecture_title,
+ 'series': lecture_series_title,
+ 'formats': formats,
+ }
+
+
+class RbgTumCourseIE(InfoExtractor):
+ _VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P<id>.+)'
+ _TESTS = [{
+ 'url': 'https://live.rbg.tum.de/course/2022/S/fpv',
+ 'info_dict': {
+ 'title': 'Funktionale Programmierung und Verifikation (IN0003)',
+ 'id': '2022/S/fpv',
+ },
+ 'params': {
+ 'noplaylist': False,
+ },
+ 'playlist_count': 13,
+ }, {
+ 'url': 'https://live.rbg.tum.de/course/2022/W/set',
+ 'info_dict': {
+ 'title': 'SET FSMPIC',
+ 'id': '2022/W/set',
+ },
+ 'params': {
+ 'noplaylist': False,
+ },
+ 'playlist_count': 6,
+ }, ]
+
+ def _real_extract(self, url):
+ course_id = self._match_id(url)
+ webpage = self._download_webpage(url, course_id)
+
+ lecture_series_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title')
+
+ lecture_urls = []
+ for lecture_url in re.findall(r'(?i)href="/w/(.+)(?<!/cam)(?<!/pres)(?<!/chat)"', webpage):
+ lecture_urls.append(self.url_result('https://live.rbg.tum.de/w/' + lecture_url, ie=RbgTumIE.ie_key()))
+
+ return self.playlist_result(lecture_urls, course_id, lecture_series_title)
diff --git a/hypervideo_dl/extractor/rcs.py b/hypervideo_dl/extractor/rcs.py
index b905f8d..0fd3ca7 100644
--- a/hypervideo_dl/extractor/rcs.py
+++ b/hypervideo_dl/extractor/rcs.py
@@ -1,11 +1,20 @@
import re
from .common import InfoExtractor
+from ..networking import HEADRequest
from ..utils import (
ExtractorError,
base_url,
clean_html,
+ extract_attributes,
+ get_element_html_by_class,
+ get_element_html_by_id,
+ int_or_none,
js_to_json,
+ mimetype2ext,
+ sanitize_url,
+ traverse_obj,
+ try_call,
url_basename,
urljoin,
)
@@ -15,41 +24,8 @@ class RCSBaseIE(InfoExtractor):
# based on VideoPlayerLoader.prototype.getVideoSrc
# and VideoPlayerLoader.prototype.transformSrc from
# https://js2.corriereobjects.it/includes2013/LIBS/js/corriere_video.sjs
- _ALL_REPLACE = {
- 'media2vam.corriere.it.edgesuite.net':
- 'media2vam-corriere-it.akamaized.net',
- 'media.youreporter.it.edgesuite.net':
- 'media-youreporter-it.akamaized.net',
- 'corrierepmd.corriere.it.edgesuite.net':
- 'corrierepmd-corriere-it.akamaized.net',
- 'media2vam-corriere-it.akamaized.net/fcs.quotidiani/vr/videos/':
- 'video.corriere.it/vr360/videos/',
- '.net//': '.net/',
- }
- _MP4_REPLACE = {
- 'media2vam.corbologna.corriere.it.edgesuite.net':
- 'media2vam-bologna-corriere-it.akamaized.net',
- 'media2vam.corfiorentino.corriere.it.edgesuite.net':
- 'media2vam-fiorentino-corriere-it.akamaized.net',
- 'media2vam.cormezzogiorno.corriere.it.edgesuite.net':
- 'media2vam-mezzogiorno-corriere-it.akamaized.net',
- 'media2vam.corveneto.corriere.it.edgesuite.net':
- 'media2vam-veneto-corriere-it.akamaized.net',
- 'media2.oggi.it.edgesuite.net':
- 'media2-oggi-it.akamaized.net',
- 'media2.quimamme.it.edgesuite.net':
- 'media2-quimamme-it.akamaized.net',
- 'media2.amica.it.edgesuite.net':
- 'media2-amica-it.akamaized.net',
- 'media2.living.corriere.it.edgesuite.net':
- 'media2-living-corriere-it.akamaized.net',
- 'media2.style.corriere.it.edgesuite.net':
- 'media2-style-corriere-it.akamaized.net',
- 'media2.iodonna.it.edgesuite.net':
- 'media2-iodonna-it.akamaized.net',
- 'media2.leitv.it.edgesuite.net':
- 'media2-leitv-it.akamaized.net',
- }
+ _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
+ _RCS_ID_RE = r'[\w-]+-\d{10}'
_MIGRATION_MAP = {
'videoamica-vh.akamaihd': 'amica',
'media2-amica-it.akamaized': 'amica',
@@ -90,183 +66,140 @@ class RCSBaseIE(InfoExtractor):
'vivimilano-vh.akamaihd': 'vivimilano',
'media2-youreporter-it.akamaized': 'youreporter'
}
- _MIGRATION_MEDIA = {
- 'advrcs-vh.akamaihd': '',
- 'corriere-f.akamaihd': '',
- 'corrierepmd-corriere-it.akamaized': '',
- 'corrprotetto-vh.akamaihd': '',
- 'gazzetta-f.akamaihd': '',
- 'gazzettapmd-gazzetta-it.akamaized': '',
- 'gazzprotetto-vh.akamaihd': '',
- 'periodici-f.akamaihd': '',
- 'periodicisecure-vh.akamaihd': '',
- 'videocoracademy-vh.akamaihd': ''
- }
def _get_video_src(self, video):
- mediaFiles = video.get('mediaProfile').get('mediaFile')
- src = {}
- # audio
- if video.get('mediaType') == 'AUDIO':
- for aud in mediaFiles:
- # todo: check
- src['mp3'] = aud.get('value')
- # video
- else:
- for vid in mediaFiles:
- if vid.get('mimeType') == 'application/vnd.apple.mpegurl':
- src['m3u8'] = vid.get('value')
- if vid.get('mimeType') == 'video/mp4':
- src['mp4'] = vid.get('value')
+ for source in traverse_obj(video, (
+ 'mediaProfile', 'mediaFile', lambda _, v: v.get('mimeType'))):
+ url = source['value']
+ for s, r in (
+ ('media2vam.corriere.it.edgesuite.net', 'media2vam-corriere-it.akamaized.net'),
+ ('media.youreporter.it.edgesuite.net', 'media-youreporter-it.akamaized.net'),
+ ('corrierepmd.corriere.it.edgesuite.net', 'corrierepmd-corriere-it.akamaized.net'),
+ ('media2vam-corriere-it.akamaized.net/fcs.quotidiani/vr/videos/', 'video.corriere.it/vr360/videos/'),
+ ('http://', 'https://'),
+ ):
+ url = url.replace(s, r)
- # replace host
- for t in src:
- for s, r in self._ALL_REPLACE.items():
- src[t] = src[t].replace(s, r)
- for s, r in self._MP4_REPLACE.items():
- src[t] = src[t].replace(s, r)
+ type_ = mimetype2ext(source['mimeType'])
+ if type_ == 'm3u8' and '-vh.akamaihd' in url:
+ # still needed for some old content: see _TESTS #3
+ matches = re.search(r'(?:https?:)?//(?P<host>[\w\.\-]+)\.net/i(?P<path>.+)$', url)
+ if matches:
+ url = f'https://vod.rcsobjects.it/hls/{self._MIGRATION_MAP[matches.group("host")]}{matches.group("path")}'
+ if traverse_obj(video, ('mediaProfile', 'geoblocking')) or (
+ type_ == 'm3u8' and 'fcs.quotidiani_!' in url):
+ url = url.replace('vod.rcsobjects', 'vod-it.rcsobjects')
+ if type_ == 'm3u8' and 'vod' in url:
+ url = url.replace('.csmil', '.urlset')
+ if type_ == 'mp3':
+ url = url.replace('media2vam-corriere-it.akamaized.net', 'vod.rcsobjects.it/corriere')
- # switch cdn
- if 'mp4' in src and 'm3u8' in src:
- if ('-lh.akamaihd' not in src.get('m3u8')
- and 'akamai' in src.get('mp4')):
- if 'm3u8' in src:
- matches = re.search(r'(?:https*:)?\/\/(?P<host>.*)\.net\/i(?P<path>.*)$', src.get('m3u8'))
- src['m3u8'] = 'https://vod.rcsobjects.it/hls/%s%s' % (
- self._MIGRATION_MAP[matches.group('host')],
- matches.group('path').replace(
- '///', '/').replace(
- '//', '/').replace(
- '.csmil', '.urlset'
- )
- )
- if 'mp4' in src:
- matches = re.search(r'(?:https*:)?\/\/(?P<host>.*)\.net\/i(?P<path>.*)$', src.get('mp4'))
- if matches:
- if matches.group('host') in self._MIGRATION_MEDIA:
- vh_stream = 'https://media2.corriereobjects.it'
- if src.get('mp4').find('fcs.quotidiani_!'):
- vh_stream = 'https://media2-it.corriereobjects.it'
- src['mp4'] = '%s%s' % (
- vh_stream,
- matches.group('path').replace(
- '///', '/').replace(
- '//', '/').replace(
- '/fcs.quotidiani/mediacenter', '').replace(
- '/fcs.quotidiani_!/mediacenter', '').replace(
- 'corriere/content/mediacenter/', '').replace(
- 'gazzetta/content/mediacenter/', '')
- )
- else:
- src['mp4'] = 'https://vod.rcsobjects.it/%s%s' % (
- self._MIGRATION_MAP[matches.group('host')],
- matches.group('path').replace('///', '/').replace('//', '/')
- )
-
- if 'mp3' in src:
- src['mp3'] = src.get('mp3').replace(
- 'media2vam-corriere-it.akamaized.net',
- 'vod.rcsobjects.it/corriere')
- if 'mp4' in src:
- if src.get('mp4').find('fcs.quotidiani_!'):
- src['mp4'] = src.get('mp4').replace('vod.rcsobjects', 'vod-it.rcsobjects')
- if 'm3u8' in src:
- if src.get('m3u8').find('fcs.quotidiani_!'):
- src['m3u8'] = src.get('m3u8').replace('vod.rcsobjects', 'vod-it.rcsobjects')
+ yield {
+ 'type': type_,
+ 'url': url,
+ 'bitrate': source.get('bitrate')
+ }
- if 'geoblocking' in video.get('mediaProfile'):
- if 'm3u8' in src:
- src['m3u8'] = src.get('m3u8').replace('vod.rcsobjects', 'vod-it.rcsobjects')
- if 'mp4' in src:
- src['mp4'] = src.get('mp4').replace('vod.rcsobjects', 'vod-it.rcsobjects')
- if 'm3u8' in src:
- if src.get('m3u8').find('csmil') and src.get('m3u8').find('vod'):
- src['m3u8'] = src.get('m3u8').replace('.csmil', '.urlset')
+ def _create_http_formats(self, m3u8_formats, video_id):
+ for f in m3u8_formats:
+ if f['vcodec'] == 'none':
+ continue
+ http_url = re.sub(r'(https?://[^/]+)/hls/([^?#]+?\.mp4).+', r'\g<1>/\g<2>', f['url'])
+ if http_url == f['url']:
+ continue
- return src
+ http_f = f.copy()
+ del http_f['manifest_url']
+ format_id = try_call(lambda: http_f['format_id'].replace('hls-', 'https-'))
+ urlh = self._request_webpage(HEADRequest(http_url), video_id, fatal=False,
+ note=f'Check filesize for {format_id}')
+ if not urlh:
+ continue
- def _create_formats(self, urls, video_id):
- formats = []
- formats = self._extract_m3u8_formats(
- urls.get('m3u8'), video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False)
-
- if urls.get('mp4'):
- formats.append({
- 'format_id': 'http-mp4',
- 'url': urls['mp4']
+ http_f.update({
+ 'format_id': format_id,
+ 'url': http_url,
+ 'protocol': 'https',
+ 'filesize_approx': int_or_none(urlh.headers.get('Content-Length', None)),
})
- return formats
+ yield http_f
+
+ def _create_formats(self, sources, video_id):
+ for source in sources:
+ if source['type'] == 'm3u8':
+ m3u8_formats = self._extract_m3u8_formats(
+ source['url'], video_id, 'mp4', m3u8_id='hls', fatal=False)
+ yield from m3u8_formats
+ yield from self._create_http_formats(m3u8_formats, video_id)
+ elif source['type'] == 'mp3':
+ yield {
+ 'format_id': 'https-mp3',
+ 'ext': 'mp3',
+ 'acodec': 'mp3',
+ 'vcodec': 'none',
+ 'abr': source.get('bitrate'),
+ 'url': source['url'],
+ }
def _real_extract(self, url):
- mobj = self._match_valid_url(url)
- video_id = mobj.group('id')
+ cdn, video_id = self._match_valid_url(url).group('cdn', 'id')
+ display_id, video_data = None, None
- if 'cdn' not in mobj.groupdict():
- raise ExtractorError('CDN not found in url: %s' % url)
+ if re.match(self._UUID_RE, video_id) or re.match(self._RCS_ID_RE, video_id):
+ url = f'https://video.{cdn}/video-json/{video_id}'
+ else:
+ webpage = self._download_webpage(url, video_id)
+ data_config = get_element_html_by_id('divVideoPlayer', webpage) or get_element_html_by_class('divVideoPlayer', webpage)
- # for leitv/youreporter/viaggi don't use the embed page
- if ((mobj.group('cdn') not in ['leitv.it', 'youreporter.it'])
- and (mobj.group('vid') == 'video')):
- url = 'https://video.%s/video-embed/%s' % (mobj.group('cdn'), video_id)
+ if data_config:
+ data_config = self._parse_json(
+ extract_attributes(data_config).get('data-config'),
+ video_id, fatal=False) or {}
+ if data_config.get('newspaper'):
+ cdn = f'{data_config["newspaper"]}.it'
+ display_id, video_id = video_id, data_config.get('uuid') or video_id
+ url = f'https://video.{cdn}/video-json/{video_id}'
+ else:
+ json_url = self._search_regex(
+ r'''(?x)url\s*=\s*(["'])
+ (?P<url>
+ (?:https?:)?//video\.rcs\.it
+ /fragment-includes/video-includes/[^"']+?\.json
+ )\1;''',
+ webpage, video_id, group='url', default=None)
+ if json_url:
+ video_data = self._download_json(sanitize_url(json_url, scheme='https'), video_id)
+ display_id, video_id = video_id, video_data.get('id') or video_id
- page = self._download_webpage(url, video_id)
+ if not video_data:
+ webpage = self._download_webpage(url, video_id)
- video_data = None
- # look for json video data url
- json = self._search_regex(
- r'''(?x)url\s*=\s*(["'])
- (?P<url>
- (?:https?:)?//video\.rcs\.it
- /fragment-includes/video-includes/.+?\.json
- )\1;''',
- page, video_id, group='url', default=None)
- if json:
- if json.startswith('//'):
- json = 'https:%s' % json
- video_data = self._download_json(json, video_id)
+ video_data = self._search_json(
+ '##start-video##', webpage, 'video data', video_id, default=None,
+ end_pattern='##end-video##', transform_source=js_to_json)
- # if json url not found, look for json video data directly in the page
- else:
- # RCS normal pages and most of the embeds
- json = self._search_regex(
- r'[\s;]video\s*=\s*({[\s\S]+?})(?:;|,playlist=)',
- page, video_id, default=None)
- if not json and 'video-embed' in url:
- page = self._download_webpage(url.replace('video-embed', 'video-json'), video_id)
- json = self._search_regex(
- r'##start-video##({[\s\S]+?})##end-video##',
- page, video_id, default=None)
- if not json:
- # if no video data found try search for iframes
- emb = RCSEmbedsIE._extract_url(page)
+ if not video_data:
+ # try search for iframes
+ emb = RCSEmbedsIE._extract_url(webpage)
if emb:
return {
'_type': 'url_transparent',
'url': emb,
'ie_key': RCSEmbedsIE.ie_key()
}
- if json:
- video_data = self._parse_json(
- json, video_id, transform_source=js_to_json)
if not video_data:
raise ExtractorError('Video data not found in the page')
- formats = self._create_formats(
- self._get_video_src(video_data), video_id)
-
- description = (video_data.get('description')
- or clean_html(video_data.get('htmlDescription'))
- or self._html_search_meta('description', page))
- uploader = video_data.get('provider') or mobj.group('cdn')
-
return {
'id': video_id,
+ 'display_id': display_id,
'title': video_data.get('title'),
- 'description': description,
- 'uploader': uploader,
- 'formats': formats
+ 'description': (clean_html(video_data.get('description'))
+ or clean_html(video_data.get('htmlDescription'))
+ or self._html_search_meta('description', webpage)),
+ 'uploader': video_data.get('provider') or cdn,
+ 'formats': list(self._create_formats(self._get_video_src(video_data), video_id)),
}
@@ -296,7 +229,7 @@ class RCSEmbedsIE(RCSBaseIE):
\1''']
_TESTS = [{
'url': 'https://video.rcs.it/video-embed/iodonna-0001585037',
- 'md5': '623ecc8ffe7299b2d0c1046d8331a9df',
+ 'md5': '0faca97df525032bb9847f690bc3720c',
'info_dict': {
'id': 'iodonna-0001585037',
'ext': 'mp4',
@@ -305,38 +238,31 @@ class RCSEmbedsIE(RCSBaseIE):
'uploader': 'rcs.it',
}
}, {
- # redownload the page changing 'video-embed' in 'video-json'
'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789',
- 'md5': 'a043e3fecbe4d9ed7fc5d888652a5440',
- 'info_dict': {
- 'id': 'gazzanet-mo05-0000260789',
- 'ext': 'mp4',
- 'title': 'Valentino Rossi e papà Graziano si divertono col drifting',
- 'description': 'md5:a8bf90d6adafd9815f70fc74c0fc370a',
- 'uploader': 'rcd',
- }
- }, {
- 'url': 'https://video.corriere.it/video-embed/b727632a-f9d0-11ea-91b0-38d50a849abb?player',
'match_only': True
}, {
'url': 'https://video.gazzetta.it/video-embed/49612410-00ca-11eb-bcd8-30d4253e0140',
'match_only': True
}]
+ _WEBPAGE_TESTS = [{
+ 'url': 'https://www.iodonna.it/video-iodonna/personaggi-video/monica-bellucci-piu-del-lavoro-oggi-per-me-sono-importanti-lamicizia-e-la-famiglia/',
+ 'info_dict': {
+ 'id': 'iodonna-0002033648',
+ 'ext': 'mp4',
+ 'title': 'Monica Bellucci: «Più del lavoro, oggi per me sono importanti l\'amicizia e la famiglia»',
+ 'description': 'md5:daea6d9837351e56b1ab615c06bebac1',
+ 'uploader': 'rcs.it',
+ }
+ }]
@staticmethod
- def _sanitize_urls(urls):
- # add protocol if missing
- for i, e in enumerate(urls):
- if e.startswith('//'):
- urls[i] = 'https:%s' % e
- # clean iframes urls
- for i, e in enumerate(urls):
- urls[i] = urljoin(base_url(e), url_basename(e))
- return urls
+ def _sanitize_url(url):
+ url = sanitize_url(url, scheme='https')
+ return urljoin(base_url(url), url_basename(url))
@classmethod
def _extract_embed_urls(cls, url, webpage):
- return cls._sanitize_urls(list(super()._extract_embed_urls(url, webpage)))
+ return map(cls._sanitize_url, super()._extract_embed_urls(url, webpage))
class RCSIE(RCSBaseIE):
@@ -349,37 +275,53 @@ class RCSIE(RCSBaseIE):
|corrierefiorentino\.
)?corriere\.it
|(?:gazzanet\.)?gazzetta\.it)
- /(?!video-embed/).+?/(?P<id>[^/\?]+)(?=\?|/$|$)'''
+ /(?!video-embed/)[^?#]+?/(?P<id>[^/\?]+)(?=\?|/$|$)'''
_TESTS = [{
+ # json iframe directly from id
'url': 'https://video.corriere.it/sport/formula-1/vettel-guida-ferrari-sf90-mugello-suo-fianco-c-elecrerc-bendato-video-esilarante/b727632a-f9d0-11ea-91b0-38d50a849abb',
- 'md5': '0f4ededc202b0f00b6e509d831e2dcda',
+ 'md5': '14946840dec46ecfddf66ba4eea7d2b2',
'info_dict': {
'id': 'b727632a-f9d0-11ea-91b0-38d50a849abb',
'ext': 'mp4',
'title': 'Vettel guida la Ferrari SF90 al Mugello e al suo fianco c\'è Leclerc (bendato): il video è esilarante',
- 'description': 'md5:93b51c9161ac8a64fb2f997b054d0152',
+ 'description': 'md5:3915ce5ebb3d2571deb69a5eb85ac9b5',
'uploader': 'Corriere Tv',
}
}, {
- # video data inside iframe
+ # search for video id inside the page
'url': 'https://viaggi.corriere.it/video/norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen/',
- 'md5': 'da378e4918d2afbf7d61c35abb948d4c',
+ 'md5': 'f22a92d9e666e80f2fffbf2825359c81',
'info_dict': {
'id': '5b7cd134-e2c1-11ea-89b3-b56dd0df2aa2',
+ 'display_id': 'norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen',
'ext': 'mp4',
'title': 'La nuova spettacolare attrazione in Norvegia: il ponte sopra Vøringsfossen',
'description': 'md5:18b35a291f6746c0c8dacd16e5f5f4f8',
'uploader': 'DOVE Viaggi',
}
}, {
- 'url': 'https://video.gazzetta.it/video-motogp-catalogna-cadute-dovizioso-vale-rossi/49612410-00ca-11eb-bcd8-30d4253e0140?vclk=Videobar',
- 'md5': 'eedc1b5defd18e67383afef51ff7bdf9',
+ # only audio format https://github.com/hypervideo/hypervideo/issues/5683
+ 'url': 'https://video.corriere.it/cronaca/audio-telefonata-il-papa-becciu-santita-lettera-che-mi-ha-inviato-condanna/b94c0d20-70c2-11ed-9572-e4b947a0ebd2',
+ 'md5': 'aaffb08d02f2ce4292a4654694c78150',
+ 'info_dict': {
+ 'id': 'b94c0d20-70c2-11ed-9572-e4b947a0ebd2',
+ 'ext': 'mp3',
+ 'title': 'L\'audio della telefonata tra il Papa e Becciu: «Santità, la lettera che mi ha inviato è una condanna»',
+ 'description': 'md5:c0ddb61bd94a8d4e0d4bb9cda50a689b',
+ 'uploader': 'Corriere Tv',
+ 'formats': [{'format_id': 'https-mp3', 'ext': 'mp3'}],
+ }
+ }, {
+ # old content still needs cdn migration
+ 'url': 'https://viaggi.corriere.it/video/milano-varallo-sesia-sul-treno-a-vapore/',
+ 'md5': '2dfdce7af249654ad27eeba03fe1e08d',
'info_dict': {
- 'id': '49612410-00ca-11eb-bcd8-30d4253e0140',
+ 'id': 'd8f6c8d0-f7d7-11e8-bfca-f74cf4634191',
+ 'display_id': 'milano-varallo-sesia-sul-treno-a-vapore',
'ext': 'mp4',
- 'title': 'Dovizioso, il contatto con Zarco e la caduta. E anche Vale finisce a terra',
- 'description': 'md5:8c6e905dc3b9413218beca11ebd69778',
- 'uploader': 'AMorici',
+ 'title': 'Milano-Varallo Sesia sul treno a vapore',
+ 'description': 'md5:6348f47aac230397fe341a74f7678d53',
+ 'uploader': 'DOVE Viaggi',
}
}, {
'url': 'https://video.corriere.it/video-360/metro-copenaghen-tutta-italiana/a248a7f0-e2db-11e9-9830-af2de6b1f945',
@@ -391,13 +333,15 @@ class RCSVariousIE(RCSBaseIE):
_VALID_URL = r'''(?x)https?://www\.
(?P<cdn>
leitv\.it|
- youreporter\.it
+ youreporter\.it|
+ amica\.it
)/(?:[^/]+/)?(?P<id>[^/]+?)(?:$|\?|/)'''
_TESTS = [{
- 'url': 'https://www.leitv.it/benessere/mal-di-testa-come-combatterlo-ed-evitarne-la-comparsa/',
- 'md5': '92b4e63667b8f95acb0a04da25ae28a1',
+ 'url': 'https://www.leitv.it/benessere/mal-di-testa/',
+ 'md5': '3b7a683d105a7313ec7513b014443631',
'info_dict': {
- 'id': 'mal-di-testa-come-combatterlo-ed-evitarne-la-comparsa',
+ 'id': 'leitv-0000125151',
+ 'display_id': 'mal-di-testa',
'ext': 'mp4',
'title': 'Cervicalgia e mal di testa, il video con i suggerimenti dell\'esperto',
'description': 'md5:ae21418f34cee0b8d02a487f55bcabb5',
@@ -405,12 +349,24 @@ class RCSVariousIE(RCSBaseIE):
}
}, {
'url': 'https://www.youreporter.it/fiume-sesia-3-ottobre-2020/',
- 'md5': '8dccd436b47a830bab5b4a88232f391a',
+ 'md5': '3989b6d603482611a2abd2f32b79f739',
'info_dict': {
- 'id': 'fiume-sesia-3-ottobre-2020',
+ 'id': 'youreporter-0000332574',
+ 'display_id': 'fiume-sesia-3-ottobre-2020',
'ext': 'mp4',
'title': 'Fiume Sesia 3 ottobre 2020',
'description': 'md5:0070eef1cc884d13c970a4125063de55',
'uploader': 'youreporter.it',
}
+ }, {
+ 'url': 'https://www.amica.it/video-post/saint-omer-al-cinema-il-film-leone-dargento-che-ribalta-gli-stereotipi/',
+ 'md5': '187cce524dfd0343c95646c047375fc4',
+ 'info_dict': {
+ 'id': 'amica-0001225365',
+ 'display_id': 'saint-omer-al-cinema-il-film-leone-dargento-che-ribalta-gli-stereotipi',
+ 'ext': 'mp4',
+ 'title': '"Saint Omer": al cinema il film Leone d\'argento che ribalta gli stereotipi',
+ 'description': 'md5:b1c8869c2dcfd6073a2a311ba0008aa8',
+ 'uploader': 'rcs.it',
+ }
}]
diff --git a/hypervideo_dl/extractor/rcti.py b/hypervideo_dl/extractor/rcti.py
index 27b4ad7..79d9c8e 100644
--- a/hypervideo_dl/extractor/rcti.py
+++ b/hypervideo_dl/extractor/rcti.py
@@ -3,7 +3,7 @@ import random
import time
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
dict_get,
ExtractorError,
@@ -186,7 +186,7 @@ class RCTIPlusIE(RCTIPlusBaseIE):
try:
formats = self._extract_m3u8_formats(video_url, display_id, 'mp4', headers={'Referer': 'https://www.rctiplus.com/'})
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
self.raise_geo_restricted(countries=['ID'], metadata_available=True)
else:
raise e
diff --git a/hypervideo_dl/extractor/recurbate.py b/hypervideo_dl/extractor/recurbate.py
new file mode 100644
index 0000000..d7294cb
--- /dev/null
+++ b/hypervideo_dl/extractor/recurbate.py
@@ -0,0 +1,42 @@
+from .common import InfoExtractor
+from ..networking.exceptions import HTTPError
+from ..utils import ExtractorError, merge_dicts
+
+
+class RecurbateIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?recurbate\.com/play\.php\?video=(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://recurbate.com/play.php?video=39161415',
+ 'md5': 'dd2b4ec57aa3e3572cb5cf0997fca99f',
+ 'info_dict': {
+ 'id': '39161415',
+ 'ext': 'mp4',
+ 'description': 'md5:db48d09e4d93fc715f47fd3d6b7edd51',
+ 'title': 'Performer zsnicole33 show on 2022-10-25 20:23, Chaturbate Archive – Recurbate',
+ 'age_limit': 18,
+ },
+ 'skip': 'Website require membership.',
+ }]
+
+ def _real_extract(self, url):
+ SUBSCRIPTION_MISSING_MESSAGE = 'This video is only available for registered users; Set your authenticated browser user agent via the --user-agent parameter.'
+ video_id = self._match_id(url)
+ try:
+ webpage = self._download_webpage(url, video_id)
+ except ExtractorError as e:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
+ self.raise_login_required(msg=SUBSCRIPTION_MISSING_MESSAGE, method='cookies')
+ raise
+ token = self._html_search_regex(r'data-token="([^"]+)"', webpage, 'token')
+ video_url = f'https://recurbate.com/api/get.php?video={video_id}&token={token}'
+
+ video_webpage = self._download_webpage(video_url, video_id)
+ if video_webpage == 'shall_subscribe':
+ self.raise_login_required(msg=SUBSCRIPTION_MISSING_MESSAGE, method='cookies')
+ entries = self._parse_html5_media_entries(video_url, video_webpage, video_id)
+ return merge_dicts({
+ 'id': video_id,
+ 'title': self._html_extract_title(webpage, 'title'),
+ 'description': self._og_search_description(webpage),
+ 'age_limit': self._rta_search(webpage),
+ }, entries[0])
diff --git a/hypervideo_dl/extractor/redbulltv.py b/hypervideo_dl/extractor/redbulltv.py
index a01bc84..d1de249 100644
--- a/hypervideo_dl/extractor/redbulltv.py
+++ b/hypervideo_dl/extractor/redbulltv.py
@@ -1,5 +1,5 @@
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
float_or_none,
ExtractorError,
@@ -68,9 +68,9 @@ class RedBullTVIE(InfoExtractor):
headers={'Authorization': token}
)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 404:
error_message = self._parse_json(
- e.cause.read().decode(), video_id)['error']
+ e.cause.response.read().decode(), video_id)['error']
raise ExtractorError('%s said: %s' % (
self.IE_NAME, error_message), expected=True)
raise
diff --git a/hypervideo_dl/extractor/reddit.py b/hypervideo_dl/extractor/reddit.py
index f1a5c85..813e628 100644
--- a/hypervideo_dl/extractor/reddit.py
+++ b/hypervideo_dl/extractor/reddit.py
@@ -1,4 +1,3 @@
-import random
import urllib.parse
from .common import InfoExtractor
@@ -9,12 +8,14 @@ from ..utils import (
traverse_obj,
try_get,
unescapeHTML,
+ urlencode_postdata,
url_or_none,
)
class RedditIE(InfoExtractor):
- _VALID_URL = r'https?://(?P<subdomain>[^/]+\.)?reddit(?:media)?\.com/r/(?P<slug>[^/]+/comments/(?P<id>[^/?#&]+))'
+ _NETRC_MACHINE = 'reddit'
+ _VALID_URL = r'https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?comments/(?P<id>[^/?#&]+))'
_TESTS = [{
'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/',
'info_dict': {
@@ -32,6 +33,7 @@ class RedditIE(InfoExtractor):
'dislike_count': int,
'comment_count': int,
'age_limit': 0,
+ 'channel_id': 'videos',
},
'params': {
'skip_download': True,
@@ -55,6 +57,30 @@ class RedditIE(InfoExtractor):
'dislike_count': int,
'comment_count': int,
'age_limit': 0,
+ 'channel_id': 'aww',
+ },
+ }, {
+ # User post
+ 'url': 'https://www.reddit.com/user/creepyt0es/comments/nip71r/i_plan_to_make_more_stickers_and_prints_check/',
+ 'info_dict': {
+ 'id': 'zasobba6wp071',
+ 'ext': 'mp4',
+ 'display_id': 'nip71r',
+ 'title': 'I plan to make more stickers and prints! Check them out on my Etsy! Or get them through my Patreon. Links below.',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'thumbnails': 'count:5',
+ 'timestamp': 1621709093,
+ 'upload_date': '20210522',
+ 'uploader': 'creepyt0es',
+ 'duration': 6,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ 'age_limit': 0,
+ 'channel_id': 'u_creepyt0es',
+ },
+ 'params': {
+ 'skip_download': True,
},
}, {
# videos embedded in reddit text post
@@ -65,6 +91,66 @@ class RedditIE(InfoExtractor):
'title': 'md5:72d3d19402aa11eff5bd32fc96369b37',
},
}, {
+ # crossposted reddit-hosted media
+ 'url': 'https://www.reddit.com/r/dumbfuckers_club/comments/zjjw82/cringe/',
+ 'md5': '746180895c7b75a9d6b05341f507699a',
+ 'info_dict': {
+ 'id': 'a1oneun6pa5a1',
+ 'ext': 'mp4',
+ 'display_id': 'zjjw82',
+ 'title': 'Cringe',
+ 'uploader': 'Otaku-senpai69420',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'upload_date': '20221212',
+ 'timestamp': 1670812309,
+ 'duration': 16,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ 'age_limit': 0,
+ 'channel_id': 'dumbfuckers_club',
+ },
+ }, {
+ # post link without subreddit
+ 'url': 'https://www.reddit.com/comments/124pp33',
+ 'md5': '15eec9d828adcef4468b741a7e45a395',
+ 'info_dict': {
+ 'id': 'antsenjc2jqa1',
+ 'ext': 'mp4',
+ 'display_id': '124pp33',
+ 'title': 'Harmless prank of some old friends',
+ 'uploader': 'Dudezila',
+ 'channel_id': 'ContagiousLaughter',
+ 'duration': 17,
+ 'upload_date': '20230328',
+ 'timestamp': 1680012043,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'age_limit': 0,
+ 'comment_count': int,
+ 'dislike_count': int,
+ 'like_count': int,
+ },
+ }, {
+ # quarantined subreddit post
+ 'url': 'https://old.reddit.com/r/GenZedong/comments/12fujy3/based_hasan/',
+ 'md5': '3156ea69e3c1f1b6259683c5abd36e71',
+ 'info_dict': {
+ 'id': '8bwtclfggpsa1',
+ 'ext': 'mp4',
+ 'display_id': '12fujy3',
+ 'title': 'Based Hasan?',
+ 'uploader': 'KingNigelXLII',
+ 'channel_id': 'GenZedong',
+ 'duration': 16,
+ 'upload_date': '20230408',
+ 'timestamp': 1680979138,
+ 'age_limit': 0,
+ 'comment_count': int,
+ 'dislike_count': int,
+ 'like_count': int,
+ },
+ 'skip': 'Requires account that has opted-in to the GenZedong subreddit',
+ }, {
'url': 'https://www.reddit.com/r/videos/comments/6rrwyj',
'only_matching': True,
}, {
@@ -92,21 +178,45 @@ class RedditIE(InfoExtractor):
'only_matching': True,
}]
- @staticmethod
- def _gen_session_id():
- id_length = 16
- rand_max = 1 << (id_length * 4)
- return '%0.*x' % (id_length, random.randrange(rand_max))
+ def _perform_login(self, username, password):
+ captcha = self._download_json(
+ 'https://www.reddit.com/api/requires_captcha/login.json', None,
+ 'Checking login requirement')['required']
+ if captcha:
+ raise ExtractorError('Reddit is requiring captcha before login', expected=True)
+ login = self._download_json(
+ f'https://www.reddit.com/api/login/{username}', None, data=urlencode_postdata({
+ 'op': 'login-main',
+ 'user': username,
+ 'passwd': password,
+ 'api_type': 'json',
+ }), note='Logging in', errnote='Login request failed')
+ errors = '; '.join(traverse_obj(login, ('json', 'errors', ..., 1)))
+ if errors:
+ raise ExtractorError(f'Unable to login, Reddit API says {errors}', expected=True)
+ elif not traverse_obj(login, ('json', 'data', 'cookie', {str})):
+ raise ExtractorError('Unable to login, no cookie was returned')
def _real_extract(self, url):
- subdomain, slug, video_id = self._match_valid_url(url).group('subdomain', 'slug', 'id')
+ host, slug, video_id = self._match_valid_url(url).group('host', 'slug', 'id')
- self._set_cookie('.reddit.com', 'reddit_session', self._gen_session_id())
- self._set_cookie('.reddit.com', '_options', '%7B%22pref_quarantine_optin%22%3A%20true%7D')
- data = self._download_json(f'https://{subdomain}reddit.com/r/{slug}/.json', video_id, fatal=False)
+ data = self._download_json(
+ f'https://{host}/{slug}/.json', video_id, fatal=False, expected_status=403)
if not data:
- # Fall back to old.reddit.com in case the requested subdomain fails
- data = self._download_json(f'https://old.reddit.com/r/{slug}/.json', video_id)
+ fallback_host = 'old.reddit.com' if host != 'old.reddit.com' else 'www.reddit.com'
+ self.to_screen(f'{host} request failed, retrying with {fallback_host}')
+ data = self._download_json(
+ f'https://{fallback_host}/{slug}/.json', video_id, expected_status=403)
+
+ if traverse_obj(data, 'error') == 403:
+ reason = data.get('reason')
+ if reason == 'quarantined':
+ self.raise_login_required('Quarantined subreddit; an account that has opted in is required')
+ elif reason == 'private':
+ self.raise_login_required('Private subreddit; an account that has been approved is required')
+ else:
+ raise ExtractorError(f'HTTP Error 403 Forbidden; reason given: {reason}')
+
data = data[0]['data']['children'][0]['data']
video_url = data['url']
@@ -130,6 +240,7 @@ class RedditIE(InfoExtractor):
'url': unescapeHTML(thumbnail_url),
'width': int_or_none(src.get('width')),
'height': int_or_none(src.get('height')),
+ 'http_headers': {'Accept': '*/*'},
})
for image in try_get(data, lambda x: x['preview']['images']) or []:
@@ -146,6 +257,7 @@ class RedditIE(InfoExtractor):
'thumbnails': thumbnails,
'timestamp': float_or_none(data.get('created_utc')),
'uploader': data.get('author'),
+ 'channel_id': data.get('subreddit'),
'like_count': int_or_none(data.get('ups')),
'dislike_count': int_or_none(data.get('downs')),
'comment_count': int_or_none(data.get('num_comments')),
@@ -179,7 +291,8 @@ class RedditIE(InfoExtractor):
raise ExtractorError('No media found', expected=True)
# Check if media is hosted on reddit:
- reddit_video = traverse_obj(data, (('media', 'secure_media'), 'reddit_video'), get_all=False)
+ reddit_video = traverse_obj(data, (
+ (None, ('crosspost_parent_list', ...)), ('secure_media', 'media'), 'reddit_video'), get_all=False)
if reddit_video:
playlist_urls = [
try_get(reddit_video, lambda x: unescapeHTML(x[y]))
diff --git a/hypervideo_dl/extractor/redgifs.py b/hypervideo_dl/extractor/redgifs.py
index 098fb81..f945320 100644
--- a/hypervideo_dl/extractor/redgifs.py
+++ b/hypervideo_dl/extractor/redgifs.py
@@ -1,8 +1,8 @@
import functools
-import urllib
from .common import InfoExtractor
from ..compat import compat_parse_qs
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
@@ -82,7 +82,7 @@ class RedGifsBaseInfoExtractor(InfoExtractor):
f'https://api.redgifs.com/v2/{ep}', video_id, headers=headers, *args, **kwargs)
break
except ExtractorError as e:
- if first_attempt and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401:
+ if first_attempt and isinstance(e.cause, HTTPError) and e.cause.status == 401:
del self._API_HEADERS['authorization'] # refresh the token
continue
raise
diff --git a/hypervideo_dl/extractor/regiotv.py b/hypervideo_dl/extractor/regiotv.py
index 6114841..edb6ae5 100644
--- a/hypervideo_dl/extractor/regiotv.py
+++ b/hypervideo_dl/extractor/regiotv.py
@@ -1,10 +1,6 @@
from .common import InfoExtractor
-
-from ..utils import (
- sanitized_Request,
- xpath_text,
- xpath_with_ns,
-)
+from ..networking import Request
+from ..utils import xpath_text, xpath_with_ns
class RegioTVIE(InfoExtractor):
@@ -33,7 +29,7 @@ class RegioTVIE(InfoExtractor):
SOAP_TEMPLATE = '<?xml version="1.0" encoding="utf-8"?><soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"><soap:Body><{0} xmlns="http://v.telvi.de/"><key xsi:type="xsd:string">{1}</key></{0}></soap:Body></soap:Envelope>'
- request = sanitized_Request(
+ request = Request(
'http://v.telvi.de/',
SOAP_TEMPLATE.format('GetHTML5VideoData', key).encode('utf-8'))
video_data = self._download_xml(request, video_id, 'Downloading video XML')
diff --git a/hypervideo_dl/extractor/rheinmaintv.py b/hypervideo_dl/extractor/rheinmaintv.py
new file mode 100644
index 0000000..c3b352d
--- /dev/null
+++ b/hypervideo_dl/extractor/rheinmaintv.py
@@ -0,0 +1,94 @@
+from .common import InfoExtractor
+from ..utils import extract_attributes, merge_dicts, remove_end
+
+
+class RheinMainTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?rheinmaintv\.de/sendungen/(?:[\w-]+/)*(?P<video_id>(?P<display_id>[\w-]+)/vom-\d{2}\.\d{2}\.\d{4}(?:/\d+)?)'
+ _TESTS = [{
+ 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/auf-dem-weg-zur-deutschen-meisterschaft/vom-07.11.2022/',
+ 'info_dict': {
+ 'id': 'auf-dem-weg-zur-deutschen-meisterschaft-vom-07.11.2022',
+ 'ext': 'ismv', # ismv+isma will be merged into mp4
+ 'alt_title': 'Auf dem Weg zur Deutschen Meisterschaft',
+ 'title': 'Auf dem Weg zur Deutschen Meisterschaft',
+ 'upload_date': '20221108',
+ 'view_count': int,
+ 'display_id': 'auf-dem-weg-zur-deutschen-meisterschaft',
+ 'thumbnail': r're:^https://.+\.jpg',
+ 'description': 'md5:48c59b74192bc819a9b34af1d5ed1eb9',
+ 'timestamp': 1667933057,
+ 'duration': 243.0,
+ },
+ 'params': {'skip_download': 'ism'},
+ }, {
+ 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften/vom-14.11.2022/',
+ 'info_dict': {
+ 'id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften-vom-14.11.2022',
+ 'ext': 'ismv',
+ 'title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften',
+ 'timestamp': 1668526214,
+ 'display_id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften',
+ 'alt_title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften',
+ 'view_count': int,
+ 'thumbnail': r're:^https://.+\.jpg',
+ 'duration': 345.0,
+ 'description': 'md5:9370ba29526984006c2cba1372e5c5a0',
+ 'upload_date': '20221115',
+ },
+ 'params': {'skip_download': 'ism'},
+ }, {
+ 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/casino-mainz-bei-den-deutschen-meisterschaften/vom-14.11.2022/',
+ 'info_dict': {
+ 'id': 'casino-mainz-bei-den-deutschen-meisterschaften-vom-14.11.2022',
+ 'ext': 'ismv',
+ 'title': 'Casino Mainz bei den Deutschen Meisterschaften',
+ 'view_count': int,
+ 'timestamp': 1668527402,
+ 'alt_title': 'Casino Mainz bei den Deutschen Meisterschaften',
+ 'upload_date': '20221115',
+ 'display_id': 'casino-mainz-bei-den-deutschen-meisterschaften',
+ 'duration': 348.0,
+ 'thumbnail': r're:^https://.+\.jpg',
+ 'description': 'md5:70fc1660eeba96da17199e5bdff4c0aa',
+ },
+ 'params': {'skip_download': 'ism'},
+ }, {
+ 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/bricks4kids/vom-22.06.2022/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = self._match_valid_url(url)
+ display_id = mobj.group('display_id')
+ video_id = mobj.group('video_id').replace('/', '-')
+ webpage = self._download_webpage(url, video_id)
+
+ source, img = self._search_regex(r'(?s)(?P<source><source[^>]*>)(?P<img><img[^>]*>)',
+ webpage, 'video', group=('source', 'img'))
+ source = extract_attributes(source)
+ img = extract_attributes(img)
+
+ raw_json_ld = list(self._yield_json_ld(webpage, video_id))
+ json_ld = self._json_ld(raw_json_ld, video_id)
+ json_ld.pop('url', None)
+
+ ism_manifest_url = (
+ source.get('src')
+ or next(json_ld.get('embedUrl') for json_ld in raw_json_ld if json_ld.get('@type') == 'VideoObject')
+ )
+ formats, subtitles = self._extract_ism_formats_and_subtitles(ism_manifest_url, video_id)
+
+ return merge_dicts({
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title':
+ self._html_search_regex(r'<h1><span class="title">([^<]*)</span>',
+ webpage, 'headline', default=None)
+ or img.get('title') or json_ld.get('title') or self._og_search_title(webpage)
+ or remove_end(self._html_extract_title(webpage), ' -'),
+ 'alt_title': img.get('alt'),
+ 'description': json_ld.get('description') or self._og_search_description(webpage),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'thumbnails': [{'url': img['src']}] if 'src' in img else json_ld.get('thumbnails'),
+ }, json_ld)
diff --git a/hypervideo_dl/extractor/rokfin.py b/hypervideo_dl/extractor/rokfin.py
index ade3cd0..4a4d40b 100644
--- a/hypervideo_dl/extractor/rokfin.py
+++ b/hypervideo_dl/extractor/rokfin.py
@@ -45,6 +45,7 @@ class RokfinIE(InfoExtractor):
'live_status': 'not_live',
'dislike_count': int,
'like_count': int,
+ 'duration': 213,
}
}, {
'url': 'https://rokfin.com/post/223/Julian-Assange-Arrested-Streaming-In-Real-Time',
@@ -72,7 +73,7 @@ class RokfinIE(InfoExtractor):
'title': '"It\'s A Crazy Mess" Regional Director Blows Whistle On Pfizer\'s Vaccine Trial Data',
'thumbnail': r're:https://img\.production\.rokfin\.com/.+',
'description': 'md5:324ce2d3e3b62e659506409e458b9d8e',
- 'channel': 'Ryan Cristián',
+ 'channel': 'TLAVagabond',
'channel_id': 53856,
'channel_url': 'https://rokfin.com/TLAVagabond',
'availability': 'public',
@@ -86,6 +87,47 @@ class RokfinIE(InfoExtractor):
'dislike_count': int,
'like_count': int,
'tags': ['FreeThinkingMedia^'],
+ 'duration': None,
+ }
+ }, {
+ 'url': 'https://rokfin.com/post/126703/Brave-New-World--Aldous-Huxley-DEEPDIVE--Chpts-13--Quite-Frankly--Jay-Dyer',
+ 'info_dict': {
+ 'id': 'post/126703',
+ 'ext': 'mp4',
+ 'title': 'Brave New World - Aldous Huxley DEEPDIVE! (Chpts 1-3) - Quite Frankly & Jay Dyer',
+ 'thumbnail': r're:https://img\.production\.rokfin\.com/.+',
+ 'channel': 'Jay Dyer',
+ 'channel_id': 186881,
+ 'channel_url': 'https://rokfin.com/jaydyer',
+ 'availability': 'premium_only',
+ 'live_status': 'not_live',
+ 'dislike_count': int,
+ 'like_count': int,
+ 'timestamp': 1678213357,
+ 'upload_date': '20230307',
+ 'tags': ['FreeThinkingMedia^', 'OpenMind^'],
+ 'description': 'md5:cb04e32e68326c9b2b251b297bacff35',
+ 'duration': 3100,
+ }
+ }, {
+ 'url': 'https://rokfin.com/stream/31332/The-Grayzone-live-on-Nordstream-blame-game',
+ 'info_dict': {
+ 'id': 'stream/31332',
+ 'ext': 'mp4',
+ 'title': 'The Grayzone live on Nordstream blame game',
+ 'thumbnail': r're:https://image\.v\.rokfin\.com/.+',
+ 'channel': 'Max Blumenthal',
+ 'channel_id': 248902,
+ 'channel_url': 'https://rokfin.com/MaxBlumenthal',
+ 'availability': 'premium_only',
+ 'live_status': 'was_live',
+ 'dislike_count': int,
+ 'like_count': int,
+ 'timestamp': 1678475166,
+ 'release_timestamp': 1678475166.0,
+ 'release_date': '20230310',
+ 'upload_date': '20230310',
+ 'tags': ['FreeThinkingMedia^'],
}
}]
@@ -100,6 +142,12 @@ class RokfinIE(InfoExtractor):
else 'not_live')
video_url = traverse_obj(metadata, 'url', ('content', 'contentUrl'), expected_type=url_or_none)
+ if video_url in (None, 'fake.m3u8'):
+ video_url = format_field(self._search_regex(
+ r'https?://[^/]+/([^/]+)/storyboard.vtt',
+ traverse_obj(metadata, 'timelineUrl', ('content', 'timelineUrl'), expected_type=url_or_none),
+ video_id, default=None), None, 'https://stream.v.rokfin.com/%s.m3u8')
+
formats, subtitles = [{'url': video_url}] if video_url else [], {}
if determine_ext(video_url) == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
@@ -197,7 +245,7 @@ class RokfinIE(InfoExtractor):
f'{self._AUTH_BASE}/token', None,
note='getting access credentials', errnote='error getting access credentials',
data=urlencode_postdata({
- 'code': urllib.parse.parse_qs(urllib.parse.urldefrag(urlh.geturl()).fragment).get('code')[0],
+ 'code': urllib.parse.parse_qs(urllib.parse.urldefrag(urlh.url).fragment).get('code')[0],
'client_id': 'web',
'grant_type': 'authorization_code',
'redirect_uri': 'https://rokfin.com/silent-check-sso.html'
@@ -221,7 +269,7 @@ class RokfinIE(InfoExtractor):
json_string, urlh = self._download_webpage_handle(
url_or_request, video_id, headers=headers, query=query, expected_status=401)
- if not auth_token or urlh.code != 401 or refresh_token is None:
+ if not auth_token or urlh.status != 401 or refresh_token is None:
return self._parse_json(json_string, video_id)
self._access_mgmt_tokens = self._download_json(
diff --git a/hypervideo_dl/extractor/roosterteeth.py b/hypervideo_dl/extractor/roosterteeth.py
index 776fbfb..94e673b 100644
--- a/hypervideo_dl/extractor/roosterteeth.py
+++ b/hypervideo_dl/extractor/roosterteeth.py
@@ -1,5 +1,5 @@
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
@@ -35,8 +35,8 @@ class RoosterTeethBaseIE(InfoExtractor):
}))
except ExtractorError as e:
msg = 'Unable to login'
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- resp = self._parse_json(e.cause.read().decode(), None, fatal=False)
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+ resp = self._parse_json(e.cause.response.read().decode(), None, fatal=False)
if resp:
error = resp.get('extra_info') or resp.get('error_description') or resp.get('error')
if error:
@@ -138,8 +138,8 @@ class RoosterTeethIE(RoosterTeethBaseIE):
m3u8_url = video_data['attributes']['url']
# XXX: additional URL at video_data['links']['download']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- if self._parse_json(e.cause.read().decode(), display_id).get('access') is False:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
+ if self._parse_json(e.cause.response.read().decode(), display_id).get('access') is False:
self.raise_login_required(
'%s is only available for FIRST members' % display_id)
raise
diff --git a/hypervideo_dl/extractor/rottentomatoes.py b/hypervideo_dl/extractor/rottentomatoes.py
index f133c85..e357175 100644
--- a/hypervideo_dl/extractor/rottentomatoes.py
+++ b/hypervideo_dl/extractor/rottentomatoes.py
@@ -1,30 +1,80 @@
from .common import InfoExtractor
-from .internetvideoarchive import InternetVideoArchiveIE
+from ..utils import (
+ ExtractorError,
+ clean_html,
+ float_or_none,
+ get_element_by_class,
+ join_nonempty,
+ traverse_obj,
+ url_or_none,
+)
class RottenTomatoesIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/(?P<playlist>[^/]+)(?:/(?P<tr>trailers)(?:/(?P<id>\w+))?)?'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/',
'info_dict': {
'id': '11028566',
'ext': 'mp4',
'title': 'Toy Story 3',
- 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.',
+ 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.'
+ },
+ 'skip': 'No longer available',
+ }, {
+ 'url': 'https://www.rottentomatoes.com/m/toy_story_3/trailers/VycaVoBKhGuk',
+ 'info_dict': {
+ 'id': 'VycaVoBKhGuk',
+ 'ext': 'mp4',
+ 'title': 'Toy Story 3: Trailer 2',
+ 'description': '',
'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 149.941
+ },
+ }, {
+ 'url': 'http://www.rottentomatoes.com/m/toy_story_3',
+ 'info_dict': {
+ 'id': 'toy_story_3',
+ 'title': 'Toy Story 3',
+ },
+ 'playlist_mincount': 4,
+ }, {
+ 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers',
+ 'info_dict': {
+ 'id': 'toy_story_3-trailers',
},
- }
+ 'playlist_mincount': 5,
+ }]
+
+ def _extract_videos(self, data, display_id):
+ for video in traverse_obj(data, (lambda _, v: v['publicId'] and v['file'] and v['type'] == 'hls')):
+ yield {
+ 'formats': self._extract_m3u8_formats(
+ video['file'], display_id, 'mp4', m3u8_id='hls', fatal=False),
+ **traverse_obj(video, {
+ 'id': 'publicId',
+ 'title': 'title',
+ 'description': 'description',
+ 'duration': ('durationInSeconds', {float_or_none}),
+ 'thumbnail': ('image', {url_or_none}),
+ }),
+ }
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- iva_id = self._search_regex(r'publishedid=(\d+)', webpage, 'internet video archive id')
+ playlist_id, trailers, video_id = self._match_valid_url(url).group('playlist', 'tr', 'id')
+ playlist_id = join_nonempty(playlist_id, trailers)
+ webpage = self._download_webpage(url, playlist_id)
+ data = self._search_json(
+ r'<script[^>]+\bid=["\'](?:heroV|v)ideos["\'][^>]*>', webpage,
+ 'data', playlist_id, contains_pattern=r'\[{(?s:.+)}\]')
+
+ if video_id:
+ video_data = traverse_obj(data, lambda _, v: v['publicId'] == video_id)
+ if not video_data:
+ raise ExtractorError('Unable to extract video from webpage')
+ return next(self._extract_videos(video_data, video_id))
- return {
- '_type': 'url_transparent',
- 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?domain=www.videodetective.com&customerid=69249&playerid=641&publishedid=' + iva_id,
- 'ie_key': InternetVideoArchiveIE.ie_key(),
- 'id': video_id,
- 'title': self._og_search_title(webpage),
- }
+ return self.playlist_result(
+ self._extract_videos(data, playlist_id), playlist_id,
+ clean_html(get_element_by_class('scoreboard__title', webpage)))
diff --git a/hypervideo_dl/extractor/rozhlas.py b/hypervideo_dl/extractor/rozhlas.py
index a818967..6313432 100644
--- a/hypervideo_dl/extractor/rozhlas.py
+++ b/hypervideo_dl/extractor/rozhlas.py
@@ -1,7 +1,16 @@
+import itertools
+
from .common import InfoExtractor
+from ..networking.exceptions import HTTPError
from ..utils import (
+ ExtractorError,
+ extract_attributes,
int_or_none,
remove_start,
+ str_or_none,
+ traverse_obj,
+ unified_timestamp,
+ url_or_none,
)
@@ -45,3 +54,290 @@ class RozhlasIE(InfoExtractor):
'duration': duration,
'vcodec': 'none',
}
+
+
+class RozhlasBaseIE(InfoExtractor):
+ def _extract_formats(self, entry, audio_id):
+ formats = []
+ for audio in traverse_obj(entry, ('audioLinks', lambda _, v: url_or_none(v['url']))):
+ ext = audio.get('variant')
+ for retry in self.RetryManager():
+ if retry.attempt > 1:
+ self._sleep(1, audio_id)
+ try:
+ if ext == 'dash':
+ formats.extend(self._extract_mpd_formats(
+ audio['url'], audio_id, mpd_id=ext))
+ elif ext == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ audio['url'], audio_id, 'm4a', m3u8_id=ext))
+ else:
+ formats.append({
+ 'url': audio['url'],
+ 'ext': ext,
+ 'format_id': ext,
+ 'abr': int_or_none(audio.get('bitrate')),
+ 'acodec': ext,
+ 'vcodec': 'none',
+ })
+ except ExtractorError as e:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 429:
+ retry.error = e.cause
+ else:
+ self.report_warning(e.msg)
+
+ return formats
+
+
+class RozhlasVltavaIE(RozhlasBaseIE):
+ _VALID_URL = r'https?://(?:\w+\.rozhlas|english\.radio)\.cz/[\w-]+-(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://wave.rozhlas.cz/papej-masicko-porcujeme-a-bilancujeme-filmy-a-serialy-ktere-letos-zabily-8891337',
+ 'md5': 'ba2fdbc1242fc16771c7695d271ec355',
+ 'info_dict': {
+ 'id': '8891337',
+ 'title': 'md5:21f99739d04ab49d8c189ec711eef4ec',
+ },
+ 'playlist_count': 1,
+ 'playlist': [{
+ 'md5': 'ba2fdbc1242fc16771c7695d271ec355',
+ 'info_dict': {
+ 'id': '10520988',
+ 'ext': 'mp3',
+ 'title': 'Papej masíčko! Porcujeme a bilancujeme filmy a seriály, které to letos zabily',
+ 'description': 'md5:1c6d29fb9564e1f17fc1bb83ae7da0bc',
+ 'duration': 1574,
+ 'artist': 'Aleš Stuchlý',
+ 'channel_id': 'radio-wave',
+ },
+ }]
+ }, {
+ 'url': 'https://wave.rozhlas.cz/poslechnete-si-neklid-podcastovy-thriller-o-vine-strachu-a-vztahu-ktery-zasel-8554744',
+ 'info_dict': {
+ 'id': '8554744',
+ 'title': 'Poslechněte si Neklid. Podcastový thriller o vině, strachu a vztahu, který zašel příliš daleko',
+ },
+ 'playlist_count': 5,
+ 'playlist': [{
+ 'md5': '93d4109cf8f40523699ae9c1d4600bdd',
+ 'info_dict': {
+ 'id': '9890713',
+ 'ext': 'mp3',
+ 'title': 'Neklid #1',
+ 'description': '1. díl: Neklid: 1. díl',
+ 'duration': 1025,
+ 'artist': 'Josef Kokta',
+ 'channel_id': 'radio-wave',
+ 'chapter': 'Neklid #1',
+ 'chapter_number': 1,
+ },
+ }, {
+ 'md5': 'e9763235be4a6dcf94bc8a5bac1ca126',
+ 'info_dict': {
+ 'id': '9890716',
+ 'ext': 'mp3',
+ 'title': 'Neklid #2',
+ 'description': '2. díl: Neklid: 2. díl',
+ 'duration': 768,
+ 'artist': 'Josef Kokta',
+ 'channel_id': 'radio-wave',
+ 'chapter': 'Neklid #2',
+ 'chapter_number': 2,
+ },
+ }, {
+ 'md5': '00b642ea94b78cc949ac84da09f87895',
+ 'info_dict': {
+ 'id': '9890722',
+ 'ext': 'mp3',
+ 'title': 'Neklid #3',
+ 'description': '3. díl: Neklid: 3. díl',
+ 'duration': 607,
+ 'artist': 'Josef Kokta',
+ 'channel_id': 'radio-wave',
+ 'chapter': 'Neklid #3',
+ 'chapter_number': 3,
+ },
+ }, {
+ 'md5': 'faef97b1b49da7df874740f118c19dea',
+ 'info_dict': {
+ 'id': '9890728',
+ 'ext': 'mp3',
+ 'title': 'Neklid #4',
+ 'description': '4. díl: Neklid: 4. díl',
+ 'duration': 621,
+ 'artist': 'Josef Kokta',
+ 'channel_id': 'radio-wave',
+ 'chapter': 'Neklid #4',
+ 'chapter_number': 4,
+ },
+ }, {
+ 'md5': '6e729fa39b647325b868d419c76f3efa',
+ 'info_dict': {
+ 'id': '9890734',
+ 'ext': 'mp3',
+ 'title': 'Neklid #5',
+ 'description': '5. díl: Neklid: 5. díl',
+ 'duration': 908,
+ 'artist': 'Josef Kokta',
+ 'channel_id': 'radio-wave',
+ 'chapter': 'Neklid #5',
+ 'chapter_number': 5,
+ },
+ }]
+ }, {
+ 'url': 'https://dvojka.rozhlas.cz/karel-siktanc-cerny-jezdec-bily-kun-napinava-pohadka-o-tajemnem-prizraku-8946969',
+ 'info_dict': {
+ 'id': '8946969',
+ 'title': 'Karel Šiktanc: Černý jezdec, bílý kůň. Napínavá pohádka o tajemném přízraku',
+ },
+ 'playlist_count': 1,
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '10631121',
+ 'ext': 'm4a',
+ 'title': 'Karel Šiktanc: Černý jezdec, bílý kůň. Napínavá pohádka o tajemném přízraku',
+ 'description': 'Karel Šiktanc: Černý jezdec, bílý kůň',
+ 'duration': 2656,
+ 'artist': 'Tvůrčí skupina Drama a literatura',
+ 'channel_id': 'dvojka',
+ },
+ }],
+ 'params': {'skip_download': 'dash'},
+ }]
+
+ def _extract_video(self, entry):
+ audio_id = entry['meta']['ga']['contentId']
+ chapter_number = traverse_obj(entry, ('meta', 'ga', 'contentSerialPart', {int_or_none}))
+
+ return {
+ 'id': audio_id,
+ 'chapter': traverse_obj(entry, ('meta', 'ga', 'contentNameShort')) if chapter_number else None,
+ 'chapter_number': chapter_number,
+ 'formats': self._extract_formats(entry, audio_id),
+ **traverse_obj(entry, {
+ 'title': ('meta', 'ga', 'contentName'),
+ 'description': 'title',
+ 'duration': ('duration', {int_or_none}),
+ 'artist': ('meta', 'ga', 'contentAuthor'),
+ 'channel_id': ('meta', 'ga', 'contentCreator'),
+ })
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ # FIXME: Use get_element_text_and_html_by_tag when it accepts less strict html
+ data = self._parse_json(extract_attributes(self._search_regex(
+ r'(<div class="mujRozhlasPlayer" data-player=\'[^\']+\'>)',
+ webpage, 'player'))['data-player'], video_id)['data']
+
+ return {
+ '_type': 'playlist',
+ 'id': str_or_none(data.get('embedId')) or video_id,
+ 'title': traverse_obj(data, ('series', 'title')),
+ 'entries': map(self._extract_video, data['playlist']),
+ }
+
+
+class MujRozhlasIE(RozhlasBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?mujrozhlas\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ # single episode extraction
+ 'url': 'https://www.mujrozhlas.cz/vykopavky/ach-jo-zase-teleci-rizek-je-mnohem-min-cesky-nez-jsme-si-mysleli',
+ 'md5': '6f8fd68663e64936623e67c152a669e0',
+ 'info_dict': {
+ 'id': '10739193',
+ 'ext': 'mp3',
+ 'title': 'Ach jo, zase to telecí! Řízek je mnohem míň český, než jsme si mysleli',
+ 'description': 'md5:db7141e9caaedc9041ec7cefb9a62908',
+ 'timestamp': 1684915200,
+ 'modified_timestamp': 1684922446,
+ 'series': 'Vykopávky',
+ 'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/84377046610af6ddc54d910b1dd7a22b.jpg',
+ 'channel_id': 'radio-wave',
+ 'upload_date': '20230524',
+ 'modified_date': '20230524',
+ },
+ }, {
+ # serial extraction
+ 'url': 'https://www.mujrozhlas.cz/radiokniha/jaroslava-janackova-pribeh-tajemneho-psani-o-pramenech-genezi-babicky',
+ 'playlist_mincount': 7,
+ 'info_dict': {
+ 'id': 'bb2b5f4e-ffb4-35a6-a34a-046aa62d6f6b',
+ 'title': 'Jaroslava Janáčková: Příběh tajemného psaní. O pramenech a genezi Babičky',
+ 'description': 'md5:7434d8fac39ac9fee6df098e11dfb1be',
+ },
+ }, {
+ # show extraction
+ 'url': 'https://www.mujrozhlas.cz/nespavci',
+ 'playlist_mincount': 14,
+ 'info_dict': {
+ 'id': '09db9b37-d0f4-368c-986a-d3439f741f08',
+ 'title': 'Nespavci',
+ 'description': 'md5:c430adcbf9e2b9eac88b745881e814dc',
+ },
+ }]
+
+ def _call_api(self, path, item_id, msg='API JSON'):
+ return self._download_json(
+ f'https://api.mujrozhlas.cz/{path}/{item_id}', item_id,
+ note=f'Downloading {msg}', errnote=f'Failed to download {msg}')['data']
+
+ def _extract_audio_entry(self, entry):
+ audio_id = entry['meta']['ga']['contentId']
+
+ return {
+ 'id': audio_id,
+ 'formats': self._extract_formats(entry['attributes'], audio_id),
+ **traverse_obj(entry, {
+ 'title': ('attributes', 'title'),
+ 'description': ('attributes', 'description'),
+ 'episode_number': ('attributes', 'part'),
+ 'series': ('attributes', 'mirroredShow', 'title'),
+ 'chapter': ('attributes', 'mirroredSerial', 'title'),
+ 'artist': ('meta', 'ga', 'contentAuthor'),
+ 'channel_id': ('meta', 'ga', 'contentCreator'),
+ 'timestamp': ('attributes', 'since', {unified_timestamp}),
+ 'modified_timestamp': ('attributes', 'updated', {unified_timestamp}),
+ 'thumbnail': ('attributes', 'asset', 'url', {url_or_none}),
+ })
+ }
+
+ def _entries(self, api_url, playlist_id):
+ for page in itertools.count(1):
+ episodes = self._download_json(
+ api_url, playlist_id, note=f'Downloading episodes page {page}',
+ errnote=f'Failed to download episodes page {page}', fatal=False)
+ for episode in traverse_obj(episodes, ('data', lambda _, v: v['meta']['ga']['contentId'])):
+ yield self._extract_audio_entry(episode)
+ api_url = traverse_obj(episodes, ('links', 'next', {url_or_none}))
+ if not api_url:
+ break
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ info = self._search_json(r'\bvar\s+dl\s*=', webpage, 'info json', display_id)
+
+ entity = info['siteEntityBundle']
+
+ if entity == 'episode':
+ return self._extract_audio_entry(self._call_api(
+ 'episodes', info['contentId'], 'episode info API JSON'))
+
+ elif entity in ('show', 'serial'):
+ playlist_id = info['contentShow'].split(':')[0] if entity == 'show' else info['contentId']
+ data = self._call_api(f'{entity}s', playlist_id, f'{entity} playlist JSON')
+ api_url = data['relationships']['episodes']['links']['related']
+ return self.playlist_result(
+ self._entries(api_url, playlist_id), playlist_id,
+ **traverse_obj(data, ('attributes', {
+ 'title': 'title',
+ 'description': 'description',
+ })))
+
+ else:
+ # `entity == 'person'` not implemented yet by API, ref:
+ # https://api.mujrozhlas.cz/persons/8367e456-2a57-379a-91bb-e699619bea49/participation
+ raise ExtractorError(f'Unsupported entity type "{entity}"')
diff --git a/hypervideo_dl/extractor/rte.py b/hypervideo_dl/extractor/rte.py
index aedaa5b..7ba80d4 100644
--- a/hypervideo_dl/extractor/rte.py
+++ b/hypervideo_dl/extractor/rte.py
@@ -1,7 +1,7 @@
import re
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
float_or_none,
parse_iso8601,
@@ -31,8 +31,8 @@ class RteBaseIE(InfoExtractor):
except ExtractorError as ee:
if num < len(ENDPOINTS) or formats:
continue
- if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
- error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False)
+ if isinstance(ee.cause, HTTPError) and ee.cause.status == 404:
+ error_info = self._parse_json(ee.cause.response.read().decode(), item_id, fatal=False)
if error_info:
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, error_info['message']),
diff --git a/hypervideo_dl/extractor/rts.py b/hypervideo_dl/extractor/rts.py
index 81c4d7c..9f73d18 100644
--- a/hypervideo_dl/extractor/rts.py
+++ b/hypervideo_dl/extractor/rts.py
@@ -136,8 +136,8 @@ class RTSIE(SRGSSRIE): # XXX: Do not subclass from concrete IE
if not entries:
page, urlh = self._download_webpage_handle(url, display_id)
- if re.match(self._VALID_URL, urlh.geturl()).group('id') != media_id:
- return self.url_result(urlh.geturl(), 'RTS')
+ if re.match(self._VALID_URL, urlh.url).group('id') != media_id:
+ return self.url_result(urlh.url, 'RTS')
# article with videos on rhs
videos = re.findall(
diff --git a/hypervideo_dl/extractor/rtvcplay.py b/hypervideo_dl/extractor/rtvcplay.py
new file mode 100644
index 0000000..741c472
--- /dev/null
+++ b/hypervideo_dl/extractor/rtvcplay.py
@@ -0,0 +1,285 @@
+import re
+
+from .common import InfoExtractor, ExtractorError
+from ..utils import (
+ clean_html,
+ determine_ext,
+ int_or_none,
+ float_or_none,
+ js_to_json,
+ mimetype2ext,
+ traverse_obj,
+ urljoin,
+ url_or_none,
+)
+
+
+class RTVCPlayBaseIE(InfoExtractor):
+ _BASE_VALID_URL = r'https?://(?:www\.)?rtvcplay\.co'
+
+ def _extract_player_config(self, webpage, video_id):
+ return self._search_json(
+ r'<script\b[^>]*>[^<]*(?:var|let|const)\s+config\s*=', re.sub(r'"\s*\+\s*"', '', webpage),
+ 'player_config', video_id, transform_source=js_to_json)
+
+ def _extract_formats_and_subtitles_player_config(self, player_config, video_id):
+ formats, subtitles = [], {}
+ for source in traverse_obj(player_config, ('sources', ..., lambda _, v: url_or_none(v['url']))):
+ ext = mimetype2ext(source.get('mimetype'), default=determine_ext(source['url']))
+ if ext == 'm3u8':
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ source['url'], video_id, 'mp4', fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+ else:
+ formats.append({
+ 'url': source['url'],
+ 'ext': ext,
+ })
+
+ return formats, subtitles
+
+
+class RTVCPlayIE(RTVCPlayBaseIE):
+ _VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/(?P<category>(?!embed)[^/]+)/(?:[^?#]+/)?(?P<id>[\w-]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.rtvcplay.co/en-vivo/canal-institucional',
+ 'info_dict': {
+ 'id': 'canal-institucional',
+ 'title': r're:^Canal Institucional',
+ 'description': 'md5:eff9e548394175928059320c006031ea',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'live_status': 'is_live',
+ 'ext': 'mp4',
+ },
+ 'params': {
+ 'skip_download': 'Livestream',
+ },
+ }, {
+ 'url': 'https://www.rtvcplay.co/en-vivo/senal-colombia',
+ 'info_dict': {
+ 'id': 'senal-colombia',
+ 'title': r're:^Señal Colombia',
+ 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'live_status': 'is_live',
+ 'ext': 'mp4',
+ },
+ 'params': {
+ 'skip_download': 'Livestream',
+ },
+ }, {
+ 'url': 'https://www.rtvcplay.co/en-vivo/radio-nacional',
+ 'info_dict': {
+ 'id': 'radio-nacional',
+ 'title': r're:^Radio Nacional',
+ 'description': 'md5:5de009bc6a9fa79d2a6cf0b73f977d53',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'live_status': 'is_live',
+ 'ext': 'mp4',
+ },
+ 'params': {
+ 'skip_download': 'Livestream',
+ },
+ }, {
+ 'url': 'https://www.rtvcplay.co/peliculas-ficcion/senoritas',
+ 'md5': '1288ee6f6d1330d880f98bff2ed710a3',
+ 'info_dict': {
+ 'id': 'senoritas',
+ 'title': 'Señoritas',
+ 'description': 'md5:f095a2bb52cb6cf279daf6302f86fb32',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'ext': 'mp4',
+ },
+ }, {
+ 'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa/james-regresa-clases-28022022',
+ 'md5': 'f040a7380a269ad633cf837384d5e9fc',
+ 'info_dict': {
+ 'id': 'james-regresa-clases-28022022',
+ 'title': 'James regresa a clases - 28/02/2022',
+ 'description': 'md5:c5dcdf757c7ab29305e8763c6007e675',
+ 'ext': 'mp4',
+ },
+ }, {
+ 'url': 'https://www.rtvcplay.co/peliculas-documentales/llinas-el-cerebro-y-el-universo',
+ 'info_dict': {
+ 'id': 'llinas-el-cerebro-y-el-universo',
+ 'title': 'Llinás, el cerebro y el universo',
+ 'description': 'md5:add875bf2309bb52b3e8b9b06116d9b0',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ },
+ 'playlist_mincount': 3,
+ }, {
+ 'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa',
+ 'info_dict': {
+ 'id': 'profe-en-tu-casa',
+ 'title': 'Profe en tu casa',
+ 'description': 'md5:47dbe20e263194413b1db2a2805a4f2e',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ },
+ 'playlist_mincount': 537,
+ }, {
+ 'url': 'https://www.rtvcplay.co/series-al-oido/relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura',
+ 'info_dict': {
+ 'id': 'relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura',
+ 'title': 'Relato de un náufrago: una travesía del periodismo a la literatura',
+ 'description': 'md5:6da28fdca4a5a568ea47ef65ef775603',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ },
+ 'playlist_mincount': 5,
+ }, {
+ 'url': 'https://www.rtvcplay.co/series-al-oido/diez-versiones',
+ 'info_dict': {
+ 'id': 'diez-versiones',
+ 'title': 'Diez versiones',
+ 'description': 'md5:997471ed971cb3fd8e41969457675306',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ },
+ 'playlist_mincount': 20,
+ }]
+
+ def _real_extract(self, url):
+ video_id, category = self._match_valid_url(url).group('id', 'category')
+ webpage = self._download_webpage(url, video_id)
+
+ hydration = self._search_json(
+ r'window\.__RTVCPLAY_STATE__\s*=', webpage, 'hydration',
+ video_id, transform_source=js_to_json)['content']['currentContent']
+
+ asset_id = traverse_obj(hydration, ('video', 'assetid'))
+ if asset_id:
+ hls_url = hydration['base_url_hls'].replace('[node:field_asset_id]', asset_id)
+ else:
+ hls_url = traverse_obj(hydration, ('channel', 'hls'))
+
+ metadata = traverse_obj(hydration, {
+ 'title': 'title',
+ 'description': 'description',
+ 'thumbnail': ((('channel', 'image', 'logo'), ('resource', 'image', 'cover_desktop')), 'path'),
+ }, get_all=False)
+
+ # Probably it's a program's page
+ if not hls_url:
+ seasons = traverse_obj(
+ hydration, ('widgets', lambda _, y: y['type'] == 'seasonList', 'contents'),
+ get_all=False)
+ if not seasons:
+ podcast_episodes = hydration.get('audios')
+ if not podcast_episodes:
+ raise ExtractorError('Could not find asset_id nor program playlist nor podcast episodes')
+
+ return self.playlist_result([
+ self.url_result(episode['file'], url_transparent=True, **traverse_obj(episode, {
+ 'title': 'title',
+ 'description': ('description', {clean_html}),
+ 'episode_number': ('chapter_number', {float_or_none}, {int_or_none}),
+ 'season_number': ('season', {int_or_none}),
+ })) for episode in podcast_episodes], video_id, **metadata)
+
+ entries = [self.url_result(
+ urljoin(url, episode['slug']), url_transparent=True,
+ **traverse_obj(season, {
+ 'season': 'title',
+ 'season_number': ('season', {int_or_none}),
+ }), **traverse_obj(episode, {
+ 'title': 'title',
+ 'thumbnail': ('image', 'cover', 'path'),
+ 'episode_number': ('chapter_number', {int_or_none}),
+ })) for season in seasons for episode in traverse_obj(season, ('contents', ...))]
+
+ return self.playlist_result(entries, video_id, **metadata)
+
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(hls_url, video_id, 'mp4')
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'is_live': category == 'en-vivo',
+ **metadata,
+ }
+
+
+class RTVCPlayEmbedIE(RTVCPlayBaseIE):
+ _VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/embed/(?P<id>[\w-]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.rtvcplay.co/embed/72b0e699-248b-4929-a4a8-3782702fa7f9',
+ 'md5': 'ed529aeaee7aa2a72afe91ac7d1177a8',
+ 'info_dict': {
+ 'id': '72b0e699-248b-4929-a4a8-3782702fa7f9',
+ 'title': 'Tráiler: Señoritas',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'ext': 'mp4',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ player_config = self._extract_player_config(webpage, video_id)
+ formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id)
+
+ asset_id = traverse_obj(player_config, ('rtvcplay', 'assetid'))
+ metadata = {} if not asset_id else self._download_json(
+ f'https://cms.rtvcplay.co/api/v1/video/asset-id/{asset_id}', video_id, fatal=False)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ **traverse_obj(metadata, {
+ 'title': 'title',
+ 'description': 'description',
+ 'thumbnail': ('image', ..., 'thumbnail', 'path'),
+ }, get_all=False)
+ }
+
+
+class RTVCKalturaIE(RTVCPlayBaseIE):
+ _VALID_URL = r'https?://media\.rtvc\.gov\.co/kalturartvc/(?P<id>[\w-]+)'
+
+ _TESTS = [{
+ 'url': 'https://media.rtvc.gov.co/kalturartvc/indexSC.html',
+ 'info_dict': {
+ 'id': 'indexSC',
+ 'title': r're:^Señal Colombia',
+ 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'live_status': 'is_live',
+ 'ext': 'mp4',
+ },
+ 'params': {
+ 'skip_download': 'Livestream',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ player_config = self._extract_player_config(webpage, video_id)
+ formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id)
+
+ channel_id = traverse_obj(player_config, ('rtvcplay', 'channelId'))
+ metadata = {} if not channel_id else self._download_json(
+ f'https://cms.rtvcplay.co/api/v1/taxonomy_term/streaming/{channel_id}', video_id, fatal=False)
+
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ traverse_obj(metadata, ('channel', 'hls')), video_id, 'mp4', fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'is_live': True,
+ **traverse_obj(metadata, {
+ 'title': 'title',
+ 'description': 'description',
+ 'thumbnail': ('channel', 'image', 'logo', 'path'),
+ })
+ }
diff --git a/hypervideo_dl/extractor/rumble.py b/hypervideo_dl/extractor/rumble.py
index 102615c..f8bf4a1 100644
--- a/hypervideo_dl/extractor/rumble.py
+++ b/hypervideo_dl/extractor/rumble.py
@@ -2,13 +2,20 @@ import itertools
import re
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
+ ExtractorError,
+ UnsupportedError,
+ clean_html,
+ determine_ext,
+ format_field,
+ get_element_by_class,
int_or_none,
+ join_nonempty,
+ parse_count,
parse_iso8601,
traverse_obj,
unescapeHTML,
- ExtractorError,
)
@@ -112,24 +119,6 @@ class RumbleEmbedIE(InfoExtractor):
_WEBPAGE_TESTS = [
{
- 'note': 'Rumble embed',
- 'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html',
- 'md5': '53af34098a7f92c4e51cf0bd1c33f009',
- 'info_dict': {
- 'id': 'vb0ofn',
- 'ext': 'mp4',
- 'timestamp': 1612662578,
- 'uploader': 'LovingMontana',
- 'channel': 'LovingMontana',
- 'upload_date': '20210207',
- 'title': 'Winter-loving dog helps girls dig a snow fort ',
- 'channel_url': 'https://rumble.com/c/c-546523',
- 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/f/x/x/5fxxb.OvCc.1-small-Moose-The-Dog-Helps-Girls-D.jpg',
- 'duration': 103,
- 'live_status': 'not_live',
- }
- },
- {
'note': 'Rumble JS embed',
'url': 'https://therightscoop.com/what-does-9-plus-1-plus-1-equal-listen-to-this-audio-of-attempted-kavanaugh-assassins-call-and-youll-get-it',
'md5': '4701209ac99095592e73dbba21889690',
@@ -155,7 +144,7 @@ class RumbleEmbedIE(InfoExtractor):
if embeds:
return embeds
return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer(
- r'<script>\s*Rumble\(\s*"play"\s*,\s*{\s*[\'"]video[\'"]\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)]
+ r'<script>[^<]*\bRumble\(\s*"play"\s*,\s*{\s*[\'"]?video[\'"]?\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -178,7 +167,13 @@ class RumbleEmbedIE(InfoExtractor):
formats = []
for ext, ext_info in (video.get('ua') or {}).items():
- for height, video_info in (ext_info or {}).items():
+ if isinstance(ext_info, dict):
+ for height, video_info in ext_info.items():
+ if not traverse_obj(video_info, ('meta', 'h', {int_or_none})):
+ video_info.setdefault('meta', {})['h'] = height
+ ext_info = ext_info.values()
+
+ for video_info in ext_info:
meta = video_info.get('meta') or {}
if not video_info.get('url'):
continue
@@ -189,18 +184,22 @@ class RumbleEmbedIE(InfoExtractor):
video_info['url'], video_id,
ext='mp4', m3u8_id='hls', fatal=False, live=live_status == 'is_live'))
continue
+ timeline = ext == 'timeline'
+ if timeline:
+ ext = determine_ext(video_info['url'])
formats.append({
'ext': ext,
+ 'acodec': 'none' if timeline else None,
'url': video_info['url'],
- 'format_id': '%s-%sp' % (ext, height),
- 'height': int_or_none(height),
- 'fps': video.get('fps'),
+ 'format_id': join_nonempty(ext, format_field(meta, 'h', '%sp')),
+ 'format_note': 'Timeline' if timeline else None,
+ 'fps': None if timeline else video.get('fps'),
**traverse_obj(meta, {
'tbr': 'bitrate',
'filesize': 'size',
'width': 'w',
'height': 'h',
- }, default={})
+ }, expected_type=lambda x: int(x) or None)
})
subtitles = {
@@ -235,6 +234,121 @@ class RumbleEmbedIE(InfoExtractor):
}
+class RumbleIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?rumble\.com/(?P<id>v(?!ideos)[\w.-]+)[^/]*$'
+ _EMBED_REGEX = [r'<a class=video-item--a href=(?P<url>/v[\w.-]+\.html)>']
+ _TESTS = [{
+ 'add_ie': ['RumbleEmbed'],
+ 'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html',
+ 'md5': '53af34098a7f92c4e51cf0bd1c33f009',
+ 'info_dict': {
+ 'id': 'vb0ofn',
+ 'ext': 'mp4',
+ 'timestamp': 1612662578,
+ 'uploader': 'LovingMontana',
+ 'channel': 'LovingMontana',
+ 'upload_date': '20210207',
+ 'title': 'Winter-loving dog helps girls dig a snow fort ',
+ 'description': 'Moose the dog is more than happy to help with digging out this epic snow fort. Great job, Moose!',
+ 'channel_url': 'https://rumble.com/c/c-546523',
+ 'thumbnail': r're:https://.+\.jpg',
+ 'duration': 103,
+ 'like_count': int,
+ 'view_count': int,
+ 'live_status': 'not_live',
+ }
+ }, {
+ 'url': 'http://www.rumble.com/vDMUM1?key=value',
+ 'only_matching': True,
+ }, {
+ 'note': 'timeline format',
+ 'url': 'https://rumble.com/v2ea9qb-the-u.s.-cannot-hide-this-in-ukraine-anymore-redacted-with-natali-and-clayt.html',
+ 'md5': '40d61fec6c0945bca3d0e1dc1aa53d79',
+ 'params': {'format': 'wv'},
+ 'info_dict': {
+ 'id': 'v2bou5f',
+ 'ext': 'mp4',
+ 'uploader': 'Redacted News',
+ 'upload_date': '20230322',
+ 'timestamp': 1679445010,
+ 'title': 'The U.S. CANNOT hide this in Ukraine anymore | Redacted with Natali and Clayton Morris',
+ 'duration': 892,
+ 'channel': 'Redacted News',
+ 'description': 'md5:aaad0c5c3426d7a361c29bdaaced7c42',
+ 'channel_url': 'https://rumble.com/c/Redacted',
+ 'live_status': 'not_live',
+ 'thumbnail': 'https://sp.rmbl.ws/s8/1/d/x/2/O/dx2Oi.qR4e-small-The-U.S.-CANNOT-hide-this-i.jpg',
+ },
+ }, {
+ 'url': 'https://rumble.com/v2e7fju-the-covid-twitter-files-drop-protecting-fauci-while-censoring-the-truth-wma.html',
+ 'info_dict': {
+ 'id': 'v2blzyy',
+ 'ext': 'mp4',
+ 'live_status': 'was_live',
+ 'release_timestamp': 1679446804,
+ 'description': 'md5:2ac4908ccfecfb921f8ffa4b30c1e636',
+ 'release_date': '20230322',
+ 'timestamp': 1679445692,
+ 'duration': 4435,
+ 'upload_date': '20230322',
+ 'title': 'The Covid Twitter Files Drop: Protecting Fauci While Censoring The Truth w/Matt Taibbi',
+ 'uploader': 'Kim Iversen',
+ 'channel_url': 'https://rumble.com/c/KimIversen',
+ 'channel': 'Kim Iversen',
+ 'thumbnail': 'https://sp.rmbl.ws/s8/1/6/b/w/O/6bwOi.qR4e-small-The-Covid-Twitter-Files-Dro.jpg',
+ },
+ }]
+
+ _WEBPAGE_TESTS = [{
+ 'url': 'https://rumble.com/videos?page=2',
+ 'playlist_count': 25,
+ 'info_dict': {
+ 'id': 'videos?page=2',
+ 'title': 'All videos',
+ 'description': 'Browse videos uploaded to Rumble.com',
+ 'age_limit': 0,
+ },
+ }, {
+ 'url': 'https://rumble.com/live-videos',
+ 'playlist_mincount': 19,
+ 'info_dict': {
+ 'id': 'live-videos',
+ 'title': 'Live Videos',
+ 'description': 'Live videos on Rumble.com',
+ 'age_limit': 0,
+ },
+ }, {
+ 'url': 'https://rumble.com/search/video?q=rumble&sort=views',
+ 'playlist_count': 24,
+ 'info_dict': {
+ 'id': 'video?q=rumble&sort=views',
+ 'title': 'Search results for: rumble',
+ 'age_limit': 0,
+ },
+ }]
+
+ def _real_extract(self, url):
+ page_id = self._match_id(url)
+ webpage = self._download_webpage(url, page_id)
+ url_info = next(RumbleEmbedIE.extract_from_webpage(self._downloader, url, webpage), None)
+ if not url_info:
+ raise UnsupportedError(url)
+
+ release_ts_str = self._search_regex(
+ r'(?:Livestream begins|Streamed on):\s+<time datetime="([^"]+)',
+ webpage, 'release date', fatal=False, default=None)
+ view_count_str = self._search_regex(r'<span class="media-heading-info">([\d,]+) Views',
+ webpage, 'view count', fatal=False, default=None)
+
+ return self.url_result(
+ url_info['url'], ie_key=url_info['ie_key'], url_transparent=True,
+ view_count=parse_count(view_count_str),
+ release_timestamp=parse_iso8601(release_ts_str),
+ like_count=parse_count(get_element_by_class('rumbles-count', webpage)),
+ description=clean_html(get_element_by_class('media-description', webpage)),
+ )
+
+
class RumbleChannelIE(InfoExtractor):
_VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c|user)/(?P<id>[^&?#$/]+))'
@@ -257,7 +371,7 @@ class RumbleChannelIE(InfoExtractor):
try:
webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 404:
break
raise
for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
diff --git a/hypervideo_dl/extractor/rutube.py b/hypervideo_dl/extractor/rutube.py
index 5a4fd97..08d9b92 100644
--- a/hypervideo_dl/extractor/rutube.py
+++ b/hypervideo_dl/extractor/rutube.py
@@ -25,8 +25,7 @@ class RutubeBaseIE(InfoExtractor):
video_id, 'Downloading video JSON',
'Unable to download video JSON', query=query)
- @staticmethod
- def _extract_info(video, video_id=None, require_title=True):
+ def _extract_info(self, video, video_id=None, require_title=True):
title = video['title'] if require_title else video.get('title')
age_limit = video.get('is_adult')
@@ -35,13 +34,15 @@ class RutubeBaseIE(InfoExtractor):
uploader_id = try_get(video, lambda x: x['author']['id'])
category = try_get(video, lambda x: x['category']['name'])
+ description = video.get('description')
+ duration = int_or_none(video.get('duration'))
return {
'id': video.get('id') or video_id if video_id else video['id'],
'title': title,
- 'description': video.get('description'),
+ 'description': description,
'thumbnail': video.get('thumbnail_url'),
- 'duration': int_or_none(video.get('duration')),
+ 'duration': duration,
'uploader': try_get(video, lambda x: x['author']['name']),
'uploader_id': compat_str(uploader_id) if uploader_id else None,
'timestamp': unified_timestamp(video.get('created_ts')),
@@ -50,6 +51,7 @@ class RutubeBaseIE(InfoExtractor):
'view_count': int_or_none(video.get('hits')),
'comment_count': int_or_none(video.get('comments_count')),
'is_live': bool_or_none(video.get('is_livestream')),
+ 'chapters': self._extract_chapters_from_description(description, duration),
}
def _download_and_extract_info(self, video_id, query=None):
@@ -91,12 +93,12 @@ class RutubeBaseIE(InfoExtractor):
class RutubeIE(RutubeBaseIE):
IE_NAME = 'rutube'
IE_DESC = 'Rutube videos'
- _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})'
+ _VALID_URL = r'https?://rutube\.ru/(?:video(?:/private)?|(?:play/)?embed)/(?P<id>[\da-z]{32})'
_EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/(?:play/)?embed/[\da-z]{32}.*?)\1']
_TESTS = [{
'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
- 'md5': '1d24f180fac7a02f3900712e5a5764d6',
+ 'md5': 'e33ac625efca66aba86cbec9851f2692',
'info_dict': {
'id': '3eac3b4561676c17df9132a9a1e62e3e',
'ext': 'mp4',
@@ -108,7 +110,12 @@ class RutubeIE(RutubeBaseIE):
'timestamp': 1381943602,
'upload_date': '20131016',
'age_limit': 0,
+ 'view_count': int,
+ 'thumbnail': 'http://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg',
+ 'category': ['Новости и СМИ'],
+ 'chapters': [],
},
+ 'expected_warnings': ['Unable to download f4m'],
}, {
'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661',
'only_matching': True,
@@ -121,6 +128,45 @@ class RutubeIE(RutubeBaseIE):
}, {
'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source',
'only_matching': True,
+ }, {
+ 'url': 'https://rutube.ru/video/private/884fb55f07a97ab673c7d654553e0f48/?p=x2QojCumHTS3rsKHWXN8Lg',
+ 'md5': 'd106225f15d625538fe22971158e896f',
+ 'info_dict': {
+ 'id': '884fb55f07a97ab673c7d654553e0f48',
+ 'ext': 'mp4',
+ 'title': 'Яцуноками, Nioh2',
+ 'description': 'Nioh2: финал сражения с боссом Яцуноками',
+ 'duration': 15,
+ 'uploader': 'mexus',
+ 'uploader_id': '24222106',
+ 'timestamp': 1670646232,
+ 'upload_date': '20221210',
+ 'age_limit': 0,
+ 'view_count': int,
+ 'thumbnail': 'http://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg',
+ 'category': ['Видеоигры'],
+ 'chapters': [],
+ },
+ 'expected_warnings': ['Unable to download f4m'],
+ }, {
+ 'url': 'https://rutube.ru/video/c65b465ad0c98c89f3b25cb03dcc87c6/',
+ 'info_dict': {
+ 'id': 'c65b465ad0c98c89f3b25cb03dcc87c6',
+ 'ext': 'mp4',
+ 'chapters': 'count:4',
+ 'category': ['Бизнес и предпринимательство'],
+ 'description': 'md5:252feac1305257d8c1bab215cedde75d',
+ 'thumbnail': 'http://pic.rutubelist.ru/video/71/8f/718f27425ea9706073eb80883dd3787b.png',
+ 'duration': 782,
+ 'age_limit': 0,
+ 'uploader_id': '23491359',
+ 'timestamp': 1677153329,
+ 'view_count': int,
+ 'upload_date': '20230223',
+ 'title': 'Бизнес с нуля: найм сотрудников. Интервью с директором строительной компании',
+ 'uploader': 'Стас Быков',
+ },
+ 'expected_warnings': ['Unable to download f4m'],
}]
@classmethod
@@ -129,8 +175,9 @@ class RutubeIE(RutubeBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
- info = self._download_and_extract_info(video_id)
- info['formats'] = self._download_and_extract_formats(video_id)
+ query = parse_qs(url)
+ info = self._download_and_extract_info(video_id, query)
+ info['formats'] = self._download_and_extract_formats(video_id, query)
return info
diff --git a/hypervideo_dl/extractor/s4c.py b/hypervideo_dl/extractor/s4c.py
new file mode 100644
index 0000000..38a9058
--- /dev/null
+++ b/hypervideo_dl/extractor/s4c.py
@@ -0,0 +1,62 @@
+from .common import InfoExtractor
+from ..utils import traverse_obj
+
+
+class S4CIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/programme/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.s4c.cymru/clic/programme/861362209',
+ 'info_dict': {
+ 'id': '861362209',
+ 'ext': 'mp4',
+ 'title': 'Y Swn',
+ 'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0',
+ 'duration': 5340
+ },
+ }, {
+ 'url': 'https://www.s4c.cymru/clic/programme/856636948',
+ 'info_dict': {
+ 'id': '856636948',
+ 'ext': 'mp4',
+ 'title': 'Am Dro',
+ 'duration': 2880,
+ 'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ details = self._download_json(
+ f'https://www.s4c.cymru/df/full_prog_details?lang=e&programme_id={video_id}',
+ video_id, fatal=False)
+
+ filename = self._download_json(
+ 'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={
+ 'programme_id': video_id,
+ 'signed': '0',
+ 'lang': 'en',
+ 'mode': 'od',
+ 'appId': 'clic',
+ 'streamName': '',
+ }, note='Downloading player config JSON')['filename']
+ m3u8_url = self._download_json(
+ 'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={
+ 'mode': 'od',
+ 'application': 'clic',
+ 'region': 'WW',
+ 'extra': 'false',
+ 'thirdParty': 'false',
+ 'filename': filename,
+ }, note='Downloading streaming urls JSON')['hls']
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls')
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ **traverse_obj(details, ('full_prog_details', 0, {
+ 'title': (('programme_title', 'series_title'), {str}),
+ 'description': ('full_billing', {str.strip}),
+ 'duration': ('duration', {lambda x: int(x) * 60}),
+ }), get_all=False),
+ }
diff --git a/hypervideo_dl/extractor/safari.py b/hypervideo_dl/extractor/safari.py
index 450a661..8d322d7 100644
--- a/hypervideo_dl/extractor/safari.py
+++ b/hypervideo_dl/extractor/safari.py
@@ -28,13 +28,13 @@ class SafariBaseIE(InfoExtractor):
'Downloading login page')
def is_logged(urlh):
- return 'learning.oreilly.com/home/' in urlh.geturl()
+ return 'learning.oreilly.com/home/' in urlh.url
if is_logged(urlh):
self.LOGGED_IN = True
return
- redirect_url = urlh.geturl()
+ redirect_url = urlh.url
parsed_url = compat_urlparse.urlparse(redirect_url)
qs = compat_parse_qs(parsed_url.query)
next_uri = compat_urlparse.urljoin(
@@ -129,7 +129,7 @@ class SafariIE(SafariBaseIE):
webpage, urlh = self._download_webpage_handle(url, video_id)
- mobj = re.match(self._VALID_URL, urlh.geturl())
+ mobj = re.match(self._VALID_URL, urlh.url)
reference_id = mobj.group('reference_id')
if not reference_id:
reference_id = self._search_regex(
diff --git a/hypervideo_dl/extractor/sbs.py b/hypervideo_dl/extractor/sbs.py
index 4532033..7a91150 100644
--- a/hypervideo_dl/extractor/sbs.py
+++ b/hypervideo_dl/extractor/sbs.py
@@ -1,7 +1,13 @@
from .common import InfoExtractor
+from ..networking import HEADRequest
from ..utils import (
- smuggle_url,
- ExtractorError,
+ float_or_none,
+ int_or_none,
+ parse_duration,
+ parse_iso8601,
+ traverse_obj,
+ update_url_query,
+ url_or_none,
)
@@ -11,7 +17,7 @@ class SBSIE(InfoExtractor):
https?://(?:www\.)?sbs\.com\.au/(?:
ondemand(?:
/video/(?:single/)?|
- /movie/[^/]+/|
+ /(?:movie|tv-program)/[^/]+/|
/(?:tv|news)-series/(?:[^/]+/){3}|
.*?\bplay=|/watch/
)|news/(?:embeds/)?video/
@@ -27,18 +33,21 @@ class SBSIE(InfoExtractor):
# Original URL is handled by the generic IE which finds the iframe:
# http://www.sbs.com.au/thefeed/blog/2014/08/21/dingo-conservation
'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed',
- 'md5': '3150cf278965eeabb5b4cea1c963fe0a',
+ 'md5': '31f84a7a19b53635db63c73f8ab0c4a7',
'info_dict': {
- 'id': '_rFBPRPO4pMR',
+ 'id': '320403011771', # '_rFBPRPO4pMR',
'ext': 'mp4',
'title': 'Dingo Conservation (The Feed)',
'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5',
- 'thumbnail': r're:http://.*\.jpg',
+ 'thumbnail': r're:https?://.*\.jpg',
'duration': 308,
'timestamp': 1408613220,
'upload_date': '20140821',
'uploader': 'SBSC',
+ 'tags': None,
+ 'categories': None,
},
+ 'expected_warnings': ['Unable to download JSON metadata'],
}, {
'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed',
'only_matching': True,
@@ -70,34 +79,80 @@ class SBSIE(InfoExtractor):
}, {
'url': 'https://www.sbs.com.au/ondemand/tv-series/the-handmaids-tale/season-5/the-handmaids-tale-s5-ep1/2065631811776',
'only_matching': True,
+ }, {
+ 'url': 'https://www.sbs.com.au/ondemand/tv-program/autun-romes-forgotten-sister/2116212803602',
+ 'only_matching': True,
}]
+ _GEO_COUNTRIES = ['AU']
+ _AUS_TV_PARENTAL_GUIDELINES = {
+ 'P': 0,
+ 'C': 7,
+ 'G': 0,
+ 'PG': 0,
+ 'M': 14,
+ 'MA15+': 15,
+ 'MAV15+': 15,
+ 'R18+': 18,
+ }
+ _PLAYER_API = 'https://www.sbs.com.au/api/v3'
+
def _real_extract(self, url):
video_id = self._match_id(url)
- player_params = self._download_json(
- 'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id)
+ formats, subtitles = self._extract_smil_formats_and_subtitles(
+ update_url_query(f'{self._PLAYER_API}/video_smil', {'id': video_id}), video_id)
- error = player_params.get('error')
- if error:
- error_message = 'Sorry, The video you are looking for does not exist.'
- video_data = error.get('results') or {}
- error_code = error.get('errorCode')
- if error_code == 'ComingSoon':
- error_message = '%s is not yet available.' % video_data.get('title', '')
- elif error_code in ('Forbidden', 'intranetAccessOnly'):
- error_message = 'Sorry, This video cannot be accessed via this website'
- elif error_code == 'Expired':
- error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '')
- raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)
+ if not formats:
+ urlh = self._request_webpage(
+ HEADRequest('https://sbs-vod-prod-01.akamaized.net/'), video_id,
+ note='Checking geo-restriction', fatal=False, expected_status=403)
+ if urlh:
+ error_reasons = urlh.headers.get_all('x-error-reason') or []
+ if 'geo-blocked' in error_reasons:
+ self.raise_geo_restricted(countries=['AU'])
+ self.raise_no_formats('No formats are available', video_id=video_id)
- urls = player_params['releaseUrls']
- theplatform_url = (urls.get('progressive') or urls.get('html')
- or urls.get('standard') or player_params['relatedItemsURL'])
+ media = traverse_obj(self._download_json(
+ f'{self._PLAYER_API}/video_stream', video_id, fatal=False,
+ query={'id': video_id, 'context': 'tv'}), ('video_object', {dict})) or {}
+
+ media.update(self._download_json(
+ f'https://catalogue.pr.sbsod.com/mpx-media/{video_id}',
+ video_id, fatal=not media) or {})
+
+ # For named episodes, use the catalogue's title to set episode, rather than generic 'Episode N'.
+ if traverse_obj(media, ('partOfSeries', {dict})):
+ media['epName'] = traverse_obj(media, ('title', {str}))
return {
- '_type': 'url_transparent',
- 'ie_key': 'ThePlatform',
'id': video_id,
- 'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}),
- 'is_live': player_params.get('streamType') == 'live',
+ **traverse_obj(media, {
+ 'title': ('name', {str}),
+ 'description': ('description', {str}),
+ 'channel': ('taxonomy', 'channel', 'name', {str}),
+ 'series': ((('partOfSeries', 'name'), 'seriesTitle'), {str}),
+ 'series_id': ((('partOfSeries', 'uuid'), 'seriesID'), {str}),
+ 'season_number': ('seasonNumber', {int_or_none}),
+ 'episode': ('epName', {str}),
+ 'episode_number': ('episodeNumber', {int_or_none}),
+ 'timestamp': (('datePublished', ('publication', 'startDate')), {parse_iso8601}),
+ 'release_year': ('releaseYear', {int_or_none}),
+ 'duration': ('duration', ({float_or_none}, {parse_duration})),
+ 'is_live': ('liveStream', {bool}),
+ 'age_limit': (('classificationID', 'contentRating'), {str.upper}, {
+ lambda x: self._AUS_TV_PARENTAL_GUIDELINES.get(x)}), # dict.get is unhashable in py3.7
+ }, get_all=False),
+ **traverse_obj(media, {
+ 'categories': (('genres', ...), ('taxonomy', ('genre', 'subgenre'), 'name'), {str}),
+ 'tags': (('consumerAdviceTexts', ('sbsSubCertification', 'consumerAdvice')), ..., {str}),
+ 'thumbnails': ('thumbnails', lambda _, v: url_or_none(v['contentUrl']), {
+ 'id': ('name', {str}),
+ 'url': 'contentUrl',
+ 'width': ('width', {int_or_none}),
+ 'height': ('height', {int_or_none}),
+ }),
+ }),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'uploader': 'SBSC',
}
diff --git a/hypervideo_dl/extractor/scrippsnetworks.py b/hypervideo_dl/extractor/scrippsnetworks.py
index c3cee6e..adfd7e5 100644
--- a/hypervideo_dl/extractor/scrippsnetworks.py
+++ b/hypervideo_dl/extractor/scrippsnetworks.py
@@ -115,6 +115,7 @@ class ScrippsNetworksIE(InfoExtractor):
'uploader': 'SCNI-SCND',
},
'add_ie': ['ThePlatform'],
+ 'expected_warnings': ['No HLS formats found'],
}, {
'url': 'https://www.diynetwork.com/videos/diy-barnwood-tablet-stand-0265790',
'only_matching': True,
diff --git a/hypervideo_dl/extractor/senalcolombia.py b/hypervideo_dl/extractor/senalcolombia.py
new file mode 100644
index 0000000..f3c066d
--- /dev/null
+++ b/hypervideo_dl/extractor/senalcolombia.py
@@ -0,0 +1,31 @@
+from .common import InfoExtractor
+from .rtvcplay import RTVCKalturaIE
+
+
+class SenalColombiaLiveIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?senalcolombia\.tv/(?P<id>senal-en-vivo)'
+
+ _TESTS = [{
+ 'url': 'https://www.senalcolombia.tv/senal-en-vivo',
+ 'info_dict': {
+ 'id': 'indexSC',
+ 'title': 're:^Señal Colombia',
+ 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'live_status': 'is_live',
+ 'ext': 'mp4',
+ },
+ 'params': {
+ 'skip_download': 'Livestream',
+ },
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ hydration = self._search_json(
+ r'<script\b[^>]*data-drupal-selector\s*=\s*"[^"]*drupal-settings-json[^"]*"[^>]*>',
+ webpage, 'hydration', display_id)
+
+ return self.url_result(hydration['envivosrc'], RTVCKalturaIE, display_id)
diff --git a/hypervideo_dl/extractor/servus.py b/hypervideo_dl/extractor/servus.py
index 490d562..dda1958 100644
--- a/hypervideo_dl/extractor/servus.py
+++ b/hypervideo_dl/extractor/servus.py
@@ -1,11 +1,13 @@
from .common import InfoExtractor
from ..utils import (
- determine_ext,
+ ExtractorError,
float_or_none,
+ format_field,
int_or_none,
+ join_nonempty,
+ traverse_obj,
+ unescapeHTML,
unified_timestamp,
- urlencode_postdata,
- url_or_none,
)
@@ -15,32 +17,41 @@ class ServusIE(InfoExtractor):
(?:www\.)?
(?:
servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)|
- (?:servustv|pm-wissen)\.com/videos
+ (?:servustv|pm-wissen)\.com/(?:[^/]+/)?v(?:ideos)?
)
- /(?P<id>[aA]{2}-\w+|\d+-\d+)
+ /(?P<id>[aA]{2}-?\w+|\d+-\d+)
'''
_TESTS = [{
- # new URL schema
- 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/',
- 'md5': '60474d4c21f3eb148838f215c37f02b9',
+ # URL schema v3
+ 'url': 'https://www.servustv.com/natur/v/aa-28bycqnh92111/',
'info_dict': {
- 'id': 'AA-1T6VBU5PW1W12',
+ 'id': 'AA-28BYCQNH92111',
'ext': 'mp4',
- 'title': 'Die Grünen aus Sicht des Volkes',
- 'alt_title': 'Talk im Hangar-7 Voxpops Gruene',
- 'description': 'md5:1247204d85783afe3682644398ff2ec4',
+ 'title': 'Klettersteige in den Alpen',
+ 'description': 'md5:25e47ddd83a009a0f9789ba18f2850ce',
'thumbnail': r're:^https?://.*\.jpg',
- 'duration': 62.442,
- 'timestamp': 1605193976,
- 'upload_date': '20201112',
- 'series': 'Talk im Hangar-7',
- 'season': 'Season 9',
- 'season_number': 9,
- 'episode': 'Episode 31 - September 14',
- 'episode_number': 31,
- }
+ 'duration': 2823,
+ 'timestamp': 1655752333,
+ 'upload_date': '20220620',
+ 'series': 'Bergwelten',
+ 'season': 'Season 11',
+ 'season_number': 11,
+ 'episode': 'Episode 8 - Vie Ferrate – Klettersteige in den Alpen',
+ 'episode_number': 8,
+ },
+ 'params': {'skip_download': 'm3u8'}
+ }, {
+ 'url': 'https://www.servustv.com/natur/v/aa-1xg5xwmgw2112/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.servustv.com/natur/v/aansszcx3yi9jmlmhdc1/',
+ 'only_matching': True,
}, {
- # old URL schema
+ # URL schema v2
+ 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/',
+ 'only_matching': True,
+ }, {
+ # URL schema v1
'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/',
'only_matching': True,
}, {
@@ -60,85 +71,65 @@ class ServusIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url).upper()
- token = self._download_json(
- 'https://auth.redbullmediahouse.com/token', video_id,
- 'Downloading token', data=urlencode_postdata({
- 'grant_type': 'client_credentials',
- }), headers={
- 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==',
- })
- access_token = token['access_token']
- token_type = token.get('token_type', 'Bearer')
-
video = self._download_json(
- 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id,
- video_id, 'Downloading video JSON', headers={
- 'Authorization': '%s %s' % (token_type, access_token),
- })
+ 'https://api-player.redbull.com/stv/servus-tv?timeZone=Europe/Berlin',
+ video_id, 'Downloading video JSON', query={'videoId': video_id})
+ if not video.get('videoUrl'):
+ self._report_errors(video)
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ video['videoUrl'], video_id, 'mp4', m3u8_id='hls')
- formats = []
- thumbnail = None
- for resource in video['resources']:
- if not isinstance(resource, dict):
- continue
- format_url = url_or_none(resource.get('url'))
- if not format_url:
- continue
- extension = resource.get('extension')
- type_ = resource.get('type')
- if extension == 'jpg' or type_ == 'reference_keyframe':
- thumbnail = format_url
- continue
- ext = determine_ext(format_url)
- if type_ == 'dash' or ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- format_url, video_id, mpd_id='dash', fatal=False))
- elif type_ == 'hls' or ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- format_url, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False))
- elif extension == 'mp4' or ext == 'mp4':
- formats.append({
- 'url': format_url,
- 'format_id': type_,
- 'width': int_or_none(resource.get('width')),
- 'height': int_or_none(resource.get('height')),
- })
-
- attrs = {}
- for attribute in video['attributes']:
- if not isinstance(attribute, dict):
- continue
- key = attribute.get('fieldKey')
- value = attribute.get('fieldValue')
- if not key or not value:
- continue
- attrs[key] = value
-
- title = attrs.get('title_stv') or video_id
- alt_title = attrs.get('title')
- description = attrs.get('long_description') or attrs.get('short_description')
- series = attrs.get('label')
- season = attrs.get('season')
- episode = attrs.get('chapter')
- duration = float_or_none(attrs.get('duration'), scale=1000)
+ season = video.get('season')
season_number = int_or_none(self._search_regex(
r'Season (\d+)', season or '', 'season number', default=None))
+ episode = video.get('chapter')
episode_number = int_or_none(self._search_regex(
r'Episode (\d+)', episode or '', 'episode number', default=None))
return {
'id': video_id,
- 'title': title,
- 'alt_title': alt_title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'timestamp': unified_timestamp(video.get('lastPublished')),
- 'series': series,
+ 'title': video.get('title'),
+ 'description': self._get_description(video_id) or video.get('description'),
+ 'thumbnail': video.get('poster'),
+ 'duration': float_or_none(video.get('duration')),
+ 'timestamp': unified_timestamp(video.get('currentSunrise')),
+ 'series': video.get('label'),
'season': season,
'season_number': season_number,
'episode': episode,
'episode_number': episode_number,
'formats': formats,
+ 'subtitles': subtitles,
}
+
+ def _get_description(self, video_id):
+ info = self._download_json(
+ f'https://backend.servustv.com/wp-json/rbmh/v2/media_asset/aa_id/{video_id}?fieldset=page',
+ video_id, fatal=False)
+
+ return join_nonempty(*traverse_obj(info, (
+ ('stv_short_description', 'stv_long_description'),
+ {lambda x: unescapeHTML(x.replace('\n\n', '\n'))})), delim='\n\n')
+
+ def _report_errors(self, video):
+ playability_errors = traverse_obj(video, ('playabilityErrors', ...))
+ if not playability_errors:
+ raise ExtractorError('No videoUrl and no information about errors')
+
+ elif 'FSK_BLOCKED' in playability_errors:
+ details = traverse_obj(video, ('playabilityErrorDetails', 'FSK_BLOCKED'), expected_type=dict)
+ message = format_field(''.join((
+ format_field(details, 'minEveningHour', ' from %02d:00'),
+ format_field(details, 'maxMorningHour', ' to %02d:00'),
+ format_field(details, 'minAge', ' (Minimum age %d)'),
+ )), None, 'Only available%s') or 'Blocked by FSK with unknown availability'
+
+ elif 'NOT_YET_AVAILABLE' in playability_errors:
+ message = format_field(
+ video, (('playabilityErrorDetails', 'NOT_YET_AVAILABLE', 'availableFrom'), 'currentSunrise'),
+ 'Only available from %s') or 'Video not yet available with unknown availability'
+
+ else:
+ message = f'Video unavailable: {", ".join(playability_errors)}'
+
+ raise ExtractorError(message, expected=True)
diff --git a/hypervideo_dl/extractor/sevenplus.py b/hypervideo_dl/extractor/sevenplus.py
index 222bf6c..6c688d1 100644
--- a/hypervideo_dl/extractor/sevenplus.py
+++ b/hypervideo_dl/extractor/sevenplus.py
@@ -2,10 +2,8 @@ import json
import re
from .brightcove import BrightcoveNewBaseIE
-from ..compat import (
- compat_HTTPError,
- compat_str,
-)
+from ..compat import compat_str
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
try_get,
@@ -97,9 +95,9 @@ class SevenPlusIE(BrightcoveNewBaseIE):
'videoType': 'vod',
}, headers=headers)['media']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
raise ExtractorError(self._parse_json(
- e.cause.read().decode(), episode_id)[0]['error_code'], expected=True)
+ e.cause.response.read().decode(), episode_id)[0]['error_code'], expected=True)
raise
for source in media.get('sources', {}):
diff --git a/hypervideo_dl/extractor/shahid.py b/hypervideo_dl/extractor/shahid.py
index 26a0bff..d509e88 100644
--- a/hypervideo_dl/extractor/shahid.py
+++ b/hypervideo_dl/extractor/shahid.py
@@ -3,7 +3,7 @@ import math
import re
from .aws import AWSIE
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
clean_html,
ExtractorError,
@@ -22,7 +22,7 @@ class ShahidBaseIE(AWSIE):
def _handle_error(self, e):
fail_data = self._parse_json(
- e.cause.read().decode('utf-8'), None, fatal=False)
+ e.cause.response.read().decode('utf-8'), None, fatal=False)
if fail_data:
faults = fail_data.get('faults', [])
faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')])
@@ -40,7 +40,7 @@ class ShahidBaseIE(AWSIE):
'secret_key': '4WUUJWuFvtTkXbhaWTDv7MhO+0LqoYDWfEnUXoWn',
}, video_id, query)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError):
+ if isinstance(e.cause, HTTPError):
self._handle_error(e)
raise
@@ -88,7 +88,7 @@ class ShahidIE(ShahidBaseIE):
'Content-Type': 'application/json; charset=UTF-8',
})['user']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError):
+ if isinstance(e.cause, HTTPError):
self._handle_error(e)
raise
diff --git a/hypervideo_dl/extractor/shemaroome.py b/hypervideo_dl/extractor/shemaroome.py
index 7a78c6e..ec9938b 100644
--- a/hypervideo_dl/extractor/shemaroome.py
+++ b/hypervideo_dl/extractor/shemaroome.py
@@ -73,7 +73,10 @@ class ShemarooMeIE(InfoExtractor):
key = bytes_to_intlist(compat_b64decode(data_json['key']))
iv = [0] * 16
m3u8_url = unpad_pkcs7(intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))).decode('ascii')
- formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers={'stream_key': data_json['stream_key']})
+ headers = {'stream_key': data_json['stream_key']}
+ formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers=headers)
+ for fmt in formats:
+ fmt['http_headers'] = headers
release_date = self._html_search_regex(
(r'itemprop="uploadDate">\s*([\d-]+)', r'id="release_date" value="([\d-]+)'),
diff --git a/hypervideo_dl/extractor/sibnet.py b/hypervideo_dl/extractor/sibnet.py
new file mode 100644
index 0000000..73bb75d
--- /dev/null
+++ b/hypervideo_dl/extractor/sibnet.py
@@ -0,0 +1,17 @@
+from .common import InfoExtractor
+
+
+class SibnetEmbedIE(InfoExtractor):
+ # Ref: https://help.sibnet.ru/?sibnet_video_embed
+ _VALID_URL = False
+ _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1']
+ _WEBPAGE_TESTS = [{
+ 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html',
+ 'info_dict': {
+ 'id': 'shell', # FIXME?
+ 'ext': 'mp4',
+ 'age_limit': 0,
+ 'thumbnail': 'https://video.sibnet.ru/upload/cover/video_1887072_0.jpg',
+ 'title': 'КВН Москва не сразу строилась - Девушка впервые играет в Mortal Kombat',
+ }
+ }]
diff --git a/hypervideo_dl/extractor/sina.py b/hypervideo_dl/extractor/sina.py
index aeba4e3..9842811 100644
--- a/hypervideo_dl/extractor/sina.py
+++ b/hypervideo_dl/extractor/sina.py
@@ -1,12 +1,12 @@
from .common import InfoExtractor
+from ..networking import HEADRequest
from ..utils import (
- HEADRequest,
ExtractorError,
+ clean_html,
+ get_element_by_attribute,
int_or_none,
- update_url_query,
qualities,
- get_element_by_attribute,
- clean_html,
+ update_url_query,
)
@@ -60,7 +60,7 @@ class SinaIE(InfoExtractor):
self.to_screen('Getting video id')
request = HEADRequest(url)
_, urlh = self._download_webpage_handle(request, 'NA', False)
- return self._real_extract(urlh.geturl())
+ return self._real_extract(urlh.url)
else:
pseudo_id = mobj.group('pseudo_id')
webpage = self._download_webpage(url, pseudo_id)
diff --git a/hypervideo_dl/extractor/sixplay.py b/hypervideo_dl/extractor/sixplay.py
index a6fb6c1..ef93b92 100644
--- a/hypervideo_dl/extractor/sixplay.py
+++ b/hypervideo_dl/extractor/sixplay.py
@@ -79,7 +79,7 @@ class SixPlayIE(InfoExtractor):
headers=self.geo_verification_headers())
if not urlh:
continue
- asset_url = urlh.geturl()
+ asset_url = urlh.url
asset_url = asset_url.replace('_drmnp.ism/', '_unpnp.ism/')
for i in range(3, 0, -1):
asset_url = asset_url = asset_url.replace('_sd1/', '_sd%d/' % i)
diff --git a/hypervideo_dl/extractor/slideslive.py b/hypervideo_dl/extractor/slideslive.py
index 9a60a79..25f867a 100644
--- a/hypervideo_dl/extractor/slideslive.py
+++ b/hypervideo_dl/extractor/slideslive.py
@@ -1,103 +1,567 @@
+import re
+import urllib.parse
+
from .common import InfoExtractor
from ..utils import (
- bool_or_none,
+ ExtractorError,
+ int_or_none,
+ parse_qs,
smuggle_url,
- try_get,
+ traverse_obj,
+ unified_timestamp,
+ update_url_query,
url_or_none,
+ xpath_text,
)
class SlidesLiveIE(InfoExtractor):
- _VALID_URL = r'https?://slideslive\.com/(?P<id>[0-9]+)'
- _WORKING = False
+ _VALID_URL = r'https?://slideslive\.com/(?:embed/(?:presentation/)?)?(?P<id>[0-9]+)'
_TESTS = [{
- # video_service_name = YOUTUBE
+ # service_name = yoda, only XML slides info
'url': 'https://slideslive.com/38902413/gcc-ia16-backend',
- 'md5': 'b29fcd6c6952d0c79c5079b0e7a07e6f',
'info_dict': {
- 'id': 'LMtgR8ba0b0',
+ 'id': '38902413',
'ext': 'mp4',
'title': 'GCC IA16 backend',
- 'description': 'Watch full version of this video at https://slideslive.com/38902413.',
- 'uploader': 'SlidesLive Videos - A',
- 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw',
- 'timestamp': 1597615266,
- 'upload_date': '20170925',
- }
+ 'timestamp': 1648189972,
+ 'upload_date': '20220325',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'thumbnails': 'count:42',
+ 'chapters': 'count:41',
+ 'duration': 1638,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
}, {
- # video_service_name = yoda
+ # service_name = yoda, /v7/ slides
'url': 'https://slideslive.com/38935785',
- 'md5': '575cd7a6c0acc6e28422fe76dd4bcb1a',
'info_dict': {
- 'id': 'RMraDYN5ozA_',
+ 'id': '38935785',
'ext': 'mp4',
'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges',
+ 'upload_date': '20211115',
+ 'timestamp': 1636996003,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'thumbnails': 'count:640',
+ 'chapters': 'count:639',
+ 'duration': 9832,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
},
}, {
- # video_service_name = youtube
+ # service_name = yoda, /v1/ slides
+ 'url': 'https://slideslive.com/38973182/how-should-a-machine-learning-researcher-think-about-ai-ethics',
+ 'info_dict': {
+ 'id': '38973182',
+ 'ext': 'mp4',
+ 'title': 'How Should a Machine Learning Researcher Think About AI Ethics?',
+ 'upload_date': '20220201',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'timestamp': 1643728135,
+ 'thumbnails': 'count:3',
+ 'chapters': 'count:2',
+ 'duration': 5889,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ # service_name = youtube, only XML slides info
+ 'url': 'https://slideslive.com/38897546/special-metaprednaska-petra-ludwiga-hodnoty-pro-lepsi-spolecnost',
+ 'md5': '8a79b5e3d700837f40bd2afca3c8fa01',
+ 'info_dict': {
+ 'id': 'jmg02wCJD5M',
+ 'display_id': '38897546',
+ 'ext': 'mp4',
+ 'title': 'SPECIÁL: Meta-přednáška Petra Ludwiga - Hodnoty pro lepší společnost',
+ 'description': 'Watch full version of this video at https://slideslive.com/38897546.',
+ 'channel_url': 'https://www.youtube.com/channel/UCZWdAkNYFncuX0khyvhqnxw',
+ 'channel': 'SlidesLive Videos - G1',
+ 'channel_id': 'UCZWdAkNYFncuX0khyvhqnxw',
+ 'uploader_id': 'UCZWdAkNYFncuX0khyvhqnxw',
+ 'uploader': 'SlidesLive Videos - G1',
+ 'uploader_url': 'http://www.youtube.com/channel/UCZWdAkNYFncuX0khyvhqnxw',
+ 'live_status': 'not_live',
+ 'upload_date': '20160710',
+ 'timestamp': 1618786715,
+ 'duration': 6827,
+ 'like_count': int,
+ 'view_count': int,
+ 'comment_count': int,
+ 'channel_follower_count': int,
+ 'age_limit': 0,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
+ 'thumbnails': 'count:169',
+ 'playable_in_embed': True,
+ 'availability': 'unlisted',
+ 'tags': [],
+ 'categories': ['People & Blogs'],
+ 'chapters': 'count:168',
+ },
+ }, {
+ # embed-only presentation, only XML slides info
+ 'url': 'https://slideslive.com/embed/presentation/38925850',
+ 'info_dict': {
+ 'id': '38925850',
+ 'ext': 'mp4',
+ 'title': 'Towards a Deep Network Architecture for Structured Smoothness',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'thumbnails': 'count:8',
+ 'timestamp': 1629671508,
+ 'upload_date': '20210822',
+ 'chapters': 'count:7',
+ 'duration': 326,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ # embed-only presentation, only JSON slides info, /v5/ slides (.png)
+ 'url': 'https://slideslive.com/38979920/',
+ 'info_dict': {
+ 'id': '38979920',
+ 'ext': 'mp4',
+ 'title': 'MoReL: Multi-omics Relational Learning',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'thumbnails': 'count:7',
+ 'timestamp': 1654714970,
+ 'upload_date': '20220608',
+ 'chapters': 'count:6',
+ 'duration': 171,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ # /v2/ slides (.jpg)
+ 'url': 'https://slideslive.com/38954074',
+ 'info_dict': {
+ 'id': '38954074',
+ 'ext': 'mp4',
+ 'title': 'Decentralized Attribution of Generative Models',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'thumbnails': 'count:16',
+ 'timestamp': 1622806321,
+ 'upload_date': '20210604',
+ 'chapters': 'count:15',
+ 'duration': 306,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ # /v4/ slides (.png)
+ 'url': 'https://slideslive.com/38979570/',
+ 'info_dict': {
+ 'id': '38979570',
+ 'ext': 'mp4',
+ 'title': 'Efficient Active Search for Combinatorial Optimization Problems',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'thumbnails': 'count:9',
+ 'timestamp': 1654714896,
+ 'upload_date': '20220608',
+ 'chapters': 'count:8',
+ 'duration': 295,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ # /v10/ slides
+ 'url': 'https://slideslive.com/embed/presentation/38979880?embed_parent_url=https%3A%2F%2Fedit.videoken.com%2F',
+ 'info_dict': {
+ 'id': '38979880',
+ 'ext': 'mp4',
+ 'title': 'The Representation Power of Neural Networks',
+ 'timestamp': 1654714962,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'thumbnails': 'count:22',
+ 'upload_date': '20220608',
+ 'chapters': 'count:21',
+ 'duration': 294,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ # /v7/ slides, 2 video slides
+ 'url': 'https://slideslive.com/embed/presentation/38979682?embed_container_origin=https%3A%2F%2Fedit.videoken.com',
+ 'playlist_count': 3,
+ 'info_dict': {
+ 'id': '38979682-playlist',
+ 'title': 'LoRA: Low-Rank Adaptation of Large Language Models',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '38979682',
+ 'ext': 'mp4',
+ 'title': 'LoRA: Low-Rank Adaptation of Large Language Models',
+ 'timestamp': 1654714920,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'thumbnails': 'count:30',
+ 'upload_date': '20220608',
+ 'chapters': 'count:31',
+ 'duration': 272,
+ },
+ }, {
+ 'info_dict': {
+ 'id': '38979682-021',
+ 'ext': 'mp4',
+ 'title': 'LoRA: Low-Rank Adaptation of Large Language Models - Slide 021',
+ 'duration': 3,
+ 'timestamp': 1654714920,
+ 'upload_date': '20220608',
+ },
+ }, {
+ 'info_dict': {
+ 'id': '38979682-024',
+ 'ext': 'mp4',
+ 'title': 'LoRA: Low-Rank Adaptation of Large Language Models - Slide 024',
+ 'duration': 4,
+ 'timestamp': 1654714920,
+ 'upload_date': '20220608',
+ },
+ }],
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ # /v6/ slides, 1 video slide, edit.videoken.com embed
+ 'url': 'https://slideslive.com/38979481/',
+ 'playlist_count': 2,
+ 'info_dict': {
+ 'id': '38979481-playlist',
+ 'title': 'How to Train Your MAML to Excel in Few-Shot Classification',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '38979481',
+ 'ext': 'mp4',
+ 'title': 'How to Train Your MAML to Excel in Few-Shot Classification',
+ 'timestamp': 1654714877,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'thumbnails': 'count:43',
+ 'upload_date': '20220608',
+ 'chapters': 'count:43',
+ 'duration': 315,
+ },
+ }, {
+ 'info_dict': {
+ 'id': '38979481-013',
+ 'ext': 'mp4',
+ 'title': 'How to Train Your MAML to Excel in Few-Shot Classification - Slide 013',
+ 'duration': 3,
+ 'timestamp': 1654714877,
+ 'upload_date': '20220608',
+ },
+ }],
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ # /v3/ slides, .jpg and .png, service_name = youtube
+ 'url': 'https://slideslive.com/embed/38932460/',
+ 'info_dict': {
+ 'id': 'RTPdrgkyTiE',
+ 'display_id': '38932460',
+ 'ext': 'mp4',
+ 'title': 'Active Learning for Hierarchical Multi-Label Classification',
+ 'description': 'Watch full version of this video at https://slideslive.com/38932460.',
+ 'channel': 'SlidesLive Videos - A',
+ 'channel_id': 'UC62SdArr41t_-_fX40QCLRw',
+ 'channel_url': 'https://www.youtube.com/channel/UC62SdArr41t_-_fX40QCLRw',
+ 'uploader': 'SlidesLive Videos - A',
+ 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw',
+ 'uploader_url': 'http://www.youtube.com/channel/UC62SdArr41t_-_fX40QCLRw',
+ 'upload_date': '20200903',
+ 'timestamp': 1602599092,
+ 'duration': 942,
+ 'age_limit': 0,
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'availability': 'unlisted',
+ 'categories': ['People & Blogs'],
+ 'tags': [],
+ 'channel_follower_count': int,
+ 'like_count': int,
+ 'view_count': int,
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png|webp)',
+ 'thumbnails': 'count:21',
+ 'chapters': 'count:20',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ # /v3/ slides, .png only, service_name = yoda
+ 'url': 'https://slideslive.com/38983994',
+ 'info_dict': {
+ 'id': '38983994',
+ 'ext': 'mp4',
+ 'title': 'Zero-Shot AutoML with Pretrained Models',
+ 'timestamp': 1662384834,
+ 'upload_date': '20220905',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'thumbnails': 'count:23',
+ 'chapters': 'count:22',
+ 'duration': 295,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ # service_name = yoda
'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend',
'only_matching': True,
}, {
- # video_service_name = url
+ # dead link, service_name = url
'url': 'https://slideslive.com/38922070/learning-transferable-skills-1',
'only_matching': True,
}, {
- # video_service_name = vimeo
+ # dead link, service_name = vimeo
'url': 'https://slideslive.com/38921896/retrospectives-a-venue-for-selfreflection-in-ml-research-3',
'only_matching': True,
}]
+ _WEBPAGE_TESTS = [{
+ # only XML slides info
+ 'url': 'https://iclr.cc/virtual_2020/poster_Hklr204Fvr.html',
+ 'info_dict': {
+ 'id': '38925850',
+ 'ext': 'mp4',
+ 'title': 'Towards a Deep Network Architecture for Structured Smoothness',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'thumbnails': 'count:8',
+ 'timestamp': 1629671508,
+ 'upload_date': '20210822',
+ 'chapters': 'count:7',
+ 'duration': 326,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }]
+
+ @classmethod
+ def _extract_embed_urls(cls, url, webpage):
+ # Reference: https://slideslive.com/embed_presentation.js
+ for embed_id in re.findall(r'(?s)new\s+SlidesLiveEmbed\s*\([^)]+\bpresentationId:\s*["\'](\d+)["\']', webpage):
+ url_parsed = urllib.parse.urlparse(url)
+ origin = f'{url_parsed.scheme}://{url_parsed.netloc}'
+ yield update_url_query(
+ f'https://slideslive.com/embed/presentation/{embed_id}', {
+ 'embed_parent_url': url,
+ 'embed_container_origin': origin,
+ })
+
+ def _download_embed_webpage_handle(self, video_id, headers):
+ return self._download_webpage_handle(
+ f'https://slideslive.com/embed/presentation/{video_id}', video_id,
+ headers=headers, query=traverse_obj(headers, {
+ 'embed_parent_url': 'Referer',
+ 'embed_container_origin': 'Origin',
+ }))
+
+ def _extract_custom_m3u8_info(self, m3u8_data):
+ m3u8_dict = {}
+
+ lookup = {
+ 'PRESENTATION-TITLE': 'title',
+ 'PRESENTATION-UPDATED-AT': 'timestamp',
+ 'PRESENTATION-THUMBNAIL': 'thumbnail',
+ 'PLAYLIST-TYPE': 'playlist_type',
+ 'VOD-VIDEO-SERVICE-NAME': 'service_name',
+ 'VOD-VIDEO-ID': 'service_id',
+ 'VOD-VIDEO-SERVERS': 'video_servers',
+ 'VOD-SUBTITLES': 'subtitles',
+ 'VOD-SLIDES-JSON-URL': 'slides_json_url',
+ 'VOD-SLIDES-XML-URL': 'slides_xml_url',
+ }
+
+ for line in m3u8_data.splitlines():
+ if not line.startswith('#EXT-SL-'):
+ continue
+ tag, _, value = line.partition(':')
+ key = lookup.get(tag.lstrip('#EXT-SL-'))
+ if not key:
+ continue
+ m3u8_dict[key] = value
+
+ # Some values are stringified JSON arrays
+ for key in ('video_servers', 'subtitles'):
+ if key in m3u8_dict:
+ m3u8_dict[key] = self._parse_json(m3u8_dict[key], None, fatal=False) or []
+
+ return m3u8_dict
+
+ def _extract_formats_and_duration(self, cdn_hostname, path, video_id, skip_duration=False):
+ formats, duration = [], None
+
+ hls_formats = self._extract_m3u8_formats(
+ f'https://{cdn_hostname}/{path}/master.m3u8',
+ video_id, 'mp4', m3u8_id='hls', fatal=False, live=True)
+ if hls_formats:
+ if not skip_duration:
+ duration = self._extract_m3u8_vod_duration(
+ hls_formats[0]['url'], video_id, note='Extracting duration from HLS manifest')
+ formats.extend(hls_formats)
+
+ dash_formats = self._extract_mpd_formats(
+ f'https://{cdn_hostname}/{path}/master.mpd', video_id, mpd_id='dash', fatal=False)
+ if dash_formats:
+ if not duration and not skip_duration:
+ duration = self._extract_mpd_vod_duration(
+ f'https://{cdn_hostname}/{path}/master.mpd', video_id,
+ note='Extracting duration from DASH manifest')
+ formats.extend(dash_formats)
+
+ return formats, duration
+
def _real_extract(self, url):
video_id = self._match_id(url)
- video_data = self._download_json(
- 'https://ben.slideslive.com/player/' + video_id, video_id)
- service_name = video_data['video_service_name'].lower()
+ webpage, urlh = self._download_embed_webpage_handle(
+ video_id, headers=traverse_obj(parse_qs(url), {
+ 'Referer': ('embed_parent_url', -1),
+ 'Origin': ('embed_container_origin', -1)}))
+ redirect_url = urlh.url
+ if 'domain_not_allowed' in redirect_url:
+ domain = traverse_obj(parse_qs(redirect_url), ('allowed_domains[]', ...), get_all=False)
+ if not domain:
+ raise ExtractorError(
+ 'This is an embed-only presentation. Try passing --referer', expected=True)
+ webpage, _ = self._download_embed_webpage_handle(video_id, headers={
+ 'Referer': f'https://{domain}/',
+ 'Origin': f'https://{domain}',
+ })
+
+ player_token = self._search_regex(r'data-player-token="([^"]+)"', webpage, 'player token')
+ player_data = self._download_webpage(
+ f'https://ben.slideslive.com/player/{video_id}', video_id,
+ note='Downloading player info', query={'player_token': player_token})
+ player_info = self._extract_custom_m3u8_info(player_data)
+
+ service_name = player_info['service_name'].lower()
assert service_name in ('url', 'yoda', 'vimeo', 'youtube')
- service_id = video_data['video_service_id']
+ service_id = player_info['service_id']
+
+ slide_url_template = 'https://slides.slideslive.com/%s/slides/original/%s%s'
+ slides, slides_info = {}, []
+
+ if player_info.get('slides_json_url'):
+ slides = self._download_json(
+ player_info['slides_json_url'], video_id, fatal=False,
+ note='Downloading slides JSON', errnote=False) or {}
+ slide_ext_default = '.png'
+ slide_quality = traverse_obj(slides, ('slide_qualities', 0))
+ if slide_quality:
+ slide_ext_default = '.jpg'
+ slide_url_template = f'https://cdn.slideslive.com/data/presentations/%s/slides/{slide_quality}/%s%s'
+ for slide_id, slide in enumerate(traverse_obj(slides, ('slides', ...), expected_type=dict), 1):
+ slides_info.append((
+ slide_id, traverse_obj(slide, ('image', 'name')),
+ traverse_obj(slide, ('image', 'extname'), default=slide_ext_default),
+ int_or_none(slide.get('time'), scale=1000)))
+
+ if not slides and player_info.get('slides_xml_url'):
+ slides = self._download_xml(
+ player_info['slides_xml_url'], video_id, fatal=False,
+ note='Downloading slides XML', errnote='Failed to download slides info')
+ slide_url_template = 'https://cdn.slideslive.com/data/presentations/%s/slides/big/%s%s'
+ for slide_id, slide in enumerate(slides.findall('./slide') if slides else [], 1):
+ slides_info.append((
+ slide_id, xpath_text(slide, './slideName', 'name'), '.jpg',
+ int_or_none(xpath_text(slide, './timeSec', 'time'))))
+
+ chapters, thumbnails = [], []
+ if url_or_none(player_info.get('thumbnail')):
+ thumbnails.append({'id': 'cover', 'url': player_info['thumbnail']})
+ for slide_id, slide_path, slide_ext, start_time in slides_info:
+ if slide_path:
+ thumbnails.append({
+ 'id': f'{slide_id:03d}',
+ 'url': slide_url_template % (video_id, slide_path, slide_ext),
+ })
+ chapters.append({
+ 'title': f'Slide {slide_id:03d}',
+ 'start_time': start_time,
+ })
+
subtitles = {}
- for sub in try_get(video_data, lambda x: x['subtitles'], list) or []:
- if not isinstance(sub, dict):
- continue
+ for sub in traverse_obj(player_info, ('subtitles', ...), expected_type=dict):
webvtt_url = url_or_none(sub.get('webvtt_url'))
if not webvtt_url:
continue
- lang = sub.get('language') or 'en'
- subtitles.setdefault(lang, []).append({
+ subtitles.setdefault(sub.get('language') or 'en', []).append({
'url': webvtt_url,
+ 'ext': 'vtt',
})
+
info = {
'id': video_id,
- 'thumbnail': video_data.get('thumbnail'),
- 'is_live': bool_or_none(video_data.get('is_live')),
+ 'title': player_info.get('title') or self._html_search_meta('title', webpage, default=''),
+ 'timestamp': unified_timestamp(player_info.get('timestamp')),
+ 'is_live': player_info.get('playlist_type') != 'vod',
+ 'thumbnails': thumbnails,
+ 'chapters': chapters,
'subtitles': subtitles,
}
- if service_name in ('url', 'yoda'):
- info['title'] = video_data['title']
- if service_name == 'url':
- info['url'] = service_id
- else:
- formats = []
- _MANIFEST_PATTERN = 'https://01.cdn.yoda.slideslive.com/%s/master.%s'
- # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol
- formats.extend(self._extract_m3u8_formats(
- _MANIFEST_PATTERN % (service_id, 'm3u8'),
- service_id, 'mp4', m3u8_id='hls', fatal=False))
- formats.extend(self._extract_mpd_formats(
- _MANIFEST_PATTERN % (service_id, 'mpd'), service_id,
- mpd_id='dash', fatal=False))
- info.update({
- 'id': service_id,
- 'formats': formats,
- })
+
+ if service_name == 'url':
+ info['url'] = service_id
+ elif service_name == 'yoda':
+ formats, duration = self._extract_formats_and_duration(
+ player_info['video_servers'][0], service_id, video_id)
+ info.update({
+ 'duration': duration,
+ 'formats': formats,
+ })
else:
info.update({
'_type': 'url_transparent',
'url': service_id,
'ie_key': service_name.capitalize(),
- 'title': video_data.get('title'),
+ 'display_id': video_id,
})
if service_name == 'vimeo':
info['url'] = smuggle_url(
- 'https://player.vimeo.com/video/' + service_id,
+ f'https://player.vimeo.com/video/{service_id}',
{'http_headers': {'Referer': url}})
- return info
+
+ video_slides = traverse_obj(slides, ('slides', ..., 'video', 'id'))
+ if not video_slides:
+ return info
+
+ def entries():
+ yield info
+
+ service_data = self._download_json(
+ f'https://ben.slideslive.com/player/{video_id}/slides_video_service_data',
+ video_id, fatal=False, query={
+ 'player_token': player_token,
+ 'videos': ','.join(video_slides),
+ }, note='Downloading video slides info', errnote='Failed to download video slides info') or {}
+
+ for slide_id, slide in enumerate(traverse_obj(slides, ('slides', ...)), 1):
+ if not traverse_obj(slide, ('video', 'service')) == 'yoda':
+ continue
+ video_path = traverse_obj(slide, ('video', 'id'))
+ cdn_hostname = traverse_obj(service_data, (
+ video_path, 'video_servers', ...), get_all=False)
+ if not cdn_hostname or not video_path:
+ continue
+ formats, _ = self._extract_formats_and_duration(
+ cdn_hostname, video_path, video_id, skip_duration=True)
+ if not formats:
+ continue
+ yield {
+ 'id': f'{video_id}-{slide_id:03d}',
+ 'title': f'{info["title"]} - Slide {slide_id:03d}',
+ 'timestamp': info['timestamp'],
+ 'duration': int_or_none(traverse_obj(slide, ('video', 'duration_ms')), scale=1000),
+ 'formats': formats,
+ }
+
+ return self.playlist_result(entries(), f'{video_id}-playlist', info['title'])
diff --git a/hypervideo_dl/extractor/sonyliv.py b/hypervideo_dl/extractor/sonyliv.py
index aaad420..4379572 100644
--- a/hypervideo_dl/extractor/sonyliv.py
+++ b/hypervideo_dl/extractor/sonyliv.py
@@ -6,10 +6,12 @@ import time
import uuid
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
+ jwt_decode_hs256,
+ try_call,
try_get,
)
@@ -77,8 +79,10 @@ class SonyLIVIE(InfoExtractor):
self._HEADERS['device_id'] = self._get_device_id()
self._HEADERS['content-type'] = 'application/json'
- if username.lower() == 'token' and len(password) > 1198:
+ if username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)):
self._HEADERS['authorization'] = password
+ self.report_login()
+ return
elif len(username) != 10 or not username.isdigit():
raise ExtractorError(f'Invalid username/password; {self._LOGIN_HINT}')
@@ -119,12 +123,12 @@ class SonyLIVIE(InfoExtractor):
'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path),
video_id, headers=self._HEADERS)['resultObj']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406 and self._parse_json(
- e.cause.read().decode(), video_id)['message'] == 'Please subscribe to watch this content':
+ if isinstance(e.cause, HTTPError) and e.cause.status == 406 and self._parse_json(
+ e.cause.response.read().decode(), video_id)['message'] == 'Please subscribe to watch this content':
self.raise_login_required(self._LOGIN_HINT, method=None)
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
message = self._parse_json(
- e.cause.read().decode(), video_id)['message']
+ e.cause.response.read().decode(), video_id)['message']
if message == 'Geoblocked Country':
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
raise ExtractorError(message)
diff --git a/hypervideo_dl/extractor/soundcloud.py b/hypervideo_dl/extractor/soundcloud.py
index c2344dd..2e6d21a 100644
--- a/hypervideo_dl/extractor/soundcloud.py
+++ b/hypervideo_dl/extractor/soundcloud.py
@@ -7,15 +7,13 @@ from .common import (
InfoExtractor,
SearchInfoExtractor
)
-from ..compat import (
- compat_HTTPError,
- compat_str,
-)
+from ..compat import compat_str
+from ..networking import HEADRequest, Request
+from ..networking.exceptions import HTTPError
from ..utils import (
error_to_compat_str,
ExtractorError,
float_or_none,
- HEADRequest,
int_or_none,
KNOWN_EXTENSIONS,
mimetype2ext,
@@ -26,7 +24,6 @@ from ..utils import (
update_url_query,
url_or_none,
urlhandle_detect_ext,
- sanitized_Request,
)
@@ -103,7 +100,7 @@ class SoundcloudBaseIE(InfoExtractor):
try:
return super()._download_json(*args, **kwargs)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403):
+ if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403):
self._store_client_id(None)
self._update_client_id()
continue
@@ -123,7 +120,7 @@ class SoundcloudBaseIE(InfoExtractor):
self._access_token = password
query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
payload = {'session': {'access_token': self._access_token}}
- token_verification = sanitized_Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8'))
+ token_verification = Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8'))
response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False)
if response is not False:
self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
@@ -212,7 +209,7 @@ class SoundcloudBaseIE(InfoExtractor):
urlh = self._request_webpage(
HEADRequest(redirect_url), track_id, fatal=False)
if urlh:
- format_url = urlh.geturl()
+ format_url = urlh.url
format_urls.add(format_url)
formats.append({
'format_id': 'download',
@@ -669,7 +666,7 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudBaseIE):
except ExtractorError as e:
# Downloading page may result in intermittent 502 HTTP error
# See https://github.com/hypervideo/hypervideo/issues/872
- if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 502:
+ if not isinstance(e.cause, HTTPError) or e.cause.status != 502:
raise
retry.error = e
continue
@@ -782,6 +779,27 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
'%s (%s)' % (user['username'], resource.capitalize()))
+class SoundcloudUserPermalinkIE(SoundcloudPagedPlaylistBaseIE):
+ _VALID_URL = r'https?://api\.soundcloud\.com/users/(?P<id>\d+)'
+ IE_NAME = 'soundcloud:user:permalink'
+ _TESTS = [{
+ 'url': 'https://api.soundcloud.com/users/30909869',
+ 'info_dict': {
+ 'id': '30909869',
+ 'title': 'neilcic',
+ },
+ 'playlist_mincount': 23,
+ }]
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+ user = self._download_json(
+ self._resolv_url(url), user_id, 'Downloading user info', headers=self._HEADERS)
+
+ return self._extract_playlist(
+ f'{self._API_V2_BASE}stream/users/{user["id"]}', str(user['id']), user.get('username'))
+
+
class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
_VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)'
IE_NAME = 'soundcloud:trackstation'
diff --git a/hypervideo_dl/extractor/spankbang.py b/hypervideo_dl/extractor/spankbang.py
index f242d33..43da34a 100644
--- a/hypervideo_dl/extractor/spankbang.py
+++ b/hypervideo_dl/extractor/spankbang.py
@@ -177,7 +177,6 @@ class SpankBangPlaylistIE(InfoExtractor):
def _real_extract(self, url):
mobj = self._match_valid_url(url)
playlist_id = mobj.group('id')
- display_id = mobj.group('display_id')
webpage = self._download_webpage(
url, playlist_id, headers={'Cookie': 'country=US; mobile=on'})
@@ -186,11 +185,11 @@ class SpankBangPlaylistIE(InfoExtractor):
urljoin(url, mobj.group('path')),
ie=SpankBangIE.ie_key(), video_id=mobj.group('id'))
for mobj in re.finditer(
- r'<a[^>]+\bhref=(["\'])(?P<path>/?[\da-z]+-(?P<id>[\da-z]+)/playlist/%s(?:(?!\1).)*)\1'
- % re.escape(display_id), webpage)]
+ r'<a[^>]+\bhref=(["\'])(?P<path>/?[\da-z]+-(?P<id>[\da-z]+)/playlist/[^"\'](?:(?!\1).)*)\1',
+ webpage)]
title = self._html_search_regex(
- r'<h1>([^<]+)\s+playlist\s*<', webpage, 'playlist title',
+ r'<em>([^<]+)</em>\s+playlist\s*<', webpage, 'playlist title',
fatal=False)
return self.playlist_result(entries, playlist_id, title)
diff --git a/hypervideo_dl/extractor/sportdeutschland.py b/hypervideo_dl/extractor/sportdeutschland.py
index 75074b3..30dbcf3 100644
--- a/hypervideo_dl/extractor/sportdeutschland.py
+++ b/hypervideo_dl/extractor/sportdeutschland.py
@@ -1,95 +1,142 @@
from .common import InfoExtractor
from ..utils import (
- clean_html,
- float_or_none,
- int_or_none,
- parse_iso8601,
- parse_qs,
+ join_nonempty,
strip_or_none,
- try_get,
+ traverse_obj,
+ unified_timestamp,
)
class SportDeutschlandIE(InfoExtractor):
_VALID_URL = r'https?://sportdeutschland\.tv/(?P<id>(?:[^/]+/)?[^?#/&]+)'
_TESTS = [{
- 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0',
+ 'url': 'https://sportdeutschland.tv/blauweissbuchholztanzsport/buchholzer-formationswochenende-2023-samstag-1-bundesliga-landesliga',
'info_dict': {
- 'id': '5318cac0275701382770543d7edaf0a0',
+ 'id': '9839a5c7-0dbb-48a8-ab63-3b408adc7b54',
'ext': 'mp4',
- 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals - Teil 1',
- 'duration': 16106.36,
- },
- 'params': {
- 'noplaylist': True,
- # m3u8 download
- 'skip_download': True,
- },
+ 'title': 'Buchholzer Formationswochenende 2023 - Samstag - 1. Bundesliga / Landesliga',
+ 'display_id': 'blauweissbuchholztanzsport/buchholzer-formationswochenende-2023-samstag-1-bundesliga-landesliga',
+ 'description': 'md5:a288c794a5ee69e200d8f12982f81a87',
+ 'live_status': 'was_live',
+ 'channel': 'Blau-Weiss Buchholz Tanzsport',
+ 'channel_url': 'https://sportdeutschland.tv/blauweissbuchholztanzsport',
+ 'channel_id': '93ec33c9-48be-43b6-b404-e016b64fdfa3',
+ 'duration': 32447,
+ 'upload_date': '20230114',
+ 'timestamp': 1673733618,
+ }
+ }, {
+ 'url': 'https://sportdeutschland.tv/deutscherbadmintonverband/bwf-tour-1-runde-feld-1-yonex-gainward-german-open-2022-0',
+ 'info_dict': {
+ 'id': '95c80c52-6b9a-4ae9-9197-984145adfced',
+ 'ext': 'mp4',
+ 'title': 'BWF Tour: 1. Runde Feld 1 - YONEX GAINWARD German Open 2022',
+ 'display_id': 'deutscherbadmintonverband/bwf-tour-1-runde-feld-1-yonex-gainward-german-open-2022-0',
+ 'description': 'md5:2afb5996ceb9ac0b2ac81f563d3a883e',
+ 'live_status': 'was_live',
+ 'channel': 'Deutscher Badminton Verband',
+ 'channel_url': 'https://sportdeutschland.tv/deutscherbadmintonverband',
+ 'channel_id': '93ca5866-2551-49fc-8424-6db35af58920',
+ 'duration': 41097,
+ 'upload_date': '20220309',
+ 'timestamp': 1646860727.0,
+ }
}, {
- 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0',
+ 'url': 'https://sportdeutschland.tv/ggcbremen/formationswochenende-latein-2023',
'info_dict': {
- 'id': 'c6e2fdd01f63013854c47054d2ab776f',
- 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals',
- 'description': 'md5:5263ff4c31c04bb780c9f91130b48530',
- 'duration': 31397,
+ 'id': '9889785e-55b0-4d97-a72a-ce9a9f157cce',
+ 'title': 'Formationswochenende Latein 2023 - Samstag',
+ 'display_id': 'ggcbremen/formationswochenende-latein-2023',
+ 'description': 'md5:6e4060d40ff6a8f8eeb471b51a8f08b2',
+ 'live_status': 'was_live',
+ 'channel': 'Grün-Gold-Club Bremen e.V.',
+ 'channel_id': '9888f04e-bb46-4c7f-be47-df960a4167bb',
+ 'channel_url': 'https://sportdeutschland.tv/ggcbremen',
},
- 'playlist_count': 2,
+ 'playlist_count': 3,
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '988e1fea-9d44-4fab-8c72-3085fb667547',
+ 'ext': 'mp4',
+ 'channel_url': 'https://sportdeutschland.tv/ggcbremen',
+ 'channel_id': '9888f04e-bb46-4c7f-be47-df960a4167bb',
+ 'channel': 'Grün-Gold-Club Bremen e.V.',
+ 'duration': 86,
+ 'title': 'Formationswochenende Latein 2023 - Samstag Part 1',
+ 'upload_date': '20230225',
+ 'timestamp': 1677349909,
+ 'live_status': 'was_live',
+ }
+ }]
}, {
- 'url': 'https://sportdeutschland.tv/freeride-world-tour-2021-fieberbrunn-oesterreich',
- 'only_matching': True,
+ 'url': 'https://sportdeutschland.tv/dtb/gymnastik-international-tag-1',
+ 'info_dict': {
+ 'id': '95d71b8a-370a-4b87-ad16-94680da18528',
+ 'ext': 'mp4',
+ 'title': r're:Gymnastik International - Tag 1 .+',
+ 'display_id': 'dtb/gymnastik-international-tag-1',
+ 'channel_id': '936ecef1-2f4a-4e08-be2f-68073cb7ecab',
+ 'channel': 'Deutscher Turner-Bund',
+ 'channel_url': 'https://sportdeutschland.tv/dtb',
+ 'description': 'md5:07a885dde5838a6f0796ee21dc3b0c52',
+ 'live_status': 'is_live',
+ },
+ 'skip': 'live',
}]
+ def _process_video(self, asset_id, video):
+ is_live = video['type'] == 'mux_live'
+ token = self._download_json(
+ f'https://api.sportdeutschland.tv/api/frontend/asset-token/{asset_id}',
+ video['id'], query={'type': video['type'], 'playback_id': video['src']})['token']
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ f'https://stream.mux.com/{video["src"]}.m3u8?token={token}', video['id'], live=is_live)
+
+ return {
+ 'is_live': is_live,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ **traverse_obj(video, {
+ 'id': 'id',
+ 'duration': ('duration', {lambda x: float(x) > 0 and float(x)}),
+ 'timestamp': ('created_at', {unified_timestamp})
+ }),
+ }
+
def _real_extract(self, url):
display_id = self._match_id(url)
- data = self._download_json(
- 'https://backend.sportdeutschland.tv/api/permalinks/' + display_id,
+ meta = self._download_json(
+ f'https://api.sportdeutschland.tv/api/stateless/frontend/assets/{display_id}',
display_id, query={'access_token': 'true'})
- asset = data['asset']
- title = (asset.get('title') or asset['label']).strip()
- asset_id = asset.get('id') or asset.get('uuid')
+
info = {
- 'id': asset_id,
- 'title': title,
- 'description': clean_html(asset.get('body') or asset.get('description')) or asset.get('teaser'),
- 'duration': int_or_none(asset.get('seconds')),
+ 'display_id': display_id,
+ **traverse_obj(meta, {
+ 'id': (('id', 'uuid'), ),
+ 'title': (('title', 'name'), {strip_or_none}),
+ 'description': 'description',
+ 'channel': ('profile', 'name'),
+ 'channel_id': ('profile', 'id'),
+ 'is_live': 'currently_live',
+ 'was_live': 'was_live',
+ 'channel_url': ('profile', 'slug', {lambda x: f'https://sportdeutschland.tv/{x}'}),
+ }, get_all=False)
}
- videos = asset.get('videos') or []
- if len(videos) > 1:
- playlist_id = parse_qs(url).get('playlistId', [None])[0]
- if not self._yes_playlist(playlist_id, asset_id):
- videos = [videos[int(playlist_id)]]
- def entries():
- for i, video in enumerate(videos, 1):
- video_id = video.get('uuid')
- video_url = video.get('url')
- if not (video_id and video_url):
- continue
- formats = self._extract_m3u8_formats(
- video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False)
- if not formats and not self.get_param('ignore_no_formats'):
- continue
- yield {
- 'id': video_id,
- 'formats': formats,
- 'title': title + ' - ' + (video.get('label') or 'Teil %d' % i),
- 'duration': float_or_none(video.get('duration')),
- }
- info.update({
- '_type': 'multi_video',
- 'entries': entries(),
- })
- else:
- formats = self._extract_m3u8_formats(
- videos[0]['url'].replace('.smil', '.m3u8'), asset_id, 'mp4')
- section_title = strip_or_none(try_get(data, lambda x: x['section']['title']))
- info.update({
- 'formats': formats,
- 'display_id': asset.get('permalink'),
- 'thumbnail': try_get(asset, lambda x: x['images'][0]),
- 'categories': [section_title] if section_title else None,
- 'view_count': int_or_none(asset.get('views')),
- 'is_live': asset.get('is_live') is True,
- 'timestamp': parse_iso8601(asset.get('date') or asset.get('published_at')),
- })
- return info
+ parts = traverse_obj(meta, (('livestream', ('videos', ...)), ))
+ entries = [{
+ 'title': join_nonempty(info.get('title'), f'Part {i}', delim=' '),
+ **traverse_obj(info, {'channel': 'channel', 'channel_id': 'channel_id',
+ 'channel_url': 'channel_url', 'was_live': 'was_live'}),
+ **self._process_video(info['id'], video),
+ } for i, video in enumerate(parts, 1)]
+
+ return {
+ '_type': 'multi_video',
+ **info,
+ 'entries': entries,
+ } if len(entries) > 1 else {
+ **info,
+ **entries[0],
+ 'title': info.get('title'),
+ }
diff --git a/hypervideo_dl/extractor/stacommu.py b/hypervideo_dl/extractor/stacommu.py
new file mode 100644
index 0000000..6f58f06
--- /dev/null
+++ b/hypervideo_dl/extractor/stacommu.py
@@ -0,0 +1,148 @@
+import time
+
+from .wrestleuniverse import WrestleUniverseBaseIE
+from ..utils import (
+ int_or_none,
+ traverse_obj,
+ url_or_none,
+)
+
+
+class StacommuBaseIE(WrestleUniverseBaseIE):
+ _NETRC_MACHINE = 'stacommu'
+ _API_HOST = 'api.stacommu.jp'
+ _LOGIN_QUERY = {'key': 'AIzaSyCR9czxhH2eWuijEhTNWBZ5MCcOYEUTAhg'}
+ _LOGIN_HEADERS = {
+ 'Accept': '*/*',
+ 'Content-Type': 'application/json',
+ 'X-Client-Version': 'Chrome/JsCore/9.9.4/FirebaseCore-web',
+ 'Referer': 'https://www.stacommu.jp/',
+ 'Origin': 'https://www.stacommu.jp',
+ }
+
+ @WrestleUniverseBaseIE._TOKEN.getter
+ def _TOKEN(self):
+ if self._REAL_TOKEN and self._TOKEN_EXPIRY <= int(time.time()):
+ self._refresh_token()
+
+ return self._REAL_TOKEN
+
+ def _get_formats(self, data, path, video_id=None):
+ if not traverse_obj(data, path) and not data.get('canWatch') and not self._TOKEN:
+ self.raise_login_required(method='password')
+ return super()._get_formats(data, path, video_id)
+
+ def _extract_hls_key(self, data, path, decrypt):
+ encryption_data = traverse_obj(data, path)
+ if traverse_obj(encryption_data, ('encryptType', {int})) == 0:
+ return None
+ return traverse_obj(encryption_data, {'key': ('key', {decrypt}), 'iv': ('iv', {decrypt})})
+
+
+class StacommuVODIE(StacommuBaseIE):
+ _VALID_URL = r'https?://www\.stacommu\.jp/videos/episodes/(?P<id>[\da-zA-Z]+)'
+ _TESTS = [{
+ # not encrypted
+ 'url': 'https://www.stacommu.jp/videos/episodes/aXcVKjHyAENEjard61soZZ',
+ 'info_dict': {
+ 'id': 'aXcVKjHyAENEjard61soZZ',
+ 'ext': 'mp4',
+ 'title': 'スタコミュAWARDの裏側、ほぼ全部見せます!〜晴れ舞台の直前ドキドキ編〜',
+ 'description': 'md5:6400275c57ae75c06da36b06f96beb1c',
+ 'timestamp': 1679652000,
+ 'upload_date': '20230324',
+ 'thumbnail': 'https://image.stacommu.jp/6eLobQan8PFtBoU4RL4uGg/6eLobQan8PFtBoU4RL4uGg',
+ 'cast': 'count:11',
+ 'duration': 250,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ # encrypted; requires a premium account
+ 'url': 'https://www.stacommu.jp/videos/episodes/3hybMByUvzMEqndSeu5LpD',
+ 'info_dict': {
+ 'id': '3hybMByUvzMEqndSeu5LpD',
+ 'ext': 'mp4',
+ 'title': 'スタプラフェス2023〜裏側ほぼ全部見せます〜#10',
+ 'description': 'md5:85494488ccf1dfa1934accdeadd7b340',
+ 'timestamp': 1682506800,
+ 'upload_date': '20230426',
+ 'thumbnail': 'https://image.stacommu.jp/eMdXtEefR4kEyJJMpAFi7x/eMdXtEefR4kEyJJMpAFi7x',
+ 'cast': 'count:55',
+ 'duration': 312,
+ 'hls_aes': {
+ 'key': '6bbaf241b8e1fd9f59ecf546a70e4ae7',
+ 'iv': '1fc9002a23166c3bb1d240b953d09de9',
+ },
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }]
+
+ _API_PATH = 'videoEpisodes'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ video_info = self._download_metadata(
+ url, video_id, 'ja', ('dehydratedState', 'queries', 0, 'state', 'data'))
+ hls_info, decrypt = self._call_encrypted_api(
+ video_id, ':watch', 'stream information', data={'method': 1})
+
+ return {
+ 'id': video_id,
+ 'formats': self._get_formats(hls_info, ('protocolHls', 'url', {url_or_none}), video_id),
+ 'hls_aes': self._extract_hls_key(hls_info, 'protocolHls', decrypt),
+ **traverse_obj(video_info, {
+ 'title': ('displayName', {str}),
+ 'description': ('description', {str}),
+ 'timestamp': ('watchStartTime', {int_or_none}),
+ 'thumbnail': ('keyVisualUrl', {url_or_none}),
+ 'cast': ('casts', ..., 'displayName', {str}),
+ 'duration': ('duration', {int}),
+ }),
+ }
+
+
+class StacommuLiveIE(StacommuBaseIE):
+ _VALID_URL = r'https?://www\.stacommu\.jp/live/(?P<id>[\da-zA-Z]+)'
+ _TESTS = [{
+ 'url': 'https://www.stacommu.jp/live/d2FJ3zLnndegZJCAEzGM3m',
+ 'info_dict': {
+ 'id': 'd2FJ3zLnndegZJCAEzGM3m',
+ 'ext': 'mp4',
+ 'title': '仲村悠菜 2023/05/04',
+ 'timestamp': 1683195647,
+ 'upload_date': '20230504',
+ 'thumbnail': 'https://image.stacommu.jp/pHGF57SPEHE2ke83FS92FN/pHGF57SPEHE2ke83FS92FN',
+ 'duration': 5322,
+ 'hls_aes': {
+ 'key': 'efbb3ec0b8246f61adf1764c5a51213a',
+ 'iv': '80621d19a1f19167b64cedb415b05d1c',
+ },
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }]
+
+ _API_PATH = 'events'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ video_info = self._call_api(video_id, msg='video information', query={'al': 'ja'}, auth=False)
+ hls_info, decrypt = self._call_encrypted_api(
+ video_id, ':watchArchive', 'stream information', data={'method': 1})
+
+ return {
+ 'id': video_id,
+ 'formats': self._get_formats(hls_info, ('hls', 'urls', ..., {url_or_none}), video_id),
+ 'hls_aes': self._extract_hls_key(hls_info, 'hls', decrypt),
+ **traverse_obj(video_info, {
+ 'title': ('displayName', {str}),
+ 'timestamp': ('startTime', {int_or_none}),
+ 'thumbnail': ('keyVisualUrl', {url_or_none}),
+ 'duration': ('duration', {int_or_none}),
+ }),
+ }
diff --git a/hypervideo_dl/extractor/stageplus.py b/hypervideo_dl/extractor/stageplus.py
new file mode 100644
index 0000000..4bed4d6
--- /dev/null
+++ b/hypervideo_dl/extractor/stageplus.py
@@ -0,0 +1,515 @@
+import json
+import uuid
+
+from .common import InfoExtractor
+from ..utils import (
+ float_or_none,
+ traverse_obj,
+ try_call,
+ unified_timestamp,
+ url_or_none,
+)
+
+
+class StagePlusVODConcertIE(InfoExtractor):
+ _NETRC_MACHINE = 'stageplus'
+ _VALID_URL = r'https?://(?:www\.)?stage-plus\.com/video/(?P<id>vod_concert_\w+)'
+ _TESTS = [{
+ 'url': 'https://www.stage-plus.com/video/vod_concert_APNM8GRFDPHMASJKBSPJACG',
+ 'playlist_count': 6,
+ 'info_dict': {
+ 'id': 'vod_concert_APNM8GRFDPHMASJKBSPJACG',
+ 'title': 'Yuja Wang plays Rachmaninoff\'s Piano Concerto No. 2 – from Odeonsplatz',
+ 'description': 'md5:50f78ec180518c9bdb876bac550996fc',
+ 'artist': ['Yuja Wang', 'Lorenzo Viotti'],
+ 'upload_date': '20230331',
+ 'timestamp': 1680249600,
+ 'release_date': '20210709',
+ 'release_timestamp': 1625788800,
+ 'thumbnails': 'count:3',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'performance_work_A1IN4PJFE9MM2RJ3CLBMUSJBBSOJAD9O',
+ 'ext': 'mp4',
+ 'title': 'Piano Concerto No. 2 in C Minor, Op. 18',
+ 'description': 'md5:50f78ec180518c9bdb876bac550996fc',
+ 'upload_date': '20230331',
+ 'timestamp': 1680249600,
+ 'release_date': '20210709',
+ 'release_timestamp': 1625788800,
+ 'duration': 2207,
+ 'chapters': 'count:5',
+ 'artist': ['Yuja Wang'],
+ 'composer': ['Sergei Rachmaninoff'],
+ 'album': 'Yuja Wang plays Rachmaninoff\'s Piano Concerto No. 2 – from Odeonsplatz',
+ 'album_artist': ['Yuja Wang', 'Lorenzo Viotti'],
+ 'track': 'Piano Concerto No. 2 in C Minor, Op. 18',
+ 'track_number': 1,
+ 'genre': 'Instrumental Concerto',
+ },
+ }],
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ # TODO: Prune this after livestream and/or album extractors are added
+ _GRAPHQL_QUERY = '''query videoDetailPage($videoId: ID!, $sliderItemsFirst: Int = 24) {
+ node(id: $videoId) {
+ __typename
+ ...LiveConcertFields
+ ... on LiveConcert {
+ artists {
+ edges {
+ role {
+ ...RoleFields
+ }
+ node {
+ id
+ name
+ sortName
+ }
+ }
+ }
+ isAtmos
+ maxResolution
+ groups {
+ id
+ name
+ typeDisplayName
+ }
+ shortDescription
+ performanceWorks {
+ ...livePerformanceWorkFields
+ }
+ totalDuration
+ sliders {
+ ...contentContainerFields
+ }
+ vodConcert {
+ __typename
+ id
+ }
+ }
+ ...VideoFields
+ ... on Video {
+ artists {
+ edges {
+ role {
+ ...RoleFields
+ }
+ node {
+ id
+ name
+ sortName
+ }
+ }
+ }
+ isAtmos
+ maxResolution
+ isLossless
+ description
+ productionDate
+ takedownDate
+ sliders {
+ ...contentContainerFields
+ }
+ }
+ ...VodConcertFields
+ ... on VodConcert {
+ artists {
+ edges {
+ role {
+ ...RoleFields
+ }
+ node {
+ id
+ name
+ sortName
+ }
+ }
+ }
+ isAtmos
+ maxResolution
+ groups {
+ id
+ name
+ typeDisplayName
+ }
+ performanceWorks {
+ ...PerformanceWorkFields
+ }
+ shortDescription
+ productionDate
+ takedownDate
+ sliders {
+ ...contentContainerFields
+ }
+ }
+ }
+}
+
+fragment LiveConcertFields on LiveConcert {
+ endTime
+ id
+ pictures {
+ ...PictureFields
+ }
+ reruns {
+ ...liveConcertRerunFields
+ }
+ publicationLevel
+ startTime
+ streamStartTime
+ subtitle
+ title
+ typeDisplayName
+ stream {
+ ...liveStreamFields
+ }
+ trailerStream {
+ ...streamFields
+ }
+ geoAccessCountries
+ geoAccessMode
+}
+
+fragment PictureFields on Picture {
+ id
+ url
+ type
+}
+
+fragment liveConcertRerunFields on LiveConcertRerun {
+ streamStartTime
+ endTime
+ startTime
+ stream {
+ ...rerunStreamFields
+ }
+}
+
+fragment rerunStreamFields on RerunStream {
+ publicationLevel
+ streamType
+ url
+}
+
+fragment liveStreamFields on LiveStream {
+ publicationLevel
+ streamType
+ url
+}
+
+fragment streamFields on Stream {
+ publicationLevel
+ streamType
+ url
+}
+
+fragment RoleFields on Role {
+ __typename
+ id
+ type
+ displayName
+}
+
+fragment livePerformanceWorkFields on LivePerformanceWork {
+ __typename
+ id
+ artists {
+ ...artistWithRoleFields
+ }
+ groups {
+ edges {
+ node {
+ id
+ name
+ typeDisplayName
+ }
+ }
+ }
+ work {
+ ...workFields
+ }
+}
+
+fragment artistWithRoleFields on ArtistWithRoleConnection {
+ edges {
+ role {
+ ...RoleFields
+ }
+ node {
+ id
+ name
+ sortName
+ }
+ }
+}
+
+fragment workFields on Work {
+ id
+ title
+ movements {
+ id
+ title
+ }
+ composers {
+ id
+ name
+ }
+ genre {
+ id
+ title
+ }
+}
+
+fragment contentContainerFields on CuratedContentContainer {
+ __typename
+ ...SliderFields
+ ...BannerFields
+}
+
+fragment SliderFields on Slider {
+ id
+ headline
+ items(first: $sliderItemsFirst) {
+ edges {
+ node {
+ id
+ __typename
+ ...AlbumFields
+ ...ArtistFields
+ ...EpochFields
+ ...GenreFields
+ ...GroupFields
+ ...LiveConcertFields
+ ...PartnerFields
+ ...PerformanceWorkFields
+ ...VideoFields
+ ...VodConcertFields
+ }
+ }
+ }
+}
+
+fragment AlbumFields on Album {
+ artistAndGroupDisplayInfo
+ id
+ pictures {
+ ...PictureFields
+ }
+ title
+}
+
+fragment ArtistFields on Artist {
+ id
+ name
+ roles {
+ ...RoleFields
+ }
+ pictures {
+ ...PictureFields
+ }
+}
+
+fragment EpochFields on Epoch {
+ id
+ endYear
+ pictures {
+ ...PictureFields
+ }
+ startYear
+ title
+}
+
+fragment GenreFields on Genre {
+ id
+ pictures {
+ ...PictureFields
+ }
+ title
+}
+
+fragment GroupFields on Group {
+ id
+ name
+ typeDisplayName
+ pictures {
+ ...PictureFields
+ }
+}
+
+fragment PartnerFields on Partner {
+ id
+ name
+ typeDisplayName
+ subtypeDisplayName
+ pictures {
+ ...PictureFields
+ }
+}
+
+fragment PerformanceWorkFields on PerformanceWork {
+ __typename
+ id
+ artists {
+ ...artistWithRoleFields
+ }
+ groups {
+ edges {
+ node {
+ id
+ name
+ typeDisplayName
+ }
+ }
+ }
+ work {
+ ...workFields
+ }
+ stream {
+ ...streamFields
+ }
+ vodConcert {
+ __typename
+ id
+ }
+ duration
+ cuePoints {
+ mark
+ title
+ }
+}
+
+fragment VideoFields on Video {
+ id
+ archiveReleaseDate
+ title
+ subtitle
+ pictures {
+ ...PictureFields
+ }
+ stream {
+ ...streamFields
+ }
+ trailerStream {
+ ...streamFields
+ }
+ duration
+ typeDisplayName
+ duration
+ geoAccessCountries
+ geoAccessMode
+ publicationLevel
+ takedownDate
+}
+
+fragment VodConcertFields on VodConcert {
+ id
+ archiveReleaseDate
+ pictures {
+ ...PictureFields
+ }
+ subtitle
+ title
+ typeDisplayName
+ totalDuration
+ geoAccessCountries
+ geoAccessMode
+ trailerStream {
+ ...streamFields
+ }
+ publicationLevel
+ takedownDate
+}
+
+fragment BannerFields on Banner {
+ description
+ link
+ pictures {
+ ...PictureFields
+ }
+ title
+}'''
+
+ _TOKEN = None
+
+ def _perform_login(self, username, password):
+ auth = self._download_json('https://audience.api.stageplus.io/oauth/token', None, headers={
+ 'Content-Type': 'application/json',
+ 'Origin': 'https://www.stage-plus.com',
+ }, data=json.dumps({
+ 'grant_type': 'password',
+ 'username': username,
+ 'password': password,
+ 'device_info': 'Chrome (Windows)',
+ 'client_device_id': str(uuid.uuid4()),
+ }, separators=(',', ':')).encode(), note='Logging in')
+
+ if auth.get('access_token'):
+ self._TOKEN = auth['access_token']
+
+ def _real_initialize(self):
+ if self._TOKEN:
+ return
+
+ self._TOKEN = try_call(
+ lambda: self._get_cookies('https://www.stage-plus.com/')['dgplus_access_token'].value)
+ if not self._TOKEN:
+ self.raise_login_required()
+
+ def _real_extract(self, url):
+ concert_id = self._match_id(url)
+
+ data = self._download_json('https://audience.api.stageplus.io/graphql', concert_id, headers={
+ 'authorization': f'Bearer {self._TOKEN}',
+ 'content-type': 'application/json',
+ 'Origin': 'https://www.stage-plus.com',
+ }, data=json.dumps({
+ 'query': self._GRAPHQL_QUERY,
+ 'variables': {'videoId': concert_id},
+ 'operationName': 'videoDetailPage'
+ }, separators=(',', ':')).encode())['data']['node']
+
+ metadata = traverse_obj(data, {
+ 'title': 'title',
+ 'description': ('shortDescription', {str}),
+ 'artist': ('artists', 'edges', ..., 'node', 'name'),
+ 'timestamp': ('archiveReleaseDate', {unified_timestamp}),
+ 'release_timestamp': ('productionDate', {unified_timestamp}),
+ })
+
+ thumbnails = traverse_obj(data, ('pictures', lambda _, v: url_or_none(v['url']), {
+ 'id': 'name',
+ 'url': 'url',
+ })) or None
+
+ entries = []
+ for idx, video in enumerate(traverse_obj(data, (
+ 'performanceWorks', lambda _, v: v['id'] and url_or_none(v['stream']['url']))), 1):
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ video['stream']['url'], video['id'], 'mp4', m3u8_id='hls', query={'token': self._TOKEN})
+ entries.append({
+ 'id': video['id'],
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'album': metadata.get('title'),
+ 'album_artist': metadata.get('artist'),
+ 'track_number': idx,
+ **metadata,
+ **traverse_obj(video, {
+ 'title': ('work', 'title'),
+ 'track': ('work', 'title'),
+ 'duration': ('duration', {float_or_none}),
+ 'chapters': (
+ 'cuePoints', lambda _, v: float_or_none(v['mark']) is not None, {
+ 'title': 'title',
+ 'start_time': ('mark', {float_or_none}),
+ }),
+ 'artist': ('artists', 'edges', ..., 'node', 'name'),
+ 'composer': ('work', 'composers', ..., 'name'),
+ 'genre': ('work', 'genre', 'title'),
+ }),
+ })
+
+ return self.playlist_result(entries, concert_id, thumbnails=thumbnails, **metadata)
diff --git a/hypervideo_dl/extractor/stripchat.py b/hypervideo_dl/extractor/stripchat.py
index 4229a0b..b9523c8 100644
--- a/hypervideo_dl/extractor/stripchat.py
+++ b/hypervideo_dl/extractor/stripchat.py
@@ -1,5 +1,10 @@
from .common import InfoExtractor
-from ..utils import ExtractorError, lowercase_escape, traverse_obj
+from ..utils import (
+ ExtractorError,
+ UserNotLive,
+ lowercase_escape,
+ traverse_obj
+)
class StripchatIE(InfoExtractor):
@@ -35,16 +40,15 @@ class StripchatIE(InfoExtractor):
if traverse_obj(data, ('viewCam', 'show'), expected_type=dict):
raise ExtractorError('Model is in private show', expected=True)
elif not traverse_obj(data, ('viewCam', 'model', 'isLive'), expected_type=bool):
- raise ExtractorError('Model is offline', expected=True)
+ raise UserNotLive(video_id=video_id)
- server = traverse_obj(data, ('viewCam', 'viewServers', 'flashphoner-hls'), expected_type=str)
model_id = traverse_obj(data, ('viewCam', 'model', 'id'), expected_type=int)
formats = []
- for host in traverse_obj(data, (
- 'config', 'data', (('featuresV2', 'hlsFallback', 'fallbackDomains', ...), 'hlsStreamHost'))):
+ for host in traverse_obj(data, ('config', 'data', (
+ (('features', 'featuresV2'), 'hlsFallback', 'fallbackDomains', ...), 'hlsStreamHost'))):
formats = self._extract_m3u8_formats(
- f'https://b-{server}.{host}/hls/{model_id}/{model_id}.m3u8',
+ f'https://edge-hls.{host}/hls/{model_id}/master/{model_id}_auto.m3u8',
video_id, ext='mp4', m3u8_id='hls', fatal=False, live=True)
if formats:
break
diff --git a/hypervideo_dl/extractor/stv.py b/hypervideo_dl/extractor/stv.py
index c879fb5..8b3e635 100644
--- a/hypervideo_dl/extractor/stv.py
+++ b/hypervideo_dl/extractor/stv.py
@@ -73,6 +73,8 @@ class STVPlayerIE(InfoExtractor):
})
programme = result.get('programme') or {}
+ if programme.get('drmEnabled'):
+ self.report_drm(video_id)
return {
'_type': 'url_transparent',
diff --git a/hypervideo_dl/extractor/substack.py b/hypervideo_dl/extractor/substack.py
index fa38263..3782cee 100644
--- a/hypervideo_dl/extractor/substack.py
+++ b/hypervideo_dl/extractor/substack.py
@@ -2,7 +2,7 @@ import re
import urllib.parse
from .common import InfoExtractor
-from ..utils import str_or_none, traverse_obj
+from ..utils import js_to_json, str_or_none, traverse_obj
class SubstackIE(InfoExtractor):
@@ -14,7 +14,7 @@ class SubstackIE(InfoExtractor):
'id': '47660949',
'ext': 'mp4',
'title': 'I MADE A VLOG',
- 'description': 'md5:10c01ff93439a62e70ce963b2aa0b7f6',
+ 'description': 'md5:9248af9a759321e1027226f988f54d96',
'thumbnail': 'md5:bec758a34d8ee9142d43bcebdf33af18',
'uploader': 'Maybe Baby',
'uploader_id': '33628',
@@ -77,7 +77,9 @@ class SubstackIE(InfoExtractor):
display_id, username = self._match_valid_url(url).group('id', 'username')
webpage = self._download_webpage(url, display_id)
- webpage_info = self._search_json(r'<script[^>]*>\s*window\._preloads\s*=', webpage, 'preloads', display_id)
+ webpage_info = self._parse_json(self._search_json(
+ r'window\._preloads\s*=\s*JSON\.parse\(', webpage, 'json string',
+ display_id, transform_source=js_to_json, contains_pattern=r'"{(?s:.+)}"'), display_id)
post_type = webpage_info['post']['type']
formats, subtitles = [], {}
diff --git a/hypervideo_dl/extractor/sverigesradio.py b/hypervideo_dl/extractor/sverigesradio.py
index 65da615..01a07b3 100644
--- a/hypervideo_dl/extractor/sverigesradio.py
+++ b/hypervideo_dl/extractor/sverigesradio.py
@@ -1,8 +1,13 @@
from .common import InfoExtractor
from ..utils import (
determine_ext,
+ extract_attributes,
+ get_element_by_id,
+ get_element_html_by_class,
int_or_none,
str_or_none,
+ traverse_obj,
+ url_or_none,
)
@@ -21,7 +26,15 @@ class SverigesRadioBaseIE(InfoExtractor):
}
def _real_extract(self, url):
- audio_id = self._match_id(url)
+ audio_id, display_id = self._match_valid_url(url).group('id', 'slug')
+ if not audio_id:
+ webpage = self._download_webpage(url, display_id)
+ audio_id = (
+ traverse_obj(
+ get_element_html_by_class('audio-button', webpage),
+ ({extract_attributes}, ('data-audio-id', 'data-publication-id')), get_all=False)
+ or self._parse_json(get_element_by_id('gtm-metadata', webpage), display_id)['pageId'])
+
query = {
'id': audio_id,
'type': self._AUDIO_TYPE,
@@ -30,7 +43,6 @@ class SverigesRadioBaseIE(InfoExtractor):
item = self._download_json(
self._BASE_URL + 'audiometadata', audio_id,
'Downloading audio JSON metadata', query=query)['items'][0]
- title = item['subtitle']
query['format'] = 'iis'
urls = []
@@ -61,18 +73,20 @@ class SverigesRadioBaseIE(InfoExtractor):
return {
'id': audio_id,
- 'title': title,
'formats': formats,
- 'series': item.get('title'),
- 'duration': int_or_none(item.get('duration')),
- 'thumbnail': item.get('displayimageurl'),
- 'description': item.get('description'),
+ **traverse_obj(item, {
+ 'title': 'subtitle',
+ 'series': 'title',
+ 'duration': ('duration', {int_or_none}),
+ 'thumbnail': ('displayimageurl', {url_or_none}),
+ 'description': 'description',
+ }),
}
class SverigesRadioPublicationIE(SverigesRadioBaseIE):
IE_NAME = 'sverigesradio:publication'
- _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/sida/(?:artikel|gruppsida)\.aspx\?.*?\bartikel=(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?(?:artikel|gruppsida)(?:\.aspx\?.*?\bartikel=(?P<id>[0-9]+)|/(?P<slug>[\w-]+))'
_TESTS = [{
'url': 'https://sverigesradio.se/sida/artikel.aspx?programid=83&artikel=7038546',
'md5': '6a4917e1923fccb080e5a206a5afa542',
@@ -86,6 +100,18 @@ class SverigesRadioPublicationIE(SverigesRadioBaseIE):
'thumbnail': r're:^https?://.*\.jpg',
},
}, {
+ 'url': 'https://sverigesradio.se/artikel/tysk-fotbollsfeber-bayern-munchens-10-ariga-segersvit-kan-brytas',
+ 'md5': 'f8a914ad50f491bb74eed403ab4bfef6',
+ 'info_dict': {
+ 'id': '8360345',
+ 'ext': 'm4a',
+ 'title': 'Tysk fotbollsfeber när Bayern Münchens 10-åriga segersvit kan brytas',
+ 'series': 'Radiosporten',
+ 'description': 'md5:5254610e20ce527ecb3a6102a06dcc5f',
+ 'duration': 72,
+ 'thumbnail': r're:^https?://.*\.jpg',
+ },
+ }, {
'url': 'https://sverigesradio.se/sida/gruppsida.aspx?programid=3304&grupp=6247&artikel=7146887',
'only_matching': True,
}]
@@ -94,8 +120,8 @@ class SverigesRadioPublicationIE(SverigesRadioBaseIE):
class SverigesRadioEpisodeIE(SverigesRadioBaseIE):
IE_NAME = 'sverigesradio:episode'
- _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?avsnitt/(?P<id>[0-9]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?avsnitt/(?:(?P<id>\d+)|(?P<slug>[\w-]+))(?:$|[#?])'
+ _TESTS = [{
'url': 'https://sverigesradio.se/avsnitt/1140922?programid=1300',
'md5': '20dc4d8db24228f846be390b0c59a07c',
'info_dict': {
@@ -106,6 +132,18 @@ class SverigesRadioEpisodeIE(SverigesRadioBaseIE):
'title': 'Metoo och valen',
'description': 'md5:fcb5c1f667f00badcc702b196f10a27e',
'thumbnail': r're:^https?://.*\.jpg',
- }
- }
+ },
+ }, {
+ 'url': 'https://sverigesradio.se/avsnitt/p4-live-med-first-aid-kit-scandinavium-mars-2023',
+ 'md5': 'ce17fb82520a8033dbb846993d5589fe',
+ 'info_dict': {
+ 'id': '2160416',
+ 'ext': 'm4a',
+ 'title': 'P4 Live med First Aid Kit',
+ 'description': 'md5:6d5b78eed3d2b65f6de04daa45e9285d',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'series': 'P4 Live',
+ 'duration': 5640,
+ },
+ }]
_AUDIO_TYPE = 'episode'
diff --git a/hypervideo_dl/extractor/svt.py b/hypervideo_dl/extractor/svt.py
index 31bf7f9..18da875 100644
--- a/hypervideo_dl/extractor/svt.py
+++ b/hypervideo_dl/extractor/svt.py
@@ -1,3 +1,4 @@
+import json
import re
from .common import InfoExtractor
@@ -6,10 +7,11 @@ from ..utils import (
determine_ext,
dict_get,
int_or_none,
- unified_timestamp,
str_or_none,
strip_or_none,
+ traverse_obj,
try_get,
+ unified_timestamp,
)
@@ -163,10 +165,46 @@ class SVTPlayIE(SVTPlayBaseIE):
},
},
'params': {
- # skip for now due to download test asserts that segment is > 10000 bytes and svt uses
- # init segments that are smaller
- # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B
- 'skip_download': True,
+ 'skip_download': 'm3u8',
+ },
+ 'skip': 'Episode is no longer available',
+ }, {
+ 'url': 'https://www.svtplay.se/video/emBxBQj',
+ 'md5': '2382036fd6f8c994856c323fe51c426e',
+ 'info_dict': {
+ 'id': 'eyBd9aj',
+ 'ext': 'mp4',
+ 'title': '1. Farlig kryssning',
+ 'timestamp': 1491019200,
+ 'upload_date': '20170401',
+ 'duration': 2566,
+ 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$',
+ 'age_limit': 0,
+ 'episode': '1. Farlig kryssning',
+ 'series': 'Rederiet',
+ 'subtitles': {
+ 'sv': 'count:3'
+ },
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ 'url': 'https://www.svtplay.se/video/jz2rYz7/anders-hansen-moter/james-fallon?info=visa',
+ 'info_dict': {
+ 'id': 'jvXAGVb',
+ 'ext': 'mp4',
+ 'title': 'James Fallon',
+ 'timestamp': 1673917200,
+ 'upload_date': '20230117',
+ 'duration': 1081,
+ 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$',
+ 'age_limit': 0,
+ 'episode': 'James Fallon',
+ 'series': 'Anders Hansen möter...',
+ },
+ 'params': {
+ 'skip_download': 'dash',
},
}, {
'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA',
@@ -248,14 +286,15 @@ class SVTPlayIE(SVTPlayBaseIE):
compat_str)
if not svt_id:
+ nextjs_data = self._search_nextjs_data(webpage, video_id, fatal=False)
+ svt_id = traverse_obj(nextjs_data, (
+ 'props', 'urqlState', ..., 'data', {json.loads}, 'detailsPageByPath',
+ 'video', 'svtId', {str}), get_all=False)
+
+ if not svt_id:
svt_id = self._search_regex(
(r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)',
- r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\b(?:modalId|id)=([\da-zA-Z-]+)' % re.escape(video_id),
- r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)',
- r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)',
- r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"',
- r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)',
- r'["\']svtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)'),
+ r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/[\w-]+/[^"\']*\b(?:modalId|id)=([\w-]+)'),
webpage, 'video id')
info_dict = self._extract_by_video_id(svt_id, webpage)
diff --git a/hypervideo_dl/extractor/tagesschau.py b/hypervideo_dl/extractor/tagesschau.py
index ea0532c..e23b490 100644
--- a/hypervideo_dl/extractor/tagesschau.py
+++ b/hypervideo_dl/extractor/tagesschau.py
@@ -2,10 +2,12 @@ import re
from .common import InfoExtractor
from ..utils import (
- js_to_json,
+ UnsupportedError,
extract_attributes,
- try_get,
int_or_none,
+ js_to_json,
+ parse_iso8601,
+ try_get,
)
@@ -14,36 +16,38 @@ class TagesschauIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
- 'md5': '7a7287612fa881a1ae1d087df45c2fd6',
+ 'md5': 'ccb9359bf8c4795836e43759f3408a93',
'info_dict': {
'id': 'video-102143-1',
'ext': 'mp4',
'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
+ 'duration': 138,
},
}, {
'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
- 'md5': '3c54c1f6243d279b706bde660ceec633',
+ 'md5': '5c15e8f3da049e48829ec9786d835536',
'info_dict': {
'id': 'ts-5727-1',
'ext': 'mp4',
'title': 'Ganze Sendung',
+ 'duration': 932,
},
}, {
# exclusive audio
'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
- 'md5': '4cf22023c285f35e99c24d290ba58cc9',
+ 'md5': '4bff8f23504df56a0d86ed312d654182',
'info_dict': {
'id': 'audio-29417-1',
'ext': 'mp3',
- 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt',
+ 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet',
},
}, {
'url': 'http://www.tagesschau.de/inland/bnd-303.html',
- 'md5': '12cfb212d9325b5ba0d52b625f1aa61c',
+ 'md5': 'f049fa1698d7564e9ca4c3325108f034',
'info_dict': {
'id': 'bnd-303-1',
- 'ext': 'mp4',
- 'title': 'SPD-Gruppenbild mit Bärbel Bas nach der Fraktionssitzung | dpa',
+ 'ext': 'mp3',
+ 'title': 'Das Siegel des Bundesnachrichtendienstes | dpa',
},
}, {
'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
@@ -51,13 +55,24 @@ class TagesschauIE(InfoExtractor):
'id': 'afd-parteitag-135',
'title': 'AfD',
},
- 'playlist_count': 20,
+ 'playlist_mincount': 15,
}, {
'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
'info_dict': {
'id': 'audio-29417-1',
'ext': 'mp3',
- 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt',
+ 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet',
+ },
+ }, {
+ 'url': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-327.html',
+ 'info_dict': {
+ 'id': 'podcast-11km-327',
+ 'ext': 'mp3',
+ 'title': 'Gewalt in der Kita – Wenn Erzieher:innen schweigen',
+ 'upload_date': '20230322',
+ 'timestamp': 1679482808,
+ 'thumbnail': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-329~_v-original.jpg',
+ 'description': 'md5:dad059931fe4b3693e3656e93a249848',
},
}, {
'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
@@ -117,7 +132,7 @@ class TagesschauIE(InfoExtractor):
formats = []
if media_url.endswith('master.m3u8'):
formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls')
- elif media_url.endswith('.hi.mp3') and media_url.startswith('https://download'):
+ elif media_url.endswith('.mp3'):
formats = [{
'url': media_url,
'vcodec': 'none',
@@ -130,20 +145,19 @@ class TagesschauIE(InfoExtractor):
'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])),
'formats': formats
})
+
+ if not entries:
+ raise UnsupportedError(url)
+
if len(entries) > 1:
return self.playlist_result(entries, display_id, title)
- formats = entries[0]['formats']
- video_info = self._search_json_ld(webpage, video_id)
- description = video_info.get('description')
- thumbnail = self._og_search_thumbnail(webpage) or video_info.get('thumbnail')
- timestamp = video_info.get('timestamp')
- title = title or video_info.get('description')
return {
'id': display_id,
'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- 'timestamp': timestamp,
- 'description': description,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'formats': entries[0]['formats'],
+ 'timestamp': parse_iso8601(self._html_search_meta('date', webpage)),
+ 'description': self._og_search_description(webpage),
+ 'duration': entries[0]['duration'],
}
diff --git a/hypervideo_dl/extractor/tbsjp.py b/hypervideo_dl/extractor/tbsjp.py
new file mode 100644
index 0000000..77ddeca
--- /dev/null
+++ b/hypervideo_dl/extractor/tbsjp.py
@@ -0,0 +1,152 @@
+from .common import InfoExtractor
+from ..networking.exceptions import HTTPError
+from ..utils import (
+ ExtractorError,
+ clean_html,
+ get_element_text_and_html_by_tag,
+ int_or_none,
+ str_or_none,
+ traverse_obj,
+ try_call,
+ unified_timestamp,
+ urljoin,
+)
+
+
+class TBSJPEpisodeIE(InfoExtractor):
+ _VALID_URL = r'https?://cu\.tbs\.co\.jp/episode/(?P<id>[\d_]+)'
+ _GEO_BYPASS = False
+ _TESTS = [{
+ 'url': 'https://cu.tbs.co.jp/episode/23613_2044134_1000049010',
+ 'skip': 'streams geo-restricted, Japan only. Also, will likely expire eventually',
+ 'info_dict': {
+ 'title': 'VIVANT 第三話 誤送金完結へ!絶体絶命の反撃開始',
+ 'id': '23613_2044134_1000049010',
+ 'ext': 'mp4',
+ 'upload_date': '20230728',
+ 'duration': 3517,
+ 'release_timestamp': 1691118230,
+ 'episode': '第三話 誤送金完結へ!絶体絶命の反撃開始',
+ 'release_date': '20230804',
+ 'categories': 'count:11',
+ 'episode_number': 3,
+ 'timestamp': 1690522538,
+ 'description': 'md5:2b796341af1ef772034133174ba4a895',
+ 'series': 'VIVANT',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ meta = self._search_json(r'window\.app\s*=', webpage, 'episode info', video_id, fatal=False)
+ episode = traverse_obj(meta, ('falcorCache', 'catalog', 'episode', video_id, 'value'))
+
+ tf_path = self._search_regex(
+ r'<script[^>]+src=["\'](/assets/tf\.[^"\']+\.js)["\']', webpage, 'stream API config')
+ tf_js = self._download_webpage(urljoin(url, tf_path), video_id, note='Downloading stream API config')
+ video_url = self._search_regex(r'videoPlaybackUrl:\s*[\'"]([^\'"]+)[\'"]', tf_js, 'stream API url')
+ api_key = self._search_regex(r'api_key:\s*[\'"]([^\'"]+)[\'"]', tf_js, 'stream API key')
+
+ try:
+ source_meta = self._download_json(f'{video_url}ref:{video_id}', video_id,
+ headers={'X-Streaks-Api-Key': api_key},
+ note='Downloading stream metadata')
+ except ExtractorError as e:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
+ self.raise_geo_restricted(countries=['JP'])
+ raise
+
+ formats, subtitles = [], {}
+ for src in traverse_obj(source_meta, ('sources', ..., 'src')):
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(src, video_id, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+
+ return {
+ 'title': try_call(lambda: clean_html(get_element_text_and_html_by_tag('h3', webpage)[0])),
+ 'id': video_id,
+ **traverse_obj(episode, {
+ 'categories': ('keywords', {list}),
+ 'id': ('content_id', {str}),
+ 'description': ('description', 0, 'value'),
+ 'timestamp': ('created_at', {unified_timestamp}),
+ 'release_timestamp': ('pub_date', {unified_timestamp}),
+ 'duration': ('tv_episode_info', 'duration', {int_or_none}),
+ 'episode_number': ('tv_episode_info', 'episode_number', {int_or_none}),
+ 'episode': ('title', lambda _, v: not v.get('is_phonetic'), 'value'),
+ 'series': ('custom_data', 'program_name'),
+ }, get_all=False),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class TBSJPProgramIE(InfoExtractor):
+ _VALID_URL = r'https?://cu\.tbs\.co\.jp/program/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://cu.tbs.co.jp/program/23601',
+ 'playlist_mincount': 4,
+ 'info_dict': {
+ 'id': '23601',
+ 'categories': ['エンタメ', 'ミライカプセル', '会社', '働く', 'バラエティ', '動画'],
+ 'description': '幼少期の夢は大人になって、どう成長したのだろうか?\nそしてその夢は今後、どのように広がっていくのか?\nいま話題の会社で働く人の「夢の成長」を描く',
+ 'series': 'ミライカプセル -I have a dream-',
+ 'title': 'ミライカプセル -I have a dream-'
+ }
+ }]
+
+ def _real_extract(self, url):
+ programme_id = self._match_id(url)
+ webpage = self._download_webpage(url, programme_id)
+ meta = self._search_json(r'window\.app\s*=', webpage, 'programme info', programme_id)
+
+ programme = traverse_obj(meta, ('falcorCache', 'catalog', 'program', programme_id, 'false', 'value'))
+
+ return {
+ '_type': 'playlist',
+ 'entries': [self.url_result(f'https://cu.tbs.co.jp/episode/{video_id}', TBSJPEpisodeIE, video_id)
+ for video_id in traverse_obj(programme, ('custom_data', 'seriesList', 'episodeCode', ...))],
+ 'id': programme_id,
+ **traverse_obj(programme, {
+ 'categories': ('keywords', ...),
+ 'id': ('tv_episode_info', 'show_content_id', {str_or_none}),
+ 'description': ('custom_data', 'program_description'),
+ 'series': ('custom_data', 'program_name'),
+ 'title': ('custom_data', 'program_name'),
+ }),
+ }
+
+
+class TBSJPPlaylistIE(InfoExtractor):
+ _VALID_URL = r'https?://cu\.tbs\.co\.jp/playlist/(?P<id>[\da-f]+)'
+ _TESTS = [{
+ 'url': 'https://cu.tbs.co.jp/playlist/184f9970e7ba48e4915f1b252c55015e',
+ 'playlist_mincount': 4,
+ 'info_dict': {
+ 'title': 'まもなく配信終了',
+ 'id': '184f9970e7ba48e4915f1b252c55015e',
+ }
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ page = self._download_webpage(url, playlist_id)
+ meta = self._search_json(r'window\.app\s*=', page, 'playlist info', playlist_id)
+ playlist = traverse_obj(meta, ('falcorCache', 'playList', playlist_id))
+
+ def entries():
+ for entry in traverse_obj(playlist, ('catalogs', 'value', lambda _, v: v['content_id'])):
+ # TODO: it's likely possible to get all metadata from the playlist page json instead
+ content_id = entry['content_id']
+ content_type = entry.get('content_type')
+ if content_type == 'tv_show':
+ yield self.url_result(
+ f'https://cu.tbs.co.jp/program/{content_id}', TBSJPProgramIE, content_id)
+ elif content_type == 'tv_episode':
+ yield self.url_result(
+ f'https://cu.tbs.co.jp/episode/{content_id}', TBSJPEpisodeIE, content_id)
+ else:
+ self.report_warning(f'Skipping "{content_id}" with unsupported content_type "{content_type}"')
+
+ return self.playlist_result(entries(), playlist_id, traverse_obj(playlist, ('display_name', 'value')))
diff --git a/hypervideo_dl/extractor/teachable.py b/hypervideo_dl/extractor/teachable.py
index c212a49..01906bd 100644
--- a/hypervideo_dl/extractor/teachable.py
+++ b/hypervideo_dl/extractor/teachable.py
@@ -56,7 +56,7 @@ class TeachableBaseIE(InfoExtractor):
self._logged_in = True
return
- login_url = urlh.geturl()
+ login_url = urlh.url
login_form = self._hidden_inputs(login_page)
diff --git a/hypervideo_dl/extractor/teamcoco.py b/hypervideo_dl/extractor/teamcoco.py
index a822b67..d32f812 100644
--- a/hypervideo_dl/extractor/teamcoco.py
+++ b/hypervideo_dl/extractor/teamcoco.py
@@ -1,57 +1,109 @@
import json
+import re
from .turner import TurnerBaseIE
from ..utils import (
- determine_ext,
ExtractorError,
- int_or_none,
+ clean_html,
+ determine_ext,
+ make_archive_id,
+ merge_dicts,
mimetype2ext,
parse_duration,
- parse_iso8601,
- qualities,
+ parse_qs,
+ traverse_obj,
+ unified_timestamp,
+ urljoin,
+ url_or_none,
)
-class TeamcocoIE(TurnerBaseIE):
- _VALID_URL = r'https?://(?:\w+\.)?teamcoco\.com/(?P<id>([^/]+/)*[^/?#]+)'
+class TeamcocoBaseIE(TurnerBaseIE):
+ _QUALITIES = {
+ 'low': (480, 272),
+ 'sd': (640, 360),
+ 'hd': (1280, 720),
+ 'uhd': (1920, 1080),
+ }
+
+ def _get_formats_and_subtitles(self, info, video_id):
+ formats, subtitles = [], {}
+
+ for src in traverse_obj(info, ('src', ..., {dict})):
+ format_id = src.get('label')
+ src_url = src.get('src')
+ if re.match(r'https?:/[^/]', src_url):
+ src_url = src_url.replace(':/', '://', 1)
+ ext = determine_ext(src_url, mimetype2ext(src.get('type')))
+
+ if not format_id or not src_url:
+ continue
+ elif format_id == 'hls' or ext == 'm3u8':
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+
+ elif format_id in self._QUALITIES:
+ if src_url.startswith('/mp4:protected/'):
+ # TODO: Correct extraction for these files
+ continue
+ formats.append({
+ 'url': src_url,
+ 'ext': ext,
+ 'format_id': format_id,
+ 'width': self._QUALITIES[format_id][0],
+ 'height': self._QUALITIES[format_id][1],
+ })
+
+ return formats, subtitles
+
+
+class TeamcocoIE(TeamcocoBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?teamcoco\.com/(?P<id>([^/]+/)*[^/?#]+)'
_TESTS = [
{
'url': 'http://teamcoco.com/video/mary-kay-remote',
- 'md5': '55d532f81992f5c92046ad02fec34d7d',
'info_dict': {
'id': '80187',
+ 'display_id': 'video_mary-kay-remote',
'ext': 'mp4',
'title': 'Conan Becomes A Mary Kay Beauty Consultant',
- 'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.',
- 'duration': 495.0,
+ 'description': 'md5:9fb64e45b5aef6b2af1b67612b36c162',
+ 'thumbnail': 'https://teamcoco.com/image/thumb?id=80187',
'upload_date': '20140402',
- 'timestamp': 1396407600,
- }
+ 'timestamp': 1396440000,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
}, {
'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush',
- 'md5': 'cde9ba0fa3506f5f017ce11ead928f9a',
'info_dict': {
'id': '19705',
+ 'display_id': 'video_louis-ck-interview-george-w-bush',
'ext': 'mp4',
- 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.',
'title': 'Louis C.K. Interview Pt. 1 11/3/11',
- 'duration': 288,
+ 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.',
+ 'thumbnail': 'https://teamcoco.com/image/thumb?id=19705',
'upload_date': '20111104',
- 'timestamp': 1320405840,
- }
+ 'timestamp': 1320408000,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
}, {
'url': 'http://teamcoco.com/video/timothy-olyphant-drinking-whiskey',
'info_dict': {
'id': '88748',
+ 'display_id': 'video_timothy-olyphant-drinking-whiskey',
'ext': 'mp4',
'title': 'Timothy Olyphant Raises A Toast To “Justified”',
'description': 'md5:15501f23f020e793aeca761205e42c24',
'upload_date': '20150415',
- 'timestamp': 1429088400,
+ 'timestamp': 1429099200,
+ 'thumbnail': 'https://teamcoco.com/image/thumb?id=88748',
},
- 'params': {
- 'skip_download': True, # m3u8 downloads
- }
}, {
'url': 'http://teamcoco.com/video/full-episode-mon-6-1-joel-mchale-jake-tapper-and-musical-guest-courtney-barnett?playlist=x;eyJ0eXBlIjoidGFnIiwiaWQiOjl9',
'info_dict': {
@@ -60,9 +112,6 @@ class TeamcocoIE(TurnerBaseIE):
'title': 'Full Episode - Mon. 6/1 - Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
'description': 'Guests: Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
},
- 'params': {
- 'skip_download': True, # m3u8 downloads
- },
'skip': 'This video is no longer available.',
}, {
'url': 'http://teamcoco.com/video/the-conan-audiencey-awards-for-04/25/18',
@@ -76,126 +125,156 @@ class TeamcocoIE(TurnerBaseIE):
}, {
'url': 'http://teamcoco.com/israel/conan-hits-the-streets-beaches-of-tel-aviv',
'only_matching': True,
- }, {
- 'url': 'https://conan25.teamcoco.com/video/ice-cube-kevin-hart-conan-share-lyft',
- 'only_matching': True,
- }
+ },
]
- _RECORD_TEMPL = '''id
- title
- teaser
- publishOn
- thumb {
- preview
- }
- tags {
- name
- }
- duration
- turnerMediaId
- turnerMediaAuthToken'''
-
- def _graphql_call(self, query_template, object_type, object_id):
- find_object = 'find' + object_type
- return self._download_json(
- 'https://teamcoco.com/graphql', object_id, data=json.dumps({
- 'query': query_template % (find_object, object_id)
- }).encode(), headers={
- 'Content-Type': 'application/json',
- })['data'][find_object]
def _real_extract(self, url):
- display_id = self._match_id(url)
+ display_id = self._match_id(url).replace('/', '_')
+ webpage = self._download_webpage(url, display_id)
+ data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['pageData']
+ info = merge_dicts(*traverse_obj(data, (
+ 'blocks', lambda _, v: v['name'] in ('meta-tags', 'video-player', 'video-info'), 'props', {dict})))
- response = self._graphql_call('''{
- %%s(slug: "%%s") {
- ... on RecordSlug {
- record {
- %s
- }
- }
- ... on PageSlug {
- child {
- id
- }
- }
- ... on NotFoundSlug {
- status
- }
- }
-}''' % self._RECORD_TEMPL, 'Slug', display_id)
- if response.get('status'):
- raise ExtractorError('This video is no longer available.', expected=True)
-
- child = response.get('child')
- if child:
- record = self._graphql_call('''{
- %%s(id: "%%s") {
- ... on Video {
- %s
- }
- }
-}''' % self._RECORD_TEMPL, 'Record', child['id'])
- else:
- record = response['record']
- video_id = record['id']
+ thumbnail = traverse_obj(
+ info, (('image', 'poster'), {lambda x: urljoin('https://teamcoco.com/', x)}), get_all=False)
+ video_id = traverse_obj(parse_qs(thumbnail), ('id', 0)) or display_id
- info = {
+ formats, subtitles = self._get_formats_and_subtitles(info, video_id)
+
+ return {
'id': video_id,
'display_id': display_id,
- 'title': record['title'],
- 'thumbnail': record.get('thumb', {}).get('preview'),
- 'description': record.get('teaser'),
- 'duration': parse_duration(record.get('duration')),
- 'timestamp': parse_iso8601(record.get('publishOn')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'thumbnail': thumbnail,
+ **traverse_obj(info, {
+ 'title': 'title',
+ 'description': (('descriptionHtml', 'description'), {clean_html}),
+ 'timestamp': ('publishedOn', {lambda x: f'{x} 12:00AM'}, {unified_timestamp}),
+ }, get_all=False),
}
- media_id = record.get('turnerMediaId')
+
+class ConanClassicIE(TeamcocoBaseIE):
+ _VALID_URL = r'https?://(?:(?:www\.)?conanclassic|conan25\.teamcoco)\.com/(?P<id>([^/]+/)*[^/?#]+)'
+ _TESTS = [{
+ 'url': 'https://conanclassic.com/video/ice-cube-kevin-hart-conan-share-lyft',
+ 'info_dict': {
+ 'id': '74709',
+ 'ext': 'mp4',
+ 'title': 'Ice Cube, Kevin Hart, & Conan Share A Lyft Car',
+ 'display_id': 'video/ice-cube-kevin-hart-conan-share-lyft',
+ 'description': 'The stars of "Ride Along" teach Conan how to roll around Hollywood.',
+ 'thumbnail': 'http://cdn.teamcococdn.com/image/640x360/lyft-5bd75f82b616c.png',
+ 'duration': 570.0,
+ 'upload_date': '20131211',
+ 'timestamp': 1386721620,
+ '_old_archive_ids': ['teamcoco 74709'],
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://conan25.teamcoco.com/video/ice-cube-kevin-hart-conan-share-lyft',
+ 'only_matching': True,
+ }]
+
+ _GRAPHQL_QUERY = '''query find($id: ID!) {
+ findRecord(id: $id) {
+
+... on MetaInterface {
+ id
+ title
+ teaser
+ publishOn
+ slug
+ thumb {
+
+... on FileInterface {
+ id
+ path
+ preview
+ mime
+}
+
+ }
+}
+
+... on Video {
+ videoType
+ duration
+ isLive
+ youtubeId
+ turnerMediaId
+ turnerMediaAuthToken
+ airDate
+}
+
+... on Episode {
+ airDate
+ seasonNumber
+ episodeNumber
+ guestNames
+}
+
+ }
+ findRecordVideoMetadata(id: $id) {
+ turnerMediaId
+ turnerMediaAuthToken
+ duration
+ src
+ }
+}'''
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['pageData']
+ video_id = traverse_obj(
+ data, ('blocks', ..., 'props', 'fieldDefs', lambda _, v: v['name'] == 'incomingVideoId', 'value'),
+ ('blocks', ..., 'props', 'fields', 'incomingVideoRecord', 'id'), get_all=False)
+ if not video_id:
+ self.raise_no_formats('Unable to extract video ID from webpage', expected=True)
+
+ response = self._download_json(
+ 'https://conanclassic.com/api/legacy/graphql', video_id, data=json.dumps({
+ 'query': self._GRAPHQL_QUERY,
+ 'variables': {'id': video_id},
+ }, separators=(',', ':')).encode(), headers={
+ 'Content-Type': 'application/json',
+ })
+
+ info = traverse_obj(response, ('data', 'findRecord', {
+ 'title': 'title',
+ 'description': 'teaser',
+ 'thumbnail': ('thumb', 'preview', {url_or_none}),
+ 'duration': ('duration', {parse_duration}),
+ 'timestamp': ('publishOn', {unified_timestamp}),
+ }))
+
+ media_id = traverse_obj(
+ response, ('data', ('findRecord', 'findRecordVideoMetadata'), 'turnerMediaId'), get_all=False)
if media_id:
+ token = traverse_obj(
+ response, ('data', ('findRecord', 'findRecordVideoMetadata'), 'turnerMediaAuthToken'), get_all=False)
+ if not token:
+ raise ExtractorError('No Turner Media auth token found in API response')
self._initialize_geo_bypass({
'countries': ['US'],
})
info.update(self._extract_ngtv_info(media_id, {
- 'accessToken': record['turnerMediaAuthToken'],
+ 'accessToken': token,
'accessTokenType': 'jws',
}))
else:
- video_sources = self._download_json(
- 'https://teamcoco.com/_truman/d/' + video_id,
- video_id)['meta']['src']
- if isinstance(video_sources, dict):
- video_sources = video_sources.values()
-
- formats = []
- get_quality = qualities(['low', 'sd', 'hd', 'uhd'])
- for src in video_sources:
- if not isinstance(src, dict):
- continue
- src_url = src.get('src')
- if not src_url:
- continue
- format_id = src.get('label')
- ext = determine_ext(src_url, mimetype2ext(src.get('type')))
- if format_id == 'hls' or ext == 'm3u8':
- # compat_urllib_parse.urljoin does not work here
- if src_url.startswith('/'):
- src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url
- formats.extend(self._extract_m3u8_formats(
- src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
- else:
- if src_url.startswith('/mp4:protected/'):
- # TODO Correct extraction for these files
- continue
- tbr = int_or_none(self._search_regex(
- r'(\d+)k\.mp4', src_url, 'tbr', default=None))
-
- formats.append({
- 'url': src_url,
- 'ext': ext,
- 'tbr': tbr,
- 'format_id': format_id,
- 'quality': get_quality(format_id),
- })
- info['formats'] = formats
-
- return info
+ formats, subtitles = self._get_formats_and_subtitles(
+ traverse_obj(response, ('data', 'findRecordVideoMetadata')), video_id)
+ info.update({
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ '_old_archive_ids': [make_archive_id('Teamcoco', video_id)],
+ **info,
+ }
diff --git a/hypervideo_dl/extractor/telecaribe.py b/hypervideo_dl/extractor/telecaribe.py
new file mode 100644
index 0000000..91118a1
--- /dev/null
+++ b/hypervideo_dl/extractor/telecaribe.py
@@ -0,0 +1,91 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import traverse_obj
+
+
+class TelecaribePlayIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?play\.telecaribe\.co/(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://www.play.telecaribe.co/breicok',
+ 'info_dict': {
+ 'id': 'breicok',
+ 'title': 'Breicok',
+ },
+ 'playlist_count': 7,
+ }, {
+ 'url': 'https://www.play.telecaribe.co/si-fue-gol-de-yepes',
+ 'info_dict': {
+ 'id': 'si-fue-gol-de-yepes',
+ 'title': 'Sí Fue Gol de Yepes',
+ },
+ 'playlist_count': 6,
+ }, {
+ 'url': 'https://www.play.telecaribe.co/ciudad-futura',
+ 'info_dict': {
+ 'id': 'ciudad-futura',
+ 'title': 'Ciudad Futura',
+ },
+ 'playlist_count': 10,
+ }, {
+ 'url': 'https://www.play.telecaribe.co/live',
+ 'info_dict': {
+ 'id': 'live',
+ 'title': r're:^Señal en vivo',
+ 'live_status': 'is_live',
+ 'ext': 'mp4',
+ },
+ 'params': {
+ 'skip_download': 'Livestream',
+ }
+ }, {
+ 'url': 'https://www.play.telecaribe.co/liveplus',
+ 'info_dict': {
+ 'id': 'liveplus',
+ 'title': r're:^Señal en vivo Plus',
+ 'live_status': 'is_live',
+ 'ext': 'mp4',
+ },
+ 'params': {
+ 'skip_download': 'Livestream',
+ },
+ 'skip': 'Geo-restricted to Colombia',
+ }]
+
+ def _download_player_webpage(self, webpage, display_id):
+ page_id = self._search_regex(
+ (r'window\.firstPageId\s*=\s*["\']([^"\']+)', r'<div[^>]+id\s*=\s*"pageBackground_([^"]+)'),
+ webpage, 'page_id')
+
+ props = self._download_json(self._search_regex(
+ rf'<link[^>]+href\s*=\s*"([^"]+)"[^>]+id\s*=\s*"features_{page_id}"',
+ webpage, 'json_props_url'), display_id)['props']['render']['compProps']
+
+ return self._download_webpage(traverse_obj(props, (..., 'url'))[-1], display_id)
+
+ def _get_clean_title(self, title):
+ return re.sub(r'\s*\|\s*Telecaribe\s*VOD', '', title or '').strip() or None
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ player = self._download_player_webpage(webpage, display_id)
+
+ livestream_url = self._search_regex(
+ r'(?:let|const|var)\s+source\s*=\s*["\']([^"\']+)', player, 'm3u8 url', default=None)
+
+ if not livestream_url:
+ return self.playlist_from_matches(
+ re.findall(r'<a[^>]+href\s*=\s*"([^"]+\.mp4)', player), display_id,
+ self._get_clean_title(self._og_search_title(webpage)))
+
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ livestream_url, display_id, 'mp4', live=True)
+
+ return {
+ 'id': display_id,
+ 'title': self._get_clean_title(self._og_search_title(webpage)),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'is_live': True,
+ }
diff --git a/hypervideo_dl/extractor/telemundo.py b/hypervideo_dl/extractor/telemundo.py
index 88f29cb..54e74a6 100644
--- a/hypervideo_dl/extractor/telemundo.py
+++ b/hypervideo_dl/extractor/telemundo.py
@@ -1,9 +1,6 @@
from .common import InfoExtractor
-from ..utils import (
- try_get,
- unified_timestamp,
- HEADRequest,
-)
+from ..networking import HEADRequest
+from ..utils import try_get, unified_timestamp
class TelemundoIE(InfoExtractor):
@@ -38,7 +35,7 @@ class TelemundoIE(InfoExtractor):
m3u8_url = self._request_webpage(HEADRequest(
redirect_url + '?format=redirect&manifest=m3u&format=redirect&Tracking=true&Embedded=true&formats=MPEG4'),
- video_id, 'Processing m3u8').geturl()
+ video_id, 'Processing m3u8').url
formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
date = unified_timestamp(try_get(
metadata, lambda x: x['props']['initialState']['video']['associatedPlaylists'][0]['videos'][0]['datePublished'].split(' ', 1)[1]))
diff --git a/hypervideo_dl/extractor/tempo.py b/hypervideo_dl/extractor/tempo.py
index 1cfb956..9318d6f 100644
--- a/hypervideo_dl/extractor/tempo.py
+++ b/hypervideo_dl/extractor/tempo.py
@@ -1,5 +1,81 @@
+import re
+
from .common import InfoExtractor
-from ..utils import int_or_none, parse_iso8601, str_or_none, traverse_obj
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+ traverse_obj,
+ try_call
+)
+
+
+class IVXPlayerIE(InfoExtractor):
+ _VALID_URL = r'ivxplayer:(?P<video_id>\d+):(?P<player_key>\w+)'
+ _TESTS = [{
+ 'url': 'ivxplayer:2366065:4a89dfe6bc8f002596b1dfbd600730b1',
+ 'info_dict': {
+ 'id': '2366065',
+ 'ext': 'mp4',
+ 'duration': 112,
+ 'upload_date': '20221204',
+ 'title': 'Film Indonesia di Disney Content Showcase Asia Pacific 2022',
+ 'timestamp': 1670151746,
+ 'thumbnail': 'https://ivx-image.ivideosmart.com/serve/image/video/2366065?width=300'
+ }
+ }]
+ _WEBPAGE_TESTS = [{
+ 'url': 'https://www.cantika.com/video/31737/film-indonesia-di-disney-content-showcase-asia-pacific-2022',
+ 'info_dict': {
+ 'id': '2374200',
+ 'ext': 'mp4',
+ 'duration': 110,
+ 'title': 'Serial Indonesia di Disney Content Showcase Asia Pacific 2022',
+ 'timestamp': 1670639416,
+ 'upload_date': '20221210',
+ 'thumbnail': 'https://ivx-image.ivideosmart.com/serve/image/video/2374200?width=300'
+ }
+ }, {
+ 'url': 'https://www.gooto.com/video/11437/wuling-suv-ramai-dikunjungi-di-giias-2018',
+ 'info_dict': {
+ 'id': '892109',
+ 'ext': 'mp4',
+ 'title': 'Wuling SUV Ramai Dikunjungi di GIIAS 2018',
+ 'upload_date': '20180811',
+ 'description': 'md5:6d901483d0aacc664aecb4489719aafa',
+ 'duration': 75,
+ 'timestamp': 1534011263,
+ 'thumbnail': 'https://ivx-image.ivideosmart.com/serve/image/video/892109?width=300'
+ }
+ }]
+
+ @classmethod
+ def _extract_embed_urls(cls, url, webpage):
+ # more info at https://player.ivideosmart.com/ivsplayer/v4/dist/js/loader.js
+ mobj = re.search(
+ r'<ivs-player\s*[^>]+data-ivs-key\s*=\s*"(?P<player_key>[\w]+)\s*[^>]+\bdata-ivs-vid="(?P<video_id>[\w-]+)',
+ webpage)
+ if mobj:
+ yield f'ivxplayer:{mobj.group("video_id")}:{mobj.group("player_key")}'
+ raise cls.StopExtraction()
+
+ def _real_extract(self, url):
+ video_id, player_key = self._match_valid_url(url).group('video_id', 'player_key')
+ json_data = self._download_json(
+ f'https://ivxplayer.ivideosmart.com/prod/video/{video_id}?key={player_key}', video_id)
+
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ json_data['player']['video_url'], video_id)
+
+ return {
+ 'id': str(json_data['ivx']['id']),
+ 'title': traverse_obj(json_data, ('ivx', 'name')),
+ 'description': traverse_obj(json_data, ('ivx', 'description')),
+ 'duration': int_or_none(traverse_obj(json_data, ('ivx', 'duration'))),
+ 'timestamp': parse_iso8601(traverse_obj(json_data, ('ivx', 'published_at'))),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'thumbnail': traverse_obj(json_data, ('ivx', 'thumbnail_url'))
+ }
class TempoIE(InfoExtractor):
@@ -7,14 +83,14 @@ class TempoIE(InfoExtractor):
_TESTS = [{
'url': 'https://video.tempo.co/read/30058/anies-baswedan-ajukan-banding-putusan-ptun-batalkan-ump-dki',
'info_dict': {
- 'id': '2144438',
+ 'id': '2144275',
+ 'display_id': 'anies-baswedan-ajukan-banding-putusan-ptun-batalkan-ump-dki',
'ext': 'mp4',
'title': 'Anies Baswedan Ajukan Banding Putusan PTUN Batalkan UMP DKI',
- 'display_id': 'anies-baswedan-ajukan-banding-putusan-ptun-batalkan-ump-dki',
- 'duration': 84,
+ 'duration': 85,
'description': 'md5:a6822b7c4c874fa7e5bd63e96a387b66',
'thumbnail': 'https://statik.tempo.co/data/2022/07/27/id_1128287/1128287_720.jpg',
- 'timestamp': 1658911277,
+ 'timestamp': 1658907970,
'upload_date': '20220727',
'tags': ['Anies Baswedan', ' PTUN', ' PTUN | Pengadilan Tata Usaha Negara', ' PTUN Batalkan UMP DKI', ' UMP DKI'],
}
@@ -24,30 +100,15 @@ class TempoIE(InfoExtractor):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- player_key, widget_id = self._search_regex(
- r'<ivs-player\s*[^>]+data-ivs-key\s*=\s*"(?P<player_key>[\w]+)[^>]+\bdata-ivs-wid="(?P<widget_id>[\w-]+)',
- webpage, 'player_key, widget_id', group=('player_key', 'widget_id'))
+ _, video_id, player_key = next(IVXPlayerIE._extract_embed_urls(url, webpage)).split(':')
json_ld_data = self._search_json_ld(webpage, display_id)
- json_data = self._download_json(
- f'https://ivxplayer.ivideosmart.com/prod/widget/{widget_id}',
- display_id, query={'key': player_key})
- formats, subtitles = self._extract_m3u8_formats_and_subtitles(
- json_data['player']['video_url'], display_id, ext='mp4')
-
- return {
- 'id': str(json_data['ivx']['id']),
- 'display_id': display_id,
- 'formats': formats,
- 'subtitles': subtitles,
- 'title': (self._html_search_meta('twitter:title', webpage) or self._og_search_title(webpage)
- or traverse_obj(json_data, ('ivx', 'name'))),
- 'duration': int_or_none(traverse_obj(json_data, ('ivx', 'duration'))),
- 'thumbnail': (self._html_search_meta('twitter:image:src', webpage) or self._og_search_thumbnail(webpage)
- or traverse_obj(json_data, ('ivx', 'thumbnail_url'))),
- 'description': (json_ld_data.get('description') or self._html_search_meta(['description', 'twitter:description'], webpage)
- or self._og_search_description(webpage)),
- 'timestamp': parse_iso8601(traverse_obj(json_data, ('ivx', 'created_at'))),
- 'tags': str_or_none(self._html_search_meta('keywords', webpage), '').split(','),
- }
+ return self.url_result(
+ f'ivxplayer:{video_id}:{player_key}', display_id=display_id,
+ thumbnail=self._html_search_meta('twitter:image:src', webpage) or self._og_search_thumbnail(webpage),
+ tags=try_call(lambda: self._html_search_meta('keywords', webpage).split(',')),
+ description=(json_ld_data.get('description')
+ or self._html_search_meta(('description', 'twitter:description'), webpage)
+ or self._og_search_description(webpage)),
+ url_transparent=True)
diff --git a/hypervideo_dl/extractor/tencent.py b/hypervideo_dl/extractor/tencent.py
index ff8bf99..6618ea4 100644
--- a/hypervideo_dl/extractor/tencent.py
+++ b/hypervideo_dl/extractor/tencent.py
@@ -8,6 +8,7 @@ from .common import InfoExtractor
from ..aes import aes_cbc_encrypt_bytes
from ..utils import (
ExtractorError,
+ float_or_none,
determine_ext,
int_or_none,
js_to_json,
@@ -19,6 +20,16 @@ from ..utils import (
class TencentBaseIE(InfoExtractor):
"""Subclasses must set _API_URL, _APP_VERSION, _PLATFORM, _HOST, _REFERER"""
+ def _check_api_response(self, api_response):
+ msg = api_response.get('msg')
+ if api_response.get('code') != '0.0' and msg is not None:
+ if msg in (
+ '您所在区域暂无此内容版权(如设置VPN请关闭后重试)',
+ 'This content is not available in your area due to copyright restrictions. Please choose other videos.'
+ ):
+ self.raise_geo_restricted()
+ raise ExtractorError(f'Tencent said: {msg}')
+
def _get_ckey(self, video_id, url, guid):
ua = self.get_param('http_headers')['User-Agent']
@@ -32,7 +43,7 @@ class TencentBaseIE(InfoExtractor):
padding_mode='whitespace').hex().upper()
def _get_video_api_response(self, video_url, video_id, series_id, subtitle_format, video_format, video_quality):
- guid = ''.join([random.choice(string.digits + string.ascii_lowercase) for _ in range(16)])
+ guid = ''.join(random.choices(string.digits + string.ascii_lowercase, k=16))
ckey = self._get_ckey(video_id, video_url, guid)
query = {
'vid': video_id,
@@ -47,6 +58,11 @@ class TencentBaseIE(InfoExtractor):
'sphttps': '1', # Enable HTTPS
'otype': 'json',
'spwm': '1',
+ 'hevclv': '28', # Enable HEVC
+ 'drm': '40', # Enable DRM
+ # For HDR
+ 'spvideo': '4',
+ 'spsfrhdr': '100',
# For SHD
'host': self._HOST,
'referer': self._REFERER,
@@ -55,7 +71,7 @@ class TencentBaseIE(InfoExtractor):
'platform': self._PLATFORM,
# For VQQ
'guid': guid,
- 'flowid': ''.join(random.choice(string.digits + string.ascii_lowercase) for _ in range(32)),
+ 'flowid': ''.join(random.choices(string.digits + string.ascii_lowercase, k=32)),
}
return self._search_json(r'QZOutputJson=', self._download_webpage(
@@ -63,7 +79,6 @@ class TencentBaseIE(InfoExtractor):
def _extract_video_formats_and_subtitles(self, api_response, video_id):
video_response = api_response['vl']['vi'][0]
- video_width, video_height = video_response.get('vw'), video_response.get('vh')
formats, subtitles = [], {}
for video_format in video_response['ul']['ui']:
@@ -71,47 +86,61 @@ class TencentBaseIE(InfoExtractor):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
video_format['url'] + traverse_obj(video_format, ('hls', 'pt'), default=''),
video_id, 'mp4', fatal=False)
- for f in fmts:
- f.update({'width': video_width, 'height': video_height})
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'url': f'{video_format["url"]}{video_response["fn"]}?vkey={video_response["fvkey"]}',
- 'width': video_width,
- 'height': video_height,
'ext': 'mp4',
})
+ identifier = video_response.get('br')
+ format_response = traverse_obj(
+ api_response, ('fl', 'fi', lambda _, v: v['br'] == identifier),
+ expected_type=dict, get_all=False) or {}
+ common_info = {
+ 'width': video_response.get('vw'),
+ 'height': video_response.get('vh'),
+ 'abr': float_or_none(format_response.get('audiobandwidth'), scale=1000),
+ 'vbr': float_or_none(format_response.get('bandwidth'), scale=1000),
+ 'fps': format_response.get('vfps'),
+ 'format': format_response.get('sname'),
+ 'format_id': format_response.get('name'),
+ 'format_note': format_response.get('resolution'),
+ 'dynamic_range': {'hdr10': 'hdr10'}.get(format_response.get('name'), 'sdr'),
+ 'has_drm': format_response.get('drm', 0) != 0,
+ }
+ for f in formats:
+ f.update(common_info)
+
return formats, subtitles
- def _extract_video_native_subtitles(self, api_response, subtitles_format):
+ def _extract_video_native_subtitles(self, api_response):
subtitles = {}
for subtitle in traverse_obj(api_response, ('sfl', 'fi')) or ():
subtitles.setdefault(subtitle['lang'].lower(), []).append({
'url': subtitle['url'],
- 'ext': subtitles_format,
+ 'ext': 'srt' if subtitle.get('captionType') == 1 else 'vtt',
'protocol': 'm3u8_native' if determine_ext(subtitle['url']) == 'm3u8' else 'http',
})
return subtitles
def _extract_all_video_formats_and_subtitles(self, url, video_id, series_id):
- formats, subtitles = [], {}
- for video_format, subtitle_format, video_quality in (
- # '': 480p, 'shd': 720p, 'fhd': 1080p
- ('mp4', 'srt', ''), ('hls', 'vtt', 'shd'), ('hls', 'vtt', 'fhd')):
- api_response = self._get_video_api_response(
- url, video_id, series_id, subtitle_format, video_format, video_quality)
-
- if api_response.get('em') != 0 and api_response.get('exem') != 0:
- if '您所在区域暂无此内容版权' in api_response.get('msg'):
- self.raise_geo_restricted()
- raise ExtractorError(f'Tencent said: {api_response.get("msg")}')
+ api_responses = [self._get_video_api_response(url, video_id, series_id, 'srt', 'hls', 'hd')]
+ self._check_api_response(api_responses[0])
+ qualities = traverse_obj(api_responses, (0, 'fl', 'fi', ..., 'name')) or ('shd', 'fhd')
+ for q in qualities:
+ if q not in ('ld', 'sd', 'hd'):
+ api_responses.append(self._get_video_api_response(
+ url, video_id, series_id, 'vtt', 'hls', q))
+ self._check_api_response(api_responses[-1])
+ formats, subtitles = [], {}
+ for api_response in api_responses:
fmts, subs = self._extract_video_formats_and_subtitles(api_response, video_id)
- native_subtitles = self._extract_video_native_subtitles(api_response, subtitle_format)
+ native_subtitles = self._extract_video_native_subtitles(api_response)
formats.extend(fmts)
self._merge_subtitles(subs, native_subtitles, target=subtitles)
@@ -120,7 +149,7 @@ class TencentBaseIE(InfoExtractor):
def _get_clean_title(self, title):
return re.sub(
- r'\s*[_\-]\s*(?:Watch online|腾讯视频|(?:高清)?1080P在线观看平台).*?$',
+ r'\s*[_\-]\s*(?:Watch online|Watch HD Video Online|WeTV|腾讯视频|(?:高清)?1080P在线观看平台).*?$',
'', title or '').strip() or None
@@ -134,11 +163,9 @@ class VQQBaseIE(TencentBaseIE):
_REFERER = 'v.qq.com'
def _get_webpage_metadata(self, webpage, video_id):
- return self._parse_json(
- self._search_regex(
- r'(?s)<script[^>]*>[^<]*window\.__pinia\s*=\s*([^<]+)</script>',
- webpage, 'pinia data', fatal=False),
- video_id, transform_source=js_to_json, fatal=False)
+ return self._search_json(
+ r'<script[^>]*>[^<]*window\.__(?:pinia|PINIA__)\s*=',
+ webpage, 'pinia data', video_id, transform_source=js_to_json, fatal=False)
class VQQVideoIE(VQQBaseIE):
@@ -147,27 +174,29 @@ class VQQVideoIE(VQQBaseIE):
_TESTS = [{
'url': 'https://v.qq.com/x/page/q326831cny0.html',
- 'md5': '826ef93682df09e3deac4a6e6e8cdb6e',
+ 'md5': 'b11c9cb781df710d686b950376676e2a',
'info_dict': {
'id': 'q326831cny0',
'ext': 'mp4',
'title': '我是选手:雷霆裂阵,终极时刻',
'description': 'md5:e7ed70be89244017dac2a835a10aeb1e',
'thumbnail': r're:^https?://[^?#]+q326831cny0',
+ 'format_id': r're:^shd',
},
}, {
'url': 'https://v.qq.com/x/page/o3013za7cse.html',
- 'md5': 'b91cbbeada22ef8cc4b06df53e36fa21',
+ 'md5': 'a1bcf42c6d28c189bd2fe2d468abb287',
'info_dict': {
'id': 'o3013za7cse',
'ext': 'mp4',
'title': '欧阳娜娜VLOG',
'description': 'md5:29fe847497a98e04a8c3826e499edd2e',
'thumbnail': r're:^https?://[^?#]+o3013za7cse',
+ 'format_id': r're:^shd',
},
}, {
'url': 'https://v.qq.com/x/cover/7ce5noezvafma27/a00269ix3l8.html',
- 'md5': '71459c5375c617c265a22f083facce67',
+ 'md5': '87968df6238a65d2478f19c25adf850b',
'info_dict': {
'id': 'a00269ix3l8',
'ext': 'mp4',
@@ -175,10 +204,12 @@ class VQQVideoIE(VQQBaseIE):
'description': 'md5:8cae3534327315b3872fbef5e51b5c5b',
'thumbnail': r're:^https?://[^?#]+7ce5noezvafma27',
'series': '鸡毛飞上天',
+ 'format_id': r're:^shd',
},
+ 'skip': '404',
}, {
'url': 'https://v.qq.com/x/cover/mzc00200p29k31e/s0043cwsgj0.html',
- 'md5': '96b9fd4a189fdd4078c111f21d7ac1bc',
+ 'md5': 'fadd10bf88aec3420f06f19ee1d24c5b',
'info_dict': {
'id': 's0043cwsgj0',
'ext': 'mp4',
@@ -186,7 +217,9 @@ class VQQVideoIE(VQQBaseIE):
'description': 'md5:1d8c3a0b8729ae3827fa5b2d3ebd5213',
'thumbnail': r're:^https?://[^?#]+s0043cwsgj0',
'series': '青年理工工作者生活研究所',
+ 'format_id': r're:^shd',
},
+ 'params': {'skip_download': 'm3u8'},
}, {
# Geo-restricted to China
'url': 'https://v.qq.com/x/cover/mcv8hkc8zk8lnov/x0036x5qqsr.html',
@@ -319,6 +352,7 @@ class WeTvEpisodeIE(WeTvBaseIE):
'episode': 'Episode 1',
'episode_number': 1,
'duration': 2835,
+ 'format_id': r're:^shd',
},
}, {
'url': 'https://wetv.vip/en/play/u37kgfnfzs73kiu/p0039b9nvik',
@@ -333,6 +367,7 @@ class WeTvEpisodeIE(WeTvBaseIE):
'episode': 'Episode 1',
'episode_number': 1,
'duration': 2454,
+ 'format_id': r're:^shd',
},
}, {
'url': 'https://wetv.vip/en/play/lcxgwod5hapghvw-WeTV-PICK-A-BOO/i0042y00lxp-Zhao-Lusi-Describes-The-First-Experiences-She-Had-In-Who-Rules-The-World-%7C-WeTV-PICK-A-BOO',
@@ -342,11 +377,12 @@ class WeTvEpisodeIE(WeTvBaseIE):
'ext': 'mp4',
'title': 'md5:f7a0857dbe5fbbe2e7ad630b92b54e6a',
'description': 'md5:76260cb9cdc0ef76826d7ca9d92fadfa',
- 'thumbnail': r're:^https?://[^?#]+lcxgwod5hapghvw',
+ 'thumbnail': r're:^https?://[^?#]+i0042y00lxp',
'series': 'WeTV PICK-A-BOO',
'episode': 'Episode 0',
'episode_number': 0,
'duration': 442,
+ 'format_id': r're:^shd',
},
}]
@@ -406,6 +442,7 @@ class IflixEpisodeIE(IflixBaseIE):
'episode': 'Episode 1',
'episode_number': 1,
'duration': 2639,
+ 'format_id': r're:^shd',
},
}, {
'url': 'https://www.iflix.com/en/play/fvvrcc3ra9lbtt1-Take-My-Brother-Away/i0029sd3gm1-EP1%EF%BC%9ATake-My-Brother-Away',
@@ -420,6 +457,7 @@ class IflixEpisodeIE(IflixBaseIE):
'episode': 'Episode 1',
'episode_number': 1,
'duration': 228,
+ 'format_id': r're:^shd',
},
}]
diff --git a/hypervideo_dl/extractor/tennistv.py b/hypervideo_dl/extractor/tennistv.py
index bc64226..c1b4a33 100644
--- a/hypervideo_dl/extractor/tennistv.py
+++ b/hypervideo_dl/extractor/tennistv.py
@@ -86,7 +86,7 @@ class TennisTVIE(InfoExtractor):
})
self.get_token(None, {
- 'code': urllib.parse.parse_qs(handle.geturl())['code'][-1],
+ 'code': urllib.parse.parse_qs(handle.url)['code'][-1],
'grant_type': 'authorization_code',
'client_id': 'tennis-tv-web',
'redirect_uri': 'https://www.tennistv.com/resources/v1.1.10/html/silent-check-sso.html'
diff --git a/hypervideo_dl/extractor/tenplay.py b/hypervideo_dl/extractor/tenplay.py
index 633032e..c7097cf 100644
--- a/hypervideo_dl/extractor/tenplay.py
+++ b/hypervideo_dl/extractor/tenplay.py
@@ -2,11 +2,8 @@ from datetime import datetime
import base64
from .common import InfoExtractor
-from ..utils import (
- HEADRequest,
- int_or_none,
- urlencode_postdata,
-)
+from ..networking import HEADRequest
+from ..utils import int_or_none, urlencode_postdata
class TenPlayIE(InfoExtractor):
@@ -94,7 +91,7 @@ class TenPlayIE(InfoExtractor):
data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON',
headers=headers).get('source')
m3u8_url = self._request_webpage(HEADRequest(
- _video_url), content_id).geturl()
+ _video_url), content_id).url
if '10play-not-in-oz' in m3u8_url:
self.raise_geo_restricted(countries=['AU'])
formats = self._extract_m3u8_formats(m3u8_url, content_id, 'mp4')
diff --git a/hypervideo_dl/extractor/testurl.py b/hypervideo_dl/extractor/testurl.py
index dccca10..3cf0017 100644
--- a/hypervideo_dl/extractor/testurl.py
+++ b/hypervideo_dl/extractor/testurl.py
@@ -8,7 +8,7 @@ class TestURLIE(InfoExtractor):
""" Allows addressing of the test cases as test:yout.*be_1 """
IE_DESC = False # Do not list
- _VALID_URL = r'test(?:url)?:(?P<extractor>.*?)(?:_(?P<num>[0-9]+))?$'
+ _VALID_URL = r'test(?:url)?:(?P<extractor>.*?)(?:_(?P<num>\d+|all))?$'
def _real_extract(self, url):
from . import gen_extractor_classes
@@ -23,11 +23,12 @@ class TestURLIE(InfoExtractor):
if len(matching_extractors) == 0:
raise ExtractorError(f'No extractors matching {extractor_id!r} found', expected=True)
elif len(matching_extractors) > 1:
- try: # Check for exact match
- extractor = next(
- ie for ie in matching_extractors
- if ie.IE_NAME.lower() == extractor_id.lower())
- except StopIteration:
+ extractor = next(( # Check for exact match
+ ie for ie in matching_extractors if ie.IE_NAME.lower() == extractor_id.lower()
+ ), None) or next(( # Check for exact match without plugin suffix
+ ie for ie in matching_extractors if ie.IE_NAME.split('+')[0].lower() == extractor_id.lower()
+ ), None)
+ if not extractor:
raise ExtractorError(
'Found multiple matching extractors: %s' % ' '.join(ie.IE_NAME for ie in matching_extractors),
expected=True)
@@ -35,6 +36,10 @@ class TestURLIE(InfoExtractor):
extractor = matching_extractors[0]
testcases = tuple(extractor.get_testcases(True))
+ if num == 'all':
+ return self.playlist_result(
+ [self.url_result(tc['url'], extractor) for tc in testcases],
+ url, f'{extractor.IE_NAME} tests')
try:
tc = testcases[int(num or 0)]
except IndexError:
@@ -42,4 +47,4 @@ class TestURLIE(InfoExtractor):
f'Test case {num or 0} not found, got only {len(testcases)} tests', expected=True)
self.to_screen(f'Test URL: {tc["url"]}')
- return self.url_result(tc['url'])
+ return self.url_result(tc['url'], extractor)
diff --git a/hypervideo_dl/extractor/tf1.py b/hypervideo_dl/extractor/tf1.py
index 4cf0322..aba4927 100644
--- a/hypervideo_dl/extractor/tf1.py
+++ b/hypervideo_dl/extractor/tf1.py
@@ -28,6 +28,25 @@ class TF1IE(InfoExtractor):
'skip_download': True,
},
}, {
+ 'url': 'https://www.tf1.fr/tmc/burger-quiz/videos/burger-quiz-du-19-aout-2023-s03-episode-21-85585666.html',
+ 'info_dict': {
+ 'id': '14010600',
+ 'ext': 'mp4',
+ 'title': 'Burger Quiz - S03 EP21 avec Eye Haidara, Anne Depétrini, Jonathan Zaccaï et Pio Marmaï',
+ 'thumbnail': 'https://photos.tf1.fr/1280/720/burger-quiz-11-9adb79-0@1x.jpg',
+ 'description': 'Manu Payet recevra Eye Haidara, Anne Depétrini, Jonathan Zaccaï et Pio Marmaï.',
+ 'upload_date': '20230819',
+ 'timestamp': 1692469471,
+ 'season_number': 3,
+ 'series': 'Burger Quiz',
+ 'episode_number': 21,
+ 'season': 'Season 3',
+ 'tags': 'count:13',
+ 'episode': 'Episode 21',
+ 'duration': 2312
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html',
'only_matching': True,
}, {
diff --git a/hypervideo_dl/extractor/tfo.py b/hypervideo_dl/extractor/tfo.py
index a24789c..d417f50 100644
--- a/hypervideo_dl/extractor/tfo.py
+++ b/hypervideo_dl/extractor/tfo.py
@@ -1,12 +1,8 @@
import json
from .common import InfoExtractor
-from ..utils import (
- HEADRequest,
- ExtractorError,
- int_or_none,
- clean_html,
-)
+from ..networking import HEADRequest
+from ..utils import ExtractorError, clean_html, int_or_none
class TFOIE(InfoExtractor):
diff --git a/hypervideo_dl/extractor/theplatform.py b/hypervideo_dl/extractor/theplatform.py
index e659b8e..99caeb5 100644
--- a/hypervideo_dl/extractor/theplatform.py
+++ b/hypervideo_dl/extractor/theplatform.py
@@ -7,19 +7,23 @@ import hashlib
from .once import OnceIE
from .adobepass import AdobePassIE
+from ..networking import Request
from ..utils import (
determine_ext,
ExtractorError,
float_or_none,
int_or_none,
parse_qs,
- sanitized_Request,
unsmuggle_url,
update_url_query,
xpath_with_ns,
mimetype2ext,
find_xpath_attr,
+ traverse_obj,
+ update_url,
+ urlhandle_detect_ext,
)
+from ..networking import HEADRequest
default_ns = 'http://www.w3.org/2005/SMIL21/Language'
_x = lambda p: xpath_with_ns(p, {'smil': default_ns})
@@ -45,7 +49,7 @@ class ThePlatformBaseIE(OnceIE):
raise ExtractorError(
error_element.attrib['abstract'], expected=True)
- smil_formats = self._parse_smil_formats(
+ smil_formats, subtitles = self._parse_smil_formats_and_subtitles(
meta, smil_url, video_id, namespace=default_ns,
# the parameters are from syfy.com, other sites may use others,
# they also work for nbc.com
@@ -65,8 +69,6 @@ class ThePlatformBaseIE(OnceIE):
formats.append(_format)
- subtitles = self._parse_smil_subtitles(meta, default_ns)
-
return formats, subtitles
def _download_theplatform_metadata(self, path, video_id):
@@ -164,7 +166,8 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
'params': {
# rtmp download
'skip_download': True,
- }
+ },
+ 'skip': '404 Not Found',
}, {
'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD',
'info_dict': {
@@ -173,7 +176,8 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
'description': 'md5:644ad9188d655b742f942bf2e06b002d',
'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
'uploader': 'EGSM',
- }
+ },
+ 'skip': '404 Not Found',
}, {
'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
'only_matching': True,
@@ -191,6 +195,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
'upload_date': '20150701',
'uploader': 'NBCU-NEWS',
},
+ 'skip': '404 Not Found',
}, {
# From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1
# geo-restricted (US), HLS encrypted with AES-128
@@ -270,7 +275,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
source_url = smuggled_data.get('source_url')
if source_url:
headers['Referer'] = source_url
- request = sanitized_Request(url, headers=headers)
+ request = Request(url, headers=headers)
webpage = self._download_webpage(request, video_id)
smil_url = self._search_regex(
r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml',
@@ -297,6 +302,17 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
+ # With some sites, manifest URL must be forced to extract HLS formats
+ if not traverse_obj(formats, lambda _, v: v['format_id'].startswith('hls')):
+ m3u8_url = update_url(url, query='mbr=true&manifest=m3u', fragment=None)
+ urlh = self._request_webpage(
+ HEADRequest(m3u8_url), video_id, 'Checking for HLS formats', 'No HLS formats found', fatal=False)
+ if urlh and urlhandle_detect_ext(urlh) == 'm3u8':
+ m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
+ m3u8_url, video_id, m3u8_id='hls', fatal=False)
+ formats.extend(m3u8_fmts)
+ self._merge_subtitles(m3u8_subs, target=subtitles)
+
ret = self._extract_theplatform_metadata(path, video_id)
combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
ret.update({
diff --git a/hypervideo_dl/extractor/thesun.py b/hypervideo_dl/extractor/thesun.py
index ba58482..5edcf1c 100644
--- a/hypervideo_dl/extractor/thesun.py
+++ b/hypervideo_dl/extractor/thesun.py
@@ -5,15 +5,22 @@ from ..utils import extract_attributes
class TheSunIE(InfoExtractor):
- _VALID_URL = r'https://(?:www\.)?thesun\.co\.uk/[^/]+/(?P<id>\d+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?the-?sun(\.co\.uk|\.com)/[^/]+/(?P<id>\d+)'
+ _TESTS = [{
'url': 'https://www.thesun.co.uk/tvandshowbiz/2261604/orlando-bloom-and-katy-perry-post-adorable-instagram-video-together-celebrating-thanksgiving-after-split-rumours/',
'info_dict': {
'id': '2261604',
'title': 'md5:cba22f48bad9218b64d5bbe0e16afddf',
},
'playlist_count': 2,
- }
+ }, {
+ 'url': 'https://www.the-sun.com/entertainment/7611415/1000lb-sisters-fans-rip-amy-dangerous-health-decision/',
+ 'info_dict': {
+ 'id': '7611415',
+ 'title': 'md5:e0b9b976f79dc770e5c80f22f40bb844',
+ },
+ 'playlist_count': 1,
+ }]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
def _real_extract(self, url):
diff --git a/hypervideo_dl/extractor/thisoldhouse.py b/hypervideo_dl/extractor/thisoldhouse.py
index 55b6413..cc7beee 100644
--- a/hypervideo_dl/extractor/thisoldhouse.py
+++ b/hypervideo_dl/extractor/thisoldhouse.py
@@ -1,5 +1,5 @@
from .common import InfoExtractor
-from ..utils import HEADRequest
+from ..networking import HEADRequest
class ThisOldHouseIE(InfoExtractor):
@@ -50,6 +50,6 @@ class ThisOldHouseIE(InfoExtractor):
r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})[^\'"]*)[\'"]',
webpage, 'video url')
if 'subscription_required=true' in video_url or 'c-entry-group-labels__image' in webpage:
- return self.url_result(self._request_webpage(HEADRequest(video_url), display_id).geturl(), 'Zype', display_id)
+ return self.url_result(self._request_webpage(HEADRequest(video_url), display_id).url, 'Zype', display_id)
video_id = self._search_regex(r'(?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})', video_url, 'video id')
return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id)
diff --git a/hypervideo_dl/extractor/thisvid.py b/hypervideo_dl/extractor/thisvid.py
new file mode 100644
index 0000000..9d3368e
--- /dev/null
+++ b/hypervideo_dl/extractor/thisvid.py
@@ -0,0 +1,226 @@
+import itertools
+import re
+import urllib.parse
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ get_element_by_class,
+ int_or_none,
+ url_or_none,
+ urljoin,
+)
+
+
+class ThisVidIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+)'
+ _TESTS = [{
+ 'url': 'https://thisvid.com/videos/sitting-on-ball-tight-jeans/',
+ 'md5': '839becb572995687e11a69dc4358a386',
+ 'info_dict': {
+ 'id': '3533241',
+ 'ext': 'mp4',
+ 'title': 'Sitting on ball tight jeans',
+ 'description': 'md5:372353bb995883d1b65fddf507489acd',
+ 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
+ 'uploader_id': '150629',
+ 'uploader': 'jeanslevisjeans',
+ 'display_id': 'sitting-on-ball-tight-jeans',
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://thisvid.com/embed/3533241/',
+ 'md5': '839becb572995687e11a69dc4358a386',
+ 'info_dict': {
+ 'id': '3533241',
+ 'ext': 'mp4',
+ 'title': 'Sitting on ball tight jeans',
+ 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
+ 'uploader_id': '150629',
+ 'uploader': 'jeanslevisjeans',
+ 'display_id': 'sitting-on-ball-tight-jeans',
+ 'age_limit': 18,
+ }
+ }]
+
+ def _real_extract(self, url):
+ main_id, type_ = re.match(self._VALID_URL, url).group('id', 'type')
+ webpage = self._download_webpage(url, main_id)
+
+ title = self._html_search_regex(
+ r'<title\b[^>]*?>(?:Video:\s+)?(.+?)(?:\s+-\s+ThisVid(?:\.com| tube))?</title>',
+ webpage, 'title')
+
+ if type_ == 'embed':
+ # look for more metadata
+ video_alt_url = url_or_none(self._search_regex(
+ rf'''video_alt_url\s*:\s+'({self._VALID_URL}/)',''',
+ webpage, 'video_alt_url', default=None))
+ if video_alt_url and video_alt_url != url:
+ webpage = self._download_webpage(
+ video_alt_url, main_id,
+ note='Redirecting embed to main page', fatal=False) or webpage
+
+ video_holder = get_element_by_class('video-holder', webpage) or ''
+ if '>This video is a private video' in video_holder:
+ self.raise_login_required(
+ (clean_html(video_holder) or 'Private video').partition('\n')[0])
+
+ uploader = self._html_search_regex(
+ r'''(?s)<span\b[^>]*>Added by:\s*</span><a\b[^>]+\bclass\s*=\s*["']author\b[^>]+\bhref\s*=\s*["']https://thisvid\.com/members/([0-9]+/.{3,}?)\s*</a>''',
+ webpage, 'uploader', default='')
+ uploader = re.split(r'''/["'][^>]*>\s*''', uploader)
+ if len(uploader) == 2:
+ # id must be non-empty, uploader could be ''
+ uploader_id, uploader = uploader
+ uploader = uploader or None
+ else:
+ uploader_id = uploader = None
+
+ return self.url_result(
+ url, ie='Generic', url_transparent=True,
+ title=title,
+ age_limit=18,
+ uploader=uploader,
+ uploader_id=uploader_id)
+
+
+class ThisVidPlaylistBaseIE(InfoExtractor):
+ _PLAYLIST_URL_RE = None
+
+ @classmethod
+ def _find_urls(cls, html):
+ for m in re.finditer(rf'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>{cls._PLAYLIST_URL_RE}\b)[^>]+>''', html):
+ yield m.group('url')
+
+ def _generate_playlist_entries(self, url, playlist_id, html=None):
+ page_url = url
+ for page in itertools.count(1):
+ if not html:
+ html = self._download_webpage(
+ page_url, playlist_id, note=f'Downloading page {page}',
+ fatal=False) or ''
+
+ yield from self._find_urls(html)
+
+ next_page = get_element_by_class('pagination-next', html) or ''
+ if next_page:
+ # member list page
+ next_page = urljoin(url, self._search_regex(
+ r'''<a\b[^>]+\bhref\s*=\s*("|')(?P<url>(?!#)(?:(?!\1).)+)''',
+ next_page, 'next page link', group='url', default=None))
+
+ # in case a member page should have pagination-next with empty link, not just `else:`
+ if next_page is None:
+ # playlist page
+ parsed_url = urllib.parse.urlparse(page_url)
+ base_path, _, num = parsed_url.path.rpartition('/')
+ num = int_or_none(num)
+ if num is None:
+ base_path, num = parsed_url.path.rstrip('/'), 1
+ parsed_url = parsed_url._replace(path=f'{base_path}/{num + 1}')
+ next_page = urllib.parse.urlunparse(parsed_url)
+ if page_url == next_page:
+ next_page = None
+
+ if not next_page:
+ return
+ page_url, html = next_page, None
+
+ def _make_playlist_result(self, url):
+ playlist_id = self._match_id(url)
+ webpage = self._download_webpage(url, playlist_id)
+
+ title = re.split(
+ r'(?i)\s*\|\s*ThisVid\.com\s*$',
+ self._og_search_title(webpage, default=None)
+ or self._html_search_regex(r'(?s)<title\b[^>]*>(.+?)</title', webpage, 'title', fatal=False) or '', 1)[0] or None
+
+ return self.playlist_from_matches(
+ self._generate_playlist_entries(url, playlist_id, webpage),
+ playlist_id=playlist_id, playlist_title=title, ie=ThisVidIE)
+
+
+class ThisVidMemberIE(ThisVidPlaylistBaseIE):
+ _VALID_URL = r'https?://thisvid\.com/members/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://thisvid.com/members/2140501/',
+ 'info_dict': {
+ 'id': '2140501',
+ 'title': 'Rafflesia\'s Profile',
+ },
+ 'playlist_mincount': 16,
+ }, {
+ 'url': 'https://thisvid.com/members/2140501/favourite_videos/',
+ 'info_dict': {
+ 'id': '2140501',
+ 'title': 'Rafflesia\'s Favourite Videos',
+ },
+ 'playlist_mincount': 15,
+ }, {
+ 'url': 'https://thisvid.com/members/636468/public_videos/',
+ 'info_dict': {
+ 'id': '636468',
+ 'title': 'Happymouth\'s Public Videos',
+ },
+ 'playlist_mincount': 196,
+ }]
+ _PLAYLIST_URL_RE = ThisVidIE._VALID_URL
+
+ def _real_extract(self, url):
+ return self._make_playlist_result(url)
+
+
+class ThisVidPlaylistIE(ThisVidPlaylistBaseIE):
+ _VALID_URL = r'https?://thisvid\.com/playlist/(?P<id>\d+)/video/(?P<video_id>[A-Za-z0-9-]+)'
+ _TESTS = [{
+ 'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
+ 'info_dict': {
+ 'id': '6615',
+ 'title': 'Underwear Stuff',
+ },
+ 'playlist_mincount': 200,
+ }, {
+ 'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
+ 'info_dict': {
+ 'id': '1072387',
+ 'ext': 'mp4',
+ 'title': 'Big Italian Booty 28',
+ 'description': 'md5:1bccf7b13765e18fb27bf764dba7ede2',
+ 'uploader_id': '367912',
+ 'uploader': 'Jcmusclefun',
+ 'age_limit': 18,
+ 'display_id': 'big-italian-booty-28',
+ 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+1072387/preview\.jpg',
+ },
+ 'params': {
+ 'noplaylist': True,
+ },
+ }]
+ _PLAYLIST_URL_RE = _VALID_URL
+
+ def _generate_playlist_entries(self, url, playlist_id, html=None):
+ for wrapped_url in super()._generate_playlist_entries(url, playlist_id, html):
+ video_id = re.match(self._VALID_URL, wrapped_url).group('video_id')
+ yield urljoin(url, f'/videos/{video_id}/')
+
+ def _real_extract(self, url):
+ playlist_id, video_id = self._match_valid_url(url).group('id', 'video_id')
+
+ if not self._yes_playlist(playlist_id, video_id):
+ redirect_url = urljoin(url, f'/videos/{video_id}/')
+ return self.url_result(redirect_url, ThisVidIE)
+
+ result = self._make_playlist_result(url)
+
+ # Fix duplicated title (`the title - the title` => `the title`)
+ title = result['title']
+ t_len = len(title)
+ if t_len > 5 and t_len % 2 != 0:
+ t_len = t_len // 2
+ if title[t_len] == '-':
+ first, second = map(str.strip, (title[:t_len], title[t_len + 1:]))
+ if first and first == second:
+ result['title'] = first
+
+ return result
diff --git a/hypervideo_dl/extractor/threeqsdn.py b/hypervideo_dl/extractor/threeqsdn.py
index b104190..7841f8d 100644
--- a/hypervideo_dl/extractor/threeqsdn.py
+++ b/hypervideo_dl/extractor/threeqsdn.py
@@ -1,5 +1,5 @@
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
determine_ext,
ExtractorError,
@@ -90,7 +90,7 @@ class ThreeQSDNIE(InfoExtractor):
config = self._download_json(
url.replace('://playout.3qsdn.com/', '://playout.3qsdn.com/config/'), video_id)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
self.raise_geo_restricted()
raise
diff --git a/hypervideo_dl/extractor/tiktok.py b/hypervideo_dl/extractor/tiktok.py
index 1bbf884..f14c4f9 100644
--- a/hypervideo_dl/extractor/tiktok.py
+++ b/hypervideo_dl/extractor/tiktok.py
@@ -1,25 +1,31 @@
import itertools
import json
import random
+import re
import string
import time
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse
+from ..networking import HEADRequest
from ..utils import (
ExtractorError,
- HEADRequest,
LazyList,
UnsupportedError,
+ UserNotLive,
+ determine_ext,
+ format_field,
get_element_by_id,
get_first,
int_or_none,
join_nonempty,
+ merge_dicts,
qualities,
remove_start,
srt_subtitles_timecode,
str_or_none,
traverse_obj,
+ try_call,
try_get,
url_or_none,
)
@@ -30,11 +36,15 @@ class TikTokBaseIE(InfoExtractor):
_WORKING_APP_VERSION = None
_APP_NAME = 'trill'
_AID = 1180
- _API_HOSTNAME = 'api-h2.tiktokv.com'
_UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
_WEBPAGE_HOST = 'https://www.tiktok.com/'
QUALITIES = ('360p', '540p', '720p', '1080p')
+ @property
+ def _API_HOSTNAME(self):
+ return self._configuration_arg(
+ 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0]
+
@staticmethod
def _create_url(user_id, video_id):
return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}'
@@ -45,14 +55,14 @@ class TikTokBaseIE(InfoExtractor):
def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True,
note='Downloading API JSON', errnote='Unable to download API page'):
- self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160)))
+ self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160)))
webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
if webpage_cookies.get('sid_tt'):
self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value)
return self._download_json(
'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id,
fatal=fatal, note=note, errnote=errnote, headers={
- 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)',
+ 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)',
'Accept': 'application/json',
}, query=query)
@@ -64,16 +74,16 @@ class TikTokBaseIE(InfoExtractor):
'build_number': app_version,
'manifest_version_code': manifest_app_version,
'update_version_code': manifest_app_version,
- 'openudid': ''.join(random.choice('0123456789abcdef') for _ in range(16)),
- 'uuid': ''.join([random.choice(string.digits) for _ in range(16)]),
+ 'openudid': ''.join(random.choices('0123456789abcdef', k=16)),
+ 'uuid': ''.join(random.choices(string.digits, k=16)),
'_rticket': int(time.time() * 1000),
'ts': int(time.time()),
'device_brand': 'Google',
- 'device_type': 'Pixel 4',
+ 'device_type': 'Pixel 7',
'device_platform': 'android',
- 'resolution': '1080*1920',
+ 'resolution': '1080*2400',
'dpi': 420,
- 'os_version': '10',
+ 'os_version': '13',
'os_api': '29',
'carrier_region': 'US',
'sys_region': 'US',
@@ -195,11 +205,22 @@ class TikTokBaseIE(InfoExtractor):
known_resolutions = {}
+ def audio_meta(url):
+ ext = determine_ext(url, default_ext='m4a')
+ return {
+ 'format_note': 'Music track',
+ 'ext': ext,
+ 'acodec': 'aac' if ext == 'm4a' else ext,
+ 'vcodec': 'none',
+ 'width': None,
+ 'height': None,
+ } if ext == 'mp3' or '-music-' in url else {}
+
def extract_addr(addr, add_meta={}):
parsed_meta, res = parse_url_key(addr.get('url_key', ''))
if res:
- known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height'))
- known_resolutions[res].setdefault('width', add_meta.get('width'))
+ known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height') or addr.get('height'))
+ known_resolutions[res].setdefault('width', add_meta.get('width') or addr.get('width'))
parsed_meta.update(known_resolutions.get(res, {}))
add_meta.setdefault('height', int_or_none(res[:-1]))
return [{
@@ -210,7 +231,8 @@ class TikTokBaseIE(InfoExtractor):
'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked
**add_meta, **parsed_meta,
'format_note': join_nonempty(
- add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' ')
+ add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' '),
+ **audio_meta(url),
} for url in addr.get('url_list') or []]
# Hack: Add direct video links first to prioritize them when removing duplicate formats
@@ -266,21 +288,19 @@ class TikTokBaseIE(InfoExtractor):
thumbnails = []
for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak',
'origin_cover', 'dynamic_cover'):
- cover = video_info.get(cover_id)
- if cover:
- for cover_url in cover['url_list']:
- thumbnails.append({
- 'id': cover_id,
- 'url': cover_url,
- })
-
- stats_info = aweme_detail.get('statistics', {})
- author_info = aweme_detail.get('author', {})
- music_info = aweme_detail.get('music', {})
+ for cover_url in traverse_obj(video_info, (cover_id, 'url_list', ...)):
+ thumbnails.append({
+ 'id': cover_id,
+ 'url': cover_url,
+ })
+
+ stats_info = aweme_detail.get('statistics') or {}
+ author_info = aweme_detail.get('author') or {}
+ music_info = aweme_detail.get('music') or {}
user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info,
'sec_uid', 'id', 'uid', 'unique_id',
expected_type=str_or_none, get_all=False))
- labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str, default=[])
+ labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str)
contained_music_track = traverse_obj(
music_info, ('matched_song', 'title'), ('matched_pgc_sound', 'title'), expected_type=str)
@@ -298,20 +318,27 @@ class TikTokBaseIE(InfoExtractor):
'extractor_key': TikTokIE.ie_key(),
'extractor': TikTokIE.IE_NAME,
'webpage_url': self._create_url(author_info.get('uid'), aweme_id),
- 'title': aweme_detail.get('desc'),
- 'description': aweme_detail.get('desc'),
- 'view_count': int_or_none(stats_info.get('play_count')),
- 'like_count': int_or_none(stats_info.get('digg_count')),
- 'repost_count': int_or_none(stats_info.get('share_count')),
- 'comment_count': int_or_none(stats_info.get('comment_count')),
- 'uploader': str_or_none(author_info.get('unique_id')),
- 'creator': str_or_none(author_info.get('nickname')),
- 'uploader_id': str_or_none(author_info.get('uid')),
+ **traverse_obj(aweme_detail, {
+ 'title': ('desc', {str}),
+ 'description': ('desc', {str}),
+ 'timestamp': ('create_time', {int_or_none}),
+ }),
+ **traverse_obj(stats_info, {
+ 'view_count': 'play_count',
+ 'like_count': 'digg_count',
+ 'repost_count': 'share_count',
+ 'comment_count': 'comment_count',
+ }, expected_type=int_or_none),
+ **traverse_obj(author_info, {
+ 'uploader': 'unique_id',
+ 'uploader_id': 'uid',
+ 'creator': 'nickname',
+ 'channel_id': 'sec_uid',
+ }, expected_type=str_or_none),
'uploader_url': user_url,
'track': music_track,
'album': str_or_none(music_info.get('album')) or None,
'artist': music_author or None,
- 'timestamp': int_or_none(aweme_detail.get('create_time')),
'formats': formats,
'subtitles': self.extract_subtitles(aweme_detail, aweme_id),
'thumbnails': thumbnails,
@@ -323,37 +350,27 @@ class TikTokBaseIE(InfoExtractor):
'_format_sort_fields': ('quality', 'codec', 'size', 'br'),
}
- def _parse_aweme_video_web(self, aweme_detail, webpage_url):
+ def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id):
video_info = aweme_detail['video']
author_info = traverse_obj(aweme_detail, 'authorInfo', 'author', expected_type=dict, default={})
music_info = aweme_detail.get('music') or {}
stats_info = aweme_detail.get('stats') or {}
- user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info,
- 'secUid', 'id', 'uid', 'uniqueId',
- expected_type=str_or_none, get_all=False)
- or aweme_detail.get('authorSecId'))
+ channel_id = traverse_obj(author_info or aweme_detail, (('authorSecId', 'secUid'), {str}), get_all=False)
+ user_url = self._UPLOADER_URL_FORMAT % channel_id if channel_id else None
formats = []
- play_url = video_info.get('playAddr')
- width = video_info.get('width')
- height = video_info.get('height')
- if isinstance(play_url, str):
- formats = [{
+ width = int_or_none(video_info.get('width'))
+ height = int_or_none(video_info.get('height'))
+
+ for play_url in traverse_obj(video_info, ('playAddr', ((..., 'src'), None), {url_or_none})):
+ formats.append({
'url': self._proto_relative_url(play_url),
'ext': 'mp4',
'width': width,
'height': height,
- }]
- elif isinstance(play_url, list):
- formats = [{
- 'url': self._proto_relative_url(url),
- 'ext': 'mp4',
- 'width': width,
- 'height': height,
- } for url in traverse_obj(play_url, (..., 'src'), expected_type=url_or_none, default=[]) if url]
+ })
- download_url = url_or_none(video_info.get('downloadAddr')) or traverse_obj(video_info, ('download', 'url'), expected_type=url_or_none)
- if download_url:
+ for download_url in traverse_obj(video_info, (('downloadAddr', ('download', 'url')), {url_or_none})):
formats.append({
'format_id': 'download',
'url': self._proto_relative_url(download_url),
@@ -361,44 +378,54 @@ class TikTokBaseIE(InfoExtractor):
'width': width,
'height': height,
})
+
self._remove_duplicate_formats(formats)
thumbnails = []
- for thumbnail_name in ('thumbnail', 'cover', 'dynamicCover', 'originCover'):
- if aweme_detail.get(thumbnail_name):
- thumbnails = [{
- 'url': self._proto_relative_url(aweme_detail[thumbnail_name]),
- 'width': width,
- 'height': height
- }]
+ for thumb_url in traverse_obj(aweme_detail, (
+ (None, 'video'), ('thumbnail', 'cover', 'dynamicCover', 'originCover'), {url_or_none})):
+ thumbnails.append({
+ 'url': self._proto_relative_url(thumb_url),
+ 'width': width,
+ 'height': height,
+ })
return {
- 'id': traverse_obj(aweme_detail, 'id', 'awemeId', expected_type=str_or_none),
- 'title': aweme_detail.get('desc'),
- 'duration': try_get(aweme_detail, lambda x: x['video']['duration'], int),
- 'view_count': int_or_none(stats_info.get('playCount')),
- 'like_count': int_or_none(stats_info.get('diggCount')),
- 'repost_count': int_or_none(stats_info.get('shareCount')),
- 'comment_count': int_or_none(stats_info.get('commentCount')),
- 'timestamp': int_or_none(aweme_detail.get('createTime')),
- 'creator': str_or_none(author_info.get('nickname')),
- 'uploader': str_or_none(author_info.get('uniqueId') or aweme_detail.get('author')),
- 'uploader_id': str_or_none(traverse_obj(author_info, 'id', 'uid', 'authorId')),
+ 'id': video_id,
+ **traverse_obj(aweme_detail, {
+ 'title': ('desc', {str}),
+ 'description': ('desc', {str}),
+ 'duration': ('video', 'duration', {int_or_none}),
+ 'timestamp': ('createTime', {int_or_none}),
+ }),
+ **traverse_obj(author_info or aweme_detail, {
+ 'creator': ('nickname', {str}),
+ 'uploader': (('uniqueId', 'author'), {str}),
+ 'uploader_id': (('authorId', 'uid', 'id'), {str_or_none}),
+ }, get_all=False),
+ **traverse_obj(stats_info, {
+ 'view_count': 'playCount',
+ 'like_count': 'diggCount',
+ 'repost_count': 'shareCount',
+ 'comment_count': 'commentCount',
+ }, expected_type=int_or_none),
+ **traverse_obj(music_info, {
+ 'track': 'title',
+ 'album': ('album', {lambda x: x or None}),
+ 'artist': 'authorName',
+ }, expected_type=str),
+ 'channel_id': channel_id,
'uploader_url': user_url,
- 'track': str_or_none(music_info.get('title')),
- 'album': str_or_none(music_info.get('album')) or None,
- 'artist': str_or_none(music_info.get('authorName')),
'formats': formats,
'thumbnails': thumbnails,
- 'description': str_or_none(aweme_detail.get('desc')),
'http_headers': {
- 'Referer': webpage_url
+ 'Referer': webpage_url,
}
}
class TikTokIE(TikTokBaseIE):
- _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)/video)/(?P<id>\d+)'
+ _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)?/video)/(?P<id>\d+)'
_EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
_TESTS = [{
@@ -426,7 +453,8 @@ class TikTokIE(TikTokBaseIE):
'artist': 'Ysrbeats',
'album': 'Lehanga',
'track': 'Lehanga',
- }
+ },
+ 'skip': '404 Not Found',
}, {
'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
'md5': '6f3cf8cdd9b28cb8363fe0a9a160695b',
@@ -441,6 +469,7 @@ class TikTokIE(TikTokBaseIE):
'uploader': 'patrox',
'uploader_id': '18702747',
'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
+ 'channel_id': 'MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
'creator': 'patroX',
'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
'upload_date': '20190930',
@@ -451,7 +480,7 @@ class TikTokIE(TikTokBaseIE):
'comment_count': int,
'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson',
'track': 'Big Fun',
- }
+ },
}, {
# Banned audio, only available on the app
'url': 'https://www.tiktok.com/@barudakhb_/video/6984138651336838402',
@@ -464,6 +493,7 @@ class TikTokIE(TikTokBaseIE):
'creator': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
'uploader_id': '6974687867511718913',
'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
+ 'channel_id': 'MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
'track': 'Boka Dance',
'artist': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
'timestamp': 1626121503,
@@ -474,7 +504,7 @@ class TikTokIE(TikTokBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
- }
+ },
}, {
# Sponsored video, only available with feed workaround
'url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_/video/7042692929109986561',
@@ -487,6 +517,7 @@ class TikTokIE(TikTokBaseIE):
'creator': 'Slap And Run',
'uploader_id': '7036055384943690754',
'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
+ 'channel_id': 'MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
'track': 'Promoted Music',
'timestamp': 1639754738,
'duration': 30,
@@ -497,7 +528,7 @@ class TikTokIE(TikTokBaseIE):
'repost_count': int,
'comment_count': int,
},
- 'expected_warnings': ['trying with webpage', 'Unable to find video in feed']
+ 'params': {'skip_download': True}, # XXX: unable to download video data: HTTP Error 403: Forbidden
}, {
# Video without title and description
'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694',
@@ -510,6 +541,7 @@ class TikTokIE(TikTokBaseIE):
'creator': 'Pokemon',
'uploader_id': '6820838815978423302',
'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
+ 'channel_id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
'track': 'original sound',
'timestamp': 1643714123,
'duration': 6,
@@ -545,6 +577,107 @@ class TikTokIE(TikTokBaseIE):
},
'skip': 'This video is unavailable',
}, {
+ # slideshow audio-only mp3 format
+ 'url': 'https://www.tiktok.com/@_le_cannibale_/video/7139980461132074283',
+ 'info_dict': {
+ 'id': '7139980461132074283',
+ 'ext': 'mp3',
+ 'title': 'TikTok video #7139980461132074283',
+ 'description': '',
+ 'creator': 'Antaura',
+ 'uploader': '_le_cannibale_',
+ 'uploader_id': '6604511138619654149',
+ 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
+ 'channel_id': 'MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
+ 'artist': 'nathan !',
+ 'track': 'grahamscott canon',
+ 'upload_date': '20220905',
+ 'timestamp': 1662406249,
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ 'thumbnail': r're:^https://.+\.webp',
+ },
+ }, {
+ # only available via web
+ 'url': 'https://www.tiktok.com/@moxypatch/video/7206382937372134662',
+ 'md5': '6aba7fad816e8709ff2c149679ace165',
+ 'info_dict': {
+ 'id': '7206382937372134662',
+ 'ext': 'mp4',
+ 'title': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
+ 'description': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
+ 'creator': 'MoxyPatch',
+ 'uploader': 'moxypatch',
+ 'uploader_id': '7039142049363379205',
+ 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
+ 'channel_id': 'MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
+ 'artist': 'your worst nightmare',
+ 'track': 'original sound',
+ 'upload_date': '20230303',
+ 'timestamp': 1677866781,
+ 'duration': 10,
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ 'thumbnail': r're:^https://.+',
+ 'thumbnails': 'count:3',
+ },
+ 'expected_warnings': ['Unable to find video in feed'],
+ }, {
+ # 1080p format
+ 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830',
+ 'md5': '982512017a8a917124d5a08c8ae79621',
+ 'info_dict': {
+ 'id': '7107337212743830830',
+ 'ext': 'mp4',
+ 'title': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok',
+ 'description': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok',
+ 'uploader': 'tatemcrae',
+ 'uploader_id': '86328792343818240',
+ 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd',
+ 'channel_id': 'MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd',
+ 'creator': 'tate mcrae',
+ 'artist': 'tate mcrae',
+ 'track': 'original sound',
+ 'upload_date': '20220609',
+ 'timestamp': 1654805899,
+ 'duration': 150,
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ 'thumbnail': r're:^https://.+\.webp',
+ },
+ 'params': {'format': 'bytevc1_1080p_808907-0'},
+ }, {
+ # Slideshow, audio-only m4a format
+ 'url': 'https://www.tiktok.com/@hara_yoimiya/video/7253412088251534594',
+ 'md5': '2ff8fe0174db2dbf49c597a7bef4e47d',
+ 'info_dict': {
+ 'id': '7253412088251534594',
+ 'ext': 'm4a',
+ 'title': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ',
+ 'description': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ',
+ 'uploader': 'hara_yoimiya',
+ 'uploader_id': '6582536342634676230',
+ 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB',
+ 'channel_id': 'MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB',
+ 'creator': 'лампочка',
+ 'artist': 'Øneheart',
+ 'album': 'watching the stars',
+ 'track': 'watching the stars',
+ 'upload_date': '20230708',
+ 'timestamp': 1688816612,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ 'thumbnail': r're:^https://.+\.webp',
+ },
+ }, {
# Auto-captions available
'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
'only_matching': True
@@ -558,7 +691,7 @@ class TikTokIE(TikTokBaseIE):
self.report_warning(f'{e}; trying with webpage')
url = self._create_url(user_id, video_id)
- webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'User-Agent:Mozilla/5.0'})
+ webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'})
next_data = self._search_nextjs_data(webpage, video_id, default='{}')
if next_data:
status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0
@@ -569,7 +702,7 @@ class TikTokIE(TikTokBaseIE):
video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict)
if status == 0:
- return self._parse_aweme_video_web(video_data, url)
+ return self._parse_aweme_video_web(video_data, url, video_id)
elif status == 10216:
raise ExtractorError('This video is private', expected=True)
raise ExtractorError('Video not available', video_id=video_id)
@@ -634,7 +767,7 @@ class TikTokUserIE(TikTokBaseIE):
'max_cursor': 0,
'min_cursor': 0,
'retry_type': 'no_retry',
- 'device_id': ''.join(random.choice(string.digits) for _ in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api.
+ 'device_id': ''.join(random.choices(string.digits, k=19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api.
}
for page in itertools.count(1):
@@ -682,7 +815,7 @@ class TikTokBaseListIE(TikTokBaseIE): # XXX: Conventionally, base classes shoul
'cursor': 0,
'count': 20,
'type': 5,
- 'device_id': ''.join(random.choice(string.digits) for i in range(19))
+ 'device_id': ''.join(random.choices(string.digits, k=19))
}
for page in itertools.count(1):
@@ -796,6 +929,7 @@ class DouyinIE(TikTokBaseIE):
'description': '#杨超越 小小水手带你去远航❤️',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
+ 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
'duration': 19782,
'timestamp': 1620905839,
@@ -805,6 +939,7 @@ class DouyinIE(TikTokBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
+ 'thumbnail': r're:https?://.+\.jpe?g',
},
}, {
'url': 'https://www.douyin.com/video/6982497745948921092',
@@ -816,8 +951,9 @@ class DouyinIE(TikTokBaseIE):
'description': '这个夏日和小羊@杨超越 一起遇见白色幻想',
'uploader_id': '408654318141572',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
+ 'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
'creator': '杨超越工作室',
- 'duration': 42608,
+ 'duration': 42479,
'timestamp': 1625739481,
'upload_date': '20210708',
'track': '@杨超越工作室创作的原声',
@@ -825,6 +961,7 @@ class DouyinIE(TikTokBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
+ 'thumbnail': r're:https?://.+\.jpe?g',
},
}, {
'url': 'https://www.douyin.com/video/6953975910773099811',
@@ -836,8 +973,9 @@ class DouyinIE(TikTokBaseIE):
'description': '#一起看海 出现在你的夏日里',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
+ 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
- 'duration': 17228,
+ 'duration': 17343,
'timestamp': 1619098692,
'upload_date': '20210422',
'track': '@杨超越创作的原声',
@@ -845,6 +983,7 @@ class DouyinIE(TikTokBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
+ 'thumbnail': r're:https?://.+\.jpe?g',
},
}, {
'url': 'https://www.douyin.com/video/6950251282489675042',
@@ -873,6 +1012,7 @@ class DouyinIE(TikTokBaseIE):
'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
+ 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
'duration': 15115,
'timestamp': 1621261163,
@@ -882,6 +1022,7 @@ class DouyinIE(TikTokBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
+ 'thumbnail': r're:https?://.+\.jpe?g',
},
}]
_APP_VERSIONS = [('23.3.0', '230300')]
@@ -901,19 +1042,17 @@ class DouyinIE(TikTokBaseIE):
self.to_screen(f'{e}; trying with webpage')
webpage = self._download_webpage(url, video_id)
- render_data_json = self._search_regex(
- r'<script [^>]*\bid=[\'"]RENDER_DATA[\'"][^>]*>(%7B.+%7D)</script>',
- webpage, 'render data', default=None)
- if not render_data_json:
+ render_data = self._search_json(
+ r'<script [^>]*\bid=[\'"]RENDER_DATA[\'"][^>]*>', webpage, 'render data', video_id,
+ contains_pattern=r'%7B(?s:.+)%7D', fatal=False, transform_source=compat_urllib_parse_unquote)
+ if not render_data:
# TODO: Run verification challenge code to generate signature cookies
cookies = self._get_cookies(self._WEBPAGE_HOST)
expected = not cookies.get('s_v_web_id') or not cookies.get('ttwid')
raise ExtractorError(
'Fresh cookies (not necessarily logged in) are needed', expected=expected)
- render_data = self._parse_json(
- render_data_json, video_id, transform_source=compat_urllib_parse_unquote)
- return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url)
+ return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url, video_id)
class TikTokVMIE(InfoExtractor):
@@ -944,8 +1083,27 @@ class TikTokVMIE(InfoExtractor):
'creator': 'SigmaChad',
},
}, {
- 'url': 'https://vm.tiktok.com/ZSe4FqkKd',
- 'only_matching': True,
+ 'url': 'https://vm.tiktok.com/ZTR45GpSF/',
+ 'info_dict': {
+ 'id': '7106798200794926362',
+ 'ext': 'mp4',
+ 'title': 'md5:edc3e7ea587847f8537468f2fe51d074',
+ 'uploader_id': '6997695878846268418',
+ 'upload_date': '20220608',
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ 'thumbnail': r're:https://.+\.webp.*',
+ 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAdZ_NcPPgMneaGrW0hN8O_J_bwLshwNNERRF5DxOw2HKIzk0kdlLrR8RkVl1ksrMO',
+ 'duration': 29,
+ 'timestamp': 1654680400,
+ 'repost_count': int,
+ 'artist': 'Akihitoko',
+ 'track': 'original sound',
+ 'description': 'md5:edc3e7ea587847f8537468f2fe51d074',
+ 'uploader': 'akihitoko1',
+ 'creator': 'Akihitoko',
+ },
}, {
'url': 'https://vt.tiktok.com/ZSe4FqkKd',
'only_matching': True,
@@ -953,7 +1111,179 @@ class TikTokVMIE(InfoExtractor):
def _real_extract(self, url):
new_url = self._request_webpage(
- HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).geturl()
+ HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).url
if self.suitable(new_url): # Prevent infinite loop in case redirect fails
raise UnsupportedError(new_url)
return self.url_result(new_url)
+
+
+class TikTokLiveIE(TikTokBaseIE):
+ _VALID_URL = r'''(?x)https?://(?:
+ (?:www\.)?tiktok\.com/@(?P<uploader>[\w.-]+)/live|
+ m\.tiktok\.com/share/live/(?P<id>\d+)
+ )'''
+ IE_NAME = 'tiktok:live'
+
+ _TESTS = [{
+ 'url': 'https://www.tiktok.com/@weathernewslive/live',
+ 'info_dict': {
+ 'id': '7210809319192726273',
+ 'ext': 'mp4',
+ 'title': r're:ウェザーニュースLiVE[\d\s:-]*',
+ 'creator': 'ウェザーニュースLiVE',
+ 'uploader': 'weathernewslive',
+ 'uploader_id': '6621496731283095554',
+ 'uploader_url': 'https://www.tiktok.com/@weathernewslive',
+ 'live_status': 'is_live',
+ 'concurrent_view_count': int,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.tiktok.com/@pilarmagenta/live',
+ 'info_dict': {
+ 'id': '7209423610325322522',
+ 'ext': 'mp4',
+ 'title': str,
+ 'creator': 'Pilarmagenta',
+ 'uploader': 'pilarmagenta',
+ 'uploader_id': '6624846890674683909',
+ 'uploader_url': 'https://www.tiktok.com/@pilarmagenta',
+ 'live_status': 'is_live',
+ 'concurrent_view_count': int,
+ },
+ 'skip': 'Livestream',
+ }, {
+ 'url': 'https://m.tiktok.com/share/live/7209423610325322522/?language=en',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.tiktok.com/@iris04201/live',
+ 'only_matching': True,
+ }]
+
+ def _call_api(self, url, param, room_id, uploader, key=None):
+ response = traverse_obj(self._download_json(
+ url, room_id, fatal=False, query={
+ 'aid': '1988',
+ param: room_id,
+ }), (key, {dict}), default={})
+
+ # status == 2 if live else 4
+ if int_or_none(response.get('status')) == 2:
+ return response
+ # If room_id is obtained via mobile share URL and cannot be refreshed, do not wait for live
+ elif not uploader:
+ raise ExtractorError('This livestream has ended', expected=True)
+ raise UserNotLive(video_id=uploader)
+
+ def _real_extract(self, url):
+ uploader, room_id = self._match_valid_url(url).group('uploader', 'id')
+ webpage = self._download_webpage(
+ url, uploader or room_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=not room_id)
+
+ if webpage:
+ data = try_call(lambda: self._get_sigi_state(webpage, uploader or room_id))
+ room_id = (traverse_obj(data, ('UserModule', 'users', ..., 'roomId', {str_or_none}), get_all=False)
+ or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None)
+ or room_id)
+ uploader = uploader or traverse_obj(
+ data, ('LiveRoom', 'liveRoomUserInfo', 'user', 'uniqueId'),
+ ('UserModule', 'users', ..., 'uniqueId'), get_all=False, expected_type=str)
+
+ if not room_id:
+ raise UserNotLive(video_id=uploader)
+
+ formats = []
+ live_info = self._call_api(
+ 'https://webcast.tiktok.com/webcast/room/info', 'room_id', room_id, uploader, key='data')
+
+ get_quality = qualities(('SD1', 'ld', 'SD2', 'sd', 'HD1', 'hd', 'FULL_HD1', 'uhd', 'ORIGION', 'origin'))
+ parse_inner = lambda x: self._parse_json(x, None)
+
+ for quality, stream in traverse_obj(live_info, (
+ 'stream_url', 'live_core_sdk_data', 'pull_data', 'stream_data',
+ {parse_inner}, 'data', {dict}), default={}).items():
+
+ sdk_params = traverse_obj(stream, ('main', 'sdk_params', {parse_inner}, {
+ 'vcodec': ('VCodec', {str}),
+ 'tbr': ('vbitrate', {lambda x: int_or_none(x, 1000)}),
+ 'resolution': ('resolution', {lambda x: re.match(r'(?i)\d+x\d+|\d+p', x).group().lower()}),
+ }))
+
+ flv_url = traverse_obj(stream, ('main', 'flv', {url_or_none}))
+ if flv_url:
+ formats.append({
+ 'url': flv_url,
+ 'ext': 'flv',
+ 'format_id': f'flv-{quality}',
+ 'quality': get_quality(quality),
+ **sdk_params,
+ })
+
+ hls_url = traverse_obj(stream, ('main', 'hls', {url_or_none}))
+ if hls_url:
+ formats.append({
+ 'url': hls_url,
+ 'ext': 'mp4',
+ 'protocol': 'm3u8_native',
+ 'format_id': f'hls-{quality}',
+ 'quality': get_quality(quality),
+ **sdk_params,
+ })
+
+ def get_vcodec(*keys):
+ return traverse_obj(live_info, (
+ 'stream_url', *keys, {parse_inner}, 'VCodec', {str}))
+
+ for stream in ('hls', 'rtmp'):
+ stream_url = traverse_obj(live_info, ('stream_url', f'{stream}_pull_url', {url_or_none}))
+ if stream_url:
+ formats.append({
+ 'url': stream_url,
+ 'ext': 'mp4' if stream == 'hls' else 'flv',
+ 'protocol': 'm3u8_native' if stream == 'hls' else 'https',
+ 'format_id': f'{stream}-pull',
+ 'vcodec': get_vcodec(f'{stream}_pull_url_params'),
+ 'quality': get_quality('ORIGION'),
+ })
+
+ for f_id, f_url in traverse_obj(live_info, ('stream_url', 'flv_pull_url', {dict}), default={}).items():
+ if not url_or_none(f_url):
+ continue
+ formats.append({
+ 'url': f_url,
+ 'ext': 'flv',
+ 'format_id': f'flv-{f_id}'.lower(),
+ 'vcodec': get_vcodec('flv_pull_url_params', f_id),
+ 'quality': get_quality(f_id),
+ })
+
+ # If uploader is a guest on another's livestream, primary endpoint will not have m3u8 URLs
+ if not traverse_obj(formats, lambda _, v: v['ext'] == 'mp4'):
+ live_info = merge_dicts(live_info, self._call_api(
+ 'https://www.tiktok.com/api/live/detail/', 'roomID', room_id, uploader, key='LiveRoomInfo'))
+ if url_or_none(live_info.get('liveUrl')):
+ formats.append({
+ 'url': live_info['liveUrl'],
+ 'ext': 'mp4',
+ 'protocol': 'm3u8_native',
+ 'format_id': 'hls-fallback',
+ 'vcodec': 'h264',
+ 'quality': get_quality('origin'),
+ })
+
+ uploader = uploader or traverse_obj(live_info, ('ownerInfo', 'uniqueId'), ('owner', 'display_id'))
+
+ return {
+ 'id': room_id,
+ 'uploader': uploader,
+ 'uploader_url': format_field(uploader, None, self._UPLOADER_URL_FORMAT) or None,
+ 'is_live': True,
+ 'formats': formats,
+ '_format_sort_fields': ('quality', 'ext'),
+ **traverse_obj(live_info, {
+ 'title': 'title',
+ 'uploader_id': (('ownerInfo', 'owner'), 'id', {str_or_none}),
+ 'creator': (('ownerInfo', 'owner'), 'nickname'),
+ 'concurrent_view_count': (('user_count', ('liveRoomStats', 'userCount')), {int_or_none}),
+ }, get_all=False),
+ }
diff --git a/hypervideo_dl/extractor/tnaflix.py b/hypervideo_dl/extractor/tnaflix.py
index 4482c84..b2baf2e 100644
--- a/hypervideo_dl/extractor/tnaflix.py
+++ b/hypervideo_dl/extractor/tnaflix.py
@@ -81,26 +81,27 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
display_id = video_id
webpage = self._download_webpage(url, display_id)
+ inputs = self._hidden_inputs(webpage)
+ query = {}
# check for MovieFap-style config
cfg_url = self._proto_relative_url(self._html_search_regex(
self._CONFIG_REGEX, webpage, 'flashvars.config', default=None,
group='url'), 'http:')
- query = {}
- # check for TNAFlix-style config
if not cfg_url:
- inputs = self._hidden_inputs(webpage)
- if inputs.get('vkey') and inputs.get('nkey'):
- cfg_url = f'https://www.{host}.com/cdn/cdn.php'
- query.update({
- 'file': inputs['vkey'],
- 'key': inputs['nkey'],
- 'VID': video_id,
- 'premium': '1',
- 'vip': '1',
- 'alpha': '',
- })
+ cfg_url = inputs.get('config')
+
+ # check for TNAFlix-style config
+ if not cfg_url and inputs.get('vkey') and inputs.get('nkey'):
+ cfg_url = f'http://cdn-fck.{host}.com/{host}/{inputs["vkey"]}.fid'
+ query.update({
+ 'key': inputs['nkey'],
+ 'VID': video_id,
+ 'premium': '1',
+ 'vip': '1',
+ 'alpha': '',
+ })
formats, json_ld = [], {}
diff --git a/hypervideo_dl/extractor/toutv.py b/hypervideo_dl/extractor/toutv.py
index f60c199..ced1224 100644
--- a/hypervideo_dl/extractor/toutv.py
+++ b/hypervideo_dl/extractor/toutv.py
@@ -1,7 +1,7 @@
import json
from .radiocanada import RadioCanadaIE
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
@@ -52,8 +52,8 @@ class TouTvIE(RadioCanadaIE): # XXX: Do not subclass from concrete IE
'Content-Type': 'application/json;charset=utf-8',
})['access_token']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- error = self._parse_json(e.cause.read().decode(), None)['Message']
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+ error = self._parse_json(e.cause.response.read().decode(), None)['Message']
raise ExtractorError(error, expected=True)
raise
self._claims = self._call_api('validation/v2/getClaims')['claims']
diff --git a/hypervideo_dl/extractor/triller.py b/hypervideo_dl/extractor/triller.py
index acd9e68..56e51fe 100644
--- a/hypervideo_dl/extractor/triller.py
+++ b/hypervideo_dl/extractor/triller.py
@@ -1,15 +1,21 @@
import itertools
import json
+import re
from .common import InfoExtractor
+from ..networking import HEADRequest
from ..utils import (
ExtractorError,
+ UnsupportedError,
+ determine_ext,
int_or_none,
+ parse_resolution,
str_or_none,
traverse_obj,
- unified_strdate,
unified_timestamp,
url_basename,
+ urljoin,
+ url_or_none,
)
@@ -22,25 +28,22 @@ class TrillerBaseIE(InfoExtractor):
if self._API_HEADERS.get('Authorization'):
return
- user_check = self._download_json(
+ headers = {**self._API_HEADERS, 'Content-Type': 'application/json'}
+ user_check = traverse_obj(self._download_json(
f'{self._API_BASE_URL}/api/user/is-valid-username', None, note='Checking username',
- fatal=False, expected_status=400, headers={
- 'Content-Type': 'application/json',
- 'Origin': 'https://triller.co',
- }, data=json.dumps({'username': username}, separators=(',', ':')).encode('utf-8'))
- if user_check.get('status'): # endpoint returns "status":false if username exists
+ fatal=False, expected_status=400, headers=headers,
+ data=json.dumps({'username': username}, separators=(',', ':')).encode()), 'status')
+
+ if user_check: # endpoint returns `"status":false` if username exists
raise ExtractorError('Unable to login: Invalid username', expected=True)
- credentials = {
- 'username': username,
- 'password': password,
- }
login = self._download_json(
- f'{self._API_BASE_URL}/user/auth', None, note='Logging in',
- fatal=False, expected_status=400, headers={
- 'Content-Type': 'application/json',
- 'Origin': 'https://triller.co',
- }, data=json.dumps(credentials, separators=(',', ':')).encode('utf-8'))
+ f'{self._API_BASE_URL}/user/auth', None, note='Logging in', fatal=False,
+ expected_status=400, headers=headers, data=json.dumps({
+ 'username': username,
+ 'password': password,
+ }, separators=(',', ':')).encode()) or {}
+
if not login.get('auth_token'):
if login.get('error') == 1008:
raise ExtractorError('Unable to login: Incorrect password', expected=True)
@@ -55,100 +58,93 @@ class TrillerBaseIE(InfoExtractor):
headers=self._API_HEADERS, query={'limit': limit}) or {}
if not comment_info.get('comments'):
return
- for comment_dict in comment_info['comments']:
- yield {
- 'author': traverse_obj(comment_dict, ('author', 'username')),
- 'author_id': traverse_obj(comment_dict, ('author', 'user_id')),
- 'id': comment_dict.get('id'),
- 'text': comment_dict.get('body'),
- 'timestamp': unified_timestamp(comment_dict.get('timestamp')),
- }
+ yield from traverse_obj(comment_info, ('comments', ..., {
+ 'id': ('id', {str_or_none}),
+ 'text': 'body',
+ 'author': ('author', 'username'),
+ 'author_id': ('author', 'user_id'),
+ 'timestamp': ('timestamp', {unified_timestamp}),
+ }))
- def _check_user_info(self, user_info):
- if not user_info:
- self.report_warning('Unable to extract user info')
- elif user_info.get('private') and not user_info.get('followed_by_me'):
- raise ExtractorError('This video is private', expected=True)
- elif traverse_obj(user_info, 'blocked_by_user', 'blocking_user'):
- raise ExtractorError('The author of the video is blocked', expected=True)
- return user_info
+ def _parse_video_info(self, video_info, username, user_id, display_id=None):
+ video_id = str(video_info['id'])
+ display_id = display_id or video_info.get('video_uuid')
+
+ if traverse_obj(video_info, (
+ None, ('transcoded_url', 'video_url', 'stream_url', 'audio_url'),
+ {lambda x: re.search(r'/copyright/', x)}), get_all=False):
+ self.raise_no_formats('This video has been removed due to licensing restrictions', expected=True)
- def _parse_video_info(self, video_info, username, user_info=None):
- video_uuid = video_info.get('video_uuid')
- video_id = video_info.get('id')
+ def format_info(url):
+ return {
+ 'url': url,
+ 'ext': determine_ext(url),
+ 'format_id': url_basename(url).split('.')[0],
+ }
formats = []
- video_url = traverse_obj(video_info, 'video_url', 'stream_url')
- if video_url:
- formats.append({
- 'url': video_url,
- 'ext': 'mp4',
- 'vcodec': 'h264',
- 'width': video_info.get('width'),
- 'height': video_info.get('height'),
- 'format_id': url_basename(video_url).split('.')[0],
- 'filesize': video_info.get('filesize'),
- })
- video_set = video_info.get('video_set') or []
- for video in video_set:
- resolution = video.get('resolution') or ''
+
+ if determine_ext(video_info.get('transcoded_url')) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_info['transcoded_url'], video_id, 'mp4', m3u8_id='hls', fatal=False))
+
+ for video in traverse_obj(video_info, ('video_set', lambda _, v: url_or_none(v['url']))):
formats.append({
- 'url': video['url'],
- 'ext': 'mp4',
+ **format_info(video['url']),
+ **parse_resolution(video.get('resolution')),
'vcodec': video.get('codec'),
'vbr': int_or_none(video.get('bitrate'), 1000),
- 'width': int_or_none(resolution.split('x')[0]),
- 'height': int_or_none(resolution.split('x')[1]),
- 'format_id': url_basename(video['url']).split('.')[0],
})
- audio_url = video_info.get('audio_url')
- if audio_url:
+
+ video_url = traverse_obj(video_info, 'video_url', 'stream_url', expected_type=url_or_none)
+ if video_url:
formats.append({
- 'url': audio_url,
- 'ext': 'm4a',
- 'format_id': url_basename(audio_url).split('.')[0],
+ **format_info(video_url),
+ 'vcodec': 'h264',
+ **traverse_obj(video_info, {
+ 'width': 'width',
+ 'height': 'height',
+ 'filesize': 'filesize',
+ }, expected_type=int_or_none),
})
- manifest_url = video_info.get('transcoded_url')
- if manifest_url:
- formats.extend(self._extract_m3u8_formats(
- manifest_url, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False))
-
- comment_count = int_or_none(video_info.get('comment_count'))
+ audio_url = url_or_none(video_info.get('audio_url'))
+ if audio_url:
+ formats.append(format_info(audio_url))
- user_info = user_info or traverse_obj(video_info, 'user', default={})
+ comment_count = traverse_obj(video_info, ('comment_count', {int_or_none}))
return {
- 'id': str_or_none(video_id) or video_uuid,
- 'title': video_info.get('description') or f'Video by {username}',
- 'thumbnail': video_info.get('thumbnail_url'),
- 'description': video_info.get('description'),
- 'uploader': str_or_none(username),
- 'uploader_id': str_or_none(user_info.get('user_id')),
- 'creator': str_or_none(user_info.get('name')),
- 'timestamp': unified_timestamp(video_info.get('timestamp')),
- 'upload_date': unified_strdate(video_info.get('timestamp')),
- 'duration': int_or_none(video_info.get('duration')),
- 'view_count': int_or_none(video_info.get('play_count')),
- 'like_count': int_or_none(video_info.get('likes_count')),
- 'artist': str_or_none(video_info.get('song_artist')),
- 'track': str_or_none(video_info.get('song_title')),
- 'webpage_url': f'https://triller.co/@{username}/video/{video_uuid}',
+ 'id': video_id,
+ 'display_id': display_id,
+ 'uploader': username,
+ 'uploader_id': user_id or traverse_obj(video_info, ('user', 'user_id', {str_or_none})),
+ 'webpage_url': urljoin(f'https://triller.co/@{username}/video/', display_id),
'uploader_url': f'https://triller.co/@{username}',
'extractor_key': TrillerIE.ie_key(),
'extractor': TrillerIE.IE_NAME,
'formats': formats,
'comment_count': comment_count,
'__post_extractor': self.extract_comments(video_id, comment_count),
+ **traverse_obj(video_info, {
+ 'title': ('description', {lambda x: x.replace('\r\n', ' ')}),
+ 'description': 'description',
+ 'creator': ((('user'), ('users', lambda _, v: str(v['user_id']) == user_id)), 'name'),
+ 'thumbnail': ('thumbnail_url', {url_or_none}),
+ 'timestamp': ('timestamp', {unified_timestamp}),
+ 'duration': ('duration', {int_or_none}),
+ 'view_count': ('play_count', {int_or_none}),
+ 'like_count': ('likes_count', {int_or_none}),
+ 'artist': 'song_artist',
+ 'track': 'song_title',
+ }, get_all=False),
}
class TrillerIE(TrillerBaseIE):
_VALID_URL = r'''(?x)
https?://(?:www\.)?triller\.co/
- @(?P<username>[\w\._]+)/video/
- (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})
+ @(?P<username>[\w.]+)/video/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})
'''
_TESTS = [{
'url': 'https://triller.co/@theestallion/video/2358fcd7-3df2-4c77-84c8-1d091610a6cf',
@@ -165,16 +161,14 @@ class TrillerIE(TrillerBaseIE):
'timestamp': 1660598222,
'upload_date': '20220815',
'duration': 47,
- 'height': 3840,
- 'width': 2160,
'view_count': int,
'like_count': int,
'artist': 'Megan Thee Stallion',
'track': 'Her',
- 'webpage_url': 'https://triller.co/@theestallion/video/2358fcd7-3df2-4c77-84c8-1d091610a6cf',
'uploader_url': 'https://triller.co/@theestallion',
'comment_count': int,
- }
+ },
+ 'skip': 'This video has been removed due to licensing restrictions',
}, {
'url': 'https://triller.co/@charlidamelio/video/46c6fcfa-aa9e-4503-a50c-68444f44cddc',
'md5': '874055f462af5b0699b9dbb527a505a0',
@@ -182,6 +176,7 @@ class TrillerIE(TrillerBaseIE):
'id': '71621339',
'ext': 'mp4',
'title': 'md5:4c91ea82760fe0fffb71b8c3aa7295fc',
+ 'display_id': '46c6fcfa-aa9e-4503-a50c-68444f44cddc',
'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$',
'description': 'md5:4c91ea82760fe0fffb71b8c3aa7295fc',
'uploader': 'charlidamelio',
@@ -190,59 +185,73 @@ class TrillerIE(TrillerBaseIE):
'timestamp': 1660773354,
'upload_date': '20220817',
'duration': 16,
- 'height': 1920,
- 'width': 1080,
'view_count': int,
'like_count': int,
'artist': 'Dixie',
'track': 'Someone to Blame',
- 'webpage_url': 'https://triller.co/@charlidamelio/video/46c6fcfa-aa9e-4503-a50c-68444f44cddc',
'uploader_url': 'https://triller.co/@charlidamelio',
'comment_count': int,
- }
+ },
+ }, {
+ 'url': 'https://triller.co/@theestallion/video/07f35f38-1f51-48e2-8c5f-f7a8e829988f',
+ 'md5': 'af7b3553e4b8bfca507636471ee2eb41',
+ 'info_dict': {
+ 'id': '71837829',
+ 'ext': 'mp4',
+ 'title': 'UNGRATEFUL VIDEO OUT NOW 👏🏾👏🏾👏🏾 💙💙 link my bio #womeninhiphop',
+ 'display_id': '07f35f38-1f51-48e2-8c5f-f7a8e829988f',
+ 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$',
+ 'description': 'UNGRATEFUL VIDEO OUT NOW 👏🏾👏🏾👏🏾 💙💙 link my bio\r\n #womeninhiphop',
+ 'uploader': 'theestallion',
+ 'uploader_id': '18992236',
+ 'creator': 'Megan Thee Stallion',
+ 'timestamp': 1662486178,
+ 'upload_date': '20220906',
+ 'duration': 30,
+ 'view_count': int,
+ 'like_count': int,
+ 'artist': 'Unknown',
+ 'track': 'Unknown',
+ 'uploader_url': 'https://triller.co/@theestallion',
+ 'comment_count': int,
+ },
}]
def _real_extract(self, url):
- username, video_uuid = self._match_valid_url(url).group('username', 'id')
+ username, display_id = self._match_valid_url(url).group('username', 'id')
- video_info = traverse_obj(self._download_json(
- f'{self._API_BASE_URL}/api/videos/{video_uuid}',
- video_uuid, note='Downloading video info API JSON',
- errnote='Unable to download video info API JSON',
- headers=self._API_HEADERS), ('videos', 0))
- if not video_info:
- raise ExtractorError('No video info found in API response')
+ video_info = self._download_json(
+ f'{self._API_BASE_URL}/api/videos/{display_id}', display_id,
+ headers=self._API_HEADERS)['videos'][0]
- user_info = self._check_user_info(video_info.get('user') or {})
- return self._parse_video_info(video_info, username, user_info)
+ return self._parse_video_info(video_info, username, None, display_id)
class TrillerUserIE(TrillerBaseIE):
- _VALID_URL = r'https?://(?:www\.)?triller\.co/@(?P<id>[\w\._]+)/?(?:$|[#?])'
+ _VALID_URL = r'https?://(?:www\.)?triller\.co/@(?P<id>[\w.]+)/?(?:$|[#?])'
_TESTS = [{
- # first videos request only returns 2 videos
'url': 'https://triller.co/@theestallion',
- 'playlist_mincount': 9,
+ 'playlist_mincount': 12,
'info_dict': {
'id': '18992236',
'title': 'theestallion',
'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$',
- }
+ },
}, {
'url': 'https://triller.co/@charlidamelio',
- 'playlist_mincount': 25,
+ 'playlist_mincount': 150,
'info_dict': {
'id': '1875551',
'title': 'charlidamelio',
'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$',
- }
+ },
}]
def _real_initialize(self):
if not self._API_HEADERS.get('Authorization'):
guest = self._download_json(
- f'{self._API_BASE_URL}/user/create_guest',
- None, note='Creating guest session', data=b'', headers=self._API_HEADERS, query={
+ f'{self._API_BASE_URL}/user/create_guest', None,
+ note='Creating guest session', data=b'', headers=self._API_HEADERS, query={
'platform': 'Web',
'app_version': '',
})
@@ -251,44 +260,70 @@ class TrillerUserIE(TrillerBaseIE):
self._API_HEADERS['Authorization'] = f'Bearer {guest["auth_token"]}'
- def _extract_video_list(self, username, user_id, limit=6):
- query = {
- 'limit': limit,
- }
+ def _entries(self, username, user_id, limit=6):
+ query = {'limit': limit}
for page in itertools.count(1):
- for retry in self.RetryManager():
- try:
- video_list = self._download_json(
- f'{self._API_BASE_URL}/api/users/{user_id}/videos',
- username, note=f'Downloading user video list page {page}',
- errnote='Unable to download user video list', headers=self._API_HEADERS,
- query=query)
- except ExtractorError as e:
- if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
- retry.error = e
- continue
- raise
- if not video_list.get('videos'):
- break
- yield from video_list['videos']
- query['before_time'] = traverse_obj(video_list, ('videos', -1, 'timestamp'))
+ videos = self._download_json(
+ f'{self._API_BASE_URL}/api/users/{user_id}/videos',
+ username, note=f'Downloading user video list page {page}',
+ headers=self._API_HEADERS, query=query)
+
+ for video in traverse_obj(videos, ('videos', ...)):
+ yield self._parse_video_info(video, username, user_id)
+
+ query['before_time'] = traverse_obj(videos, ('videos', -1, 'timestamp'))
if not query['before_time']:
break
- def _entries(self, videos, username, user_info):
- for video in videos:
- yield self._parse_video_info(video, username, user_info)
-
def _real_extract(self, url):
username = self._match_id(url)
- user_info = self._check_user_info(self._download_json(
+
+ user_info = traverse_obj(self._download_json(
f'{self._API_BASE_URL}/api/users/by_username/{username}',
- username, note='Downloading user info',
- errnote='Failed to download user info', headers=self._API_HEADERS).get('user', {}))
+ username, note='Downloading user info', headers=self._API_HEADERS), ('user', {dict})) or {}
+
+ if user_info.get('private') and user_info.get('followed_by_me') not in (True, 'true'):
+ raise ExtractorError('This user profile is private', expected=True)
+ elif traverse_obj(user_info, (('blocked_by_user', 'blocking_user'), {bool}), get_all=False):
+ raise ExtractorError('The author of the video is blocked', expected=True)
user_id = str_or_none(user_info.get('user_id'))
- videos = self._extract_video_list(username, user_id)
- thumbnail = user_info.get('avatar_url')
+ if not user_id:
+ raise ExtractorError('Unable to extract user ID')
return self.playlist_result(
- self._entries(videos, username, user_info), user_id, username, thumbnail=thumbnail)
+ self._entries(username, user_id), user_id, username, thumbnail=user_info.get('avatar_url'))
+
+
+class TrillerShortIE(InfoExtractor):
+ _VALID_URL = r'https?://v\.triller\.co/(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://v.triller.co/WWZNWk',
+ 'md5': '5eb8dc2c971bd8cd794ec9e8d5e9d101',
+ 'info_dict': {
+ 'id': '66210052',
+ 'ext': 'mp4',
+ 'title': 'md5:2dfc89d154cd91a4a18cd9582ba03e16',
+ 'display_id': 'f4480e1f-fb4e-45b9-a44c-9e6c679ce7eb',
+ 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$',
+ 'description': 'md5:2dfc89d154cd91a4a18cd9582ba03e16',
+ 'uploader': 'statefairent',
+ 'uploader_id': '487545193',
+ 'creator': 'Official Summer Fair of LA',
+ 'timestamp': 1629655457,
+ 'upload_date': '20210822',
+ 'duration': 19,
+ 'view_count': int,
+ 'like_count': int,
+ 'artist': 'Unknown',
+ 'track': 'Unknown',
+ 'uploader_url': 'https://triller.co/@statefairent',
+ 'comment_count': int,
+ },
+ }]
+
+ def _real_extract(self, url):
+ real_url = self._request_webpage(HEADRequest(url), self._match_id(url)).url
+ if self.suitable(real_url): # Prevent infinite loop in case redirect fails
+ raise UnsupportedError(real_url)
+ return self.url_result(real_url)
diff --git a/hypervideo_dl/extractor/trtcocuk.py b/hypervideo_dl/extractor/trtcocuk.py
new file mode 100644
index 0000000..f27f5a1
--- /dev/null
+++ b/hypervideo_dl/extractor/trtcocuk.py
@@ -0,0 +1,48 @@
+from .common import InfoExtractor
+from ..utils import ExtractorError, int_or_none, parse_iso8601, traverse_obj
+
+
+class TrtCocukVideoIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.trtcocuk\.net\.tr/video/(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://www.trtcocuk.net.tr/video/kaptan-pengu-ve-arkadaslari-1',
+ 'info_dict': {
+ 'id': '3789738',
+ 'ext': 'mp4',
+ 'season_number': 1,
+ 'series': '"Kaptan Pengu ve Arkadaşları"',
+ 'season': 'Season 1',
+ 'title': 'Kaptan Pengu ve Arkadaşları 1 Bölüm İzle TRT Çocuk',
+ 'release_date': '20201209',
+ 'release_timestamp': 1607513774,
+ }
+ }, {
+ 'url': 'https://www.trtcocuk.net.tr/video/sef-rokanin-lezzet-dunyasi-17',
+ 'info_dict': {
+ 'id': '10260842',
+ 'ext': 'mp4',
+ 'series': '"Şef Roka\'nın Lezzet Dünyası"',
+ 'title': 'Şef Roka\'nın Lezzet Dünyası 17 Bölüm İzle TRT Çocuk',
+ }
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ nuxtjs_data = self._search_nuxt_data(webpage, display_id)['data']
+
+ try:
+ video_url = self._parse_json(nuxtjs_data['video'], display_id)
+ except ExtractorError:
+ video_url = nuxtjs_data['video']
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, display_id)
+
+ return {
+ 'id': str(nuxtjs_data['id']),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'season_number': int_or_none(nuxtjs_data.get('season')),
+ 'release_timestamp': parse_iso8601(nuxtjs_data.get('publishedDate')),
+ 'series': traverse_obj(nuxtjs_data, ('show', 0, 'title')),
+ 'title': self._html_extract_title(webpage) # TODO: get better title
+ }
diff --git a/hypervideo_dl/extractor/trueid.py b/hypervideo_dl/extractor/trueid.py
index 6963436..86f0990 100644
--- a/hypervideo_dl/extractor/trueid.py
+++ b/hypervideo_dl/extractor/trueid.py
@@ -1,5 +1,5 @@
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
determine_ext,
ExtractorError,
@@ -88,9 +88,9 @@ class TrueIDIE(InfoExtractor):
stream_data = self._download_json(
f'https://{domain}/cmsPostProxy/contents/video/{video_id}/streamer?os=android', video_id, data=b'')['data']
except ExtractorError as e:
- if not isinstance(e.cause, compat_HTTPError):
+ if not isinstance(e.cause, HTTPError):
raise e
- errmsg = self._parse_json(e.cause.read().decode(), video_id)['meta']['message']
+ errmsg = self._parse_json(e.cause.response.read().decode(), video_id)['meta']['message']
if 'country' in errmsg:
self.raise_geo_restricted(
errmsg, [initial_data['display_country']] if initial_data.get('display_country') else None, True)
diff --git a/hypervideo_dl/extractor/tubetugraz.py b/hypervideo_dl/extractor/tubetugraz.py
index ebabedc..a351e4e 100644
--- a/hypervideo_dl/extractor/tubetugraz.py
+++ b/hypervideo_dl/extractor/tubetugraz.py
@@ -21,17 +21,36 @@ class TubeTuGrazBaseIE(InfoExtractor):
if not urlh:
return
- urlh = self._request_webpage(
- urlh.geturl(), None, fatal=False, headers={'referer': urlh.geturl()},
- note='logging in', errnote='unable to log in', data=urlencode_postdata({
+ content, urlh = self._download_webpage_handle(
+ urlh.url, None, fatal=False, headers={'referer': urlh.url},
+ note='logging in', errnote='unable to log in',
+ data=urlencode_postdata({
'lang': 'de',
'_eventId_proceed': '',
'j_username': username,
'j_password': password
}))
+ if not urlh or urlh.url == 'https://tube.tugraz.at/paella/ui/index.html':
+ return
- if urlh and urlh.geturl() != 'https://tube.tugraz.at/paella/ui/index.html':
+ if not self._html_search_regex(
+ r'<p\b[^>]*>(Bitte geben Sie einen OTP-Wert ein:)</p>',
+ content, 'TFA prompt', default=None):
self.report_warning('unable to login: incorrect password')
+ return
+
+ content, urlh = self._download_webpage_handle(
+ urlh.url, None, fatal=False, headers={'referer': urlh.url},
+ note='logging in with TFA', errnote='unable to log in with TFA',
+ data=urlencode_postdata({
+ 'lang': 'de',
+ '_eventId_proceed': '',
+ 'j_tokenNumber': self._get_tfa_info(),
+ }))
+ if not urlh or urlh.url == 'https://tube.tugraz.at/paella/ui/index.html':
+ return
+
+ self.report_warning('unable to login: incorrect TFA code')
def _extract_episode(self, episode_info):
id = episode_info.get('id')
diff --git a/hypervideo_dl/extractor/tubitv.py b/hypervideo_dl/extractor/tubitv.py
index de8b5da..bd46bc3 100644
--- a/hypervideo_dl/extractor/tubitv.py
+++ b/hypervideo_dl/extractor/tubitv.py
@@ -1,13 +1,13 @@
import re
from .common import InfoExtractor
+from ..networking import Request
from ..utils import (
ExtractorError,
int_or_none,
js_to_json,
- sanitized_Request,
- urlencode_postdata,
traverse_obj,
+ urlencode_postdata,
)
@@ -72,8 +72,8 @@ class TubiTvIE(InfoExtractor):
'password': password,
}
payload = urlencode_postdata(form_data)
- request = sanitized_Request(self._LOGIN_URL, payload)
- request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ request = Request(self._LOGIN_URL, payload)
+ request.headers['Content-Type'] = 'application/x-www-form-urlencoded'
login_page = self._download_webpage(
request, None, False, 'Wrong login info')
if not re.search(r'id="tubi-logout"', login_page):
diff --git a/hypervideo_dl/extractor/tumblr.py b/hypervideo_dl/extractor/tumblr.py
index 88d4ae3..a26bdca 100644
--- a/hypervideo_dl/extractor/tumblr.py
+++ b/hypervideo_dl/extractor/tumblr.py
@@ -274,7 +274,7 @@ class TumblrIE(InfoExtractor):
url = f'http://{blog}.tumblr.com/post/{video_id}/'
webpage, urlh = self._download_webpage_handle(url, video_id)
- redirect_url = urlh.geturl()
+ redirect_url = urlh.url
api_only = bool(self._search_regex(
r'(tumblr.com|^)/(safe-mode|login_required|blog/view)',
diff --git a/hypervideo_dl/extractor/tunein.py b/hypervideo_dl/extractor/tunein.py
index 43b4f67..fd2fe13 100644
--- a/hypervideo_dl/extractor/tunein.py
+++ b/hypervideo_dl/extractor/tunein.py
@@ -1,149 +1,201 @@
-import re
+import urllib.parse
from .common import InfoExtractor
-from ..utils import ExtractorError
-from ..compat import compat_urlparse
+from ..utils import (
+ OnDemandPagedList,
+ determine_ext,
+ parse_iso8601,
+ traverse_obj,
+)
class TuneInBaseIE(InfoExtractor):
- _API_BASE_URL = 'http://tunein.com/tuner/tune/'
+ _VALID_URL_BASE = r'https?://(?:www\.)?tunein\.com'
- def _real_extract(self, url):
- content_id = self._match_id(url)
-
- content_info = self._download_json(
- self._API_BASE_URL + self._API_URL_QUERY % content_id,
- content_id, note='Downloading JSON metadata')
-
- title = content_info['Title']
- thumbnail = content_info.get('Logo')
- location = content_info.get('Location')
- streams_url = content_info.get('StreamUrl')
- if not streams_url:
- raise ExtractorError('No downloadable streams found', expected=True)
- if not streams_url.startswith('http://'):
- streams_url = compat_urlparse.urljoin(url, streams_url)
+ def _extract_metadata(self, webpage, content_id):
+ return self._search_json(r'window.INITIAL_STATE=', webpage, 'hydration', content_id, fatal=False)
+ def _extract_formats_and_subtitles(self, content_id):
streams = self._download_json(
- streams_url, content_id, note='Downloading stream data',
- transform_source=lambda s: re.sub(r'^\s*\((.*)\);\s*$', r'\1', s))['Streams']
+ f'https://opml.radiotime.com/Tune.ashx?render=json&formats=mp3,aac,ogg,flash,hls&id={content_id}',
+ content_id)['body']
- is_live = None
- formats = []
+ formats, subtitles = [], {}
for stream in streams:
- if stream.get('Type') == 'Live':
- is_live = True
- reliability = stream.get('Reliability')
- format_note = (
- 'Reliability: %d%%' % reliability
- if reliability is not None else None)
- formats.append({
- 'preference': (
- 0 if reliability is None or reliability > 90
- else 1),
- 'abr': stream.get('Bandwidth'),
- 'ext': stream.get('MediaType').lower(),
- 'acodec': stream.get('MediaType'),
- 'vcodec': 'none',
- 'url': stream.get('Url'),
- 'source_preference': reliability,
- 'format_note': format_note,
- })
-
- return {
- 'id': content_id,
- 'title': title,
- 'formats': formats,
- 'thumbnail': thumbnail,
- 'location': location,
- 'is_live': is_live,
- }
-
-
-class TuneInClipIE(TuneInBaseIE):
- IE_NAME = 'tunein:clip'
- _VALID_URL = r'https?://(?:www\.)?tunein\.com/station/.*?audioClipId\=(?P<id>\d+)'
- _API_URL_QUERY = '?tuneType=AudioClip&audioclipId=%s'
-
- _TESTS = [{
- 'url': 'http://tunein.com/station/?stationId=246119&audioClipId=816',
- 'md5': '99f00d772db70efc804385c6b47f4e77',
- 'info_dict': {
- 'id': '816',
- 'title': '32m',
- 'ext': 'mp3',
- },
- }]
+ if stream.get('media_type') == 'hls':
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(stream['url'], content_id, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+ elif determine_ext(stream['url']) == 'pls':
+ playlist_content = self._download_webpage(stream['url'], content_id)
+ formats.append({
+ 'url': self._search_regex(r'File1=(.*)', playlist_content, 'url', fatal=False),
+ 'abr': stream.get('bitrate'),
+ 'ext': stream.get('media_type'),
+ })
+ else:
+ formats.append({
+ 'url': stream['url'],
+ 'abr': stream.get('bitrate'),
+ 'ext': stream.get('media_type'),
+ })
+
+ return formats, subtitles
class TuneInStationIE(TuneInBaseIE):
- IE_NAME = 'tunein:station'
- _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-s|station/.*?StationId=|embed/player/s)(?P<id>\d+)'
- _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/[pst]\d+)']
- _API_URL_QUERY = '?tuneType=Station&stationId=%s'
-
- @classmethod
- def suitable(cls, url):
- return False if TuneInClipIE.suitable(url) else super(TuneInStationIE, cls).suitable(url)
+ _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'(?:/radio/[^?#]+-|/embed/player/)(?P<id>s\d+)'
+ _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/s\d+)']
_TESTS = [{
- 'url': 'http://tunein.com/radio/Jazz24-885-s34682/',
+ 'url': 'https://tunein.com/radio/Jazz24-885-s34682/',
'info_dict': {
- 'id': '34682',
- 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2',
+ 'id': 's34682',
+ 'title': 're:^Jazz24',
+ 'description': 'md5:d6d0b89063fd68d529fa7058ee98619b',
+ 'thumbnail': 're:^https?://[^?&]+/s34682',
+ 'location': 'Seattle-Tacoma, US',
'ext': 'mp3',
- 'location': 'Tacoma, WA',
+ 'live_status': 'is_live',
},
'params': {
- 'skip_download': True, # live stream
+ 'skip_download': True,
},
}, {
- 'url': 'http://tunein.com/embed/player/s6404/',
+ 'url': 'https://tunein.com/embed/player/s6404/',
'only_matching': True,
+ }, {
+ 'url': 'https://tunein.com/radio/BBC-Radio-1-988-s24939/',
+ 'info_dict': {
+ 'id': 's24939',
+ 'title': 're:^BBC Radio 1',
+ 'description': 'md5:f3f75f7423398d87119043c26e7bfb84',
+ 'thumbnail': 're:^https?://[^?&]+/s24939',
+ 'location': 'London, UK',
+ 'ext': 'mp3',
+ 'live_status': 'is_live',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
+ def _real_extract(self, url):
+ station_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, station_id)
+ metadata = self._extract_metadata(webpage, station_id)
+
+ formats, subtitles = self._extract_formats_and_subtitles(station_id)
+ return {
+ 'id': station_id,
+ 'title': traverse_obj(metadata, ('profiles', station_id, 'title')),
+ 'description': traverse_obj(metadata, ('profiles', station_id, 'description')),
+ 'thumbnail': traverse_obj(metadata, ('profiles', station_id, 'image')),
+ 'timestamp': parse_iso8601(
+ traverse_obj(metadata, ('profiles', station_id, 'actions', 'play', 'publishTime'))),
+ 'location': traverse_obj(
+ metadata, ('profiles', station_id, 'metadata', 'properties', 'location', 'displayName'),
+ ('profiles', station_id, 'properties', 'location', 'displayName')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'is_live': traverse_obj(metadata, ('profiles', station_id, 'actions', 'play', 'isLive')),
+ }
+
-class TuneInProgramIE(TuneInBaseIE):
- IE_NAME = 'tunein:program'
- _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-p|program/.*?ProgramId=|embed/player/p)(?P<id>\d+)'
- _API_URL_QUERY = '?tuneType=Program&programId=%s'
+class TuneInPodcastIE(TuneInBaseIE):
+ _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'/(?:podcasts/[^?#]+-|embed/player/)(?P<id>p\d+)/?(?:#|$)'
+ _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/p\d+)']
_TESTS = [{
- 'url': 'http://tunein.com/radio/Jazz-24-p2506/',
+ 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019',
'info_dict': {
- 'id': '2506',
- 'title': 'Jazz 24 on 91.3 WUKY-HD3',
- 'ext': 'mp3',
- 'location': 'Lexington, KY',
- },
- 'params': {
- 'skip_download': True, # live stream
+ 'id': 'p1153019',
+ 'title': 'Lex Fridman Podcast',
+ 'description': 'md5:bedc4e5f1c94f7dec6e4317b5654b00d',
},
+ 'playlist_mincount': 200,
}, {
- 'url': 'http://tunein.com/embed/player/p191660/',
- 'only_matching': True,
+ 'url': 'https://tunein.com/embed/player/p191660/',
+ 'only_matching': True
+ }, {
+ 'url': 'https://tunein.com/podcasts/World-News/BBC-News-p14/',
+ 'info_dict': {
+ 'id': 'p14',
+ 'title': 'BBC News',
+ 'description': 'md5:1218e575eeaff75f48ed978261fa2068',
+ },
+ 'playlist_mincount': 200,
}]
+ _PAGE_SIZE = 30
+
+ def _real_extract(self, url):
+ podcast_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, podcast_id, fatal=False)
+ metadata = self._extract_metadata(webpage, podcast_id)
+
+ def page_func(page_num):
+ api_response = self._download_json(
+ f'https://api.tunein.com/profiles/{podcast_id}/contents', podcast_id,
+ note=f'Downloading page {page_num + 1}', query={
+ 'filter': 't:free',
+ 'offset': page_num * self._PAGE_SIZE,
+ 'limit': self._PAGE_SIZE,
+ })
-class TuneInTopicIE(TuneInBaseIE):
- IE_NAME = 'tunein:topic'
- _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:topic/.*?TopicId=|embed/player/t)(?P<id>\d+)'
- _API_URL_QUERY = '?tuneType=Topic&topicId=%s'
+ return [
+ self.url_result(
+ f'https://tunein.com/podcasts/{podcast_id}?topicId={episode["GuideId"][1:]}',
+ TuneInPodcastEpisodeIE, title=episode.get('Title'))
+ for episode in api_response['Items']]
+
+ entries = OnDemandPagedList(page_func, self._PAGE_SIZE)
+ return self.playlist_result(
+ entries, playlist_id=podcast_id, title=traverse_obj(metadata, ('profiles', podcast_id, 'title')),
+ description=traverse_obj(metadata, ('profiles', podcast_id, 'description')))
+
+
+class TuneInPodcastEpisodeIE(TuneInBaseIE):
+ _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'/podcasts/(?:[^?&]+-)?(?P<podcast_id>p\d+)/?\?topicId=(?P<id>\w\d+)'
_TESTS = [{
- 'url': 'http://tunein.com/topic/?TopicId=101830576',
- 'md5': 'c31a39e6f988d188252eae7af0ef09c9',
+ 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/?topicId=236404354',
'info_dict': {
- 'id': '101830576',
- 'title': 'Votez pour moi du 29 octobre 2015 (29/10/15)',
+ 'id': 't236404354',
+ 'title': '#351 \u2013 MrBeast: Future of YouTube, Twitter, TikTok, and Instagram',
+ 'description': 'md5:e1734db6f525e472c0c290d124a2ad77',
+ 'thumbnail': 're:^https?://[^?&]+/p1153019',
+ 'timestamp': 1673458571,
+ 'upload_date': '20230111',
+ 'series_id': 'p1153019',
+ 'series': 'Lex Fridman Podcast',
'ext': 'mp3',
- 'location': 'Belgium',
},
- }, {
- 'url': 'http://tunein.com/embed/player/t101830576/',
- 'only_matching': True,
}]
+ def _real_extract(self, url):
+ podcast_id, episode_id = self._match_valid_url(url).group('podcast_id', 'id')
+ episode_id = f't{episode_id}'
+
+ webpage = self._download_webpage(url, episode_id)
+ metadata = self._extract_metadata(webpage, episode_id)
+
+ formats, subtitles = self._extract_formats_and_subtitles(episode_id)
+ return {
+ 'id': episode_id,
+ 'title': traverse_obj(metadata, ('profiles', episode_id, 'title')),
+ 'description': traverse_obj(metadata, ('profiles', episode_id, 'description')),
+ 'thumbnail': traverse_obj(metadata, ('profiles', episode_id, 'image')),
+ 'timestamp': parse_iso8601(
+ traverse_obj(metadata, ('profiles', episode_id, 'actions', 'play', 'publishTime'))),
+ 'series_id': podcast_id,
+ 'series': traverse_obj(metadata, ('profiles', podcast_id, 'title')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
class TuneInShortenerIE(InfoExtractor):
IE_NAME = 'tunein:shortener'
@@ -154,10 +206,13 @@ class TuneInShortenerIE(InfoExtractor):
# test redirection
'url': 'http://tun.in/ser7s',
'info_dict': {
- 'id': '34682',
- 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2',
+ 'id': 's34682',
+ 'title': 're:^Jazz24',
+ 'description': 'md5:d6d0b89063fd68d529fa7058ee98619b',
+ 'thumbnail': 're:^https?://[^?&]+/s34682',
+ 'location': 'Seattle-Tacoma, US',
'ext': 'mp3',
- 'location': 'Tacoma, WA',
+ 'live_status': 'is_live',
},
'params': {
'skip_download': True, # live stream
@@ -169,6 +224,11 @@ class TuneInShortenerIE(InfoExtractor):
# The server doesn't support HEAD requests
urlh = self._request_webpage(
url, redirect_id, note='Downloading redirect page')
- url = urlh.geturl()
+
+ url = urlh.url
+ url_parsed = urllib.parse.urlparse(url)
+ if url_parsed.port == 443:
+ url = url_parsed._replace(netloc=url_parsed.hostname).url
+
self.to_screen('Following redirect: %s' % url)
return self.url_result(url)
diff --git a/hypervideo_dl/extractor/tv2.py b/hypervideo_dl/extractor/tv2.py
index c51e633..f6b452d 100644
--- a/hypervideo_dl/extractor/tv2.py
+++ b/hypervideo_dl/extractor/tv2.py
@@ -1,7 +1,7 @@
import re
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
determine_ext,
ExtractorError,
@@ -57,8 +57,8 @@ class TV2IE(InfoExtractor):
headers={'content-type': 'application/json'},
data='{"device":{"id":"1-1-1","name":"Nettleser (HTML)"}}'.encode())['playback']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- error = self._parse_json(e.cause.read().decode(), video_id)['error']
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+ error = self._parse_json(e.cause.response.read().decode(), video_id)['error']
error_code = error.get('code')
if error_code == 'ASSET_PLAYBACK_INVALID_GEO_LOCATION':
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
@@ -211,8 +211,8 @@ class KatsomoIE(InfoExtractor):
api_base + '/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % protocol,
video_id, 'Downloading play JSON')['playback']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- error = self._parse_json(e.cause.read().decode(), video_id)['error']
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+ error = self._parse_json(e.cause.response.read().decode(), video_id)['error']
error_code = error.get('code')
if error_code == 'ASSET_PLAYBACK_INVALID_GEO_LOCATION':
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
diff --git a/hypervideo_dl/extractor/tv4.py b/hypervideo_dl/extractor/tv4.py
index 1378a6f..10a2fe6 100644
--- a/hypervideo_dl/extractor/tv4.py
+++ b/hypervideo_dl/extractor/tv4.py
@@ -2,8 +2,11 @@ import re
from .common import InfoExtractor
from ..utils import (
+ bool_or_none,
int_or_none,
parse_iso8601,
+ traverse_obj,
+ url_or_none,
)
@@ -20,19 +23,25 @@ class TV4IE(InfoExtractor):
sport/|
)
)(?P<id>[0-9]+)'''
- _GEO_COUNTRIES = ['SE']
+ _GEO_BYPASS = False
_TESTS = [
{
+ # not geo-restricted
'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650',
'md5': 'cb837212f342d77cec06e6dad190e96d',
'info_dict': {
'id': '2491650',
'ext': 'mp4',
'title': 'Kalla Fakta 5 (english subtitles)',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'timestamp': int,
+ 'description': '2491650',
+ 'series': 'Kalla fakta',
+ 'duration': 1335,
+ 'thumbnail': r're:^https?://[^/?#]+/api/v2/img/',
+ 'timestamp': 1385373240,
'upload_date': '20131125',
},
+ 'params': {'skip_download': 'm3u8'},
+ 'expected_warnings': ['Unable to download f4m manifest'],
},
{
'url': 'http://www.tv4play.se/iframe/video/3054113',
@@ -46,6 +55,7 @@ class TV4IE(InfoExtractor):
'timestamp': int,
'upload_date': '20150130',
},
+ 'skip': '404 Not Found',
},
{
'url': 'http://www.tv4play.se/sport/3060959',
@@ -69,29 +79,28 @@ class TV4IE(InfoExtractor):
}
]
+ def _call_api(self, endpoint, video_id, headers=None, query={}):
+ return self._download_json(
+ f'https://playback2.a2d.tv/{endpoint}/{video_id}', video_id,
+ f'Downloading {endpoint} API JSON', headers=headers, query={
+ 'service': 'tv4',
+ 'device': 'browser',
+ 'protocol': 'hls',
+ **query,
+ })
+
def _real_extract(self, url):
video_id = self._match_id(url)
- info = self._download_json(
- 'https://playback-api.b17g.net/asset/%s' % video_id,
- video_id, 'Downloading video info JSON', query={
- 'service': 'tv4',
- 'device': 'browser',
- 'protocol': 'hls,dash',
- 'drm': 'widevine',
- })['metadata']
+ info = traverse_obj(self._call_api('asset', video_id, query={
+ 'protocol': 'hls,dash',
+ 'drm': 'widevine',
+ }), ('metadata', {dict})) or {}
- title = info['title']
+ manifest_url = self._call_api(
+ 'play', video_id, headers=self.geo_verification_headers())['playbackItem']['manifestUrl']
- manifest_url = self._download_json(
- 'https://playback-api.b17g.net/media/' + video_id,
- video_id, query={
- 'service': 'tv4',
- 'device': 'browser',
- 'protocol': 'hls',
- })['playbackItem']['manifestUrl']
- formats = []
- subtitles = {}
+ formats, subtitles = [], {}
fmts, subs = self._extract_m3u8_formats_and_subtitles(
manifest_url, video_id, 'mp4',
@@ -117,20 +126,24 @@ class TV4IE(InfoExtractor):
subtitles = self._merge_subtitles(subtitles, subs)
if not formats and info.get('is_geo_restricted'):
- self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
+ self.raise_geo_restricted(
+ 'This video is not available from your location due to geo-restriction, or not being authenticated',
+ countries=['SE'])
return {
'id': video_id,
- 'title': title,
'formats': formats,
'subtitles': subtitles,
- 'description': info.get('description'),
- 'timestamp': parse_iso8601(info.get('broadcast_date_time')),
- 'duration': int_or_none(info.get('duration')),
- 'thumbnail': info.get('image'),
- 'is_live': info.get('isLive') is True,
- 'series': info.get('seriesTitle'),
- 'season_number': int_or_none(info.get('seasonNumber')),
- 'episode': info.get('episodeTitle'),
- 'episode_number': int_or_none(info.get('episodeNumber')),
+ **traverse_obj(info, {
+ 'title': ('title', {str}),
+ 'description': ('description', {str}),
+ 'timestamp': (('broadcast_date_time', 'broadcastDateTime'), {parse_iso8601}),
+ 'duration': ('duration', {int_or_none}),
+ 'thumbnail': ('image', {url_or_none}),
+ 'is_live': ('isLive', {bool_or_none}),
+ 'series': ('seriesTitle', {str}),
+ 'season_number': ('seasonNumber', {int_or_none}),
+ 'episode': ('episodeTitle', {str}),
+ 'episode_number': ('episodeNumber', {int_or_none}),
+ }, get_all=False),
}
diff --git a/hypervideo_dl/extractor/tvp.py b/hypervideo_dl/extractor/tvp.py
index 8483564..2aa0dd8 100644
--- a/hypervideo_dl/extractor/tvp.py
+++ b/hypervideo_dl/extractor/tvp.py
@@ -268,8 +268,11 @@ class TVPIE(InfoExtractor):
class TVPStreamIE(InfoExtractor):
IE_NAME = 'tvp:stream'
- _VALID_URL = r'(?:tvpstream:|https?://tvpstream\.vod\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)'
+ _VALID_URL = r'(?:tvpstream:|https?://(?:tvpstream\.vod|stream)\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)'
_TESTS = [{
+ 'url': 'https://stream.tvp.pl/?channel_id=56969941',
+ 'only_matching': True,
+ }, {
# untestable as "video" id changes many times across a day
'url': 'https://tvpstream.vod.tvp.pl/?channel_id=1455',
'only_matching': True,
@@ -285,28 +288,21 @@ class TVPStreamIE(InfoExtractor):
'only_matching': True,
}]
- _PLAYER_BOX_RE = r'<div\s[^>]*id\s*=\s*["\']?tvp_player_box["\']?[^>]+data-%s-id\s*=\s*["\']?(\d+)'
- _BUTTON_RE = r'<div\s[^>]*data-channel-id=["\']?%s["\']?[^>]*\sdata-title=(?:"([^"]*)"|\'([^\']*)\')[^>]*\sdata-stationname=(?:"([^"]*)"|\'([^\']*)\')'
-
def _real_extract(self, url):
channel_id = self._match_id(url)
- channel_url = self._proto_relative_url('//tvpstream.vod.tvp.pl/?channel_id=%s' % channel_id or 'default')
- webpage = self._download_webpage(channel_url, channel_id, 'Downloading channel webpage')
- if not channel_id:
- channel_id = self._search_regex(self._PLAYER_BOX_RE % 'channel',
- webpage, 'default channel id')
- video_id = self._search_regex(self._PLAYER_BOX_RE % 'video',
- webpage, 'video id')
- audition_title, station_name = self._search_regex(
- self._BUTTON_RE % (re.escape(channel_id)), webpage,
- 'audition title and station name',
- group=(1, 2))
+ channel_url = self._proto_relative_url('//stream.tvp.pl/?channel_id=%s' % channel_id or 'default')
+ webpage = self._download_webpage(channel_url, channel_id or 'default', 'Downloading channel webpage')
+ channels = self._search_json(
+ r'window\.__channels\s*=', webpage, 'channel list', channel_id,
+ contains_pattern=r'\[\s*{(?s:.+)}\s*]')
+ channel = traverse_obj(channels, (lambda _, v: channel_id == str(v['id'])), get_all=False) if channel_id else channels[0]
+ audition = traverse_obj(channel, ('items', lambda _, v: v['is_live'] is True), get_all=False)
return {
'_type': 'url_transparent',
- 'id': channel_id,
- 'url': 'tvp:%s' % video_id,
- 'title': audition_title,
- 'alt_title': station_name,
+ 'id': channel_id or channel['id'],
+ 'url': 'tvp:%s' % audition['video_id'],
+ 'title': audition.get('title'),
+ 'alt_title': channel.get('title'),
'is_live': True,
'ie_key': 'TVPEmbed',
}
@@ -486,21 +482,34 @@ class TVPEmbedIE(InfoExtractor):
class TVPVODBaseIE(InfoExtractor):
_API_BASE_URL = 'https://vod.tvp.pl/api/products'
- def _call_api(self, resource, video_id, **kwargs):
- return self._download_json(
+ def _call_api(self, resource, video_id, query={}, **kwargs):
+ is_valid = lambda x: 200 <= x < 300
+ document, urlh = self._download_json_handle(
f'{self._API_BASE_URL}/{resource}', video_id,
- query={'lang': 'pl', 'platform': 'BROWSER'}, **kwargs)
-
- def _parse_video(self, video):
- return {
- '_type': 'url',
- 'url': 'tvp:' + video['externalUid'],
- 'ie_key': TVPEmbedIE.ie_key(),
- 'title': video.get('title'),
- 'description': traverse_obj(video, ('lead', 'description')),
- 'age_limit': int_or_none(video.get('rating')),
- 'duration': int_or_none(video.get('duration')),
- }
+ query={'lang': 'pl', 'platform': 'BROWSER', **query},
+ expected_status=lambda x: is_valid(x) or 400 <= x < 500, **kwargs)
+ if is_valid(urlh.status):
+ return document
+ raise ExtractorError(f'Woronicza said: {document.get("code")} (HTTP {urlh.status})')
+
+ def _parse_video(self, video, with_url=True):
+ info_dict = traverse_obj(video, {
+ 'id': ('id', {str_or_none}),
+ 'title': 'title',
+ 'age_limit': ('rating', {int_or_none}),
+ 'duration': ('duration', {int_or_none}),
+ 'episode_number': ('number', {int_or_none}),
+ 'series': ('season', 'serial', 'title', {str_or_none}),
+ 'thumbnails': ('images', ..., ..., {'url': ('url', {url_or_none})}),
+ })
+ info_dict['description'] = clean_html(dict_get(video, ('lead', 'description')))
+ if with_url:
+ info_dict.update({
+ '_type': 'url',
+ 'url': video['webUrl'],
+ 'ie_key': TVPVODVideoIE.ie_key(),
+ })
+ return info_dict
class TVPVODVideoIE(TVPVODBaseIE):
@@ -510,37 +519,70 @@ class TVPVODVideoIE(TVPVODBaseIE):
_TESTS = [{
'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357',
'info_dict': {
- 'id': '60468609',
+ 'id': '311357',
'ext': 'mp4',
- 'title': 'Laboratorium alchemika, Tusze termiczne. Jak zobaczyć niewidoczne. Odcinek 24',
+ 'title': 'Tusze termiczne. Jak zobaczyć niewidoczne. Odcinek 24',
'description': 'md5:1d4098d3e537092ccbac1abf49b7cd4c',
'duration': 300,
'episode_number': 24,
'episode': 'Episode 24',
'age_limit': 0,
'series': 'Laboratorium alchemika',
- 'thumbnail': 're:https://.+',
+ 'thumbnail': 're:https?://.+',
},
+ 'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://vod.tvp.pl/filmy-dokumentalne,163/ukrainski-sluga-narodu,339667',
'info_dict': {
- 'id': '51640077',
+ 'id': '339667',
'ext': 'mp4',
- 'title': 'Ukraiński sługa narodu, Ukraiński sługa narodu',
- 'series': 'Ukraiński sługa narodu',
+ 'title': 'Ukraiński sługa narodu',
'description': 'md5:b7940c0a8e439b0c81653a986f544ef3',
'age_limit': 12,
- 'episode': 'Episode 0',
- 'episode_number': 0,
'duration': 3051,
- 'thumbnail': 're:https://.+',
+ 'thumbnail': 're:https?://.+',
+ 'subtitles': 'count:2',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ 'note': 'embed fails with "payment required"',
+ 'url': 'https://vod.tvp.pl/seriale,18/polowanie-na-cmy-odcinki,390116/odcinek-7,S01E07,398869',
+ 'info_dict': {
+ 'id': '398869',
+ 'ext': 'mp4',
+ 'title': 'odc. 7',
+ 'description': 'md5:dd2bb33f023dc5c2fbaddfbe4cb5dba0',
+ 'duration': 2750,
+ 'age_limit': 16,
+ 'series': 'Polowanie na ćmy',
+ 'episode_number': 7,
+ 'episode': 'Episode 7',
+ 'thumbnail': 're:https?://.+',
},
+ 'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
- return self._parse_video(self._call_api(f'vods/{video_id}', video_id))
+ info_dict = self._parse_video(self._call_api(f'vods/{video_id}', video_id), with_url=False)
+
+ playlist = self._call_api(f'{video_id}/videos/playlist', video_id, query={'videoType': 'MOVIE'})
+
+ info_dict['formats'] = []
+ for manifest_url in traverse_obj(playlist, ('sources', 'HLS', ..., 'src')):
+ info_dict['formats'].extend(self._extract_m3u8_formats(manifest_url, video_id, fatal=False))
+ for manifest_url in traverse_obj(playlist, ('sources', 'DASH', ..., 'src')):
+ info_dict['formats'].extend(self._extract_mpd_formats(manifest_url, video_id, fatal=False))
+
+ info_dict['subtitles'] = {}
+ for sub in playlist.get('subtitles') or []:
+ info_dict['subtitles'].setdefault(sub.get('language') or 'und', []).append({
+ 'url': sub['url'],
+ 'ext': 'ttml',
+ })
+
+ return info_dict
class TVPVODSeriesIE(TVPVODBaseIE):
@@ -555,7 +597,7 @@ class TVPVODSeriesIE(TVPVODBaseIE):
'age_limit': 12,
'categories': ['seriale'],
},
- 'playlist_count': 129,
+ 'playlist_count': 130,
}, {
'url': 'https://vod.tvp.pl/programy,88/rolnik-szuka-zony-odcinki,284514',
'only_matching': True,
diff --git a/hypervideo_dl/extractor/tvplay.py b/hypervideo_dl/extractor/tvplay.py
index 9ef4f96..48a6efe 100644
--- a/hypervideo_dl/extractor/tvplay.py
+++ b/hypervideo_dl/extractor/tvplay.py
@@ -1,10 +1,8 @@
import re
from .common import InfoExtractor
-from ..compat import (
- compat_HTTPError,
- compat_urlparse,
-)
+from ..compat import compat_urlparse
+from ..networking.exceptions import HTTPError
from ..utils import (
determine_ext,
ExtractorError,
@@ -30,10 +28,7 @@ class TVPlayIE(InfoExtractor):
(?:
tvplay(?:\.skaties)?\.lv(?:/parraides)?|
(?:tv3play|play\.tv3)\.lt(?:/programos)?|
- tv3play(?:\.tv3)?\.ee/sisu|
- (?:tv(?:3|6|8|10)play)\.se/program|
- (?:(?:tv3play|viasat4play|tv6play)\.no|(?:tv3play)\.dk)/programmer|
- play\.nova(?:tv)?\.bg/programi
+ tv3play(?:\.tv3)?\.ee/sisu
)
/(?:[^/]+/)+
)
@@ -93,117 +88,6 @@ class TVPlayIE(InfoExtractor):
},
},
{
- 'url': 'http://www.tv3play.se/program/husraddarna/395385?autostart=true',
- 'info_dict': {
- 'id': '395385',
- 'ext': 'mp4',
- 'title': 'Husräddarna S02E07',
- 'description': 'md5:f210c6c89f42d4fc39faa551be813777',
- 'duration': 2574,
- 'timestamp': 1400596321,
- 'upload_date': '20140520',
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://www.tv6play.se/program/den-sista-dokusapan/266636?autostart=true',
- 'info_dict': {
- 'id': '266636',
- 'ext': 'mp4',
- 'title': 'Den sista dokusåpan S01E08',
- 'description': 'md5:295be39c872520221b933830f660b110',
- 'duration': 1492,
- 'timestamp': 1330522854,
- 'upload_date': '20120229',
- 'age_limit': 18,
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://www.tv8play.se/program/antikjakten/282756?autostart=true',
- 'info_dict': {
- 'id': '282756',
- 'ext': 'mp4',
- 'title': 'Antikjakten S01E10',
- 'description': 'md5:1b201169beabd97e20c5ad0ad67b13b8',
- 'duration': 2646,
- 'timestamp': 1348575868,
- 'upload_date': '20120925',
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://www.tv3play.no/programmer/anna-anka-soker-assistent/230898?autostart=true',
- 'info_dict': {
- 'id': '230898',
- 'ext': 'mp4',
- 'title': 'Anna Anka søker assistent - Ep. 8',
- 'description': 'md5:f80916bf5bbe1c5f760d127f8dd71474',
- 'duration': 2656,
- 'timestamp': 1277720005,
- 'upload_date': '20100628',
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://www.viasat4play.no/programmer/budbringerne/21873?autostart=true',
- 'info_dict': {
- 'id': '21873',
- 'ext': 'mp4',
- 'title': 'Budbringerne program 10',
- 'description': 'md5:4db78dc4ec8a85bb04fd322a3ee5092d',
- 'duration': 1297,
- 'timestamp': 1254205102,
- 'upload_date': '20090929',
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://www.tv6play.no/programmer/hotelinspektor-alex-polizzi/361883?autostart=true',
- 'info_dict': {
- 'id': '361883',
- 'ext': 'mp4',
- 'title': 'Hotelinspektør Alex Polizzi - Ep. 10',
- 'description': 'md5:3ecf808db9ec96c862c8ecb3a7fdaf81',
- 'duration': 2594,
- 'timestamp': 1393236292,
- 'upload_date': '20140224',
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true',
- 'info_dict': {
- 'id': '624952',
- 'ext': 'flv',
- 'title': 'Здравей, България (12.06.2015 г.) ',
- 'description': 'md5:99f3700451ac5bb71a260268b8daefd7',
- 'duration': 8838,
- 'timestamp': 1434100372,
- 'upload_date': '20150612',
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- },
- },
- {
- 'url': 'https://play.nova.bg/programi/zdravei-bulgariya/764300?autostart=true',
- 'only_matching': True,
- },
- {
'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true',
'only_matching': True,
},
@@ -243,8 +127,8 @@ class TVPlayIE(InfoExtractor):
'http://playapi.mtgx.tv/v3/videos/stream/%s' % video_id,
video_id, 'Downloading streams JSON')
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- msg = self._parse_json(e.cause.read().decode('utf-8'), video_id)
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
+ msg = self._parse_json(e.cause.response.read().decode('utf-8'), video_id)
raise ExtractorError(msg['msg'], expected=True)
raise
@@ -327,103 +211,6 @@ class TVPlayIE(InfoExtractor):
}
-class ViafreeIE(InfoExtractor):
- _VALID_URL = r'''(?x)
- https?://
- (?:www\.)?
- viafree\.(?P<country>dk|no|se|fi)
- /(?P<id>(?:program(?:mer)?|ohjelmat)?/(?:[^/]+/)+[^/?#&]+)
- '''
- _TESTS = [{
- 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1',
- 'info_dict': {
- 'id': '757786',
- 'ext': 'mp4',
- 'title': 'Det beste vorspielet - Sesong 2 - Episode 1',
- 'description': 'md5:b632cb848331404ccacd8cd03e83b4c3',
- 'series': 'Det beste vorspielet',
- 'season_number': 2,
- 'duration': 1116,
- 'timestamp': 1471200600,
- 'upload_date': '20160814',
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- 'url': 'https://www.viafree.dk/programmer/humor/comedy-central-roast-of-charlie-sheen/film/1047660',
- 'info_dict': {
- 'id': '1047660',
- 'ext': 'mp4',
- 'title': 'Comedy Central Roast of Charlie Sheen - Comedy Central Roast of Charlie Sheen',
- 'description': 'md5:ec956d941ae9fd7c65a48fd64951dc6d',
- 'series': 'Comedy Central Roast of Charlie Sheen',
- 'season_number': 1,
- 'duration': 3747,
- 'timestamp': 1608246060,
- 'upload_date': '20201217'
- },
- 'params': {
- 'skip_download': True
- }
- }, {
- # with relatedClips
- 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-1',
- 'only_matching': True,
- }, {
- # Different og:image URL schema
- 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2',
- 'only_matching': True,
- }, {
- 'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2',
- 'only_matching': True,
- }, {
- 'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5',
- 'only_matching': True,
- }, {
- 'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869',
- 'only_matching': True,
- }, {
- 'url': 'https://www.viafree.fi/ohjelmat/entertainment/amazing-makeovers/kausi-7/jakso-2',
- 'only_matching': True,
- }]
- _GEO_BYPASS = False
-
- def _real_extract(self, url):
- country, path = self._match_valid_url(url).groups()
- content = self._download_json(
- 'https://viafree-content.mtg-api.com/viafree-content/v1/%s/path/%s' % (country, path), path)
- program = content['_embedded']['viafreeBlocks'][0]['_embedded']['program']
- guid = program['guid']
- meta = content['meta']
- title = meta['title']
-
- try:
- stream_href = self._download_json(
- program['_links']['streamLink']['href'], guid,
- headers=self.geo_verification_headers())['embedded']['prioritizedStreams'][0]['links']['stream']['href']
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- self.raise_geo_restricted(countries=[country])
- raise
-
- formats, subtitles = self._extract_m3u8_formats_and_subtitles(stream_href, guid, 'mp4')
- episode = program.get('episode') or {}
- return {
- 'id': guid,
- 'title': title,
- 'thumbnail': meta.get('image'),
- 'description': meta.get('description'),
- 'series': episode.get('seriesTitle'),
- 'subtitles': subtitles,
- 'episode_number': int_or_none(episode.get('episodeNumber')),
- 'season_number': int_or_none(episode.get('seasonNumber')),
- 'duration': int_or_none(try_get(program, lambda x: x['video']['duration']['milliseconds']), 1000),
- 'timestamp': parse_iso8601(try_get(program, lambda x: x['availability']['start'])),
- 'formats': formats,
- }
-
-
class TVPlayHomeIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
diff --git a/hypervideo_dl/extractor/tvplayer.py b/hypervideo_dl/extractor/tvplayer.py
index b05355f..228c236 100644
--- a/hypervideo_dl/extractor/tvplayer.py
+++ b/hypervideo_dl/extractor/tvplayer.py
@@ -1,8 +1,6 @@
from .common import InfoExtractor
-from ..compat import (
- compat_HTTPError,
- compat_str,
-)
+from ..compat import compat_str
+from ..networking.exceptions import HTTPError
from ..utils import (
extract_attributes,
try_get,
@@ -64,9 +62,9 @@ class TVPlayerIE(InfoExtractor):
'validate': validate,
}))['tvplayer']['response']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError):
+ if isinstance(e.cause, HTTPError):
response = self._parse_json(
- e.cause.read().decode(), resource_id)['tvplayer']['response']
+ e.cause.response.read().decode(), resource_id)['tvplayer']['response']
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, response['error']), expected=True)
raise
diff --git a/hypervideo_dl/extractor/twitcasting.py b/hypervideo_dl/extractor/twitcasting.py
index 30bc987..ede1085 100644
--- a/hypervideo_dl/extractor/twitcasting.py
+++ b/hypervideo_dl/extractor/twitcasting.py
@@ -38,7 +38,7 @@ class TwitCastingIE(InfoExtractor):
'description': 'Twitter Oficial da cantora brasileira Ivete Sangalo.',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20110822',
- 'timestamp': 1314010824,
+ 'timestamp': 1313978424,
'duration': 32,
'view_count': int,
},
@@ -52,10 +52,10 @@ class TwitCastingIE(InfoExtractor):
'ext': 'mp4',
'title': 'Live playing something #3689740',
'uploader_id': 'mttbernardini',
- 'description': 'Salve, io sono Matto (ma con la e). Questa è la mia presentazione, in quanto sono letteralmente matto (nel senso di strano), con qualcosa in più.',
+ 'description': 'md5:1dc7efa2f1ab932fcd119265cebeec69',
'thumbnail': r're:^https?://.*\.jpg$',
- 'upload_date': '20120212',
- 'timestamp': 1329028024,
+ 'upload_date': '20120211',
+ 'timestamp': 1328995624,
'duration': 681,
'view_count': int,
},
@@ -64,15 +64,22 @@ class TwitCastingIE(InfoExtractor):
'videopassword': 'abc',
},
}, {
- 'note': 'archive is split in 2 parts',
'url': 'https://twitcasting.tv/loft_heaven/movie/685979292',
'info_dict': {
'id': '685979292',
'ext': 'mp4',
- 'title': '南波一海のhear_here “ナタリー望月哲さんに聞く編集と「渋谷系狂騒曲」”',
- 'duration': 6964.599334,
+ 'title': '【無料配信】南波一海のhear/here “ナタリー望月哲さんに聞く編集と「渋谷系狂騒曲」”',
+ 'uploader_id': 'loft_heaven',
+ 'description': 'md5:3a0c7b53019df987ce545c935538bacf',
+ 'upload_date': '20210604',
+ 'timestamp': 1622802114,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 6964,
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
},
- 'playlist_mincount': 2,
}]
def _parse_data_movie_playlist(self, dmp, video_id):
@@ -88,18 +95,21 @@ class TwitCastingIE(InfoExtractor):
def _real_extract(self, url):
uploader_id, video_id = self._match_valid_url(url).groups()
+ webpage, urlh = self._download_webpage_handle(url, video_id)
video_password = self.get_param('videopassword')
request_data = None
if video_password:
request_data = urlencode_postdata({
'password': video_password,
+ **self._hidden_inputs(webpage),
}, encoding='utf-8')
- webpage, urlh = self._download_webpage_handle(
- url, video_id, data=request_data,
- headers={'Origin': 'https://twitcasting.tv'})
- if urlh.geturl() != url and request_data:
+ webpage, urlh = self._download_webpage_handle(
+ url, video_id, data=request_data,
+ headers={'Origin': 'https://twitcasting.tv'},
+ note='Trying video password')
+ if urlh.url != url and request_data:
webpage = self._download_webpage(
- urlh.geturl(), video_id, data=request_data,
+ urlh.url, video_id, data=request_data,
headers={'Origin': 'https://twitcasting.tv'},
note='Retrying authentication')
# has to check here as the first request can contain password input form even if the password is correct
@@ -122,7 +132,7 @@ class TwitCastingIE(InfoExtractor):
duration = (try_get(video_js_data, lambda x: sum(float_or_none(y.get('duration')) for y in x) / 1000)
or parse_duration(clean_html(get_element_by_class('tw-player-duration-time', webpage))))
view_count = str_to_int(self._search_regex(
- (r'Total\s*:\s*([\d,]+)\s*Views', r'総視聴者\s*:\s*([\d,]+)\s*</'), webpage, 'views', None))
+ (r'Total\s*:\s*Views\s*([\d,]+)', r'総視聴者\s*:\s*([\d,]+)\s*</'), webpage, 'views', None))
timestamp = unified_timestamp(self._search_regex(
r'data-toggle="true"[^>]+datetime="([^"]+)"',
webpage, 'datetime', None))
diff --git a/hypervideo_dl/extractor/twitch.py b/hypervideo_dl/extractor/twitch.py
index c59d1cf..3297ef0 100644
--- a/hypervideo_dl/extractor/twitch.py
+++ b/hypervideo_dl/extractor/twitch.py
@@ -41,23 +41,27 @@ class TwitchBaseIE(InfoExtractor):
_USHER_BASE = 'https://usher.ttvnw.net'
_LOGIN_FORM_URL = 'https://www.twitch.tv/login'
_LOGIN_POST_URL = 'https://passport.twitch.tv/login'
- _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko'
_NETRC_MACHINE = 'twitch'
_OPERATION_HASHES = {
'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14',
'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb',
'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777',
- 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84',
- 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e',
+ 'ChannelCollectionsContent': '447aec6a0cc1e8d0a8d7732d47eb0762c336a2294fdb009e9c9d854e49d484b9',
+ 'StreamMetadata': 'a647c2a13599e5991e175155f798ca7f1ecddde73f7f341f39009c14dbf59962',
'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01',
'VideoAccessToken_Clip': '36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11',
'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c',
- 'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687',
+ 'VideoMetadata': '49b5b8f268cdeb259d75b58dcb0c1a748e3b575003448a2333dc5cdafd49adad',
'VideoPlayer_ChapterSelectButtonVideo': '8d2793384aac3773beab5e59bd5d6f585aedb923d292800119e03d40cd0f9b41',
'VideoPlayer_VODSeekbarPreviewVideo': '07e99e4d56c5a7c67117a154777b0baf85a5ffefa393b213f4bc712ccaf85dd6',
}
+ @property
+ def _CLIENT_ID(self):
+ return self._configuration_arg(
+ 'client_id', ['ue6666qo983tsx6so1t0vnawi233wa'], ie_key='Twitch', casesense=True)[0]
+
def _perform_login(self, username, password):
def fail(message):
raise ExtractorError(
@@ -67,7 +71,7 @@ class TwitchBaseIE(InfoExtractor):
form = self._hidden_inputs(page)
form.update(data)
- page_url = urlh.geturl()
+ page_url = urlh.url
post_url = self._search_regex(
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page,
'post url', default=self._LOGIN_POST_URL, group='url')
@@ -179,6 +183,14 @@ class TwitchBaseIE(InfoExtractor):
video_id, ops,
'Downloading %s access token GraphQL' % token_kind)['data'][method]
+ def _get_thumbnails(self, thumbnail):
+ return [{
+ 'url': re.sub(r'\d+x\d+(\.\w+)($|(?=[?#]))', r'0x0\g<1>', thumbnail),
+ 'preference': 1,
+ }, {
+ 'url': thumbnail,
+ }] if thumbnail else None
+
class TwitchVodIE(TwitchBaseIE):
IE_NAME = 'twitch:vod'
@@ -186,7 +198,8 @@ class TwitchVodIE(TwitchBaseIE):
https?://
(?:
(?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/|
- player\.twitch\.tv/\?.*?\bvideo=v?
+ player\.twitch\.tv/\?.*?\bvideo=v?|
+ www\.twitch\.tv/[^/]+/schedule\?vodID=
)
(?P<id>\d+)
'''
@@ -355,6 +368,9 @@ class TwitchVodIE(TwitchBaseIE):
'skip_download': True
},
'expected_warnings': ['Unable to download JSON metadata: HTTP Error 403: Forbidden']
+ }, {
+ 'url': 'https://www.twitch.tv/tangotek/schedule?vodID=1822395420',
+ 'only_matching': True,
}]
def _download_info(self, item_id):
@@ -380,13 +396,14 @@ class TwitchVodIE(TwitchBaseIE):
}],
'Downloading stream metadata GraphQL')
- video = traverse_obj(data, (0, 'data', 'video'))
- video['moments'] = traverse_obj(data, (1, 'data', 'video', 'moments', 'edges', ..., 'node'))
- video['storyboard'] = traverse_obj(data, (2, 'data', 'video', 'seekPreviewsURL'), expected_type=url_or_none)
-
+ video = traverse_obj(data, (..., 'data', 'video'), get_all=False)
if video is None:
- raise ExtractorError(
- 'Video %s does not exist' % item_id, expected=True)
+ raise ExtractorError(f'Video {item_id} does not exist', expected=True)
+
+ video['moments'] = traverse_obj(data, (..., 'data', 'video', 'moments', 'edges', ..., 'node'))
+ video['storyboard'] = traverse_obj(
+ data, (..., 'data', 'video', 'seekPreviewsURL', {url_or_none}), get_all=False)
+
return video
def _extract_info(self, info):
@@ -455,19 +472,17 @@ class TwitchVodIE(TwitchBaseIE):
thumbnail = url_or_none(info.get('previewThumbnailURL'))
is_live = None
if thumbnail:
- if thumbnail.endswith('/404_processing_{width}x{height}.png'):
+ if re.findall(r'/404_processing_[^.?#]+\.png', thumbnail):
is_live, thumbnail = True, None
else:
is_live = False
- for p in ('width', 'height'):
- thumbnail = thumbnail.replace('{%s}' % p, '0')
return {
'id': vod_id,
'title': info.get('title') or 'Untitled Broadcast',
'description': info.get('description'),
'duration': int_or_none(info.get('lengthSeconds')),
- 'thumbnail': thumbnail,
+ 'thumbnails': self._get_thumbnails(thumbnail),
'uploader': try_get(info, lambda x: x['owner']['displayName'], compat_str),
'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str),
'timestamp': unified_timestamp(info.get('publishedAt')),
@@ -854,6 +869,13 @@ class TwitchVideosCollectionsIE(TwitchPlaylistBaseIE):
'title': 'spamfish - Collections',
},
'playlist_mincount': 3,
+ }, {
+ 'url': 'https://www.twitch.tv/monstercat/videos?filter=collections',
+ 'info_dict': {
+ 'id': 'monstercat',
+ 'title': 'monstercat - Collections',
+ },
+ 'playlist_mincount': 13,
}]
_OPERATION_NAME = 'ChannelCollectionsContent'
@@ -922,6 +944,7 @@ class TwitchStreamIE(TwitchBaseIE):
# m3u8 download
'skip_download': True,
},
+ 'skip': 'User does not exist',
}, {
'url': 'http://www.twitch.tv/miracle_doto#profile-0',
'only_matching': True,
@@ -934,6 +957,25 @@ class TwitchStreamIE(TwitchBaseIE):
}, {
'url': 'https://m.twitch.tv/food',
'only_matching': True,
+ }, {
+ 'url': 'https://www.twitch.tv/monstercat',
+ 'info_dict': {
+ 'id': '40500071752',
+ 'display_id': 'monstercat',
+ 'title': 're:Monstercat',
+ 'description': 'md5:0945ad625e615bc8f0469396537d87d9',
+ 'is_live': True,
+ 'timestamp': 1677107190,
+ 'upload_date': '20230222',
+ 'uploader': 'Monstercat',
+ 'uploader_id': 'monstercat',
+ 'live_status': 'is_live',
+ 'thumbnail': 're:https://.*.jpg',
+ 'ext': 'mp4',
+ },
+ 'params': {
+ 'skip_download': 'Livestream',
+ },
}]
@classmethod
@@ -1025,7 +1067,7 @@ class TwitchStreamIE(TwitchBaseIE):
'display_id': channel_name,
'title': title,
'description': description,
- 'thumbnail': thumbnail,
+ 'thumbnails': self._get_thumbnails(thumbnail),
'uploader': uploader,
'uploader_id': channel_name,
'timestamp': timestamp,
@@ -1041,7 +1083,7 @@ class TwitchClipsIE(TwitchBaseIE):
https?://
(?:
clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)|
- (?:(?:www|go|m)\.)?twitch\.tv/[^/]+/clip/
+ (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/)?clip/
)
(?P<id>[^/?#&]+)
'''
@@ -1077,6 +1119,9 @@ class TwitchClipsIE(TwitchBaseIE):
}, {
'url': 'https://go.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank',
'only_matching': True,
+ }, {
+ 'url': 'https://m.twitch.tv/clip/FaintLightGullWholeWheat',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/hypervideo_dl/extractor/twitter.py b/hypervideo_dl/extractor/twitter.py
index 18ebb36..66d1eb8 100644
--- a/hypervideo_dl/extractor/twitter.py
+++ b/hypervideo_dl/extractor/twitter.py
@@ -1,10 +1,9 @@
+import functools
import json
import re
-import urllib.error
from .common import InfoExtractor
from .periscope import PeriscopeBaseIE, PeriscopeIE
-from ..compat import functools # isort: split
from ..compat import (
compat_parse_qs,
compat_urllib_parse_unquote,
@@ -13,10 +12,12 @@ from ..compat import (
from ..utils import (
ExtractorError,
dict_get,
+ filter_dict,
float_or_none,
format_field,
int_or_none,
make_archive_id,
+ remove_end,
str_or_none,
strip_or_none,
traverse_obj,
@@ -30,13 +31,67 @@ from ..utils import (
class TwitterBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'twitter'
_API_BASE = 'https://api.twitter.com/1.1/'
_GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/'
- _TOKENS = {
- 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA': None,
- 'AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw': None,
- }
_BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/'
+ _AUTH = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
+ _LEGACY_AUTH = 'AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE'
+ _flow_token = None
+
+ _LOGIN_INIT_DATA = json.dumps({
+ 'input_flow_data': {
+ 'flow_context': {
+ 'debug_overrides': {},
+ 'start_location': {
+ 'location': 'unknown'
+ }
+ }
+ },
+ 'subtask_versions': {
+ 'action_list': 2,
+ 'alert_dialog': 1,
+ 'app_download_cta': 1,
+ 'check_logged_in_account': 1,
+ 'choice_selection': 3,
+ 'contacts_live_sync_permission_prompt': 0,
+ 'cta': 7,
+ 'email_verification': 2,
+ 'end_flow': 1,
+ 'enter_date': 1,
+ 'enter_email': 2,
+ 'enter_password': 5,
+ 'enter_phone': 2,
+ 'enter_recaptcha': 1,
+ 'enter_text': 5,
+ 'enter_username': 2,
+ 'generic_urt': 3,
+ 'in_app_notification': 1,
+ 'interest_picker': 3,
+ 'js_instrumentation': 1,
+ 'menu_dialog': 1,
+ 'notifications_permission_prompt': 2,
+ 'open_account': 2,
+ 'open_home_timeline': 1,
+ 'open_link': 1,
+ 'phone_verification': 4,
+ 'privacy_options': 1,
+ 'security_key': 3,
+ 'select_avatar': 4,
+ 'select_banner': 2,
+ 'settings_list': 7,
+ 'show_code': 1,
+ 'sign_up': 2,
+ 'sign_up_review': 4,
+ 'tweet_selection_urt': 1,
+ 'update_users': 1,
+ 'upload_media': 1,
+ 'user_recommendations_list': 4,
+ 'user_recommendations_urt': 1,
+ 'wait_spinner': 3,
+ 'web_modal': 1
+ }
+ }, separators=(',', ':')).encode()
def _extract_variant_formats(self, variant, video_id):
variant_url = variant.get('url')
@@ -88,73 +143,179 @@ class TwitterBaseIE(InfoExtractor):
'height': int(m.group('height')),
})
- @functools.cached_property
+ @property
def is_logged_in(self):
return bool(self._get_cookies(self._API_BASE).get('auth_token'))
- def _call_api(self, path, video_id, query={}, graphql=False):
- cookies = self._get_cookies(self._API_BASE)
- headers = {}
+ def _fetch_guest_token(self, display_id):
+ guest_token = traverse_obj(self._download_json(
+ f'{self._API_BASE}guest/activate.json', display_id, 'Downloading guest token', data=b'',
+ headers=self._set_base_headers(legacy=display_id and self._configuration_arg('legacy_api'))),
+ ('guest_token', {str}))
+ if not guest_token:
+ raise ExtractorError('Could not retrieve guest token')
+ return guest_token
+
+ def _set_base_headers(self, legacy=False):
+ bearer_token = self._LEGACY_AUTH if legacy and not self.is_logged_in else self._AUTH
+ return filter_dict({
+ 'Authorization': f'Bearer {bearer_token}',
+ 'x-csrf-token': try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value),
+ })
+
+ def _call_login_api(self, note, headers, query={}, data=None):
+ response = self._download_json(
+ f'{self._API_BASE}onboarding/task.json', None, note,
+ headers=headers, query=query, data=data, expected_status=400)
+ error = traverse_obj(response, ('errors', 0, 'message', {str}))
+ if error:
+ raise ExtractorError(f'Login failed, Twitter API says: {error}', expected=True)
+ elif traverse_obj(response, 'status') != 'success':
+ raise ExtractorError('Login was unsuccessful')
+
+ subtask = traverse_obj(
+ response, ('subtasks', ..., 'subtask_id', {str}), get_all=False)
+ if not subtask:
+ raise ExtractorError('Twitter API did not return next login subtask')
- csrf_cookie = cookies.get('ct0')
- if csrf_cookie:
- headers['x-csrf-token'] = csrf_cookie.value
+ self._flow_token = response['flow_token']
+ return subtask
+
+ def _perform_login(self, username, password):
if self.is_logged_in:
- headers.update({
- 'x-twitter-auth-type': 'OAuth2Session',
- 'x-twitter-client-language': 'en',
- 'x-twitter-active-user': 'yes',
- })
+ return
+
+ webpage = self._download_webpage('https://twitter.com/', None, 'Downloading login page')
+ guest_token = self._search_regex(
+ r'\.cookie\s*=\s*["\']gt=(\d+);', webpage, 'gt', default=None) or self._fetch_guest_token(None)
+ headers = {
+ **self._set_base_headers(),
+ 'content-type': 'application/json',
+ 'x-guest-token': guest_token,
+ 'x-twitter-client-language': 'en',
+ 'x-twitter-active-user': 'yes',
+ 'Referer': 'https://twitter.com/',
+ 'Origin': 'https://twitter.com',
+ }
- last_error = None
- for bearer_token in self._TOKENS:
- for first_attempt in (True, False):
- headers['Authorization'] = f'Bearer {bearer_token}'
-
- if not self.is_logged_in:
- if not self._TOKENS[bearer_token]:
- headers.pop('x-guest-token', None)
- guest_token_response = self._download_json(
- self._API_BASE + 'guest/activate.json', video_id,
- 'Downloading guest token', data=b'', headers=headers)
-
- self._TOKENS[bearer_token] = guest_token_response.get('guest_token')
- if not self._TOKENS[bearer_token]:
- raise ExtractorError('Could not retrieve guest token')
-
- headers['x-guest-token'] = self._TOKENS[bearer_token]
-
- try:
- allowed_status = {400, 403, 404} if graphql else {403}
- result = self._download_json(
- (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path,
- video_id, headers=headers, query=query, expected_status=allowed_status)
-
- except ExtractorError as e:
- if last_error:
- raise last_error
-
- if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code != 404:
- raise
-
- last_error = e
- self.report_warning(
- 'Twitter API gave 404 response, retrying with deprecated auth token. '
- 'Only one media item can be extracted')
- break # continue outer loop with next bearer_token
-
- if result.get('errors'):
- errors = traverse_obj(result, ('errors', ..., 'message'), expected_type=str)
- if first_attempt and any('bad guest token' in error.lower() for error in errors):
- self.to_screen('Guest token has expired. Refreshing guest token')
- self._TOKENS[bearer_token] = None
- continue
+ def build_login_json(*subtask_inputs):
+ return json.dumps({
+ 'flow_token': self._flow_token,
+ 'subtask_inputs': subtask_inputs
+ }, separators=(',', ':')).encode()
+
+ def input_dict(subtask_id, text):
+ return {
+ 'subtask_id': subtask_id,
+ 'enter_text': {
+ 'text': text,
+ 'link': 'next_link'
+ }
+ }
+
+ next_subtask = self._call_login_api(
+ 'Downloading flow token', headers, query={'flow_name': 'login'}, data=self._LOGIN_INIT_DATA)
+
+ while not self.is_logged_in:
+ if next_subtask == 'LoginJsInstrumentationSubtask':
+ next_subtask = self._call_login_api(
+ 'Submitting JS instrumentation response', headers, data=build_login_json({
+ 'subtask_id': next_subtask,
+ 'js_instrumentation': {
+ 'response': '{}',
+ 'link': 'next_link'
+ }
+ }))
+
+ elif next_subtask == 'LoginEnterUserIdentifierSSO':
+ next_subtask = self._call_login_api(
+ 'Submitting username', headers, data=build_login_json({
+ 'subtask_id': next_subtask,
+ 'settings_list': {
+ 'setting_responses': [{
+ 'key': 'user_identifier',
+ 'response_data': {
+ 'text_data': {
+ 'result': username
+ }
+ }
+ }],
+ 'link': 'next_link'
+ }
+ }))
+
+ elif next_subtask == 'LoginEnterAlternateIdentifierSubtask':
+ next_subtask = self._call_login_api(
+ 'Submitting alternate identifier', headers,
+ data=build_login_json(input_dict(next_subtask, self._get_tfa_info(
+ 'one of username, phone number or email that was not used as --username'))))
+
+ elif next_subtask == 'LoginEnterPassword':
+ next_subtask = self._call_login_api(
+ 'Submitting password', headers, data=build_login_json({
+ 'subtask_id': next_subtask,
+ 'enter_password': {
+ 'password': password,
+ 'link': 'next_link'
+ }
+ }))
+
+ elif next_subtask == 'AccountDuplicationCheck':
+ next_subtask = self._call_login_api(
+ 'Submitting account duplication check', headers, data=build_login_json({
+ 'subtask_id': next_subtask,
+ 'check_logged_in_account': {
+ 'link': 'AccountDuplicationCheck_false'
+ }
+ }))
+
+ elif next_subtask == 'LoginTwoFactorAuthChallenge':
+ next_subtask = self._call_login_api(
+ 'Submitting 2FA token', headers, data=build_login_json(input_dict(
+ next_subtask, self._get_tfa_info('two-factor authentication token'))))
+
+ elif next_subtask == 'LoginAcid':
+ next_subtask = self._call_login_api(
+ 'Submitting confirmation code', headers, data=build_login_json(input_dict(
+ next_subtask, self._get_tfa_info('confirmation code sent to your email or phone'))))
+
+ elif next_subtask == 'ArkoseLogin':
+ self.raise_login_required('Twitter is requiring captcha for this login attempt', method='cookies')
+
+ elif next_subtask == 'DenyLoginSubtask':
+ self.raise_login_required('Twitter rejected this login attempt as suspicious', method='cookies')
+
+ elif next_subtask == 'LoginSuccessSubtask':
+ raise ExtractorError('Twitter API did not grant auth token cookie')
- error_message = ', '.join(set(errors)) or 'Unknown error'
- raise ExtractorError(f'Error(s) while querying API: {error_message}', expected=True)
+ else:
+ raise ExtractorError(f'Unrecognized subtask ID "{next_subtask}"')
+
+ self.report_login()
+
+ def _call_api(self, path, video_id, query={}, graphql=False):
+ headers = self._set_base_headers(legacy=not graphql and self._configuration_arg('legacy_api'))
+ headers.update({
+ 'x-twitter-auth-type': 'OAuth2Session',
+ 'x-twitter-client-language': 'en',
+ 'x-twitter-active-user': 'yes',
+ } if self.is_logged_in else {
+ 'x-guest-token': self._fetch_guest_token(video_id)
+ })
+ allowed_status = {400, 401, 403, 404} if graphql else {403}
+ result = self._download_json(
+ (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path,
+ video_id, headers=headers, query=query, expected_status=allowed_status,
+ note=f'Downloading {"GraphQL" if graphql else "legacy API"} JSON')
- return result
+ if result.get('errors'):
+ errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str}))))
+ if errors and 'not authorized' in errors:
+ self.raise_login_required(remove_end(errors, '.'))
+ raise ExtractorError(f'Error(s) while querying API: {errors or "Unknown error"}')
+
+ return result
def _build_graphql_query(self, media_id):
raise NotImplementedError('Method must be implemented to support GraphQL')
@@ -293,7 +454,7 @@ class TwitterCardIE(InfoExtractor):
class TwitterIE(TwitterBaseIE):
IE_NAME = 'twitter'
- _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)'
+ _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)(?:/(?:video|photo)/(?P<index>\d+))?'
_TESTS = [{
'url': 'https://twitter.com/freethenipple/status/643211948184596480',
@@ -313,6 +474,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
+ 'view_count': int,
'tags': [],
'age_limit': 18,
},
@@ -336,7 +498,7 @@ class TwitterIE(TwitterBaseIE):
'id': '665052190608723968',
'display_id': '665052190608723968',
'ext': 'mp4',
- 'title': 'md5:55fef1d5b811944f1550e91b44abb82e',
+ 'title': r're:Star Wars.*A new beginning is coming December 18.*',
'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ',
'uploader_id': 'starwars',
'uploader': r're:Star Wars.*',
@@ -391,6 +553,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
+ 'view_count': int,
'tags': ['Damndaniel'],
'age_limit': 0,
},
@@ -431,6 +594,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
+ 'view_count': int,
'tags': [],
'age_limit': 0,
},
@@ -451,7 +615,7 @@ class TwitterIE(TwitterBaseIE):
# has mp4 formats via mobile API
'url': 'https://twitter.com/news_al3alm/status/852138619213144067',
'info_dict': {
- 'id': '852138619213144067',
+ 'id': '852077943283097602',
'ext': 'mp4',
'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة',
'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN',
@@ -460,8 +624,16 @@ class TwitterIE(TwitterBaseIE):
'duration': 277.4,
'timestamp': 1492000653,
'upload_date': '20170412',
+ 'display_id': '852138619213144067',
+ 'age_limit': 0,
+ 'uploader_url': 'https://twitter.com/news_al3alm',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'tags': [],
+ 'repost_count': int,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
},
- 'skip': 'Account suspended',
}, {
'url': 'https://twitter.com/i/web/status/910031516746514432',
'info_dict': {
@@ -480,6 +652,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
+ 'view_count': int,
'tags': ['Maria'],
'age_limit': 0,
},
@@ -505,6 +678,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
+ 'view_count': int,
'tags': [],
'age_limit': 0,
},
@@ -517,18 +691,19 @@ class TwitterIE(TwitterBaseIE):
'id': '1087791272830607360',
'display_id': '1087791357756956680',
'ext': 'mp4',
- 'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!',
+ 'title': 'X - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!',
'thumbnail': r're:^https?://.*\.jpg',
'description': 'md5:6dfd341a3310fb97d80d2bf7145df976',
- 'uploader': 'Twitter',
- 'uploader_id': 'Twitter',
+ 'uploader': 'X',
+ 'uploader_id': 'X',
'duration': 61.567,
'timestamp': 1548184644,
'upload_date': '20190122',
- 'uploader_url': 'https://twitter.com/Twitter',
+ 'uploader_url': 'https://twitter.com/X',
'comment_count': int,
'repost_count': int,
'like_count': int,
+ 'view_count': int,
'tags': [],
'age_limit': 0,
},
@@ -589,6 +764,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
+ 'view_count': int,
'tags': [],
'age_limit': 0,
},
@@ -597,9 +773,9 @@ class TwitterIE(TwitterBaseIE):
'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464',
'info_dict': {
'id': '1577719286659006464',
- 'title': 'Ultima | #\u0432\u029f\u043c - Test',
+ 'title': 'Ultima📛 | #вʟм - Test',
'description': 'Test https://t.co/Y3KEZD7Dad',
- 'uploader': 'Ultima | #\u0432\u029f\u043c',
+ 'uploader': 'Ultima📛 | #вʟм',
'uploader_id': 'UltimaShadowX',
'uploader_url': 'https://twitter.com/UltimaShadowX',
'upload_date': '20221005',
@@ -630,12 +806,12 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
+ 'view_count': int,
'tags': ['HurricaneIan'],
'age_limit': 0,
},
}, {
- # Adult content, uses old token
- # Fails if not logged in (GraphQL)
+ # Adult content, fails if not logged in (GraphQL)
'url': 'https://twitter.com/Rizdraws/status/1575199173472927762',
'info_dict': {
'id': '1575199163847000068',
@@ -648,16 +824,16 @@ class TwitterIE(TwitterBaseIE):
'uploader_url': 'https://twitter.com/Rizdraws',
'upload_date': '20220928',
'timestamp': 1664391723,
- 'thumbnail': 're:^https?://.*\\.jpg',
+ 'thumbnail': r're:^https?://.+\.jpg',
'like_count': int,
'repost_count': int,
'comment_count': int,
'age_limit': 18,
'tags': []
},
- 'expected_warnings': ['404'],
+ 'skip': 'Requires authentication',
}, {
- # Description is missing one https://t.co url (GraphQL)
+ # Playlist result only with auth
'url': 'https://twitter.com/Srirachachau/status/1395079556562706435',
'playlist_mincount': 2,
'info_dict': {
@@ -669,14 +845,13 @@ class TwitterIE(TwitterBaseIE):
'upload_date': '20210519',
'age_limit': 0,
'repost_count': int,
- 'description': 'Here it is! Finished my gothic western cartoon. Pretty proud of it. It\'s got some goofs and lots of splashy over the top violence, something for everyone, hope you like it https://t.co/fOsG5glUnw https://t.co/kbXZrozlY7',
+ 'description': 'Here it is! Finished my gothic western cartoon. Pretty proud of it. It\'s got some goofs and lots of splashy over the top violence, something for everyone, hope you like it https://t.co/fOsG5glUnw',
'uploader_id': 'Srirachachau',
'comment_count': int,
'uploader_url': 'https://twitter.com/Srirachachau',
'timestamp': 1621447860,
},
}, {
- # Description is missing one https://t.co url (GraphQL)
'url': 'https://twitter.com/DavidToons_/status/1578353380363501568',
'playlist_mincount': 2,
'info_dict': {
@@ -688,7 +863,7 @@ class TwitterIE(TwitterBaseIE):
'uploader': str,
'timestamp': 1665143744,
'uploader_url': 'https://twitter.com/DavidToons_',
- 'description': 'Chris sounds like Linda from Bob\'s Burgers, so as an animator: this had to be done. https://t.co/glfQdgfFXH https://t.co/WgJauwIW1w',
+ 'description': 'Chris sounds like Linda from Bob\'s Burgers, so as an animator: this had to be done. https://t.co/WgJauwIW1w',
'tags': [],
'comment_count': int,
'upload_date': '20221007',
@@ -722,11 +897,174 @@ class TwitterIE(TwitterBaseIE):
'uploader': r're:Monique Camarra.+?',
'uploader_id': 'MoniqueCamarra',
'live_status': 'was_live',
- 'description': 'md5:acce559345fd49f129c20dbcda3f1201',
- 'timestamp': 1658407771464,
+ 'release_timestamp': 1658417414,
+ 'description': 'md5:4dc8e972f1d8b3c6580376fabb02a3ad',
+ 'timestamp': 1658407771,
+ 'release_date': '20220721',
+ 'upload_date': '20220721',
},
'add_ie': ['TwitterSpaces'],
'params': {'skip_download': 'm3u8'},
+ 'skip': 'Requires authentication',
+ }, {
+ # URL specifies video number but --yes-playlist
+ 'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/1',
+ 'playlist_mincount': 2,
+ 'info_dict': {
+ 'id': '1600649710662213632',
+ 'title': 'md5:be05989b0722e114103ed3851a0ffae2',
+ 'timestamp': 1670459604.0,
+ 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c',
+ 'comment_count': int,
+ 'uploader_id': 'CTVJLaidlaw',
+ 'repost_count': int,
+ 'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'],
+ 'upload_date': '20221208',
+ 'age_limit': 0,
+ 'uploader': 'Jocelyn Laidlaw',
+ 'uploader_url': 'https://twitter.com/CTVJLaidlaw',
+ 'like_count': int,
+ },
+ }, {
+ # URL specifies video number and --no-playlist
+ 'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/2',
+ 'info_dict': {
+ 'id': '1600649511827013632',
+ 'ext': 'mp4',
+ 'title': 'md5:7662a0a27ce6faa3e5b160340f3cfab1',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ 'timestamp': 1670459604.0,
+ 'uploader_id': 'CTVJLaidlaw',
+ 'uploader': 'Jocelyn Laidlaw',
+ 'repost_count': int,
+ 'comment_count': int,
+ 'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'],
+ 'duration': 102.226,
+ 'uploader_url': 'https://twitter.com/CTVJLaidlaw',
+ 'display_id': '1600649710662213632',
+ 'like_count': int,
+ 'view_count': int,
+ 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c',
+ 'upload_date': '20221208',
+ 'age_limit': 0,
+ },
+ 'params': {'noplaylist': True},
+ }, {
+ # id pointing to TweetWithVisibilityResults type entity which wraps the actual Tweet over
+ # note the id different between extraction and url
+ 'url': 'https://twitter.com/s2FAKER/status/1621117700482416640',
+ 'info_dict': {
+ 'id': '1621117577354424321',
+ 'display_id': '1621117700482416640',
+ 'ext': 'mp4',
+ 'title': '뽀 - 아 최우제 이동속도 봐',
+ 'description': '아 최우제 이동속도 봐 https://t.co/dxu2U5vXXB',
+ 'duration': 24.598,
+ 'uploader': '뽀',
+ 'uploader_id': 's2FAKER',
+ 'uploader_url': 'https://twitter.com/s2FAKER',
+ 'upload_date': '20230202',
+ 'timestamp': 1675339553.0,
+ 'thumbnail': r're:https?://pbs\.twimg\.com/.+',
+ 'age_limit': 18,
+ 'tags': [],
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ 'view_count': int,
+ },
+ }, {
+ 'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2',
+ 'info_dict': {
+ 'id': '1599108643743473680',
+ 'display_id': '1599108751385972737',
+ 'ext': 'mp4',
+ 'title': '\u06ea - \U0001F48B',
+ 'uploader_url': 'https://twitter.com/hlo_again',
+ 'like_count': int,
+ 'uploader_id': 'hlo_again',
+ 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1599108643743473680/pu/img/UG3xjov4rgg5sbYM.jpg?name=orig',
+ 'repost_count': int,
+ 'duration': 9.531,
+ 'comment_count': int,
+ 'view_count': int,
+ 'upload_date': '20221203',
+ 'age_limit': 0,
+ 'timestamp': 1670092210.0,
+ 'tags': [],
+ 'uploader': '\u06ea',
+ 'description': '\U0001F48B https://t.co/bTj9Qz7vQP',
+ },
+ 'params': {'noplaylist': True},
+ }, {
+ 'url': 'https://twitter.com/MunTheShinobi/status/1600009574919962625',
+ 'info_dict': {
+ 'id': '1600009362759733248',
+ 'display_id': '1600009574919962625',
+ 'ext': 'mp4',
+ 'uploader_url': 'https://twitter.com/MunTheShinobi',
+ 'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml',
+ 'view_count': int,
+ 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig',
+ 'age_limit': 0,
+ 'uploader': 'Mün The Friend Of YWAP',
+ 'repost_count': int,
+ 'upload_date': '20221206',
+ 'title': 'Mün The Friend Of YWAP - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525',
+ 'comment_count': int,
+ 'like_count': int,
+ 'tags': [],
+ 'uploader_id': 'MunTheShinobi',
+ 'duration': 139.987,
+ 'timestamp': 1670306984.0,
+ },
+ }, {
+ # url to retweet id w/ legacy api
+ 'url': 'https://twitter.com/liberdalau/status/1623739803874349067',
+ 'info_dict': {
+ 'id': '1623274794488659969',
+ 'display_id': '1623739803874349067',
+ 'ext': 'mp4',
+ 'title': 'Johnny Bullets - Me after going viral to over 30million people: Whoopsie-daisy',
+ 'description': 'md5:b06864cd3dc2554821cc327f5348485a',
+ 'uploader': 'Johnny Bullets',
+ 'uploader_id': 'Johnnybull3ts',
+ 'uploader_url': 'https://twitter.com/Johnnybull3ts',
+ 'age_limit': 0,
+ 'tags': [],
+ 'duration': 8.033,
+ 'timestamp': 1675853859.0,
+ 'upload_date': '20230208',
+ 'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+',
+ 'like_count': int,
+ 'repost_count': int,
+ },
+ 'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}},
+ 'skip': 'Protected tweet',
+ }, {
+ # orig tweet w/ graphql
+ 'url': 'https://twitter.com/liberdalau/status/1623739803874349067',
+ 'info_dict': {
+ 'id': '1623274794488659969',
+ 'display_id': '1623739803874349067',
+ 'ext': 'mp4',
+ 'title': '@selfisekai@hackerspace.pl 🐀 - RT @Johnnybull3ts: Me after going viral to over 30million people: Whoopsie-daisy',
+ 'description': 'md5:9258bdbb54793bdc124fe1cd47e96c6a',
+ 'uploader': '@selfisekai@hackerspace.pl 🐀',
+ 'uploader_id': 'liberdalau',
+ 'uploader_url': 'https://twitter.com/liberdalau',
+ 'age_limit': 0,
+ 'tags': [],
+ 'duration': 8.033,
+ 'timestamp': 1675964711.0,
+ 'upload_date': '20230209',
+ 'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+',
+ 'like_count': int,
+ 'view_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ },
+ 'skip': 'Protected tweet',
}, {
# onion route
'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',
@@ -769,12 +1107,23 @@ class TwitterIE(TwitterBaseIE):
result = traverse_obj(data, (
'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries',
lambda _, v: v['entryId'] == f'tweet-{twid}', 'content', 'itemContent',
- 'tweet_results', 'result'
- ), expected_type=dict, default={}, get_all=False)
+ 'tweet_results', 'result', ('tweet', None), {dict},
+ ), default={}, get_all=False) if self.is_logged_in else traverse_obj(
+ data, ('tweetResult', 'result', {dict}), default={})
+
+ if result.get('__typename') not in ('Tweet', 'TweetTombstone', 'TweetUnavailable', None):
+ self.report_warning(f'Unknown typename: {result.get("__typename")}', twid, only_once=True)
if 'tombstone' in result:
- cause = traverse_obj(result, ('tombstone', 'text', 'text'), expected_type=str)
+ cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more')
raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True)
+ elif result.get('__typename') == 'TweetUnavailable':
+ reason = result.get('reason')
+ if reason == 'NsfwLoggedOut':
+ self.raise_login_required('NSFW tweet requires authentication')
+ elif reason == 'Protected':
+ self.raise_login_required('You are not authorized to view this protected tweet')
+ raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True)
status = result.get('legacy', {})
status.update(traverse_obj(result, {
@@ -786,7 +1135,7 @@ class TwitterIE(TwitterBaseIE):
# extra transformation is needed since result does not match legacy format
binding_values = {
binding_value.get('key'): binding_value.get('value')
- for binding_value in traverse_obj(status, ('card', 'binding_values', ...), expected_type=dict)
+ for binding_value in traverse_obj(status, ('card', 'binding_values', ..., {dict}))
}
if binding_values:
status['card']['binding_values'] = binding_values
@@ -825,25 +1174,74 @@ class TwitterIE(TwitterBaseIE):
'verified_phone_label_enabled': False,
'vibe_api_enabled': True,
},
+ } if self.is_logged_in else {
+ 'variables': {
+ 'tweetId': media_id,
+ 'withCommunity': False,
+ 'includePromotedContent': False,
+ 'withVoice': False,
+ },
+ 'features': {
+ 'creator_subscriptions_tweet_preview_api_enabled': True,
+ 'tweetypie_unmention_optimization_enabled': True,
+ 'responsive_web_edit_tweet_api_enabled': True,
+ 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True,
+ 'view_counts_everywhere_api_enabled': True,
+ 'longform_notetweets_consumption_enabled': True,
+ 'responsive_web_twitter_article_tweet_consumption_enabled': False,
+ 'tweet_awards_web_tipping_enabled': False,
+ 'freedom_of_speech_not_reach_fetch_enabled': True,
+ 'standardized_nudges_misinfo': True,
+ 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': True,
+ 'longform_notetweets_rich_text_read_enabled': True,
+ 'longform_notetweets_inline_media_enabled': True,
+ 'responsive_web_graphql_exclude_directive_enabled': True,
+ 'verified_phone_label_enabled': False,
+ 'responsive_web_media_download_video_enabled': False,
+ 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False,
+ 'responsive_web_graphql_timeline_navigation_enabled': True,
+ 'responsive_web_enhance_cards_enabled': False
+ },
+ 'fieldToggles': {
+ 'withArticleRichContentState': False
+ }
}
- def _real_extract(self, url):
- twid = self._match_id(url)
- if self.is_logged_in or self._configuration_arg('force_graphql'):
- self.write_debug(f'Using GraphQL API (Auth = {self.is_logged_in})')
- result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid)
- status = self._graphql_to_legacy(result, twid)
-
- else:
- status = self._call_api(f'statuses/show/{twid}.json', twid, {
+ def _extract_status(self, twid):
+ if self.is_logged_in:
+ return self._graphql_to_legacy(
+ self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid)
+
+ try:
+ if not self._configuration_arg('legacy_api'):
+ return self._graphql_to_legacy(
+ self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid)
+ return traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, {
'cards_platform': 'Web-12',
'include_cards': 1,
'include_reply_count': 1,
'include_user_entities': 0,
'tweet_mode': 'extended',
- })
+ }), 'retweeted_status', None)
+
+ except ExtractorError as e:
+ if e.expected:
+ raise
+ self.report_warning(
+ f'{e.orig_msg}. Falling back to syndication endpoint; some metadata may be missing', twid)
+
+ status = self._download_json(
+ 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
+ headers={'User-Agent': 'Googlebot'}, query={'id': twid})
+ status['extended_entities'] = {'media': status.get('mediaDetails')}
+ return status
+
+ def _real_extract(self, url):
+ twid, selected_index = self._match_valid_url(url).group('id', 'index')
+ status = self._extract_status(twid)
- title = description = status['full_text'].replace('\n', ' ')
+ title = description = traverse_obj(
+ status, (('full_text', 'text'), {lambda x: x.replace('\n', ' ')}), get_all=False) or ''
# strip 'https -_t.co_BJYgOjSeGA' junk from filenames
title = re.sub(r'\s+(https?://[^ ]+)', '', title)
user = status.get('user') or {}
@@ -852,13 +1250,6 @@ class TwitterIE(TwitterBaseIE):
title = f'{uploader} - {title}'
uploader_id = user.get('screen_name')
- tags = []
- for hashtag in (try_get(status, lambda x: x['entities']['hashtags'], list) or []):
- hashtag_text = hashtag.get('text')
- if not hashtag_text:
- continue
- tags.append(hashtag_text)
-
info = {
'id': twid,
'title': title,
@@ -871,17 +1262,19 @@ class TwitterIE(TwitterBaseIE):
'repost_count': int_or_none(status.get('retweet_count')),
'comment_count': int_or_none(status.get('reply_count')),
'age_limit': 18 if status.get('possibly_sensitive') else 0,
- 'tags': tags,
+ 'tags': traverse_obj(status, ('entities', 'hashtags', ..., 'text')),
}
def extract_from_video_info(media):
- media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none)
+ media_id = traverse_obj(media, 'id_str', 'id', (
+ 'video_info', 'variants', ..., 'url',
+ {functools.partial(re.search, r'_video/(\d+)/')}, 1
+ ), get_all=False, expected_type=str_or_none) or twid
self.write_debug(f'Extracting from video info: {media_id}')
- video_info = media.get('video_info') or {}
formats = []
subtitles = {}
- for variant in video_info.get('variants', []):
+ for variant in traverse_obj(media, ('video_info', 'variants', ...)):
fmts, subs = self._extract_variant_formats(variant, twid)
subtitles = self._merge_subtitles(subtitles, subs)
formats.extend(fmts)
@@ -905,7 +1298,8 @@ class TwitterIE(TwitterBaseIE):
'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails,
- 'duration': float_or_none(video_info.get('duration_millis'), 1000),
+ 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})),
+ 'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000),
# The codec of http formats are unknown
'_format_sort_fields': ('res', 'br', 'size', 'proto'),
}
@@ -984,15 +1378,37 @@ class TwitterIE(TwitterBaseIE):
'content_duration_seconds')),
}
- media_path = ((None, 'quoted_status'), 'extended_entities', 'media', lambda _, m: m['type'] != 'photo')
- videos = map(extract_from_video_info, traverse_obj(status, media_path, expected_type=dict))
- cards = extract_from_card_info(status.get('card'))
- entries = [{**info, **data, 'display_id': twid} for data in (*videos, *cards)]
+ videos = traverse_obj(status, (
+ (None, 'quoted_status'), 'extended_entities', 'media', lambda _, m: m['type'] != 'photo', {dict}))
+ if self._yes_playlist(twid, selected_index, video_label='URL-specified video number'):
+ selected_entries = (*map(extract_from_video_info, videos), *extract_from_card_info(status.get('card')))
+ else:
+ desired_obj = traverse_obj(status, (
+ (None, 'quoted_status'), 'extended_entities', 'media', int(selected_index) - 1, {dict}), get_all=False)
+ if not desired_obj:
+ raise ExtractorError(f'Video #{selected_index} is unavailable', expected=True)
+ elif desired_obj.get('type') != 'video':
+ raise ExtractorError(f'Media #{selected_index} is not a video', expected=True)
+
+ # Restore original archive id and video index in title
+ for index, entry in enumerate(videos, 1):
+ if entry.get('id') != desired_obj.get('id'):
+ continue
+ if index == 1:
+ info['_old_archive_ids'] = [make_archive_id(self, twid)]
+ if len(videos) != 1:
+ info['title'] += f' #{index}'
+ break
+
+ return {**info, **extract_from_video_info(desired_obj), 'display_id': twid}
+
+ entries = [{**info, **data, 'display_id': twid} for data in selected_entries]
if not entries:
expanded_url = traverse_obj(status, ('entities', 'urls', 0, 'expanded_url'), expected_type=url_or_none)
if not expanded_url or expanded_url == url:
- raise ExtractorError('No video could be found in this tweet', expected=True)
+ self.raise_no_formats('No video could be found in this tweet', expected=True)
+ return info
return self.url_result(expanded_url, display_id=twid, **info)
@@ -1116,7 +1532,42 @@ class TwitterSpacesIE(TwitterBaseIE):
'uploader': r're:Lucio Di Gaetano.*?',
'uploader_id': 'luciodigaetano',
'live_status': 'was_live',
- 'timestamp': 1659877956397,
+ 'timestamp': 1659877956,
+ 'upload_date': '20220807',
+ 'release_timestamp': 1659904215,
+ 'release_date': '20220807',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ # post_live/TimedOut but downloadable
+ 'url': 'https://twitter.com/i/spaces/1vAxRAVQWONJl',
+ 'info_dict': {
+ 'id': '1vAxRAVQWONJl',
+ 'ext': 'm4a',
+ 'title': 'Framing Up FinOps: Billing Tools',
+ 'description': 'Twitter Space participated by rupa, Alfonso Hernandez',
+ 'uploader': 'Google Cloud',
+ 'uploader_id': 'googlecloud',
+ 'live_status': 'post_live',
+ 'timestamp': 1681409554,
+ 'upload_date': '20230413',
+ 'release_timestamp': 1681839000,
+ 'release_date': '20230418',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ # Needs ffmpeg as downloader, see: https://github.com/hypervideo/hypervideo/issues/7536
+ 'url': 'https://twitter.com/i/spaces/1eaKbrQbjoRKX',
+ 'info_dict': {
+ 'id': '1eaKbrQbjoRKX',
+ 'ext': 'm4a',
+ 'title': 'あ',
+ 'description': 'Twitter Space participated by nobody yet',
+ 'uploader': '息根とめる🔪Twitchで復活',
+ 'uploader_id': 'tomeru_ikinone',
+ 'live_status': 'was_live',
+ 'timestamp': 1685617198,
+ 'upload_date': '20230601',
},
'params': {'skip_download': 'm3u8'},
}]
@@ -1156,32 +1607,39 @@ class TwitterSpacesIE(TwitterBaseIE):
def _real_extract(self, url):
space_id = self._match_id(url)
+ if not self.is_logged_in:
+ self.raise_login_required('Twitter Spaces require authentication')
space_data = self._call_graphql_api('HPEisOmj1epUNLCWTYhUWw/AudioSpaceById', space_id)['audioSpace']
if not space_data:
raise ExtractorError('Twitter Space not found', expected=True)
metadata = space_data['metadata']
live_status = try_call(lambda: self.SPACE_STATUS[metadata['state'].lower()])
+ is_live = live_status == 'is_live'
formats = []
if live_status == 'is_upcoming':
self.raise_no_formats('Twitter Space not started yet', expected=True)
- elif live_status == 'post_live':
- self.raise_no_formats('Twitter Space ended but not downloadable yet', expected=True)
- else:
- source = self._call_api(
- f'live_video_stream/status/{metadata["media_key"]}', metadata['media_key'])['source']
-
- # XXX: Native downloader does not work
- formats = self._extract_m3u8_formats(
- traverse_obj(source, 'noRedirectPlaybackUrl', 'location'),
- metadata['media_key'], 'm4a', 'm3u8', live=live_status == 'is_live',
- headers={'Referer': 'https://twitter.com/'})
+ elif not is_live and not metadata.get('is_space_available_for_replay'):
+ self.raise_no_formats('Twitter Space ended and replay is disabled', expected=True)
+ elif metadata.get('media_key'):
+ source = traverse_obj(
+ self._call_api(f'live_video_stream/status/{metadata["media_key"]}', metadata['media_key']),
+ ('source', ('noRedirectPlaybackUrl', 'location'), {url_or_none}), get_all=False)
+ formats = self._extract_m3u8_formats( # XXX: Some Spaces need ffmpeg as downloader
+ source, metadata['media_key'], 'm4a', entry_protocol='m3u8', live=is_live,
+ headers={'Referer': 'https://twitter.com/'}, fatal=False) if source else []
for fmt in formats:
fmt.update({'vcodec': 'none', 'acodec': 'aac'})
+ if not is_live:
+ fmt['container'] = 'm4a_dash'
participants = ', '.join(traverse_obj(
space_data, ('participants', 'speakers', ..., 'display_name'))) or 'nobody yet'
+
+ if not formats and live_status == 'post_live':
+ self.raise_no_formats('Twitter Space ended but not downloadable yet', expected=True)
+
return {
'id': space_id,
'title': metadata.get('title'),
@@ -1191,7 +1649,9 @@ class TwitterSpacesIE(TwitterBaseIE):
'uploader_id': traverse_obj(
metadata, ('creator_results', 'result', 'legacy', 'screen_name')),
'live_status': live_status,
- 'timestamp': metadata.get('created_at'),
+ 'release_timestamp': try_call(
+ lambda: int_or_none(metadata['scheduled_start'], scale=1000)),
+ 'timestamp': int_or_none(metadata.get('created_at'), scale=1000),
'formats': formats,
}
@@ -1207,7 +1667,7 @@ class TwitterShortenerIE(TwitterBaseIE):
if eid:
id = eid
url = self._BASE_URL + id
- new_url = self._request_webpage(url, id, headers={'User-Agent': 'curl'}).geturl()
+ new_url = self._request_webpage(url, id, headers={'User-Agent': 'curl'}).url
__UNSAFE_LINK = "https://twitter.com/safety/unsafe_link_warning?unsafe_link="
if new_url.startswith(__UNSAFE_LINK):
new_url = new_url.replace(__UNSAFE_LINK, "")
diff --git a/hypervideo_dl/extractor/txxx.py b/hypervideo_dl/extractor/txxx.py
new file mode 100644
index 0000000..fff7a5d
--- /dev/null
+++ b/hypervideo_dl/extractor/txxx.py
@@ -0,0 +1,418 @@
+import base64
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ js_to_json,
+ merge_dicts,
+ parse_duration,
+ traverse_obj,
+ try_call,
+ urljoin,
+ variadic,
+)
+
+
+def decode_base64(text):
+ return base64.b64decode(text.translate(text.maketrans({
+ '\u0405': 'S',
+ '\u0406': 'I',
+ '\u0408': 'J',
+ '\u0410': 'A',
+ '\u0412': 'B',
+ '\u0415': 'E',
+ '\u041a': 'K',
+ '\u041c': 'M',
+ '\u041d': 'H',
+ '\u041e': 'O',
+ '\u0420': 'P',
+ '\u0421': 'C',
+ '\u0425': 'X',
+ ',': '/',
+ '.': '+',
+ '~': '=',
+ }))).decode()
+
+
+def get_formats(host, video_file):
+ return [{
+ 'url': urljoin(f'https://{host}', decode_base64(video['video_url'])),
+ 'format_id': try_call(lambda: variadic(video['format'])[0].lstrip('_')),
+ 'quality': index,
+ } for index, video in enumerate(video_file) if video.get('video_url')]
+
+
+class TxxxIE(InfoExtractor):
+ _DOMAINS = (
+ 'hclips.com',
+ 'hdzog.com',
+ 'hdzog.tube',
+ 'hotmovs.com',
+ 'hotmovs.tube',
+ 'inporn.com',
+ 'privatehomeclips.com',
+ 'tubepornclassic.com',
+ 'txxx.com',
+ 'txxx.tube',
+ 'upornia.com',
+ 'upornia.tube',
+ 'vjav.com',
+ 'vjav.tube',
+ 'vxxx.com',
+ 'voyeurhit.com',
+ 'voyeurhit.tube',
+ )
+ _VALID_URL = rf'''(?x)
+ https?://(?:www\.)?(?P<host>{"|".join(map(re.escape, _DOMAINS))})/
+ (?:videos?[/-]|embed/)(?P<id>\d+)(?:/(?P<display_id>[^/?#]+))?
+ '''
+ _EMBED_REGEX = [rf'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:{"|".join(map(re.escape, _DOMAINS))})/embed/[^"\']*)\1']
+ _TESTS = [{
+ 'url': 'https://txxx.com/videos/16574965/digital-desire-malena-morgan/',
+ 'md5': 'c54e4ace54320aaf8e2a72df87859391',
+ 'info_dict': {
+ 'id': '16574965',
+ 'display_id': 'digital-desire-malena-morgan',
+ 'ext': 'mp4',
+ 'title': 'Digital Desire - Malena Morgan',
+ 'uploader': 'Lois Argentum',
+ 'duration': 694,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://txxx.tube/videos/16574965/digital-desire-malena-morgan/',
+ 'md5': 'c54e4ace54320aaf8e2a72df87859391',
+ 'info_dict': {
+ 'id': '16574965',
+ 'display_id': 'digital-desire-malena-morgan',
+ 'ext': 'mp4',
+ 'title': 'Digital Desire - Malena Morgan',
+ 'uploader': 'Lois Argentum',
+ 'duration': 694,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://vxxx.com/video-68925/',
+ 'md5': '1fcff3748b0c5b41fe41d0afa22409e1',
+ 'info_dict': {
+ 'id': '68925',
+ 'display_id': '68925',
+ 'ext': 'mp4',
+ 'title': 'Malena Morgan',
+ 'uploader': 'Huge Hughes',
+ 'duration': 694,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://hclips.com/videos/6291073/malena-morgan-masturbates-her-sweet/',
+ 'md5': 'a5dd4f83363972ee043313cff85e7e26',
+ 'info_dict': {
+ 'id': '6291073',
+ 'display_id': 'malena-morgan-masturbates-her-sweet',
+ 'ext': 'mp4',
+ 'title': 'Malena Morgan masturbates her sweet',
+ 'uploader': 'John Salt',
+ 'duration': 426,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://hdzog.com/videos/67063/gorgeous-malena-morgan-will-seduce-you-at-the-first-glance/',
+ 'md5': 'f8bdedafd45d1ec2875c43fe33a846d3',
+ 'info_dict': {
+ 'id': '67063',
+ 'display_id': 'gorgeous-malena-morgan-will-seduce-you-at-the-first-glance',
+ 'ext': 'mp4',
+ 'title': 'Gorgeous Malena Morgan will seduce you at the first glance',
+ 'uploader': 'momlesson',
+ 'duration': 601,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://hdzog.tube/videos/67063/gorgeous-malena-morgan-will-seduce-you-at-the-first-glance/',
+ 'md5': 'f8bdedafd45d1ec2875c43fe33a846d3',
+ 'info_dict': {
+ 'id': '67063',
+ 'display_id': 'gorgeous-malena-morgan-will-seduce-you-at-the-first-glance',
+ 'ext': 'mp4',
+ 'title': 'Gorgeous Malena Morgan will seduce you at the first glance',
+ 'uploader': 'momlesson',
+ 'duration': 601,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://hotmovs.com/videos/8789287/unbelievable-malena-morgan-performing-in-incredible-masturantion/',
+ 'md5': '71d32c51584876472db87e561171a386',
+ 'info_dict': {
+ 'id': '8789287',
+ 'display_id': 'unbelievable-malena-morgan-performing-in-incredible-masturantion',
+ 'ext': 'mp4',
+ 'title': 'Unbelievable Malena Morgan performing in incredible masturantion',
+ 'uploader': 'Davit Sanchez',
+ 'duration': 940,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://hotmovs.tube/videos/8789287/unbelievable-malena-morgan-performing-in-incredible-masturantion/',
+ 'md5': '71d32c51584876472db87e561171a386',
+ 'info_dict': {
+ 'id': '8789287',
+ 'display_id': 'unbelievable-malena-morgan-performing-in-incredible-masturantion',
+ 'ext': 'mp4',
+ 'title': 'Unbelievable Malena Morgan performing in incredible masturantion',
+ 'uploader': 'Davit Sanchez',
+ 'duration': 940,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://inporn.com/video/517897/malena-morgan-solo/',
+ 'md5': '344db467481edf78f193cdf5820a7cfb',
+ 'info_dict': {
+ 'id': '517897',
+ 'display_id': 'malena-morgan-solo',
+ 'ext': 'mp4',
+ 'title': 'Malena Morgan - Solo',
+ 'uploader': 'Ashley Oxy',
+ 'duration': 480,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://privatehomeclips.com/videos/3630599/malena-morgan-cam-show/',
+ 'md5': 'ea657273e352493c5fb6357fbfa4f126',
+ 'info_dict': {
+ 'id': '3630599',
+ 'display_id': 'malena-morgan-cam-show',
+ 'ext': 'mp4',
+ 'title': 'malena morgan cam show',
+ 'uploader': 'Member9915',
+ 'duration': 290,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://tubepornclassic.com/videos/1015455/mimi-rogers-full-body-massage-nude-compilation/',
+ 'md5': '2e9a6cf610c9862e86e0ce24f08f4427',
+ 'info_dict': {
+ 'id': '1015455',
+ 'display_id': 'mimi-rogers-full-body-massage-nude-compilation',
+ 'ext': 'mp4',
+ 'title': 'Mimi Rogers - Full Body Massage (Nude) compilation',
+ 'uploader': '88bhuto',
+ 'duration': 286,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://upornia.com/videos/1498858/twistys-malena-morgan-starring-at-dr-morgan-baller/',
+ 'md5': '7ff7033340bc88a173198b7c22600e4f',
+ 'info_dict': {
+ 'id': '1498858',
+ 'display_id': 'twistys-malena-morgan-starring-at-dr-morgan-baller',
+ 'ext': 'mp4',
+ 'title': 'Twistys - Malena Morgan starring at Dr. Morgan-Baller',
+ 'uploader': 'mindgeek',
+ 'duration': 480,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://upornia.tube/videos/1498858/twistys-malena-morgan-starring-at-dr-morgan-baller/',
+ 'md5': '7ff7033340bc88a173198b7c22600e4f',
+ 'info_dict': {
+ 'id': '1498858',
+ 'display_id': 'twistys-malena-morgan-starring-at-dr-morgan-baller',
+ 'ext': 'mp4',
+ 'title': 'Twistys - Malena Morgan starring at Dr. Morgan-Baller',
+ 'uploader': 'mindgeek',
+ 'duration': 480,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://vjav.com/videos/11761/yui-hatano-in-if-yui-was-my-girlfriend2/',
+ 'md5': '6de5bc1f13bdfc3491a77f23edb1676f',
+ 'info_dict': {
+ 'id': '11761',
+ 'display_id': 'yui-hatano-in-if-yui-was-my-girlfriend2',
+ 'ext': 'mp4',
+ 'title': 'Yui Hatano in If Yui Was My Girlfriend',
+ 'uploader': 'Matheus69',
+ 'duration': 3310,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://vjav.tube/videos/11761/yui-hatano-in-if-yui-was-my-girlfriend2/',
+ 'md5': '6de5bc1f13bdfc3491a77f23edb1676f',
+ 'info_dict': {
+ 'id': '11761',
+ 'display_id': 'yui-hatano-in-if-yui-was-my-girlfriend2',
+ 'ext': 'mp4',
+ 'title': 'Yui Hatano in If Yui Was My Girlfriend',
+ 'uploader': 'Matheus69',
+ 'duration': 3310,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://voyeurhit.com/videos/332875/charlotte-stokely-elle-alexandra-malena-morgan-lingerie/',
+ 'md5': '12b4666e9c3e60dafe9182e5d12aae33',
+ 'info_dict': {
+ 'id': '332875',
+ 'display_id': 'charlotte-stokely-elle-alexandra-malena-morgan-lingerie',
+ 'ext': 'mp4',
+ 'title': 'Charlotte Stokely, Elle Alexandra, Malena Morgan-Lingerie',
+ 'uploader': 'Kyle Roberts',
+ 'duration': 655,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://voyeurhit.tube/videos/332875/charlotte-stokely-elle-alexandra-malena-morgan-lingerie/',
+ 'md5': '12b4666e9c3e60dafe9182e5d12aae33',
+ 'info_dict': {
+ 'id': '332875',
+ 'display_id': 'charlotte-stokely-elle-alexandra-malena-morgan-lingerie',
+ 'ext': 'mp4',
+ 'title': 'Charlotte Stokely, Elle Alexandra, Malena Morgan-Lingerie',
+ 'uploader': 'Kyle Roberts',
+ 'duration': 655,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }]
+ _WEBPAGE_TESTS = [{
+ 'url': 'https://pornzog.com/video/9125519/michelle-malone-dreamgirls-wild-wet-3/',
+ 'info_dict': {
+ 'id': '5119660',
+ 'display_id': '5119660',
+ 'ext': 'mp4',
+ 'title': 'Michelle Malone - Dreamgirls - Wild Wet 3',
+ 'uploader': 'FallenAngel12',
+ 'duration': 402,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }]
+
+ def _call_api(self, url, video_id, fatal=False, **kwargs):
+ content = self._download_json(url, video_id, fatal=fatal, **kwargs)
+ if traverse_obj(content, 'error'):
+ raise self._error_or_warning(ExtractorError(
+ f'Txxx said: {content["error"]}', expected=True), fatal=fatal)
+ return content or {}
+
+ def _real_extract(self, url):
+ video_id, host, display_id = self._match_valid_url(url).group('id', 'host', 'display_id')
+ headers = {'Referer': url, 'X-Requested-With': 'XMLHttpRequest'}
+
+ video_file = self._call_api(
+ f'https://{host}/api/videofile.php?video_id={video_id}&lifetime=8640000',
+ video_id, fatal=True, note='Downloading video file info', headers=headers)
+
+ slug = f'{int(1E6 * (int(video_id) // 1E6))}/{1000 * (int(video_id) // 1000)}'
+ video_info = self._call_api(
+ f'https://{host}/api/json/video/86400/{slug}/{video_id}.json',
+ video_id, note='Downloading video info', headers=headers)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': traverse_obj(video_info, ('video', 'title')),
+ 'uploader': traverse_obj(video_info, ('video', 'user', 'username')),
+ 'duration': parse_duration(traverse_obj(video_info, ('video', 'duration'))),
+ 'view_count': int_or_none(traverse_obj(video_info, ('video', 'statistics', 'viewed'))),
+ 'like_count': int_or_none(traverse_obj(video_info, ('video', 'statistics', 'likes'))),
+ 'dislike_count': int_or_none(traverse_obj(video_info, ('video', 'statistics', 'dislikes'))),
+ 'age_limit': 18,
+ 'formats': get_formats(host, video_file),
+ }
+
+
+class PornTopIE(InfoExtractor):
+ _VALID_URL = r'https?://(?P<host>(?:www\.)?porntop\.com)/video/(?P<id>\d+)(?:/(?P<display_id>[^/?]+))?'
+ _TESTS = [{
+ 'url': 'https://porntop.com/video/101569/triple-threat-with-lia-lor-malena-morgan-and-dani-daniels/',
+ 'md5': '612ba7b3cb99455b382972948e200b08',
+ 'info_dict': {
+ 'id': '101569',
+ 'display_id': 'triple-threat-with-lia-lor-malena-morgan-and-dani-daniels',
+ 'ext': 'mp4',
+ 'title': 'Triple Threat With Lia Lor, Malena Morgan And Dani Daniels',
+ 'description': 'md5:285357d9d3a00ce5acb29f39f826dbf6',
+ 'uploader': 'PatrickBush',
+ 'duration': 480,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ 'timestamp': 1609455029,
+ 'upload_date': '20201231',
+ 'thumbnail': 'https://tn.porntop.com/media/tn/sources/101569_1.jpg',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id, host, display_id = self._match_valid_url(url).group('id', 'host', 'display_id')
+ webpage = self._download_webpage(url, video_id)
+
+ json_ld = self._json_ld(self._search_json(
+ r'\bschemaJson\s*=', webpage, 'JSON-LD', video_id, transform_source=js_to_json,
+ contains_pattern='{[^<]+?VideoObject[^<]+};'), video_id, fatal=True)
+
+ video_file = self._parse_json(decode_base64(self._search_regex(
+ r"window\.initPlayer\(.*}}},\s*'(?P<json_b64c>[^']+)'",
+ webpage, 'json_urls', group='json_b64c')), video_id)
+
+ return merge_dicts({
+ 'id': video_id,
+ 'display_id': display_id,
+ 'age_limit': 18,
+ 'formats': get_formats(host, video_file),
+ }, json_ld)
diff --git a/hypervideo_dl/extractor/udemy.py b/hypervideo_dl/extractor/udemy.py
index 4faad58..117acc7 100644
--- a/hypervideo_dl/extractor/udemy.py
+++ b/hypervideo_dl/extractor/udemy.py
@@ -1,8 +1,9 @@
import re
-import urllib.request
from .common import InfoExtractor
-from ..compat import compat_HTTPError, compat_str, compat_urlparse
+from ..compat import compat_str, compat_urlparse
+from ..networking import Request
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
determine_ext,
@@ -10,9 +11,10 @@ from ..utils import (
float_or_none,
int_or_none,
js_to_json,
- sanitized_Request,
+ smuggle_url,
try_get,
unescapeHTML,
+ unsmuggle_url,
url_or_none,
urlencode_postdata,
)
@@ -106,7 +108,7 @@ class UdemyIE(InfoExtractor):
% (course_id, lecture_id),
lecture_id, 'Downloading lecture JSON', query={
'fields[lecture]': 'title,description,view_html,asset',
- 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data',
+ 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data,course_is_drmed',
})
def _handle_error(self, response):
@@ -151,11 +153,10 @@ class UdemyIE(InfoExtractor):
headers['X-Udemy-Bearer-Token'] = cookie.value
headers['X-Udemy-Authorization'] = 'Bearer %s' % cookie.value
- if isinstance(url_or_request, urllib.request.Request):
- for header, value in headers.items():
- url_or_request.add_header(header, value)
+ if isinstance(url_or_request, Request):
+ url_or_request.headers.update(headers)
else:
- url_or_request = sanitized_Request(url_or_request, headers=headers)
+ url_or_request = Request(url_or_request, headers=headers)
response = super(UdemyIE, self)._download_json(url_or_request, *args, **kwargs)
self._handle_error(response)
@@ -199,16 +200,19 @@ class UdemyIE(InfoExtractor):
def _real_extract(self, url):
lecture_id = self._match_id(url)
+ course_id = unsmuggle_url(url, {})[1].get('course_id')
- webpage = self._download_webpage(url, lecture_id)
-
- course_id, _ = self._extract_course_info(webpage, lecture_id)
+ webpage = None
+ if not course_id:
+ webpage = self._download_webpage(url, lecture_id)
+ course_id, _ = self._extract_course_info(webpage, lecture_id)
try:
lecture = self._download_lecture(course_id, lecture_id)
except ExtractorError as e:
# Error could possibly mean we are not enrolled in the course
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
+ webpage = webpage or self._download_webpage(url, lecture_id)
self._enroll_course(url, webpage, course_id)
lecture = self._download_lecture(course_id, lecture_id)
else:
@@ -391,6 +395,9 @@ class UdemyIE(InfoExtractor):
if f.get('url'):
formats.append(f)
+ if not formats and asset.get('course_is_drmed'):
+ self.report_drm(video_id)
+
return {
'id': video_id,
'title': title,
@@ -449,7 +456,9 @@ class UdemyCourseIE(UdemyIE): # XXX: Do not subclass from concrete IE
if lecture_id:
entry = {
'_type': 'url_transparent',
- 'url': 'https://www.udemy.com/%s/learn/v4/t/lecture/%s' % (course_path, entry['id']),
+ 'url': smuggle_url(
+ f'https://www.udemy.com/{course_path}/learn/v4/t/lecture/{entry["id"]}',
+ {'course_id': course_id}),
'title': entry.get('title'),
'ie_key': UdemyIE.ie_key(),
}
diff --git a/hypervideo_dl/extractor/unsupported.py b/hypervideo_dl/extractor/unsupported.py
index 620c025..78c2206 100644
--- a/hypervideo_dl/extractor/unsupported.py
+++ b/hypervideo_dl/extractor/unsupported.py
@@ -42,6 +42,12 @@ class KnownDRMIE(UnsupportedInfoExtractor):
r'vootkids\.com',
r'nowtv\.it/watch',
r'tv\.apple\.com',
+ r'primevideo\.com',
+ r'hulu\.com',
+ r'resource\.inkryptvideos\.com',
+ r'joyn\.de',
+ r'amazon\.(?:\w{2}\.)?\w+/gp/video',
+ r'music\.amazon\.(?:\w{2}\.)?\w+',
)
_TESTS = [{
@@ -111,6 +117,30 @@ class KnownDRMIE(UnsupportedInfoExtractor):
# https://github.com/hypervideo/hypervideo/issues/5557
'url': 'https://tv.apple.com/it/show/loot---una-fortuna/umc.cmc.5erbujil1mpazuerhr1udnk45?ctx_brand=tvs.sbd.4000',
'only_matching': True,
+ }, {
+ # https://github.com/hypervideo/hypervideo/issues/3072
+ 'url': 'https://www.joyn.de/play/serien/clannad/1-1-wo-die-kirschblueten-fallen',
+ 'only_matching': True,
+ }, {
+ # https://github.com/hypervideo/hypervideo/issues/7323
+ 'url': 'https://music.amazon.co.jp/albums/B088Y368TK',
+ 'only_matching': True,
+ }, {
+ # https://github.com/hypervideo/hypervideo/issues/7323
+ 'url': 'https://www.amazon.co.jp/gp/video/detail/B09X5HBYRS/',
+ 'only_matching': True,
+ }, {
+ # https://github.com/hypervideo/hypervideo/issues/6125
+ 'url': 'https://www.primevideo.com/region/eu/detail/0H3DDB4KBJFNDCKKLHNRLRLVKQ/ref=atv_br_def_r_br_c_unkc_1_10',
+ 'only_matching': True,
+ }, {
+ # https://github.com/hypervideo/hypervideo/issues/5740
+ 'url': 'https://resource.inkryptvideos.com/v2-a83ns52/iframe/index.html#video_id=7999ea0f6e03439eb40d056258c2d736&otp=xxx',
+ 'only_matching': True,
+ }, {
+ # https://github.com/hypervideo/hypervideo/issues/5767
+ 'url': 'https://www.hulu.com/movie/anthem-6b25fac9-da2b-45a3-8e09-e4156b0471cc',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -130,6 +160,10 @@ class KnownPiracyIE(UnsupportedInfoExtractor):
URLS = (
r'dood\.(?:to|watch|so|pm|wf|re)',
+ # Sites youtube-dl supports, but we won't
+ r'viewsb\.com',
+ r'filemoon\.sx',
+ r'hentai\.animestigma\.com',
)
_TESTS = [{
diff --git a/hypervideo_dl/extractor/uplynk.py b/hypervideo_dl/extractor/uplynk.py
index 87c427f..e7d816e 100644
--- a/hypervideo_dl/extractor/uplynk.py
+++ b/hypervideo_dl/extractor/uplynk.py
@@ -2,40 +2,42 @@ import re
from .common import InfoExtractor
from ..utils import (
- float_or_none,
ExtractorError,
+ float_or_none,
+ smuggle_url,
+ traverse_obj,
+ unsmuggle_url,
+ update_url_query,
)
-class UplynkIE(InfoExtractor):
- IE_NAME = 'uplynk'
- _VALID_URL = r'https?://.*?\.uplynk\.com/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.(?:m3u8|json)(?:.*?\bpbs=(?P<session_id>[^&]+))?'
- _TEST = {
- 'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8',
- 'info_dict': {
- 'id': 'e89eaf2ce9054aa89d92ddb2d817a52e',
- 'ext': 'mp4',
- 'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4',
- 'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- }
+class UplynkBaseIE(InfoExtractor):
+ _UPLYNK_URL_RE = r'''(?x)
+ https?://[\w-]+\.uplynk\.com/(?P<path>
+ ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|
+ (?P<id>[0-9a-f]{32})
+ )\.(?:m3u8|json)
+ (?:.*?\bpbs=(?P<session_id>[^&]+))?'''
- def _extract_uplynk_info(self, uplynk_content_url):
- path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups()
+ def _extract_uplynk_info(self, url):
+ uplynk_content_url, smuggled_data = unsmuggle_url(url, {})
+ mobj = re.match(self._UPLYNK_URL_RE, uplynk_content_url)
+ if not mobj:
+ raise ExtractorError('Necessary parameters not found in Uplynk URL')
+ path, external_id, video_id, session_id = mobj.group('path', 'external_id', 'id', 'session_id')
display_id = video_id or external_id
+ headers = traverse_obj(
+ smuggled_data, {'Referer': 'Referer', 'Origin': 'Origin'}, casesense=False)
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
- 'http://content.uplynk.com/%s.m3u8' % path,
- display_id, 'mp4', 'm3u8_native')
+ f'http://content.uplynk.com/{path}.m3u8', display_id, 'mp4', headers=headers)
if session_id:
for f in formats:
- f['extra_param_to_segment_url'] = 'pbs=' + session_id
- asset = self._download_json('http://content.uplynk.com/player/assetinfo/%s.json' % path, display_id)
+ f['extra_param_to_segment_url'] = f'pbs={session_id}'
+ asset = self._download_json(
+ f'http://content.uplynk.com/player/assetinfo/{path}.json', display_id)
if asset.get('error') == 1:
- raise ExtractorError('% said: %s' % (self.IE_NAME, asset['msg']), expected=True)
+ msg = asset.get('msg') or 'unknown error'
+ raise ExtractorError(f'{self.IE_NAME} said: {msg}', expected=True)
return {
'id': asset['asset'],
@@ -47,20 +49,40 @@ class UplynkIE(InfoExtractor):
'subtitles': subtitles,
}
+
+class UplynkIE(UplynkBaseIE):
+ IE_NAME = 'uplynk'
+ _VALID_URL = UplynkBaseIE._UPLYNK_URL_RE
+ _TEST = {
+ 'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8',
+ 'info_dict': {
+ 'id': 'e89eaf2ce9054aa89d92ddb2d817a52e',
+ 'ext': 'mp4',
+ 'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4',
+ 'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa',
+ 'duration': 530.2739166666679,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }
+
def _real_extract(self, url):
return self._extract_uplynk_info(url)
-class UplynkPreplayIE(UplynkIE): # XXX: Do not subclass from concrete IE
+class UplynkPreplayIE(UplynkBaseIE):
IE_NAME = 'uplynk:preplay'
- _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.json'
+ _VALID_URL = r'https?://[\w-]+\.uplynk\.com/preplay2?/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.json'
def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
path, external_id, video_id = self._match_valid_url(url).groups()
display_id = video_id or external_id
preplay = self._download_json(url, display_id)
- content_url = 'http://content.uplynk.com/%s.m3u8' % path
+ content_url = f'http://content.uplynk.com/{path}.m3u8'
session_id = preplay.get('sid')
if session_id:
- content_url += '?pbs=' + session_id
- return self._extract_uplynk_info(content_url)
+ content_url = update_url_query(content_url, {'pbs': session_id})
+ return self._extract_uplynk_info(smuggle_url(content_url, smuggled_data))
diff --git a/hypervideo_dl/extractor/urplay.py b/hypervideo_dl/extractor/urplay.py
index 0f0d659..7f97fc9 100644
--- a/hypervideo_dl/extractor/urplay.py
+++ b/hypervideo_dl/extractor/urplay.py
@@ -14,12 +14,13 @@ class URPlayIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ur(?:play|skola)\.se/(?:program|Produkter)/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://urplay.se/program/203704-ur-samtiden-livet-universum-och-rymdens-markliga-musik-om-vetenskap-kritiskt-tankande-och-motstand',
- 'md5': 'ff5b0c89928f8083c74bbd5099c9292d',
+ 'md5': '5ba36643c77cc3d34ffeadad89937d1e',
'info_dict': {
'id': '203704',
'ext': 'mp4',
'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd',
'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a',
+ 'thumbnail': r're:^https?://.+\.jpg',
'timestamp': 1513292400,
'upload_date': '20171214',
'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik',
@@ -30,18 +31,41 @@ class URPlayIE(InfoExtractor):
'age_limit': 15,
},
}, {
+ 'url': 'https://urplay.se/program/222967-en-foralders-dagbok-mitt-barn-skadar-sig-sjalv',
+ 'info_dict': {
+ 'id': '222967',
+ 'ext': 'mp4',
+ 'title': 'En förälders dagbok : Mitt barn skadar sig själv',
+ 'description': 'md5:9f771eef03a732a213b367b52fe826ca',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ 'timestamp': 1629676800,
+ 'upload_date': '20210823',
+ 'series': 'En förälders dagbok',
+ 'duration': 1740,
+ 'age_limit': 15,
+ 'episode_number': 3,
+ 'categories': 'count:2',
+ 'tags': 'count:7',
+ 'episode': 'Mitt barn skadar sig själv',
+ },
+ }, {
'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde',
'info_dict': {
'id': '190031',
'ext': 'mp4',
'title': 'Tripp, Trapp, Träd : Sovkudde',
'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1',
+ 'thumbnail': r're:^https?://.+\.jpg',
'timestamp': 1440086400,
'upload_date': '20150820',
'series': 'Tripp, Trapp, Träd',
'duration': 865,
+ 'age_limit': 1,
+ 'episode_number': 1,
+ 'categories': [],
'tags': ['Sova'],
'episode': 'Sovkudde',
+ 'season': 'Säsong 1',
},
}, {
'url': 'http://urskola.se/Produkter/155794-Smasagor-meankieli-Grodan-i-vida-varlden',
@@ -69,7 +93,7 @@ class URPlayIE(InfoExtractor):
urplayer_streams = urplayer_data.get('streamingInfo', {})
for k, v in urplayer_streams.get('raw', {}).items():
- if not (k in ('sd', 'hd') and isinstance(v, dict)):
+ if not (k in ('sd', 'hd', 'mp3', 'm4a') and isinstance(v, dict)):
continue
file_http = v.get('location')
if file_http:
@@ -88,18 +112,19 @@ class URPlayIE(InfoExtractor):
lang = ISO639Utils.short2long(lang)
return lang or None
- for k, v in (urplayer_data['streamingInfo'].get('sweComplete') or {}).items():
- if (k in ('sd', 'hd') or not isinstance(v, dict)):
- continue
- lang, sttl_url = (v.get(kk) for kk in ('language', 'location', ))
- if not sttl_url:
- continue
- lang = parse_lang_code(lang)
- if not lang:
- continue
- sttl = subtitles.get(lang) or []
- sttl.append({'ext': k, 'url': sttl_url, })
- subtitles[lang] = sttl
+ for stream in urplayer_data['streamingInfo'].values():
+ for k, v in stream.items():
+ if (k in ('sd', 'hd') or not isinstance(v, dict)):
+ continue
+ lang, sttl_url = (v.get(kk) for kk in ('language', 'location', ))
+ if not sttl_url:
+ continue
+ lang = parse_lang_code(lang)
+ if not lang:
+ continue
+ sttl = subtitles.get(lang) or []
+ sttl.append({'ext': k, 'url': sttl_url, })
+ subtitles[lang] = sttl
image = urplayer_data.get('image') or {}
thumbnails = []
diff --git a/hypervideo_dl/extractor/vevo.py b/hypervideo_dl/extractor/vevo.py
index da4ce49..aa40227 100644
--- a/hypervideo_dl/extractor/vevo.py
+++ b/hypervideo_dl/extractor/vevo.py
@@ -2,10 +2,8 @@ import re
import json
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_HTTPError,
-)
+from ..compat import compat_str
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
@@ -184,8 +182,8 @@ class VevoIE(VevoBaseIE):
try:
data = self._download_json(self._api_url_template % path, *args, **kwargs)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError):
- errors = self._parse_json(e.cause.read().decode(), None)['errors']
+ if isinstance(e.cause, HTTPError):
+ errors = self._parse_json(e.cause.response.read().decode(), None)['errors']
error_message = ', '.join([error['message'] for error in errors])
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)
raise
diff --git a/hypervideo_dl/extractor/vice.py b/hypervideo_dl/extractor/vice.py
index d1a3b48..8a71268 100644
--- a/hypervideo_dl/extractor/vice.py
+++ b/hypervideo_dl/extractor/vice.py
@@ -7,10 +7,8 @@ import time
from .adobepass import AdobePassIE
from .common import InfoExtractor
from .youtube import YoutubeIE
-from ..compat import (
- compat_HTTPError,
- compat_str,
-)
+from ..compat import compat_str
+from ..networking.exceptions import HTTPError
from ..utils import (
clean_html,
ExtractorError,
@@ -140,8 +138,8 @@ class ViceIE(ViceBaseIE, AdobePassIE):
'https://vms.vice.com/%s/video/preplay/%s' % (locale, video_id),
video_id, query=query)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401):
- error = json.loads(e.cause.read().decode())
+ if isinstance(e.cause, HTTPError) and e.cause.status in (400, 401):
+ error = json.loads(e.cause.response.read().decode())
error_message = error.get('error_description') or error['details']
raise ExtractorError('%s said: %s' % (
self.IE_NAME, error_message), expected=True)
diff --git a/hypervideo_dl/extractor/videa.py b/hypervideo_dl/extractor/videa.py
index 52fa8fc..59ae933 100644
--- a/hypervideo_dl/extractor/videa.py
+++ b/hypervideo_dl/extractor/videa.py
@@ -119,7 +119,7 @@ class VideaIE(InfoExtractor):
result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)]
query = parse_qs(player_url)
- random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8))
+ random_seed = ''.join(random.choices(string.ascii_letters + string.digits, k=8))
query['_s'] = random_seed
query['_t'] = result[:16]
diff --git a/hypervideo_dl/extractor/videocampus_sachsen.py b/hypervideo_dl/extractor/videocampus_sachsen.py
index 982ab3d..37bc7d7 100644
--- a/hypervideo_dl/extractor/videocampus_sachsen.py
+++ b/hypervideo_dl/extractor/videocampus_sachsen.py
@@ -2,7 +2,7 @@ import functools
import re
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import ExtractorError, OnDemandPagedList, urlencode_postdata
@@ -169,7 +169,7 @@ class VideocampusSachsenIE(InfoExtractor):
f'https://{host}/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8',
video_id, 'mp4', m3u8_id='hls', fatal=True)
except ExtractorError as e:
- if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (404, 500):
+ if not isinstance(e.cause, HTTPError) or e.cause.status not in (404, 500):
raise
formats.append({'url': f'https://{host}/getMedium/{video_id}.mp4'})
diff --git a/hypervideo_dl/extractor/videoken.py b/hypervideo_dl/extractor/videoken.py
new file mode 100644
index 0000000..560b41a
--- /dev/null
+++ b/hypervideo_dl/extractor/videoken.py
@@ -0,0 +1,336 @@
+import base64
+import functools
+import math
+import re
+import time
+import urllib.parse
+
+from .common import InfoExtractor
+from .slideslive import SlidesLiveIE
+from ..utils import (
+ ExtractorError,
+ InAdvancePagedList,
+ int_or_none,
+ traverse_obj,
+ update_url_query,
+ url_or_none,
+)
+
+
+class VideoKenBaseIE(InfoExtractor):
+ _ORGANIZATIONS = {
+ 'videos.icts.res.in': 'icts',
+ 'videos.cncf.io': 'cncf',
+ 'videos.neurips.cc': 'neurips',
+ }
+ _BASE_URL_RE = rf'https?://(?P<host>{"|".join(map(re.escape, _ORGANIZATIONS))})/'
+
+ _PAGE_SIZE = 12
+
+ def _get_org_id_and_api_key(self, org, video_id):
+ details = self._download_json(
+ f'https://analytics.videoken.com/api/videolake/{org}/details', video_id,
+ note='Downloading organization ID and API key', headers={
+ 'Accept': 'application/json',
+ })
+ return details['id'], details['apikey']
+
+ def _create_slideslive_url(self, video_url, video_id, referer):
+ if not video_url and not video_id:
+ return
+ elif not video_url or 'embed/sign-in' in video_url:
+ video_url = f'https://slideslive.com/embed/{video_id.lstrip("slideslive-")}'
+ if url_or_none(referer):
+ return update_url_query(video_url, {
+ 'embed_parent_url': referer,
+ 'embed_container_origin': f'https://{urllib.parse.urlparse(referer).netloc}',
+ })
+ return video_url
+
+ def _extract_videos(self, videos, url):
+ for video in traverse_obj(videos, (('videos', 'results'), ...)):
+ video_id = traverse_obj(video, 'youtube_id', 'videoid')
+ if not video_id:
+ continue
+ ie_key = None
+ if traverse_obj(video, 'type', 'source') == 'youtube':
+ video_url = video_id
+ ie_key = 'Youtube'
+ else:
+ video_url = traverse_obj(video, 'embed_url', 'embeddableurl')
+ if urllib.parse.urlparse(video_url).netloc == 'slideslive.com':
+ ie_key = SlidesLiveIE
+ video_url = self._create_slideslive_url(video_url, video_id, url)
+ if not video_url:
+ continue
+ yield self.url_result(video_url, ie_key, video_id)
+
+
+class VideoKenIE(VideoKenBaseIE):
+ _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:(?:topic|category)/[^/#?]+/)?video/(?P<id>[\w-]+)'
+ _TESTS = [{
+ # neurips -> videoken -> slideslive
+ 'url': 'https://videos.neurips.cc/video/slideslive-38922815',
+ 'info_dict': {
+ 'id': '38922815',
+ 'ext': 'mp4',
+ 'title': 'Efficient Processing of Deep Neural Network: from Algorithms to Hardware Architectures',
+ 'timestamp': 1630939331,
+ 'upload_date': '20210906',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'thumbnails': 'count:330',
+ 'chapters': 'count:329',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ 'expected_warnings': ['Failed to download VideoKen API JSON'],
+ }, {
+ # neurips -> videoken -> slideslive -> youtube
+ 'url': 'https://videos.neurips.cc/topic/machine%20learning/video/slideslive-38923348',
+ 'info_dict': {
+ 'id': '2Xa_dt78rJE',
+ 'ext': 'mp4',
+ 'display_id': '38923348',
+ 'title': 'Machine Education',
+ 'description': 'Watch full version of this video at https://slideslive.com/38923348.',
+ 'channel': 'SlidesLive Videos - G2',
+ 'channel_id': 'UCOExahQQ588Da8Nft_Ltb9w',
+ 'channel_url': 'https://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w',
+ 'uploader': 'SlidesLive Videos - G2',
+ 'uploader_id': 'UCOExahQQ588Da8Nft_Ltb9w',
+ 'uploader_url': 'http://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w',
+ 'duration': 2504,
+ 'timestamp': 1618922125,
+ 'upload_date': '20200131',
+ 'age_limit': 0,
+ 'channel_follower_count': int,
+ 'view_count': int,
+ 'availability': 'unlisted',
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ 'categories': ['People & Blogs'],
+ 'tags': [],
+ 'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
+ 'thumbnails': 'count:78',
+ 'chapters': 'count:77',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ 'expected_warnings': ['Failed to download VideoKen API JSON'],
+ }, {
+ # icts -> videoken -> youtube
+ 'url': 'https://videos.icts.res.in/topic/random%20variable/video/zysIsojYdvc',
+ 'info_dict': {
+ 'id': 'zysIsojYdvc',
+ 'ext': 'mp4',
+ 'title': 'Small-worlds, complex networks and random graphs (Lecture 3) by Remco van der Hofstad',
+ 'description': 'md5:87433069d79719eeadc1962cc2ace00b',
+ 'channel': 'International Centre for Theoretical Sciences',
+ 'channel_id': 'UCO3xnVTHzB7l-nc8mABUJIQ',
+ 'channel_url': 'https://www.youtube.com/channel/UCO3xnVTHzB7l-nc8mABUJIQ',
+ 'uploader': 'International Centre for Theoretical Sciences',
+ 'uploader_id': 'ICTStalks',
+ 'uploader_url': 'http://www.youtube.com/user/ICTStalks',
+ 'duration': 3372,
+ 'upload_date': '20191004',
+ 'age_limit': 0,
+ 'live_status': 'not_live',
+ 'availability': 'public',
+ 'playable_in_embed': True,
+ 'channel_follower_count': int,
+ 'like_count': int,
+ 'view_count': int,
+ 'categories': ['Science & Technology'],
+ 'tags': [],
+ 'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
+ 'thumbnails': 'count:42',
+ 'chapters': 'count:20',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }, {
+ 'url': 'https://videos.cncf.io/category/478/video/IL4nxbmUIX8',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://videos.cncf.io/topic/kubernetes/video/YAM2d7yTrrI',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://videos.icts.res.in/video/d7HuP_abpKU',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ hostname, video_id = self._match_valid_url(url).group('host', 'id')
+ org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], video_id)
+ details = self._download_json(
+ 'https://analytics.videoken.com/api/videoinfo_private', video_id, query={
+ 'videoid': video_id,
+ 'org_id': org_id,
+ }, headers={'Accept': 'application/json'}, note='Downloading VideoKen API JSON',
+ errnote='Failed to download VideoKen API JSON', fatal=False)
+ if details:
+ return next(self._extract_videos({'videos': [details]}, url))
+ # fallback for API error 400 response
+ elif video_id.startswith('slideslive-'):
+ return self.url_result(
+ self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id)
+ elif re.match(r'^[\w-]{11}$', video_id):
+ self.url_result(video_id, 'Youtube', video_id)
+ else:
+ raise ExtractorError('Unable to extract without VideoKen API response')
+
+
+class VideoKenPlayerIE(VideoKenBaseIE):
+ _VALID_URL = r'https?://player\.videoken\.com/embed/slideslive-(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://player.videoken.com/embed/slideslive-38968434',
+ 'info_dict': {
+ 'id': '38968434',
+ 'ext': 'mp4',
+ 'title': 'Deep Learning with Label Differential Privacy',
+ 'timestamp': 1643377020,
+ 'upload_date': '20220128',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+ 'thumbnails': 'count:30',
+ 'chapters': 'count:29',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ return self.url_result(
+ self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id)
+
+
+class VideoKenPlaylistIE(VideoKenBaseIE):
+ _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:category/\d+/)?playlist/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://videos.icts.res.in/category/1822/playlist/381',
+ 'playlist_mincount': 117,
+ 'info_dict': {
+ 'id': '381',
+ 'title': 'Cosmology - The Next Decade',
+ },
+ }]
+
+ def _real_extract(self, url):
+ hostname, playlist_id = self._match_valid_url(url).group('host', 'id')
+ org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], playlist_id)
+ videos = self._download_json(
+ f'https://analytics.videoken.com/api/{org_id}/playlistitems/{playlist_id}/',
+ playlist_id, headers={'Accept': 'application/json'}, note='Downloading API JSON')
+ return self.playlist_result(self._extract_videos(videos, url), playlist_id, videos.get('title'))
+
+
+class VideoKenCategoryIE(VideoKenBaseIE):
+ _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'category/(?P<id>\d+)/?(?:$|[?#])'
+ _TESTS = [{
+ 'url': 'https://videos.icts.res.in/category/1822/',
+ 'playlist_mincount': 500,
+ 'info_dict': {
+ 'id': '1822',
+ 'title': 'Programs',
+ },
+ }, {
+ 'url': 'https://videos.neurips.cc/category/350/',
+ 'playlist_mincount': 34,
+ 'info_dict': {
+ 'id': '350',
+ 'title': 'NeurIPS 2018',
+ },
+ }, {
+ 'url': 'https://videos.cncf.io/category/479/',
+ 'playlist_mincount': 328,
+ 'info_dict': {
+ 'id': '479',
+ 'title': 'KubeCon + CloudNativeCon Europe\'19',
+ },
+ }]
+
+ def _get_category_page(self, category_id, org_id, page=1, note=None):
+ return self._download_json(
+ f'https://analytics.videoken.com/api/videolake/{org_id}/category_videos', category_id,
+ fatal=False, note=note if note else f'Downloading category page {page}',
+ query={
+ 'category_id': category_id,
+ 'page_number': page,
+ 'length': self._PAGE_SIZE,
+ }, headers={'Accept': 'application/json'}) or {}
+
+ def _entries(self, category_id, org_id, url, page):
+ videos = self._get_category_page(category_id, org_id, page + 1)
+ yield from self._extract_videos(videos, url)
+
+ def _real_extract(self, url):
+ hostname, category_id = self._match_valid_url(url).group('host', 'id')
+ org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], category_id)
+ category_info = self._get_category_page(category_id, org_id, note='Downloading category info')
+ category = category_info['category_name']
+ total_pages = math.ceil(int(category_info['recordsTotal']) / self._PAGE_SIZE)
+ return self.playlist_result(InAdvancePagedList(
+ functools.partial(self._entries, category_id, org_id, url),
+ total_pages, self._PAGE_SIZE), category_id, category)
+
+
+class VideoKenTopicIE(VideoKenBaseIE):
+ _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'topic/(?P<id>[^/#?]+)/?(?:$|[?#])'
+ _TESTS = [{
+ 'url': 'https://videos.neurips.cc/topic/machine%20learning/',
+ 'playlist_mincount': 500,
+ 'info_dict': {
+ 'id': 'machine_learning',
+ 'title': 'machine learning',
+ },
+ }, {
+ 'url': 'https://videos.icts.res.in/topic/gravitational%20waves/',
+ 'playlist_mincount': 77,
+ 'info_dict': {
+ 'id': 'gravitational_waves',
+ 'title': 'gravitational waves'
+ },
+ }, {
+ 'url': 'https://videos.cncf.io/topic/prometheus/',
+ 'playlist_mincount': 134,
+ 'info_dict': {
+ 'id': 'prometheus',
+ 'title': 'prometheus',
+ },
+ }]
+
+ def _get_topic_page(self, topic, org_id, search_id, api_key, page=1, note=None):
+ return self._download_json(
+ 'https://es.videoken.com/api/v1.0/get_results', topic, fatal=False, query={
+ 'orgid': org_id,
+ 'size': self._PAGE_SIZE,
+ 'query': topic,
+ 'page': page,
+ 'sort': 'upload_desc',
+ 'filter': 'all',
+ 'token': api_key,
+ 'is_topic': 'true',
+ 'category': '',
+ 'searchid': search_id,
+ }, headers={'Accept': 'application/json'},
+ note=note if note else f'Downloading topic page {page}') or {}
+
+ def _entries(self, topic, org_id, search_id, api_key, url, page):
+ videos = self._get_topic_page(topic, org_id, search_id, api_key, page + 1)
+ yield from self._extract_videos(videos, url)
+
+ def _real_extract(self, url):
+ hostname, topic_id = self._match_valid_url(url).group('host', 'id')
+ topic = urllib.parse.unquote(topic_id)
+ topic_id = topic.replace(' ', '_')
+ org_id, api_key = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], topic)
+ search_id = base64.b64encode(f':{topic}:{int(time.time())}:transient'.encode()).decode()
+ total_pages = int_or_none(self._get_topic_page(
+ topic, org_id, search_id, api_key, note='Downloading topic info')['total_no_of_pages'])
+ return self.playlist_result(InAdvancePagedList(
+ functools.partial(self._entries, topic, org_id, search_id, api_key, url),
+ total_pages, self._PAGE_SIZE), topic_id, topic)
diff --git a/hypervideo_dl/extractor/vidlii.py b/hypervideo_dl/extractor/vidlii.py
index 5933783..44353b7 100644
--- a/hypervideo_dl/extractor/vidlii.py
+++ b/hypervideo_dl/extractor/vidlii.py
@@ -1,8 +1,8 @@
import re
from .common import InfoExtractor
+from ..networking import HEADRequest
from ..utils import (
- HEADRequest,
format_field,
float_or_none,
get_element_by_id,
@@ -70,6 +70,7 @@ class VidLiiIE(InfoExtractor):
r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1',
webpage) or []]
for source in sources:
+ source = urljoin(url, source)
height = int(self._search_regex(r'(\d+).mp4', source, 'height', default=360))
if self._request_webpage(HEADRequest(source), video_id, f'Checking {height}p url', errnote=False):
formats.append({
diff --git a/hypervideo_dl/extractor/viewlift.py b/hypervideo_dl/extractor/viewlift.py
index 3812601..8f686f0 100644
--- a/hypervideo_dl/extractor/viewlift.py
+++ b/hypervideo_dl/extractor/viewlift.py
@@ -1,7 +1,7 @@
import json
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
@@ -46,8 +46,8 @@ class ViewLiftBaseIE(InfoExtractor):
return self._download_json(
self._API_BASE + path, video_id, headers={'Authorization': self._TOKENS.get(site)}, query=query)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- webpage = e.cause.read().decode()
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
+ webpage = e.cause.response.read().decode()
try:
error_message = traverse_obj(json.loads(webpage), 'errorMessage', 'message')
except json.JSONDecodeError:
diff --git a/hypervideo_dl/extractor/viidea.py b/hypervideo_dl/extractor/viidea.py
index 4cdf267..649ffe3 100644
--- a/hypervideo_dl/extractor/viidea.py
+++ b/hypervideo_dl/extractor/viidea.py
@@ -2,10 +2,10 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_HTTPError,
compat_str,
compat_urlparse,
)
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
js_to_json,
@@ -133,9 +133,9 @@ class ViideaIE(InfoExtractor):
'%s/site/api/lecture/%s?format=json' % (base_url, lecture_id),
lecture_id)['lecture'][0]
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 403:
msg = self._parse_json(
- e.cause.read().decode('utf-8'), lecture_id)
+ e.cause.response.read().decode('utf-8'), lecture_id)
raise ExtractorError(msg['detail'], expected=True)
raise
diff --git a/hypervideo_dl/extractor/vimeo.py b/hypervideo_dl/extractor/vimeo.py
index 516b76d..c0c08e8 100644
--- a/hypervideo_dl/extractor/vimeo.py
+++ b/hypervideo_dl/extractor/vimeo.py
@@ -2,20 +2,16 @@ import base64
import functools
import re
import itertools
-import urllib.error
from .common import InfoExtractor
-from ..compat import (
- compat_HTTPError,
- compat_str,
- compat_urlparse,
-)
+from ..compat import compat_str, compat_urlparse
+from ..networking import HEADRequest, Request
+from ..networking.exceptions import HTTPError
from ..utils import (
clean_html,
determine_ext,
ExtractorError,
get_element_by_class,
- HEADRequest,
js_to_json,
int_or_none,
merge_dicts,
@@ -23,7 +19,6 @@ from ..utils import (
parse_filesize,
parse_iso8601,
parse_qs,
- sanitized_Request,
smuggle_url,
str_or_none,
try_get,
@@ -72,7 +67,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
'Referer': self._LOGIN_URL,
})
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 418:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 418:
raise ExtractorError(
'Unable to log in: bad username or password',
expected=True)
@@ -304,27 +299,33 @@ class VimeoIE(VimeoBaseInfoExtractor):
# _VALID_URL matches Vimeo URLs
_VALID_URL = r'''(?x)
- https?://
- (?:
- (?:
- www|
- player
- )
- \.
- )?
- vimeo\.com/
- (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
- (?:[^/]+/)*?
- (?:
- (?:
- play_redirect_hls|
- moogaloop\.swf)\?clip_id=
- )?
- (?:videos?/)?
- (?P<id>[0-9]+)
- (?:/(?P<unlisted_hash>[\da-f]{10}))?
- /?(?:[?&].*)?(?:[#].*)?$
- '''
+ https?://
+ (?:
+ (?:
+ www|
+ player
+ )
+ \.
+ )?
+ vimeo\.com/
+ (?:
+ (?P<u>user)|
+ (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
+ (?:.*?/)??
+ (?P<q>
+ (?:
+ play_redirect_hls|
+ moogaloop\.swf)\?clip_id=
+ )?
+ (?:videos?/)?
+ )
+ (?P<id>[0-9]+)
+ (?(u)
+ /(?!videos|likes)[^/?#]+/?|
+ (?(q)|/(?P<unlisted_hash>[\da-f]{10}))?
+ )
+ (?:(?(q)[&]|(?(u)|/?)[?]).*?)?(?:[#].*)?$
+ '''
IE_NAME = 'vimeo'
_EMBED_REGEX = [
# iframe
@@ -358,7 +359,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
},
{
'url': 'http://player.vimeo.com/video/54469442',
- 'md5': 'b3e7f4d2cbb53bd7dc3bb6ff4ed5cfbd',
+ 'md5': '619b811a4417aa4abe78dc653becf511',
'note': 'Videos that embed the url in the player page',
'info_dict': {
'id': '54469442',
@@ -389,8 +390,8 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader_id': 'user18948128',
'uploader': 'Jaime Marquínez Ferrándiz',
'duration': 10,
- 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f',
- 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_960',
+ 'description': 'md5:6173f270cd0c0119f22817204b3eb86c',
+ 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280',
'view_count': int,
'comment_count': int,
'like_count': int,
@@ -407,7 +408,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'id': '75629013',
'ext': 'mp4',
'title': 'Key & Peele: Terrorist Interrogation',
- 'description': 'md5:8678b246399b070816b12313e8b4eb5c',
+ 'description': 'md5:6173f270cd0c0119f22817204b3eb86c',
'uploader_url': r're:https?://(?:www\.)?vimeo\.com/atencio',
'uploader_id': 'atencio',
'uploader': 'Peter Atencio',
@@ -559,8 +560,8 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader_id': 'user18948128',
'uploader': 'Jaime Marquínez Ferrándiz',
'duration': 10,
- 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f',
- 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_960',
+ 'description': 'md5:6173f270cd0c0119f22817204b3eb86c',
+ 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280',
'view_count': int,
'comment_count': int,
'like_count': int,
@@ -705,7 +706,12 @@ class VimeoIE(VimeoBaseInfoExtractor):
'params': {
'skip_download': True,
},
- }
+ },
+ {
+ # user playlist alias -> https://vimeo.com/258705797
+ 'url': 'https://vimeo.com/user26785108/newspiritualguide',
+ 'only_matching': True,
+ },
# https://gettingthingsdone.com/workflowmap/
# vimeo embed with check-password page protected by Referer header
]
@@ -798,7 +804,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'X-Requested-With': 'XMLHttpRequest',
})
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
raise ExtractorError('Wrong password', expected=True)
raise
@@ -821,10 +827,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
# Retrieve video webpage to extract further information
webpage, urlh = self._download_webpage_handle(
url, video_id, headers=headers)
- redirect_url = urlh.geturl()
+ redirect_url = urlh.url
except ExtractorError as ee:
- if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
- errmsg = ee.cause.read()
+ if isinstance(ee.cause, HTTPError) and ee.cause.status == 403:
+ errmsg = ee.cause.response.read()
if b'Because of its privacy settings, this video cannot be played here' in errmsg:
raise ExtractorError(
'Cannot download embed-only video without embedding '
@@ -834,8 +840,8 @@ class VimeoIE(VimeoBaseInfoExtractor):
raise
if '://player.vimeo.com/video/' in url:
- config = self._parse_json(self._search_regex(
- r'\b(?:playerC|c)onfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id)
+ config = self._search_json(
+ r'\b(?:playerC|c)onfig\s*=', webpage, 'info section', video_id)
if config.get('view') == 4:
config = self._verify_player_video_password(
redirect_url, video_id, headers)
@@ -1143,7 +1149,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor):
'Authorization': 'jwt ' + authorization,
})['data']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 400:
return
for video in videos:
link = video.get('link')
@@ -1185,7 +1191,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor):
'X-Requested-With': 'XMLHttpRequest',
})['hashed_pass']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
raise ExtractorError('Wrong password', expected=True)
raise
entries = OnDemandPagedList(functools.partial(
@@ -1298,10 +1304,10 @@ class VimeoWatchLaterIE(VimeoChannelIE): # XXX: Do not subclass from concrete I
def _page_url(self, base_url, pagenum):
url = '%s/page:%d/' % (base_url, pagenum)
- request = sanitized_Request(url)
+ request = Request(url)
# Set the header to get a partial html page with the ids,
# the normal page doesn't contain them.
- request.add_header('X-Requested-With', 'XMLHttpRequest')
+ request.headers['X-Requested-With'] = 'XMLHttpRequest'
return request
def _real_extract(self, url):
@@ -1421,7 +1427,7 @@ class VimeoProIE(VimeoBaseInfoExtractor):
**self._hidden_inputs(password_form),
}), note='Logging in with video password')
except ExtractorError as e:
- if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 418:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 418:
raise ExtractorError('Wrong video password', expected=True)
raise
diff --git a/hypervideo_dl/extractor/viu.py b/hypervideo_dl/extractor/viu.py
index b183c88..f315687 100644
--- a/hypervideo_dl/extractor/viu.py
+++ b/hypervideo_dl/extractor/viu.py
@@ -9,9 +9,12 @@ from ..compat import compat_str
from ..utils import (
ExtractorError,
int_or_none,
+ remove_end,
strip_or_none,
+ traverse_obj,
try_get,
smuggle_url,
+ unified_timestamp,
unsmuggle_url,
url_or_none,
)
@@ -251,7 +254,7 @@ class ViuOTTIE(InfoExtractor):
return self._user_token
def _get_token(self, country_code, video_id):
- rand = ''.join(random.choice('0123456789') for _ in range(10))
+ rand = ''.join(random.choices('0123456789', k=10))
return self._download_json(
f'https://api-gateway-global.viu.com/api/auth/token?v={rand}000', video_id,
headers={'Content-Type': 'application/json'}, note='Getting bearer token',
@@ -394,3 +397,146 @@ class ViuOTTIE(InfoExtractor):
'formats': formats,
'subtitles': subtitles,
}
+
+
+class ViuOTTIndonesiaBaseIE(InfoExtractor):
+ _BASE_QUERY = {
+ 'ver': 1.0,
+ 'fmt': 'json',
+ 'aver': 5.0,
+ 'appver': 2.0,
+ 'appid': 'viu_desktop',
+ 'platform': 'desktop',
+ }
+
+ _DEVICE_ID = str(uuid.uuid4())
+ _SESSION_ID = str(uuid.uuid4())
+ _TOKEN = None
+
+ _HEADERS = {
+ 'x-session-id': _SESSION_ID,
+ 'x-client': 'browser'
+ }
+
+ _AGE_RATINGS_MAPPER = {
+ 'ADULTS': 18,
+ 'teens': 13
+ }
+
+ def _real_initialize(self):
+ ViuOTTIndonesiaBaseIE._TOKEN = self._download_json(
+ 'https://um.viuapi.io/user/identity', None,
+ headers={'Content-type': 'application/json', **self._HEADERS},
+ query={**self._BASE_QUERY, 'iid': self._DEVICE_ID},
+ data=json.dumps({'deviceId': self._DEVICE_ID}).encode(),
+ note='Downloading token information')['token']
+
+
+class ViuOTTIndonesiaIE(ViuOTTIndonesiaBaseIE):
+ _VALID_URL = r'https?://www\.viu\.com/ott/\w+/\w+/all/video-[\w-]+-(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.viu.com/ott/id/id/all/video-japanese-drama-tv_shows-detective_conan_episode_793-1165863142?containerId=playlist-26271226',
+ 'info_dict': {
+ 'id': '1165863142',
+ 'ext': 'mp4',
+ 'episode_number': 793,
+ 'episode': 'Episode 793',
+ 'title': 'Detective Conan - Episode 793',
+ 'duration': 1476,
+ 'description': 'md5:b79d55345bc1e0217ece22616267c9a5',
+ 'thumbnail': 'https://vuclipi-a.akamaihd.net/p/cloudinary/h_171,w_304,dpr_1.5,f_auto,c_thumb,q_auto:low/1165863189/d-1',
+ 'upload_date': '20210101',
+ 'timestamp': 1609459200,
+ }
+ }, {
+ 'url': 'https://www.viu.com/ott/id/id/all/video-korean-reality-tv_shows-entertainment_weekly_episode_1622-1118617054',
+ 'info_dict': {
+ 'id': '1118617054',
+ 'ext': 'mp4',
+ 'episode_number': 1622,
+ 'episode': 'Episode 1622',
+ 'description': 'md5:6d68ca450004020113e9bf27ad99f0f8',
+ 'title': 'Entertainment Weekly - Episode 1622',
+ 'duration': 4729,
+ 'thumbnail': 'https://vuclipi-a.akamaihd.net/p/cloudinary/h_171,w_304,dpr_1.5,f_auto,c_thumb,q_auto:low/1120187848/d-1',
+ 'timestamp': 1420070400,
+ 'upload_date': '20150101',
+ 'cast': ['Shin Hyun-joon', 'Lee Da-Hee']
+ }
+ }, {
+ # age-limit test
+ 'url': 'https://www.viu.com/ott/id/id/all/video-japanese-trailer-tv_shows-trailer_jujutsu_kaisen_ver_01-1166044219?containerId=playlist-26273140',
+ 'info_dict': {
+ 'id': '1166044219',
+ 'ext': 'mp4',
+ 'upload_date': '20200101',
+ 'timestamp': 1577836800,
+ 'title': 'Trailer \'Jujutsu Kaisen\' Ver.01',
+ 'duration': 92,
+ 'thumbnail': 'https://vuclipi-a.akamaihd.net/p/cloudinary/h_171,w_304,dpr_1.5,f_auto,c_thumb,q_auto:low/1166044240/d-1',
+ 'description': 'Trailer \'Jujutsu Kaisen\' Ver.01',
+ 'cast': ['Junya Enoki', ' Yûichi Nakamura', ' Yuma Uchida', 'Asami Seto'],
+ 'age_limit': 13,
+ }
+ }, {
+ # json ld metadata type equal to Movie instead of TVEpisodes
+ 'url': 'https://www.viu.com/ott/id/id/all/video-japanese-animation-movies-demon_slayer_kimetsu_no_yaiba_the_movie_mugen_train-1165892707?containerId=1675060691786',
+ 'info_dict': {
+ 'id': '1165892707',
+ 'ext': 'mp4',
+ 'timestamp': 1577836800,
+ 'upload_date': '20200101',
+ 'title': 'Demon Slayer - Kimetsu no Yaiba - The Movie: Mugen Train',
+ 'age_limit': 13,
+ 'cast': 'count:9',
+ 'thumbnail': 'https://vuclipi-a.akamaihd.net/p/cloudinary/h_171,w_304,dpr_1.5,f_auto,c_thumb,q_auto:low/1165895279/d-1',
+ 'description': 'md5:1ce9c35a3aeab384085533f746c87469',
+ 'duration': 7021,
+ }
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ video_data = self._download_json(
+ f'https://um.viuapi.io/drm/v1/content/{display_id}', display_id, data=b'',
+ headers={'Authorization': ViuOTTIndonesiaBaseIE._TOKEN, **self._HEADERS, 'ccode': 'ID'})
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_data['playUrl'], display_id)
+
+ initial_state = self._search_json(
+ r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state',
+ display_id)['content']['clipDetails']
+ for key, url in initial_state.items():
+ lang, ext = self._search_regex(
+ r'^subtitle_(?P<lang>[\w-]+)_(?P<ext>\w+)$', key, 'subtitle metadata',
+ default=(None, None), group=('lang', 'ext'))
+ if lang and ext:
+ subtitles.setdefault(lang, []).append({
+ 'ext': ext,
+ 'url': url,
+ })
+
+ if ext == 'vtt':
+ subtitles[lang].append({
+ 'ext': 'srt',
+ 'url': f'{remove_end(initial_state[key], "vtt")}srt',
+ })
+
+ episode = traverse_obj(list(filter(
+ lambda x: x.get('@type') in ('TVEpisode', 'Movie'), self._yield_json_ld(webpage, display_id))), 0) or {}
+ return {
+ 'id': display_id,
+ 'title': (traverse_obj(initial_state, 'title', 'display_title')
+ or episode.get('name')),
+ 'description': initial_state.get('description') or episode.get('description'),
+ 'duration': initial_state.get('duration'),
+ 'thumbnail': traverse_obj(episode, ('image', 'url')),
+ 'timestamp': unified_timestamp(episode.get('dateCreated')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'episode_number': (traverse_obj(initial_state, 'episode_no', 'episodeno', expected_type=int_or_none)
+ or int_or_none(episode.get('episodeNumber'))),
+ 'cast': traverse_obj(episode, ('actor', ..., 'name'), default=None),
+ 'age_limit': self._AGE_RATINGS_MAPPER.get(initial_state.get('internal_age_rating'))
+ }
diff --git a/hypervideo_dl/extractor/vk.py b/hypervideo_dl/extractor/vk.py
index 347aa38..9154228 100644
--- a/hypervideo_dl/extractor/vk.py
+++ b/hypervideo_dl/extractor/vk.py
@@ -6,22 +6,28 @@ from .common import InfoExtractor
from .dailymotion import DailymotionIE
from .odnoklassniki import OdnoklassnikiIE
from .pladform import PladformIE
+from .sibnet import SibnetEmbedIE
from .vimeo import VimeoIE
from .youtube import YoutubeIE
-from ..compat import compat_urlparse
from ..utils import (
ExtractorError,
+ UserNotLive,
clean_html,
get_element_by_class,
+ get_element_html_by_id,
int_or_none,
- orderedSet,
+ join_nonempty,
+ parse_resolution,
str_or_none,
str_to_int,
+ try_call,
unescapeHTML,
unified_timestamp,
update_url_query,
url_or_none,
urlencode_postdata,
+ urljoin,
+ traverse_obj,
)
@@ -30,7 +36,7 @@ class VKBaseIE(InfoExtractor):
def _download_webpage_handle(self, url_or_request, video_id, *args, fatal=True, **kwargs):
response = super()._download_webpage_handle(url_or_request, video_id, *args, fatal=fatal, **kwargs)
- challenge_url, cookie = response[1].geturl() if response else '', None
+ challenge_url, cookie = response[1].url if response else '', None
if challenge_url.startswith('https://vk.com/429.html?'):
cookie = self._get_cookies(challenge_url).get('hash429')
if not cookie:
@@ -101,8 +107,7 @@ class VKIE(VKBaseIE):
(?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>([\da-f]+)|(ln-[\da-zA-Z]+)))?
)
'''
- # https://help.sibnet.ru/?sibnet_video_embed
- _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1']
+
_TESTS = [
{
'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
@@ -117,7 +122,7 @@ class VKIE(VKBaseIE):
'upload_date': '20120212',
'comment_count': int,
'like_count': int,
- 'thumbnail': r're:https?://.+\.jpg$',
+ 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$',
},
'params': {'skip_download': 'm3u8'},
},
@@ -134,7 +139,7 @@ class VKIE(VKBaseIE):
'upload_date': '20130720',
'comment_count': int,
'like_count': int,
- 'thumbnail': r're:https?://.+\.jpg$',
+ 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$',
}
},
{
@@ -149,56 +154,11 @@ class VKIE(VKBaseIE):
'upload_date': '20120212',
'timestamp': 1329049880,
'uploader_id': '39545378',
- 'thumbnail': r're:https?://.+\.jpg$',
+ 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$',
},
'params': {'skip_download': 'm3u8'},
},
{
- # VIDEO NOW REMOVED
- # please update if you find a video whose URL follows the same pattern
- 'url': 'http://vk.com/video-8871596_164049491',
- 'md5': 'a590bcaf3d543576c9bd162812387666',
- 'note': 'Only available for registered users',
- 'info_dict': {
- 'id': '-8871596_164049491',
- 'ext': 'mp4',
- 'uploader': 'Триллеры',
- 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',
- 'duration': 8352,
- 'upload_date': '20121218',
- 'view_count': int,
- },
- 'skip': 'Removed',
- },
- {
- 'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d',
- 'info_dict': {
- 'id': '-43215063_168067957',
- 'ext': 'mp4',
- 'uploader': 'Bro Mazter',
- 'title': ' ',
- 'duration': 7291,
- 'upload_date': '20140328',
- 'uploader_id': '223413403',
- 'timestamp': 1396018030,
- },
- 'skip': 'Requires vk account credentials',
- },
- {
- 'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540',
- 'md5': '0c45586baa71b7cb1d0784ee3f4e00a6',
- 'note': 'ivi.ru embed',
- 'info_dict': {
- 'id': '-43215063_169084319',
- 'ext': 'mp4',
- 'title': 'Книга Илая',
- 'duration': 6771,
- 'upload_date': '20140626',
- 'view_count': int,
- },
- 'skip': 'Removed',
- },
- {
'url': 'https://vk.com/video-93049196_456239755?list=ln-cBjJ7S4jYYx3ADnmDT',
'info_dict': {
'id': '-93049196_456239755',
@@ -211,26 +171,11 @@ class VKIE(VKBaseIE):
'timestamp': 1640162189,
'upload_date': '20211222',
'uploader_id': '-93049196',
- 'thumbnail': r're:https?://.+\.jpg$',
+ 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$',
},
},
{
- # video (removed?) only available with list id
- 'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4',
- 'md5': '091287af5402239a1051c37ec7b92913',
- 'info_dict': {
- 'id': '30481095_171201961',
- 'ext': 'mp4',
- 'title': 'ТюменцевВВ_09.07.2015',
- 'uploader': 'Anton Ivanov',
- 'duration': 109,
- 'upload_date': '20150709',
- 'view_count': int,
- },
- 'skip': 'Removed',
- },
- {
- # youtube embed
+ 'note': 'youtube embed',
'url': 'https://vk.com/video276849682_170681728',
'info_dict': {
'id': 'V3K4mi0SYkc',
@@ -254,23 +199,45 @@ class VKIE(VKBaseIE):
'start_time': 0.0,
'categories': ['Nonprofits & Activism'],
'channel_url': 'https://www.youtube.com/channel/UCgzCNQ11TmR9V97ECnhi3gw',
+ 'channel_follower_count': int,
'age_limit': 0,
},
},
{
- # dailymotion embed
- 'url': 'https://vk.com/video-37468416_456239855',
+ 'note': 'dailymotion embed',
+ 'url': 'https://vk.com/video-95168827_456239103?list=cca524a0f0d5557e16',
'info_dict': {
- 'id': 'k3lz2cmXyRuJQSjGHUv',
+ 'id': 'x8gfli0',
'ext': 'mp4',
- 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f',
- 'description': 'md5:424b8e88cc873217f520e582ba28bb36',
- 'uploader': 'AniLibria.Tv',
- 'upload_date': '20160914',
- 'uploader_id': 'x1p5vl5',
- 'timestamp': 1473877246,
+ 'title': 'md5:45410f60ccd4b2760da98cb5fc777d70',
+ 'description': 'md5:2e71c5c9413735cfa06cf1a166f16c84',
+ 'uploader': 'Movies and cinema.',
+ 'upload_date': '20221218',
+ 'uploader_id': 'x1jdavv',
+ 'timestamp': 1671387617,
+ 'age_limit': 0,
+ 'duration': 2918,
+ 'like_count': int,
+ 'view_count': int,
+ 'thumbnail': r're:https?://.+x1080$',
+ 'tags': list
+ },
+ },
+ {
+ 'url': 'https://vk.com/clips-74006511?z=clip-74006511_456247211',
+ 'info_dict': {
+ 'id': '-74006511_456247211',
+ 'ext': 'mp4',
+ 'comment_count': int,
+ 'duration': 9,
+ 'like_count': int,
+ 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$',
+ 'timestamp': 1664995597,
+ 'title': 'Clip by @madempress',
+ 'upload_date': '20221005',
+ 'uploader': 'Шальная императрица',
+ 'uploader_id': '-74006511',
},
- 'skip': 'Removed'
},
{
# video key is extra_data not url\d+
@@ -288,7 +255,7 @@ class VKIE(VKBaseIE):
'skip': 'Removed',
},
{
- # finished live stream, postlive_mp4
+ 'note': 'finished live stream, postlive_mp4',
'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2',
'info_dict': {
'id': '-387766_456242764',
@@ -455,7 +422,7 @@ class VKIE(VKBaseIE):
if odnoklassniki_url:
return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
- sibnet_url = next(self._extract_embed_urls(url, info_page), None)
+ sibnet_url = next(SibnetEmbedIE._extract_embed_urls(url, info_page), None)
if sibnet_url:
return self.url_result(sibnet_url)
@@ -552,7 +519,7 @@ class VKUserVideosIE(VKBaseIE):
}, {
'url': 'https://vk.com/video/playlist/-174476437_2',
'info_dict': {
- 'id': '-174476437_2',
+ 'id': '-174476437_playlist_2',
'title': 'Анонсы'
},
'playlist_mincount': 108,
@@ -595,6 +562,7 @@ class VKUserVideosIE(VKBaseIE):
page_id = self._search_regex(r'data-owner-id\s?=\s?"([^"]+)"', webpage, 'page_id')
elif '_' in u_id:
page_id, section = u_id.split('_', 1)
+ section = f'playlist_{section}'
else:
raise ExtractorError('Invalid URL', expected=True)
@@ -614,13 +582,13 @@ class VKWallPostIE(VKBaseIE):
'info_dict': {
'id': '-23538238_35',
'title': 'Black Shadow - Wall post -23538238_35',
- 'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c',
+ 'description': 'md5:190c78f905a53e0de793d83933c6e67f',
},
'playlist': [{
'md5': '5ba93864ec5b85f7ce19a9af4af080f6',
'info_dict': {
'id': '135220665_111806521',
- 'ext': 'mp4',
+ 'ext': 'm4a',
'title': 'Black Shadow - Слепое Верование',
'duration': 370,
'uploader': 'Black Shadow',
@@ -631,7 +599,7 @@ class VKWallPostIE(VKBaseIE):
'md5': '4cc7e804579122b17ea95af7834c9233',
'info_dict': {
'id': '135220665_111802303',
- 'ext': 'mp4',
+ 'ext': 'm4a',
'title': 'Black Shadow - Война - Негасимое Бездны Пламя!',
'duration': 423,
'uploader': 'Black Shadow',
@@ -642,16 +610,15 @@ class VKWallPostIE(VKBaseIE):
'params': {
'skip_download': True,
},
- 'skip': 'Requires vk account credentials',
}, {
- # single YouTube embed, no leading -
- 'url': 'https://vk.com/wall85155021_6319',
+ # single YouTube embed with irrelevant reaction videos
+ 'url': 'https://vk.com/wall-32370614_7173954',
'info_dict': {
- 'id': '85155021_6319',
- 'title': 'Сергей Горбунов - Wall post 85155021_6319',
+ 'id': '-32370614_7173954',
+ 'title': 'md5:9f93c405bbc00061d34007d78c75e3bc',
+ 'description': 'md5:953b811f26fa9f21ee5856e2ea8e68fc',
},
'playlist_count': 1,
- 'skip': 'Requires vk account credentials',
}, {
# wall page URL
'url': 'https://vk.com/wall-23538238_35',
@@ -703,39 +670,173 @@ class VKWallPostIE(VKBaseIE):
'w': 'wall' + post_id,
})[1]
- description = clean_html(get_element_by_class('wall_post_text', webpage))
- uploader = clean_html(get_element_by_class('author', webpage))
+ uploader = clean_html(get_element_by_class('PostHeaderTitle__authorName', webpage))
entries = []
for audio in re.findall(r'data-audio="([^"]+)', webpage):
audio = self._parse_json(unescapeHTML(audio), post_id)
- a = self._AUDIO._make(audio[:16])
- if not a.url:
+ if not audio['url']:
continue
- title = unescapeHTML(a.title)
- performer = unescapeHTML(a.performer)
+ title = unescapeHTML(audio.get('title'))
+ artist = unescapeHTML(audio.get('artist'))
entries.append({
- 'id': '%s_%s' % (a.owner_id, a.id),
- 'url': self._unmask_url(a.url, a.ads['vk_id']),
- 'title': '%s - %s' % (performer, title) if performer else title,
- 'thumbnails': [{'url': c_url} for c_url in a.cover_url.split(',')] if a.cover_url else None,
- 'duration': int_or_none(a.duration),
+ 'id': f'{audio["owner_id"]}_{audio["id"]}',
+ 'title': join_nonempty(artist, title, delim=' - '),
+ 'thumbnails': try_call(lambda: [{'url': u} for u in audio['coverUrl'].split(',')]),
+ 'duration': int_or_none(audio.get('duration')),
'uploader': uploader,
- 'artist': performer,
+ 'artist': artist,
'track': title,
- 'ext': 'mp4',
- 'protocol': 'm3u8_native',
+ 'formats': [{
+ 'url': audio['url'],
+ 'ext': 'm4a',
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ 'container': 'm4a_dash',
+ }],
})
- for video in re.finditer(
- r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage):
- entries.append(self.url_result(
- compat_urlparse.urljoin(url, video.group('url')), VKIE.ie_key()))
-
- title = 'Wall post %s' % post_id
+ entries.extend(self.url_result(urljoin(url, entry), VKIE) for entry in set(re.findall(
+ r'<a[^>]+href=(?:["\'])(/video(?:-?[\d_]+)[^"\']*)',
+ get_element_html_by_id('wl_post_body', webpage))))
return self.playlist_result(
- orderedSet(entries), post_id,
- '%s - %s' % (uploader, title) if uploader else title,
- description)
+ entries, post_id, join_nonempty(uploader, f'Wall post {post_id}', delim=' - '),
+ clean_html(get_element_by_class('wall_post_text', webpage)))
+
+
+class VKPlayBaseIE(InfoExtractor):
+ _RESOLUTIONS = {
+ 'tiny': '256x144',
+ 'lowest': '426x240',
+ 'low': '640x360',
+ 'medium': '852x480',
+ 'high': '1280x720',
+ 'full_hd': '1920x1080',
+ 'quad_hd': '2560x1440',
+ }
+
+ def _extract_from_initial_state(self, url, video_id, path):
+ webpage = self._download_webpage(url, video_id)
+ video_info = traverse_obj(self._search_json(
+ r'<script[^>]+\bid="initial-state"[^>]*>', webpage, 'initial state', video_id),
+ path, expected_type=dict)
+ if not video_info:
+ raise ExtractorError('Unable to extract video info from html inline initial state')
+ return video_info
+
+ def _extract_formats(self, stream_info, video_id):
+ formats = []
+ for stream in traverse_obj(stream_info, (
+ 'data', 0, 'playerUrls', lambda _, v: url_or_none(v['url']) and v['type'])):
+ url = stream['url']
+ format_id = str_or_none(stream['type'])
+ if format_id in ('hls', 'live_hls', 'live_playback_hls') or '.m3u8' in url:
+ formats.extend(self._extract_m3u8_formats(url, video_id, m3u8_id=format_id, fatal=False))
+ elif format_id == 'dash':
+ formats.extend(self._extract_mpd_formats(url, video_id, mpd_id=format_id, fatal=False))
+ elif format_id in ('live_dash', 'live_playback_dash'):
+ self.write_debug(f'Not extracting unsupported format "{format_id}"')
+ else:
+ formats.append({
+ 'url': url,
+ 'ext': 'mp4',
+ 'format_id': format_id,
+ **parse_resolution(self._RESOLUTIONS.get(format_id)),
+ })
+ return formats
+
+ def _extract_common_meta(self, stream_info):
+ return traverse_obj(stream_info, {
+ 'id': ('id', {str_or_none}),
+ 'title': ('title', {str}),
+ 'release_timestamp': ('startTime', {int_or_none}),
+ 'thumbnail': ('previewUrl', {url_or_none}),
+ 'view_count': ('count', 'views', {int_or_none}),
+ 'like_count': ('count', 'likes', {int_or_none}),
+ 'categories': ('category', 'title', {str}, {lambda x: [x] if x else None}),
+ 'uploader': (('user', ('blog', 'owner')), 'nick', {str}),
+ 'uploader_id': (('user', ('blog', 'owner')), 'id', {str_or_none}),
+ 'duration': ('duration', {int_or_none}),
+ 'is_live': ('isOnline', {bool}),
+ 'concurrent_view_count': ('count', 'viewers', {int_or_none}),
+ }, get_all=False)
+
+
+class VKPlayIE(VKPlayBaseIE):
+ _VALID_URL = r'https?://vkplay\.live/(?P<username>[^/#?]+)/record/(?P<id>[a-f0-9-]+)'
+ _TESTS = [{
+ 'url': 'https://vkplay.live/zitsmann/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da',
+ 'info_dict': {
+ 'id': 'f5e6e3b5-dc52-4d14-965d-0680dd2882da',
+ 'ext': 'mp4',
+ 'title': 'Atomic Heart (пробуем!) спасибо подписчику EKZO!',
+ 'uploader': 'ZitsmanN',
+ 'uploader_id': '13159830',
+ 'release_timestamp': 1683461378,
+ 'release_date': '20230507',
+ 'thumbnail': r're:https://images.vkplay.live/public_video_stream/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da/preview\?change_time=\d+',
+ 'duration': 10608,
+ 'view_count': int,
+ 'like_count': int,
+ 'categories': ['Atomic Heart'],
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ username, video_id = self._match_valid_url(url).groups()
+
+ record_info = traverse_obj(self._download_json(
+ f'https://api.vkplay.live/v1/blog/{username}/public_video_stream/record/{video_id}', video_id, fatal=False),
+ ('data', 'record', {dict}))
+ if not record_info:
+ record_info = self._extract_from_initial_state(url, video_id, ('record', 'currentRecord', 'data'))
+
+ return {
+ **self._extract_common_meta(record_info),
+ 'id': video_id,
+ 'formats': self._extract_formats(record_info, video_id),
+ }
+
+
+class VKPlayLiveIE(VKPlayBaseIE):
+ _VALID_URL = r'https?://vkplay\.live/(?P<id>[^/#?]+)/?(?:[#?]|$)'
+ _TESTS = [{
+ 'url': 'https://vkplay.live/bayda',
+ 'info_dict': {
+ 'id': 'f02c321e-427b-408d-b12f-ae34e53e0ea2',
+ 'ext': 'mp4',
+ 'title': r're:эскапизм крута .*',
+ 'uploader': 'Bayda',
+ 'uploader_id': 12279401,
+ 'release_timestamp': 1687209962,
+ 'release_date': '20230619',
+ 'thumbnail': r're:https://images.vkplay.live/public_video_stream/12279401/preview\?change_time=\d+',
+ 'view_count': int,
+ 'concurrent_view_count': int,
+ 'like_count': int,
+ 'categories': ['EVE Online'],
+ 'live_status': 'is_live',
+ },
+ 'skip': 'livestream',
+ 'params': {'skip_download': True},
+ }]
+
+ def _real_extract(self, url):
+ username = self._match_id(url)
+
+ stream_info = self._download_json(
+ f'https://api.vkplay.live/v1/blog/{username}/public_video_stream', username, fatal=False)
+ if not stream_info:
+ stream_info = self._extract_from_initial_state(url, username, ('stream', 'stream', 'data', 'stream'))
+
+ formats = self._extract_formats(stream_info, username)
+ if not formats and not traverse_obj(stream_info, ('isOnline', {bool})):
+ raise UserNotLive(video_id=username)
+
+ return {
+ **self._extract_common_meta(stream_info),
+ 'formats': formats,
+ }
diff --git a/hypervideo_dl/extractor/vocaroo.py b/hypervideo_dl/extractor/vocaroo.py
new file mode 100644
index 0000000..d98fbfd
--- /dev/null
+++ b/hypervideo_dl/extractor/vocaroo.py
@@ -0,0 +1,63 @@
+from .common import InfoExtractor
+from ..networking import HEADRequest
+from ..utils import float_or_none
+
+
+class VocarooIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?(?:vocaroo\.com|voca\.ro)/(?:embed/)?(?P<id>\w+)'
+ _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:www\.)?vocaroo\.com/embed/.+?)\1']
+ _TESTS = [
+ {
+ 'url': 'https://vocaroo.com/1de8yA3LNe77',
+ 'md5': 'c557841d5e50261777a6585648adf439',
+ 'info_dict': {
+ 'id': '1de8yA3LNe77',
+ 'ext': 'mp3',
+ 'title': 'Vocaroo video #1de8yA3LNe77',
+ 'timestamp': 1675059800.370,
+ 'upload_date': '20230130',
+ },
+ },
+ {
+ 'url': 'https://vocaroo.com/embed/12WqtjLnpj6g?autoplay=0',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://voca.ro/12D52rgpzkB0',
+ 'only_matching': True,
+ },
+ ]
+
+ _WEBPAGE_TESTS = [
+ {
+ 'url': 'https://qbnu.github.io/cool.html',
+ 'md5': 'f322e529275dd8a47994919eeac404a5',
+ 'info_dict': {
+ 'id': '19cgWmKO6AmC',
+ 'ext': 'mp3',
+ 'title': 'Vocaroo video #19cgWmKO6AmC',
+ 'timestamp': 1675093841.408,
+ 'upload_date': '20230130',
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ audio_id = self._match_id(url)
+ if len(audio_id) == 10 or (len(audio_id) == 12 and audio_id[0] == '1'):
+ media_subdomain = 'media1'
+ else:
+ media_subdomain = 'media'
+
+ url = f'https://{media_subdomain}.vocaroo.com/mp3/{audio_id}'
+ http_headers = {'Referer': 'https://vocaroo.com/'}
+ resp = self._request_webpage(HEADRequest(url), audio_id, headers=http_headers)
+ return {
+ 'id': audio_id,
+ 'title': '',
+ 'url': url,
+ 'ext': 'mp3',
+ 'timestamp': float_or_none(resp.getheader('x-bz-upload-timestamp'), scale=1000),
+ 'vcodec': 'none',
+ 'http_headers': http_headers,
+ }
diff --git a/hypervideo_dl/extractor/vodlocker.py b/hypervideo_dl/extractor/vodlocker.py
index 1c7236e..b215d6c 100644
--- a/hypervideo_dl/extractor/vodlocker.py
+++ b/hypervideo_dl/extractor/vodlocker.py
@@ -1,10 +1,6 @@
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- NO_DEFAULT,
- sanitized_Request,
- urlencode_postdata,
-)
+from ..networking import Request
+from ..utils import NO_DEFAULT, ExtractorError, urlencode_postdata
class VodlockerIE(InfoExtractor):
@@ -37,8 +33,8 @@ class VodlockerIE(InfoExtractor):
if fields['op'] == 'download1':
self._sleep(3, video_id) # they do detect when requests happen too fast!
post = urlencode_postdata(fields)
- req = sanitized_Request(url, post)
- req.add_header('Content-type', 'application/x-www-form-urlencoded')
+ req = Request(url, post)
+ req.headers['Content-type'] = 'application/x-www-form-urlencoded'
webpage = self._download_webpage(
req, video_id, 'Downloading video page')
diff --git a/hypervideo_dl/extractor/volejtv.py b/hypervideo_dl/extractor/volejtv.py
new file mode 100644
index 0000000..622d841
--- /dev/null
+++ b/hypervideo_dl/extractor/volejtv.py
@@ -0,0 +1,40 @@
+from .common import InfoExtractor
+
+
+class VolejTVIE(InfoExtractor):
+ _VALID_URL = r'https?://volej\.tv/video/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://volej.tv/video/725742/',
+ 'info_dict': {
+ 'id': '725742',
+ 'ext': 'mp4',
+ 'description': 'Zápas VK Královo Pole vs VK Prostějov 10.12.2022 v 19:00 na Volej.TV',
+ 'thumbnail': 'https://volej.tv/images/og/16/17186/og.png',
+ 'title': 'VK Královo Pole vs VK Prostějov',
+ }
+ }, {
+ 'url': 'https://volej.tv/video/725605/',
+ 'info_dict': {
+ 'id': '725605',
+ 'ext': 'mp4',
+ 'thumbnail': 'https://volej.tv/images/og/15/17185/og.png',
+ 'title': 'VK Lvi Praha vs VK Euro Sitex Příbram',
+ 'description': 'Zápas VK Lvi Praha vs VK Euro Sitex Příbram 11.12.2022 v 19:00 na Volej.TV',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ json_data = self._search_json(
+ r'<\s*!\[CDATA[^=]+=', webpage, 'CDATA', video_id)
+ formats, subtitle = self._extract_m3u8_formats_and_subtitles(
+ json_data['urls']['hls'], video_id)
+ return {
+ 'id': video_id,
+ 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage),
+ 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage),
+ 'description': self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage),
+ 'formats': formats,
+ 'subtitles': subtitle,
+ }
diff --git a/hypervideo_dl/extractor/voot.py b/hypervideo_dl/extractor/voot.py
index b709b74..b19a279 100644
--- a/hypervideo_dl/extractor/voot.py
+++ b/hypervideo_dl/extractor/voot.py
@@ -1,14 +1,86 @@
+import json
+import time
+import uuid
+
from .common import InfoExtractor
from ..compat import compat_str
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
+ float_or_none,
int_or_none,
+ jwt_decode_hs256,
+ parse_age_limit,
+ traverse_obj,
+ try_call,
try_get,
- unified_timestamp,
+ unified_strdate,
)
-class VootIE(InfoExtractor):
+class VootBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'voot'
+ _GEO_BYPASS = False
+ _LOGIN_HINT = 'Log in with "-u <email_address> -p <password>", or use "-u token -p <auth_token>" to login with auth token.'
+ _TOKEN = None
+ _EXPIRY = 0
+ _API_HEADERS = {'Origin': 'https://www.voot.com', 'Referer': 'https://www.voot.com/'}
+
+ def _perform_login(self, username, password):
+ if self._TOKEN and self._EXPIRY:
+ return
+
+ if username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)):
+ VootBaseIE._TOKEN = password
+ VootBaseIE._EXPIRY = jwt_decode_hs256(password)['exp']
+ self.report_login()
+
+ # Mobile number as username is not supported
+ elif not username.isdigit():
+ check_username = self._download_json(
+ 'https://userauth.voot.com/usersV3/v3/checkUser', None, data=json.dumps({
+ 'type': 'email',
+ 'email': username
+ }, separators=(',', ':')).encode(), headers={
+ **self._API_HEADERS,
+ 'Content-Type': 'application/json;charset=utf-8',
+ }, note='Checking username', expected_status=403)
+ if not traverse_obj(check_username, ('isExist', {bool})):
+ if traverse_obj(check_username, ('status', 'code', {int})) == 9999:
+ self.raise_geo_restricted(countries=['IN'])
+ raise ExtractorError('Incorrect username', expected=True)
+ auth_token = traverse_obj(self._download_json(
+ 'https://userauth.voot.com/usersV3/v3/login', None, data=json.dumps({
+ 'type': 'traditional',
+ 'deviceId': str(uuid.uuid4()),
+ 'deviceBrand': 'PC/MAC',
+ 'data': {
+ 'email': username,
+ 'password': password
+ }
+ }, separators=(',', ':')).encode(), headers={
+ **self._API_HEADERS,
+ 'Content-Type': 'application/json;charset=utf-8',
+ }, note='Logging in', expected_status=400), ('data', 'authToken', {dict}))
+ if not auth_token:
+ raise ExtractorError('Incorrect password', expected=True)
+ VootBaseIE._TOKEN = auth_token['accessToken']
+ VootBaseIE._EXPIRY = auth_token['expirationTime']
+
+ else:
+ raise ExtractorError(self._LOGIN_HINT, expected=True)
+
+ def _check_token_expiry(self):
+ if int(time.time()) >= self._EXPIRY:
+ raise ExtractorError('Access token has expired', expected=True)
+
+ def _real_initialize(self):
+ if not self._TOKEN:
+ self.raise_login_required(self._LOGIN_HINT, method=None)
+ self._check_token_expiry()
+
+
+class VootIE(VootBaseIE):
_VALID_URL = r'''(?x)
(?:
voot:|
@@ -20,27 +92,25 @@ class VootIE(InfoExtractor):
)
(?P<id>\d{3,})
'''
- _GEO_COUNTRIES = ['IN']
_TESTS = [{
'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353',
'info_dict': {
- 'id': '0_8ledb18o',
+ 'id': '441353',
'ext': 'mp4',
- 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340',
+ 'title': 'Is this the end of Kamini?',
'description': 'md5:06291fbbbc4dcbe21235c40c262507c1',
- 'timestamp': 1472162937,
+ 'timestamp': 1472103000,
'upload_date': '20160825',
'series': 'Ishq Ka Rang Safed',
'season_number': 1,
'episode': 'Is this the end of Kamini?',
'episode_number': 340,
- 'view_count': int,
- 'like_count': int,
- },
- 'params': {
- 'skip_download': True,
+ 'release_date': '20160825',
+ 'season': 'Season 1',
+ 'age_limit': 13,
+ 'duration': 1146.0,
},
- 'expected_warnings': ['Failed to download m3u8 information'],
+ 'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925',
'only_matching': True,
@@ -55,59 +125,50 @@ class VootIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
media_info = self._download_json(
- 'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id,
- query={
- 'platform': 'Web',
- 'pId': 2,
- 'mediaId': video_id,
- })
-
- status_code = try_get(media_info, lambda x: x['status']['code'], int)
- if status_code != 0:
- raise ExtractorError(media_info['status']['message'], expected=True)
-
- media = media_info['assets']
-
- entry_id = media['EntryId']
- title = media['MediaName']
- formats = self._extract_m3u8_formats(
- 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + entry_id,
- video_id, 'mp4', m3u8_id='hls')
-
- description, series, season_number, episode, episode_number = [None] * 5
-
- for meta in try_get(media, lambda x: x['Metas'], list) or []:
- key, value = meta.get('Key'), meta.get('Value')
- if not key or not value:
- continue
- if key == 'ContentSynopsis':
- description = value
- elif key == 'RefSeriesTitle':
- series = value
- elif key == 'RefSeriesSeason':
- season_number = int_or_none(value)
- elif key == 'EpisodeMainTitle':
- episode = value
- elif key == 'EpisodeNo':
- episode_number = int_or_none(value)
+ 'https://psapi.voot.com/jio/voot/v1/voot-web/content/query/asset-details', video_id,
+ query={'ids': f'include:{video_id}', 'responseType': 'common'}, headers={'accesstoken': self._TOKEN})
+
+ try:
+ m3u8_url = self._download_json(
+ 'https://vootapi.media.jio.com/playback/v1/playbackrights', video_id,
+ 'Downloading playback JSON', data=b'{}', headers={
+ **self.geo_verification_headers(),
+ **self._API_HEADERS,
+ 'Content-Type': 'application/json;charset=utf-8',
+ 'platform': 'androidwebdesktop',
+ 'vootid': video_id,
+ 'voottoken': self._TOKEN,
+ })['m3u8']
+ except ExtractorError as e:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 400:
+ self._check_token_expiry()
+ raise
+
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls')
+ self._remove_duplicate_formats(formats)
+
return {
- 'extractor_key': 'Kaltura',
- 'id': entry_id,
- 'title': title,
- 'description': description,
- 'series': series,
- 'season_number': season_number,
- 'episode': episode,
- 'episode_number': episode_number,
- 'timestamp': unified_timestamp(media.get('CreationDate')),
- 'duration': int_or_none(media.get('Duration')),
- 'view_count': int_or_none(media.get('ViewCounter')),
- 'like_count': int_or_none(media.get('like_counter')),
- 'formats': formats,
+ 'id': video_id,
+ # '/_definst_/smil:vod/' m3u8 manifests claim to have 720p+ formats but max out at 480p
+ 'formats': traverse_obj(formats, (
+ lambda _, v: '/_definst_/smil:vod/' not in v['url'] or v['height'] <= 480)),
+ 'http_headers': self._API_HEADERS,
+ **traverse_obj(media_info, ('result', 0, {
+ 'title': ('fullTitle', {str}),
+ 'description': ('fullSynopsis', {str}),
+ 'series': ('showName', {str}),
+ 'season_number': ('season', {int_or_none}),
+ 'episode': ('fullTitle', {str}),
+ 'episode_number': ('episode', {int_or_none}),
+ 'timestamp': ('uploadTime', {int_or_none}),
+ 'release_date': ('telecastDate', {unified_strdate}),
+ 'age_limit': ('ageNemonic', {parse_age_limit}),
+ 'duration': ('duration', {float_or_none}),
+ })),
}
-class VootSeriesIE(InfoExtractor):
+class VootSeriesIE(VootBaseIE):
_VALID_URL = r'https?://(?:www\.)?voot\.com/shows/[^/]+/(?P<id>\d{3,})'
_TESTS = [{
'url': 'https://www.voot.com/shows/chakravartin-ashoka-samrat/100002',
diff --git a/hypervideo_dl/extractor/vrt.py b/hypervideo_dl/extractor/vrt.py
index 26f48bf..497233d 100644
--- a/hypervideo_dl/extractor/vrt.py
+++ b/hypervideo_dl/extractor/vrt.py
@@ -1,45 +1,139 @@
-from .common import InfoExtractor
+import functools
+import json
+import time
+import urllib.parse
+
+from .gigya import GigyaBaseIE
+from ..networking.exceptions import HTTPError
from ..utils import (
+ ExtractorError,
+ clean_html,
extract_attributes,
float_or_none,
get_element_by_class,
+ get_element_html_by_class,
+ int_or_none,
+ join_nonempty,
+ jwt_encode_hs256,
+ make_archive_id,
+ parse_age_limit,
+ parse_iso8601,
+ str_or_none,
strip_or_none,
- unified_timestamp,
+ traverse_obj,
+ url_or_none,
+ urlencode_postdata,
)
-class VRTIE(InfoExtractor):
+class VRTBaseIE(GigyaBaseIE):
+ _GEO_BYPASS = False
+ _PLAYER_INFO = {
+ 'platform': 'desktop',
+ 'app': {
+ 'type': 'browser',
+ 'name': 'Chrome',
+ },
+ 'device': 'undefined (undefined)',
+ 'os': {
+ 'name': 'Windows',
+ 'version': 'x86_64'
+ },
+ 'player': {
+ 'name': 'VRT web player',
+ 'version': '2.7.4-prod-2023-04-19T06:05:45'
+ }
+ }
+ # From https://player.vrt.be/vrtnws/js/main.js & https://player.vrt.be/ketnet/js/main.8cdb11341bcb79e4cd44.js
+ _JWT_KEY_ID = '0-0Fp51UZykfaiCJrfTE3+oMI8zvDteYfPtR+2n1R+z8w='
+ _JWT_SIGNING_KEY = 'b5f500d55cb44715107249ccd8a5c0136cfb2788dbb71b90a4f142423bacaf38' # -dev
+ # player-stag.vrt.be key: d23987504521ae6fbf2716caca6700a24bb1579477b43c84e146b279de5ca595
+ # player.vrt.be key: 2a9251d782700769fb856da5725daf38661874ca6f80ae7dc2b05ec1a81a24ae
+
+ def _extract_formats_and_subtitles(self, data, video_id):
+ if traverse_obj(data, 'drm'):
+ self.report_drm(video_id)
+
+ formats, subtitles = [], {}
+ for target in traverse_obj(data, ('targetUrls', lambda _, v: url_or_none(v['url']) and v['type'])):
+ format_type = target['type'].upper()
+ format_url = target['url']
+ if format_type in ('HLS', 'HLS_AES'):
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ format_url, video_id, 'mp4', m3u8_id=format_type, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+ elif format_type == 'HDS':
+ formats.extend(self._extract_f4m_formats(
+ format_url, video_id, f4m_id=format_type, fatal=False))
+ elif format_type == 'MPEG_DASH':
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
+ format_url, video_id, mpd_id=format_type, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+ elif format_type == 'HSS':
+ fmts, subs = self._extract_ism_formats_and_subtitles(
+ format_url, video_id, ism_id='mss', fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+ else:
+ formats.append({
+ 'format_id': format_type,
+ 'url': format_url,
+ })
+
+ for sub in traverse_obj(data, ('subtitleUrls', lambda _, v: v['url'] and v['type'] == 'CLOSED')):
+ subtitles.setdefault('nl', []).append({'url': sub['url']})
+
+ return formats, subtitles
+
+ def _call_api(self, video_id, client='null', id_token=None, version='v2'):
+ player_info = {'exp': (round(time.time(), 3) + 900), **self._PLAYER_INFO}
+ player_token = self._download_json(
+ 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2/tokens',
+ video_id, 'Downloading player token', headers={
+ **self.geo_verification_headers(),
+ 'Content-Type': 'application/json',
+ }, data=json.dumps({
+ 'identityToken': id_token or {},
+ 'playerInfo': jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={
+ 'kid': self._JWT_KEY_ID
+ }).decode()
+ }, separators=(',', ':')).encode())['vrtPlayerToken']
+
+ return self._download_json(
+ f'https://media-services-public.vrt.be/media-aggregator/{version}/media-items/{video_id}',
+ video_id, 'Downloading API JSON', query={
+ 'vrtPlayerToken': player_token,
+ 'client': client,
+ }, expected_status=400)
+
+
+class VRTIE(VRTBaseIE):
IE_DESC = 'VRT NWS, Flanders News, Flandern Info and Sporza'
_VALID_URL = r'https?://(?:www\.)?(?P<site>vrt\.be/vrtnws|sporza\.be)/[a-z]{2}/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)'
_TESTS = [{
'url': 'https://www.vrt.be/vrtnws/nl/2019/05/15/beelden-van-binnenkant-notre-dame-een-maand-na-de-brand/',
- 'md5': 'e1663accf5cf13f375f3cd0d10476669',
'info_dict': {
'id': 'pbs-pub-7855fc7b-1448-49bc-b073-316cb60caa71$vid-2ca50305-c38a-4762-9890-65cbd098b7bd',
'ext': 'mp4',
'title': 'Beelden van binnenkant Notre-Dame, één maand na de brand',
- 'description': 'Op maandagavond 15 april ging een deel van het dakgebinte van de Parijse kathedraal in vlammen op.',
- 'timestamp': 1557924660,
- 'upload_date': '20190515',
+ 'description': 'md5:6fd85f999b2d1841aa5568f4bf02c3ff',
'duration': 31.2,
+ 'thumbnail': 'https://images.vrt.be/orig/2019/05/15/2d914d61-7710-11e9-abcc-02b7b76bf47f.jpg',
},
+ 'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/',
- 'md5': '910bba927566e9ab992278f647eb4b75',
'info_dict': {
'id': 'pbs-pub-f2c86a46-8138-413a-a4b9-a0015a16ce2c$vid-1f112b31-e58e-4379-908d-aca6d80f8818',
'ext': 'mp4',
- 'title': 'De Belgian Cats zijn klaar voor het EK mét Ann Wauters',
- 'timestamp': 1557923760,
- 'upload_date': '20190515',
+ 'title': 'De Belgian Cats zijn klaar voor het EK',
+ 'description': 'Video: De Belgian Cats zijn klaar voor het EK mét Ann Wauters | basketbal, sport in het journaal',
'duration': 115.17,
+ 'thumbnail': 'https://images.vrt.be/orig/2019/05/15/11c0dba3-770e-11e9-abcc-02b7b76bf47f.jpg',
},
- }, {
- 'url': 'https://www.vrt.be/vrtnws/en/2019/05/15/belgium_s-eurovision-entry-falls-at-the-first-hurdle/',
- 'only_matching': True,
- }, {
- 'url': 'https://www.vrt.be/vrtnws/de/2019/05/15/aus-fuer-eliott-im-halbfinale-des-eurosongfestivals/',
- 'only_matching': True,
+ 'params': {'skip_download': 'm3u8'},
}]
_CLIENT_MAP = {
'vrt.be/vrtnws': 'vrtnieuws',
@@ -49,34 +143,285 @@ class VRTIE(InfoExtractor):
def _real_extract(self, url):
site, display_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, display_id)
- attrs = extract_attributes(self._search_regex(
- r'(<[^>]+class="vrtvideo( [^"]*)?"[^>]*>)', webpage, 'vrt video'))
+ attrs = extract_attributes(get_element_html_by_class('vrtvideo', webpage) or '')
- asset_id = attrs['data-video-id']
- publication_id = attrs.get('data-publication-id')
+ asset_id = attrs.get('data-video-id') or attrs['data-videoid']
+ publication_id = traverse_obj(attrs, 'data-publication-id', 'data-publicationid')
if publication_id:
- asset_id = publication_id + '$' + asset_id
- client = attrs.get('data-client-code') or self._CLIENT_MAP[site]
+ asset_id = f'{publication_id}${asset_id}'
+ client = traverse_obj(attrs, 'data-client-code', 'data-client') or self._CLIENT_MAP[site]
+
+ data = self._call_api(asset_id, client)
+ formats, subtitles = self._extract_formats_and_subtitles(data, asset_id)
- title = strip_or_none(get_element_by_class(
- 'vrt-title', webpage) or self._html_search_meta(
- ['og:title', 'twitter:title', 'name'], webpage))
description = self._html_search_meta(
['og:description', 'twitter:description', 'description'], webpage)
if description == '…':
description = None
- timestamp = unified_timestamp(self._html_search_meta(
- 'article:published_time', webpage))
return {
- '_type': 'url_transparent',
'id': asset_id,
- 'display_id': display_id,
- 'title': title,
+ 'formats': formats,
+ 'subtitles': subtitles,
'description': description,
- 'thumbnail': attrs.get('data-posterimage'),
- 'timestamp': timestamp,
+ 'thumbnail': url_or_none(attrs.get('data-posterimage')),
'duration': float_or_none(attrs.get('data-duration'), 1000),
- 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (client, asset_id),
- 'ie_key': 'Canvas',
+ '_old_archive_ids': [make_archive_id('Canvas', asset_id)],
+ **traverse_obj(data, {
+ 'title': ('title', {str}),
+ 'description': ('shortDescription', {str}),
+ 'duration': ('duration', {functools.partial(float_or_none, scale=1000)}),
+ 'thumbnail': ('posterImageUrl', {url_or_none}),
+ }),
+ }
+
+
+class VrtNUIE(VRTBaseIE):
+ IE_DESC = 'VRT MAX'
+ _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ # CONTENT_IS_AGE_RESTRICTED
+ 'url': 'https://www.vrt.be/vrtnu/a-z/de-ideale-wereld/2023-vj/de-ideale-wereld-d20230116/',
+ 'info_dict': {
+ 'id': 'pbs-pub-855b00a8-6ce2-4032-ac4f-1fcf3ae78524$vid-d2243aa1-ec46-4e34-a55b-92568459906f',
+ 'ext': 'mp4',
+ 'title': 'Tom Waes',
+ 'description': 'Satirisch actualiteitenmagazine met Ella Leyers. Tom Waes is te gast.',
+ 'timestamp': 1673905125,
+ 'release_timestamp': 1673905125,
+ 'series': 'De ideale wereld',
+ 'season_id': '1672830988794',
+ 'episode': 'Aflevering 1',
+ 'episode_number': 1,
+ 'episode_id': '1672830988861',
+ 'display_id': 'de-ideale-wereld-d20230116',
+ 'channel': 'VRT',
+ 'duration': 1939.0,
+ 'thumbnail': 'https://images.vrt.be/orig/2023/01/10/1bb39cb3-9115-11ed-b07d-02b7b76bf47f.jpg',
+ 'release_date': '20230116',
+ 'upload_date': '20230116',
+ 'age_limit': 12,
+ },
+ }, {
+ 'url': 'https://www.vrt.be/vrtnu/a-z/buurman--wat-doet-u-nu-/6/buurman--wat-doet-u-nu--s6-trailer/',
+ 'info_dict': {
+ 'id': 'pbs-pub-ad4050eb-d9e5-48c2-9ec8-b6c355032361$vid-0465537a-34a8-4617-8352-4d8d983b4eee',
+ 'ext': 'mp4',
+ 'title': 'Trailer seizoen 6 \'Buurman, wat doet u nu?\'',
+ 'description': 'md5:197424726c61384b4e5c519f16c0cf02',
+ 'timestamp': 1652940000,
+ 'release_timestamp': 1652940000,
+ 'series': 'Buurman, wat doet u nu?',
+ 'season': 'Seizoen 6',
+ 'season_number': 6,
+ 'season_id': '1652344200907',
+ 'episode': 'Aflevering 0',
+ 'episode_number': 0,
+ 'episode_id': '1652951873524',
+ 'display_id': 'buurman--wat-doet-u-nu--s6-trailer',
+ 'channel': 'VRT',
+ 'duration': 33.13,
+ 'thumbnail': 'https://images.vrt.be/orig/2022/05/23/3c234d21-da83-11ec-b07d-02b7b76bf47f.jpg',
+ 'release_date': '20220519',
+ 'upload_date': '20220519',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+ _NETRC_MACHINE = 'vrtnu'
+ _authenticated = False
+
+ def _perform_login(self, username, password):
+ auth_info = self._gigya_login({
+ 'APIKey': '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy',
+ 'targetEnv': 'jssdk',
+ 'loginID': username,
+ 'password': password,
+ 'authMode': 'cookie',
+ })
+
+ if auth_info.get('errorDetails'):
+ raise ExtractorError(f'Unable to login. VrtNU said: {auth_info["errorDetails"]}', expected=True)
+
+ # Sometimes authentication fails for no good reason, retry
+ for retry in self.RetryManager():
+ if retry.attempt > 1:
+ self._sleep(1, None)
+ try:
+ self._request_webpage(
+ 'https://token.vrt.be/vrtnuinitlogin', None, note='Requesting XSRF Token',
+ errnote='Could not get XSRF Token', query={
+ 'provider': 'site',
+ 'destination': 'https://www.vrt.be/vrtnu/',
+ })
+ self._request_webpage(
+ 'https://login.vrt.be/perform_login', None,
+ note='Performing login', errnote='Login failed',
+ query={'client_id': 'vrtnu-site'}, data=urlencode_postdata({
+ 'UID': auth_info['UID'],
+ 'UIDSignature': auth_info['UIDSignature'],
+ 'signatureTimestamp': auth_info['signatureTimestamp'],
+ '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value,
+ }))
+ except ExtractorError as e:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+ retry.error = e
+ continue
+ raise
+
+ self._authenticated = True
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ parsed_url = urllib.parse.urlparse(url)
+ details = self._download_json(
+ f'{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip("/")}.model.json',
+ display_id, 'Downloading asset JSON', 'Unable to download asset JSON')['details']
+
+ watch_info = traverse_obj(details, (
+ 'actions', lambda _, v: v['type'] == 'watch-episode', {dict}), get_all=False) or {}
+ video_id = join_nonempty(
+ 'episodePublicationId', 'episodeVideoId', delim='$', from_dict=watch_info)
+ if '$' not in video_id:
+ raise ExtractorError('Unable to extract video ID')
+
+ vrtnutoken = self._download_json(
+ 'https://token.vrt.be/refreshtoken', video_id, note='Retrieving vrtnutoken',
+ errnote='Token refresh failed')['vrtnutoken'] if self._authenticated else None
+
+ video_info = self._call_api(video_id, 'vrtnu-web@PROD', vrtnutoken)
+
+ if 'title' not in video_info:
+ code = video_info.get('code')
+ if code in ('AUTHENTICATION_REQUIRED', 'CONTENT_IS_AGE_RESTRICTED'):
+ self.raise_login_required(code, method='password')
+ elif code in ('INVALID_LOCATION', 'CONTENT_AVAILABLE_ONLY_IN_BE'):
+ self.raise_geo_restricted(countries=['BE'])
+ elif code == 'CONTENT_AVAILABLE_ONLY_FOR_BE_RESIDENTS_AND_EXPATS':
+ if not self._authenticated:
+ self.raise_login_required(code, method='password')
+ self.raise_geo_restricted(countries=['BE'])
+ raise ExtractorError(code, expected=True)
+
+ formats, subtitles = self._extract_formats_and_subtitles(video_info, video_id)
+
+ return {
+ **traverse_obj(details, {
+ 'title': 'title',
+ 'description': ('description', {clean_html}),
+ 'timestamp': ('data', 'episode', 'onTime', 'raw', {parse_iso8601}),
+ 'release_timestamp': ('data', 'episode', 'onTime', 'raw', {parse_iso8601}),
+ 'series': ('data', 'program', 'title'),
+ 'season': ('data', 'season', 'title', 'value'),
+ 'season_number': ('data', 'season', 'title', 'raw', {int_or_none}),
+ 'season_id': ('data', 'season', 'id', {str_or_none}),
+ 'episode': ('data', 'episode', 'number', 'value', {str_or_none}),
+ 'episode_number': ('data', 'episode', 'number', 'raw', {int_or_none}),
+ 'episode_id': ('data', 'episode', 'id', {str_or_none}),
+ 'age_limit': ('data', 'episode', 'age', 'raw', {parse_age_limit}),
+ }),
+ 'id': video_id,
+ 'display_id': display_id,
+ 'channel': 'VRT',
+ 'formats': formats,
+ 'duration': float_or_none(video_info.get('duration'), 1000),
+ 'thumbnail': url_or_none(video_info.get('posterImageUrl')),
+ 'subtitles': subtitles,
+ '_old_archive_ids': [make_archive_id('Canvas', video_id)],
+ }
+
+
+class KetnetIE(VRTBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P<id>(?:[^/]+/)*[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.ketnet.be/kijken/m/meisjes/6/meisjes-s6a5',
+ 'info_dict': {
+ 'id': 'pbs-pub-39f8351c-a0a0-43e6-8394-205d597d6162$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e',
+ 'ext': 'mp4',
+ 'title': 'Meisjes',
+ 'episode': 'Reeks 6: Week 5',
+ 'season': 'Reeks 6',
+ 'series': 'Meisjes',
+ 'timestamp': 1685251800,
+ 'upload_date': '20230528',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ video = self._download_json(
+ 'https://senior-bff.ketnet.be/graphql', display_id, query={
+ 'query': '''{
+ video(id: "content/ketnet/nl/%s.model.json") {
+ description
+ episodeNr
+ imageUrl
+ mediaReference
+ programTitle
+ publicationDate
+ seasonTitle
+ subtitleVideodetail
+ titleVideodetail
+ }
+}''' % display_id,
+ })['data']['video']
+
+ video_id = urllib.parse.unquote(video['mediaReference'])
+ data = self._call_api(video_id, 'ketnet@PROD', version='v1')
+ formats, subtitles = self._extract_formats_and_subtitles(data, video_id)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ '_old_archive_ids': [make_archive_id('Canvas', video_id)],
+ **traverse_obj(video, {
+ 'title': ('titleVideodetail', {str}),
+ 'description': ('description', {str}),
+ 'thumbnail': ('thumbnail', {url_or_none}),
+ 'timestamp': ('publicationDate', {parse_iso8601}),
+ 'series': ('programTitle', {str}),
+ 'season': ('seasonTitle', {str}),
+ 'episode': ('subtitleVideodetail', {str}),
+ 'episode_number': ('episodeNr', {int_or_none}),
+ }),
+ }
+
+
+class DagelijkseKostIE(VRTBaseIE):
+ IE_DESC = 'dagelijksekost.een.be'
+ _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof',
+ 'info_dict': {
+ 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa',
+ 'ext': 'mp4',
+ 'title': 'Hachis parmentier met witloof',
+ 'description': 'md5:9960478392d87f63567b5b117688cdc5',
+ 'display_id': 'hachis-parmentier-met-witloof',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ video_id = self._html_search_regex(
+ r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', group='id')
+
+ data = self._call_api(video_id, 'dako@prod', version='v1')
+ formats, subtitles = self._extract_formats_and_subtitles(data, video_id)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'display_id': display_id,
+ 'title': strip_or_none(get_element_by_class(
+ 'dish-metadata__title', webpage) or self._html_search_meta('twitter:title', webpage)),
+ 'description': clean_html(get_element_by_class(
+ 'dish-description', webpage)) or self._html_search_meta(
+ ['description', 'twitter:description', 'og:description'], webpage),
+ '_old_archive_ids': [make_archive_id('Canvas', video_id)],
}
diff --git a/hypervideo_dl/extractor/vrv.py b/hypervideo_dl/extractor/vrv.py
index 89fa7af..523c442 100644
--- a/hypervideo_dl/extractor/vrv.py
+++ b/hypervideo_dl/extractor/vrv.py
@@ -8,7 +8,8 @@ import time
import urllib.parse
from .common import InfoExtractor
-from ..compat import compat_HTTPError, compat_urllib_parse_urlencode
+from ..compat import compat_urllib_parse_urlencode
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
float_or_none,
@@ -30,7 +31,7 @@ class VRVBaseIE(InfoExtractor):
base_url = self._API_DOMAIN + '/core/' + path
query = [
('oauth_consumer_key', self._API_PARAMS['oAuthKey']),
- ('oauth_nonce', ''.join([random.choice(string.ascii_letters) for _ in range(32)])),
+ ('oauth_nonce', ''.join(random.choices(string.ascii_letters, k=32))),
('oauth_signature_method', 'HMAC-SHA1'),
('oauth_timestamp', int(time.time())),
]
@@ -54,8 +55,8 @@ class VRVBaseIE(InfoExtractor):
'?'.join([base_url, encoded_query]), video_id,
note='Downloading %s JSON metadata' % note, headers=headers, data=data)
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- raise ExtractorError(json.loads(e.cause.read().decode())['message'], expected=True)
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+ raise ExtractorError(json.loads(e.cause.response.read().decode())['message'], expected=True)
raise
def _call_cms(self, path, video_id, note):
diff --git a/hypervideo_dl/extractor/vshare.py b/hypervideo_dl/extractor/vshare.py
index 1bc7ae4..443ed43 100644
--- a/hypervideo_dl/extractor/vshare.py
+++ b/hypervideo_dl/extractor/vshare.py
@@ -22,7 +22,7 @@ class VShareIE(InfoExtractor):
packed = self._search_regex(
r'(eval\(function.+)', webpage, 'packed code')
unpacked = decode_packed_codes(packed)
- digits = self._search_regex(r'\[((?:\d+,?)+)\]', unpacked, 'digits')
+ digits = self._search_regex(r'\[([\d,]+)\]', unpacked, 'digits')
digits = [int(digit) for digit in digits.split(',')]
key_digit = self._search_regex(
r'fromCharCode\(.+?(\d+)\)}', unpacked, 'key digit')
diff --git a/hypervideo_dl/extractor/vzaar.py b/hypervideo_dl/extractor/vzaar.py
index 6b9817c..19908a9 100644
--- a/hypervideo_dl/extractor/vzaar.py
+++ b/hypervideo_dl/extractor/vzaar.py
@@ -87,7 +87,7 @@ class VzaarIE(InfoExtractor):
m3u8_id='hls', fatal=False)
if hls_aes:
for f in m3u8_formats:
- f['_decryption_key_url'] = url_templ % ('goose', '') + qs
+ f['hls_aes'] = {'uri': url_templ % ('goose', '') + qs}
formats.extend(m3u8_formats)
return {
diff --git a/hypervideo_dl/extractor/wat.py b/hypervideo_dl/extractor/wat.py
index 7c62d28..9ea3fdd 100644
--- a/hypervideo_dl/extractor/wat.py
+++ b/hypervideo_dl/extractor/wat.py
@@ -41,6 +41,18 @@ class WatIE(InfoExtractor):
'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."],
'skip': 'This content is no longer available',
},
+ {
+ 'url': 'wat:14010600',
+ 'info_dict': {
+ 'id': '14010600',
+ 'title': 'Burger Quiz - S03 EP21 avec Eye Haidara, Anne Depétrini, Jonathan Zaccaï et Pio Marmaï',
+ 'thumbnail': 'https://photos.tf1.fr/1280/720/burger-quiz-11-9adb79-0@1x.jpg',
+ 'upload_date': '20230819',
+ 'duration': 2312,
+ 'ext': 'mp4',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }
]
_GEO_BYPASS = False
@@ -54,7 +66,7 @@ class WatIE(InfoExtractor):
# 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id)
video_data = self._download_json(
'https://mediainfo.tf1.fr/mediainfocombo/' + video_id,
- video_id, query={'context': 'MYTF1', 'pver': '4020003'})
+ video_id, query={'pver': '5010000'})
video_info = video_data['media']
error_desc = video_info.get('error_desc')
diff --git a/hypervideo_dl/extractor/webcamerapl.py b/hypervideo_dl/extractor/webcamerapl.py
new file mode 100644
index 0000000..a02d951
--- /dev/null
+++ b/hypervideo_dl/extractor/webcamerapl.py
@@ -0,0 +1,44 @@
+import codecs
+
+from .common import InfoExtractor
+
+
+class WebcameraplIE(InfoExtractor):
+ _VALID_URL = r'https?://(?P<id>[\w-]+)\.webcamera\.pl'
+ _TESTS = [{
+ 'url': 'https://warszawa-plac-zamkowy.webcamera.pl',
+ 'info_dict': {
+ 'id': 'warszawa-plac-zamkowy',
+ 'ext': 'mp4',
+ 'title': r're:WIDOK NA PLAC ZAMKOWY W WARSZAWIE \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'live_status': 'is_live',
+ }
+ }, {
+ 'url': 'https://gdansk-stare-miasto.webcamera.pl/',
+ 'info_dict': {
+ 'id': 'gdansk-stare-miasto',
+ 'ext': 'mp4',
+ 'title': r're:GDAŃSK - widok na Stare Miasto \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'live_status': 'is_live',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ rot13_m3u8_url = self._search_regex(r'data-src\s*=\s*"(uggc[^"]+\.z3h8)"',
+ webpage, 'm3u8 url', default=None)
+ if not rot13_m3u8_url:
+ self.raise_no_formats('No video/audio found at the provided url', expected=True)
+
+ m3u8_url = codecs.decode(rot13_m3u8_url, 'rot-13')
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, live=True)
+
+ return {
+ 'id': video_id,
+ 'title': self._html_search_regex(r'<h1\b[^>]*>([^>]+)</h1>', webpage, 'title'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'is_live': True,
+ }
diff --git a/hypervideo_dl/extractor/weibo.py b/hypervideo_dl/extractor/weibo.py
index 81a23b9..bc9a71a 100644
--- a/hypervideo_dl/extractor/weibo.py
+++ b/hypervideo_dl/extractor/weibo.py
@@ -31,7 +31,7 @@ class WeiboIE(InfoExtractor):
# to get Referer url for genvisitor
webpage, urlh = self._download_webpage_handle(url, video_id)
- visitor_url = urlh.geturl()
+ visitor_url = urlh.url
if 'passport.weibo.com' in visitor_url:
# first visit
diff --git a/hypervideo_dl/extractor/weverse.py b/hypervideo_dl/extractor/weverse.py
new file mode 100644
index 0000000..bbf6285
--- /dev/null
+++ b/hypervideo_dl/extractor/weverse.py
@@ -0,0 +1,608 @@
+import base64
+import hashlib
+import hmac
+import itertools
+import json
+import re
+import time
+import urllib.parse
+import uuid
+
+from .common import InfoExtractor
+from .naver import NaverBaseIE
+from .youtube import YoutubeIE
+from ..networking.exceptions import HTTPError
+from ..utils import (
+ ExtractorError,
+ UserNotLive,
+ float_or_none,
+ int_or_none,
+ str_or_none,
+ traverse_obj,
+ try_call,
+ update_url_query,
+ url_or_none,
+)
+
+
+class WeverseBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'weverse'
+ _ACCOUNT_API_BASE = 'https://accountapi.weverse.io/web/api/v2'
+ _API_HEADERS = {
+ 'Referer': 'https://weverse.io/',
+ 'WEV-device-Id': str(uuid.uuid4()),
+ }
+
+ def _perform_login(self, username, password):
+ if self._API_HEADERS.get('Authorization'):
+ return
+
+ headers = {
+ 'x-acc-app-secret': '5419526f1c624b38b10787e5c10b2a7a',
+ 'x-acc-app-version': '2.2.6',
+ 'x-acc-language': 'en',
+ 'x-acc-service-id': 'weverse',
+ 'x-acc-trace-id': str(uuid.uuid4()),
+ 'x-clog-user-device-id': str(uuid.uuid4()),
+ }
+ check_username = self._download_json(
+ f'{self._ACCOUNT_API_BASE}/signup/email/status', None,
+ note='Checking username', query={'email': username}, headers=headers)
+ if not check_username.get('hasPassword'):
+ raise ExtractorError('Invalid username provided', expected=True)
+
+ headers['content-type'] = 'application/json'
+ try:
+ auth = self._download_json(
+ f'{self._ACCOUNT_API_BASE}/auth/token/by-credentials', None, data=json.dumps({
+ 'email': username,
+ 'password': password,
+ }, separators=(',', ':')).encode(), headers=headers, note='Logging in')
+ except ExtractorError as e:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+ raise ExtractorError('Invalid password provided', expected=True)
+ raise
+
+ WeverseBaseIE._API_HEADERS['Authorization'] = f'Bearer {auth["accessToken"]}'
+
+ def _real_initialize(self):
+ if self._API_HEADERS.get('Authorization'):
+ return
+
+ token = try_call(lambda: self._get_cookies('https://weverse.io/')['we2_access_token'].value)
+ if token:
+ WeverseBaseIE._API_HEADERS['Authorization'] = f'Bearer {token}'
+
+ def _call_api(self, ep, video_id, data=None, note='Downloading API JSON'):
+ # Ref: https://ssl.pstatic.net/static/wevweb/2_3_2_11101725/public/static/js/2488.a09b41ff.chunk.js
+ # From https://ssl.pstatic.net/static/wevweb/2_3_2_11101725/public/static/js/main.e206f7c1.js:
+ key = b'1b9cb6378d959b45714bec49971ade22e6e24e42'
+ api_path = update_url_query(ep, {
+ 'appId': 'be4d79eb8fc7bd008ee82c8ec4ff6fd4',
+ 'language': 'en',
+ 'platform': 'WEB',
+ 'wpf': 'pc',
+ })
+ wmsgpad = int(time.time() * 1000)
+ wmd = base64.b64encode(hmac.HMAC(
+ key, f'{api_path[:255]}{wmsgpad}'.encode(), digestmod=hashlib.sha1).digest()).decode()
+ headers = {'Content-Type': 'application/json'} if data else {}
+ try:
+ return self._download_json(
+ f'https://global.apis.naver.com/weverse/wevweb{api_path}', video_id, note=note,
+ data=data, headers={**self._API_HEADERS, **headers}, query={
+ 'wmsgpad': wmsgpad,
+ 'wmd': wmd,
+ })
+ except ExtractorError as e:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+ self.raise_login_required(
+ 'Session token has expired. Log in again or refresh cookies in browser')
+ elif isinstance(e.cause, HTTPError) and e.cause.status == 403:
+ if 'Authorization' in self._API_HEADERS:
+ raise ExtractorError('Your account does not have access to this content', expected=True)
+ self.raise_login_required()
+ raise
+
+ def _call_post_api(self, video_id):
+ path = '' if 'Authorization' in self._API_HEADERS else '/preview'
+ return self._call_api(f'/post/v1.0/post-{video_id}{path}?fieldSet=postV1', video_id)
+
+ def _get_community_id(self, channel):
+ return str(self._call_api(
+ f'/community/v1.0/communityIdUrlPathByUrlPathArtistCode?keyword={channel}',
+ channel, note='Fetching community ID')['communityId'])
+
+ def _get_formats(self, data, video_id):
+ formats = traverse_obj(data, ('videos', 'list', lambda _, v: url_or_none(v['source']), {
+ 'url': 'source',
+ 'width': ('encodingOption', 'width', {int_or_none}),
+ 'height': ('encodingOption', 'height', {int_or_none}),
+ 'vcodec': 'type',
+ 'vbr': ('bitrate', 'video', {int_or_none}),
+ 'abr': ('bitrate', 'audio', {int_or_none}),
+ 'filesize': ('size', {int_or_none}),
+ 'format_id': ('encodingOption', 'id', {str_or_none}),
+ }))
+
+ for stream in traverse_obj(data, ('streams', lambda _, v: v['type'] == 'HLS' and url_or_none(v['source']))):
+ query = {}
+ for param in traverse_obj(stream, ('keys', lambda _, v: v['type'] == 'param' and v['name'])):
+ query[param['name']] = param.get('value', '')
+ fmts = self._extract_m3u8_formats(
+ stream['source'], video_id, 'mp4', m3u8_id='hls', fatal=False, query=query)
+ if query:
+ for fmt in fmts:
+ fmt['url'] = update_url_query(fmt['url'], query)
+ fmt['extra_param_to_segment_url'] = urllib.parse.urlencode(query)
+ formats.extend(fmts)
+
+ return formats
+
+ def _get_subs(self, caption_url):
+ subs_ext_re = r'\.(?:ttml|vtt)'
+ replace_ext = lambda x, y: re.sub(subs_ext_re, y, x)
+ if re.search(subs_ext_re, caption_url):
+ return [replace_ext(caption_url, '.ttml'), replace_ext(caption_url, '.vtt')]
+ return [caption_url]
+
+ def _parse_post_meta(self, metadata):
+ return traverse_obj(metadata, {
+ 'title': ((('extension', 'mediaInfo', 'title'), 'title'), {str}),
+ 'description': ((('extension', 'mediaInfo', 'body'), 'body'), {str}),
+ 'uploader': ('author', 'profileName', {str}),
+ 'uploader_id': ('author', 'memberId', {str}),
+ 'creator': ('community', 'communityName', {str}),
+ 'channel_id': (('community', 'author'), 'communityId', {str_or_none}),
+ 'duration': ('extension', 'video', 'playTime', {float_or_none}),
+ 'timestamp': ('publishedAt', {lambda x: int_or_none(x, 1000)}),
+ 'release_timestamp': ('extension', 'video', 'onAirStartAt', {lambda x: int_or_none(x, 1000)}),
+ 'thumbnail': ('extension', (('mediaInfo', 'thumbnail', 'url'), ('video', 'thumb')), {url_or_none}),
+ 'view_count': ('extension', 'video', 'playCount', {int_or_none}),
+ 'like_count': ('extension', 'video', 'likeCount', {int_or_none}),
+ 'comment_count': ('commentCount', {int_or_none}),
+ }, get_all=False)
+
+ def _extract_availability(self, data):
+ return self._availability(**traverse_obj(data, ((('extension', 'video'), None), {
+ 'needs_premium': 'paid',
+ 'needs_subscription': 'membershipOnly',
+ }), get_all=False, expected_type=bool), needs_auth=True)
+
+ def _extract_live_status(self, data):
+ data = traverse_obj(data, ('extension', 'video', {dict})) or {}
+ if data.get('type') == 'LIVE':
+ return traverse_obj({
+ 'ONAIR': 'is_live',
+ 'DONE': 'post_live',
+ 'STANDBY': 'is_upcoming',
+ 'DELAY': 'is_upcoming',
+ }, (data.get('status'), {str})) or 'is_live'
+ return 'was_live' if data.get('liveToVod') else 'not_live'
+
+
+class WeverseIE(WeverseBaseIE):
+ _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<artist>[^/?#]+)/live/(?P<id>[\d-]+)'
+ _TESTS = [{
+ 'url': 'https://weverse.io/billlie/live/0-107323480',
+ 'md5': '1fa849f00181eef9100d3c8254c47979',
+ 'info_dict': {
+ 'id': '0-107323480',
+ 'ext': 'mp4',
+ 'title': '행복한 평이루💜',
+ 'description': '',
+ 'uploader': 'Billlie',
+ 'uploader_id': '5ae14aed7b7cdc65fa87c41fe06cc936',
+ 'channel': 'billlie',
+ 'channel_id': '72',
+ 'channel_url': 'https://weverse.io/billlie',
+ 'creator': 'Billlie',
+ 'timestamp': 1666262062,
+ 'upload_date': '20221020',
+ 'release_timestamp': 1666262058,
+ 'release_date': '20221020',
+ 'duration': 3102,
+ 'thumbnail': r're:^https?://.*\.jpe?g$',
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ 'availability': 'needs_auth',
+ 'live_status': 'was_live',
+ },
+ }, {
+ 'url': 'https://weverse.io/lesserafim/live/2-102331763',
+ 'md5': 'e46125c08b13a6c8c1f4565035cca987',
+ 'info_dict': {
+ 'id': '2-102331763',
+ 'ext': 'mp4',
+ 'title': '🎂김채원 생신🎂',
+ 'description': '🎂김채원 생신🎂',
+ 'uploader': 'LE SSERAFIM ',
+ 'uploader_id': 'd26ddc1e258488a0a2b795218d14d59d',
+ 'channel': 'lesserafim',
+ 'channel_id': '47',
+ 'channel_url': 'https://weverse.io/lesserafim',
+ 'creator': 'LE SSERAFIM',
+ 'timestamp': 1659353400,
+ 'upload_date': '20220801',
+ 'release_timestamp': 1659353400,
+ 'release_date': '20220801',
+ 'duration': 3006,
+ 'thumbnail': r're:^https?://.*\.jpe?g$',
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ 'availability': 'needs_auth',
+ 'live_status': 'was_live',
+ 'subtitles': {
+ 'id_ID': 'count:2',
+ 'en_US': 'count:2',
+ 'es_ES': 'count:2',
+ 'vi_VN': 'count:2',
+ 'th_TH': 'count:2',
+ 'zh_CN': 'count:2',
+ 'zh_TW': 'count:2',
+ 'ja_JP': 'count:2',
+ 'ko_KR': 'count:2',
+ },
+ },
+ }, {
+ 'url': 'https://weverse.io/treasure/live/2-117230416',
+ 'info_dict': {
+ 'id': '2-117230416',
+ 'ext': 'mp4',
+ 'title': r're:스껄도려님 첫 스무살 생파🦋',
+ 'description': '',
+ 'uploader': 'TREASURE',
+ 'uploader_id': '77eabbc449ca37f7970054a136f60082',
+ 'channel': 'treasure',
+ 'channel_id': '20',
+ 'channel_url': 'https://weverse.io/treasure',
+ 'creator': 'TREASURE',
+ 'timestamp': 1680667651,
+ 'upload_date': '20230405',
+ 'release_timestamp': 1680667639,
+ 'release_date': '20230405',
+ 'thumbnail': r're:^https?://.*\.jpe?g$',
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ 'availability': 'needs_auth',
+ 'live_status': 'is_live',
+ },
+ 'skip': 'Livestream has ended',
+ }]
+
+ def _real_extract(self, url):
+ channel, video_id = self._match_valid_url(url).group('artist', 'id')
+ post = self._call_post_api(video_id)
+ api_video_id = post['extension']['video']['videoId']
+ availability = self._extract_availability(post)
+ live_status = self._extract_live_status(post)
+ video_info, formats = {}, []
+
+ if live_status == 'is_upcoming':
+ self.raise_no_formats('Livestream has not yet started', expected=True)
+
+ elif live_status == 'is_live':
+ video_info = self._call_api(
+ f'/video/v1.0/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2',
+ video_id, note='Downloading live JSON')
+ playback = self._parse_json(video_info['lipPlayback'], video_id)
+ m3u8_url = traverse_obj(playback, (
+ 'media', lambda _, v: v['protocol'] == 'HLS', 'path', {url_or_none}), get_all=False)
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True)
+
+ elif live_status == 'post_live':
+ if availability in ('premium_only', 'subscriber_only'):
+ self.report_drm(video_id)
+ self.raise_no_formats(
+ 'Livestream has ended and downloadable VOD is not available', expected=True)
+
+ else:
+ infra_video_id = post['extension']['video']['infraVideoId']
+ in_key = self._call_api(
+ f'/video/v1.0/vod/{api_video_id}/inKey?preview=false', video_id,
+ data=b'{}', note='Downloading VOD API key')['inKey']
+
+ video_info = self._download_json(
+ f'https://global.apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{infra_video_id}',
+ video_id, note='Downloading VOD JSON', query={
+ 'key': in_key,
+ 'sid': traverse_obj(post, ('extension', 'video', 'serviceId')) or '2070',
+ 'pid': str(uuid.uuid4()),
+ 'nonce': int(time.time() * 1000),
+ 'devt': 'html5_pc',
+ 'prv': 'Y' if post.get('membershipOnly') else 'N',
+ 'aup': 'N',
+ 'stpb': 'N',
+ 'cpl': 'en',
+ 'env': 'prod',
+ 'lc': 'en',
+ 'adi': '[{"adSystem":"null"}]',
+ 'adu': '/',
+ })
+
+ formats = self._get_formats(video_info, video_id)
+ has_drm = traverse_obj(video_info, ('meta', 'provider', 'name', {str.lower})) == 'drm'
+ if has_drm and formats:
+ self.report_warning(
+ 'Requested content is DRM-protected, only a 30-second preview is available', video_id)
+ elif has_drm and not formats:
+ self.report_drm(video_id)
+
+ return {
+ 'id': video_id,
+ 'channel': channel,
+ 'channel_url': f'https://weverse.io/{channel}',
+ 'formats': formats,
+ 'availability': availability,
+ 'live_status': live_status,
+ **self._parse_post_meta(post),
+ **NaverBaseIE.process_subtitles(video_info, self._get_subs),
+ }
+
+
+class WeverseMediaIE(WeverseBaseIE):
+ _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<artist>[^/?#]+)/media/(?P<id>[\d-]+)'
+ _TESTS = [{
+ 'url': 'https://weverse.io/billlie/media/4-116372884',
+ 'md5': '8efc9cfd61b2f25209eb1a5326314d28',
+ 'info_dict': {
+ 'id': 'e-C9wLSQs6o',
+ 'ext': 'mp4',
+ 'title': 'Billlie | \'EUNOIA\' Performance Video (heartbeat ver.)',
+ 'description': 'md5:6181caaf2a2397bca913ffe368c104e5',
+ 'channel': 'Billlie',
+ 'channel_id': 'UCyc9sUCxELTDK9vELO5Fzeg',
+ 'channel_url': 'https://www.youtube.com/channel/UCyc9sUCxELTDK9vELO5Fzeg',
+ 'uploader': 'Billlie',
+ 'uploader_id': '@Billlie',
+ 'uploader_url': 'http://www.youtube.com/@Billlie',
+ 'upload_date': '20230403',
+ 'duration': 211,
+ 'age_limit': 0,
+ 'playable_in_embed': True,
+ 'live_status': 'not_live',
+ 'availability': 'public',
+ 'view_count': int,
+ 'comment_count': int,
+ 'like_count': int,
+ 'channel_follower_count': int,
+ 'thumbnail': 'https://i.ytimg.com/vi/e-C9wLSQs6o/maxresdefault.jpg',
+ 'categories': ['Entertainment'],
+ 'tags': 'count:7',
+ },
+ }, {
+ 'url': 'https://weverse.io/billlie/media/3-102914520',
+ 'md5': '031551fcbd716bc4f080cb6174a43d8a',
+ 'info_dict': {
+ 'id': '3-102914520',
+ 'ext': 'mp4',
+ 'title': 'From. SUHYEON🌸',
+ 'description': 'Billlie 멤버별 독점 영상 공개💙💜',
+ 'uploader': 'Billlie_official',
+ 'uploader_id': 'f569c6e92f7eaffef0a395037dcaa54f',
+ 'channel': 'billlie',
+ 'channel_id': '72',
+ 'channel_url': 'https://weverse.io/billlie',
+ 'creator': 'Billlie',
+ 'timestamp': 1662174000,
+ 'upload_date': '20220903',
+ 'release_timestamp': 1662174000,
+ 'release_date': '20220903',
+ 'duration': 17.0,
+ 'thumbnail': r're:^https?://.*\.jpe?g$',
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ 'availability': 'needs_auth',
+ 'live_status': 'not_live',
+ },
+ }]
+
+ def _real_extract(self, url):
+ channel, video_id = self._match_valid_url(url).group('artist', 'id')
+ post = self._call_post_api(video_id)
+ media_type = traverse_obj(post, ('extension', 'mediaInfo', 'mediaType', {str.lower}))
+ youtube_id = traverse_obj(post, ('extension', 'youtube', 'youtubeVideoId', {str}))
+
+ if media_type == 'vod':
+ return self.url_result(f'https://weverse.io/{channel}/live/{video_id}', WeverseIE)
+ elif media_type == 'youtube' and youtube_id:
+ return self.url_result(youtube_id, YoutubeIE)
+ elif media_type == 'image':
+ self.raise_no_formats('No video content found in webpage', expected=True)
+ elif media_type:
+ raise ExtractorError(f'Unsupported media type "{media_type}"')
+
+ self.raise_no_formats('No video content found in webpage')
+
+
+class WeverseMomentIE(WeverseBaseIE):
+ _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<artist>[^/?#]+)/moment/(?P<uid>[\da-f]+)/post/(?P<id>[\d-]+)'
+ _TESTS = [{
+ 'url': 'https://weverse.io/secretnumber/moment/66a07e164b56a696ee71c99315ffe27b/post/1-117229444',
+ 'md5': '87733ac19a54081b7dfc2442036d282b',
+ 'info_dict': {
+ 'id': '1-117229444',
+ 'ext': 'mp4',
+ 'title': '今日もめっちゃいい天気☀️🌤️',
+ 'uploader': '레아',
+ 'uploader_id': '66a07e164b56a696ee71c99315ffe27b',
+ 'channel': 'secretnumber',
+ 'channel_id': '56',
+ 'creator': 'SECRET NUMBER',
+ 'duration': 10,
+ 'upload_date': '20230405',
+ 'timestamp': 1680653968,
+ 'thumbnail': r're:^https?://.*\.jpe?g$',
+ 'like_count': int,
+ 'comment_count': int,
+ 'availability': 'needs_auth',
+ },
+ 'skip': 'Moment has expired',
+ }]
+
+ def _real_extract(self, url):
+ channel, uploader_id, video_id = self._match_valid_url(url).group('artist', 'uid', 'id')
+ post = self._call_post_api(video_id)
+ api_video_id = post['extension']['moment']['video']['videoId']
+ video_info = self._call_api(
+ f'/cvideo/v1.0/cvideo-{api_video_id}/playInfo?videoId={api_video_id}', video_id,
+ note='Downloading moment JSON')['playInfo']
+
+ return {
+ 'id': video_id,
+ 'channel': channel,
+ 'uploader_id': uploader_id,
+ 'formats': self._get_formats(video_info, video_id),
+ 'availability': self._extract_availability(post),
+ **traverse_obj(post, {
+ 'title': ((('extension', 'moment', 'body'), 'body'), {str}),
+ 'uploader': ('author', 'profileName', {str}),
+ 'creator': (('community', 'author'), 'communityName', {str}),
+ 'channel_id': (('community', 'author'), 'communityId', {str_or_none}),
+ 'duration': ('extension', 'moment', 'video', 'uploadInfo', 'playTime', {float_or_none}),
+ 'timestamp': ('publishedAt', {lambda x: int_or_none(x, 1000)}),
+ 'thumbnail': ('extension', 'moment', 'video', 'uploadInfo', 'imageUrl', {url_or_none}),
+ 'like_count': ('emotionCount', {int_or_none}),
+ 'comment_count': ('commentCount', {int_or_none}),
+ }, get_all=False),
+ **NaverBaseIE.process_subtitles(video_info, self._get_subs),
+ }
+
+
+class WeverseTabBaseIE(WeverseBaseIE):
+ _ENDPOINT = None
+ _PATH = None
+ _QUERY = {}
+ _RESULT_IE = None
+
+ def _entries(self, channel_id, channel, first_page):
+ query = self._QUERY.copy()
+
+ for page in itertools.count(1):
+ posts = first_page if page == 1 else self._call_api(
+ update_url_query(self._ENDPOINT % channel_id, query), channel,
+ note=f'Downloading {self._PATH} tab page {page}')
+
+ for post in traverse_obj(posts, ('data', lambda _, v: v['postId'])):
+ yield self.url_result(
+ f'https://weverse.io/{channel}/{self._PATH}/{post["postId"]}',
+ self._RESULT_IE, post['postId'], **self._parse_post_meta(post),
+ channel=channel, channel_url=f'https://weverse.io/{channel}',
+ availability=self._extract_availability(post),
+ live_status=self._extract_live_status(post))
+
+ query['after'] = traverse_obj(posts, ('paging', 'nextParams', 'after', {str}))
+ if not query['after']:
+ break
+
+ def _real_extract(self, url):
+ channel = self._match_id(url)
+ channel_id = self._get_community_id(channel)
+
+ first_page = self._call_api(
+ update_url_query(self._ENDPOINT % channel_id, self._QUERY), channel,
+ note=f'Downloading {self._PATH} tab page 1')
+
+ return self.playlist_result(
+ self._entries(channel_id, channel, first_page), f'{channel}-{self._PATH}',
+ **traverse_obj(first_page, ('data', ..., {
+ 'playlist_title': ('community', 'communityName', {str}),
+ 'thumbnail': ('author', 'profileImageUrl', {url_or_none}),
+ }), get_all=False))
+
+
+class WeverseLiveTabIE(WeverseTabBaseIE):
+ _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<id>[^/?#]+)/live/?(?:[?#]|$)'
+ _TESTS = [{
+ 'url': 'https://weverse.io/billlie/live/',
+ 'playlist_mincount': 55,
+ 'info_dict': {
+ 'id': 'billlie-live',
+ 'title': 'Billlie',
+ 'thumbnail': r're:^https?://.*\.jpe?g$',
+ },
+ }]
+
+ _ENDPOINT = '/post/v1.0/community-%s/liveTabPosts'
+ _PATH = 'live'
+ _QUERY = {'fieldSet': 'postsV1'}
+ _RESULT_IE = WeverseIE
+
+
+class WeverseMediaTabIE(WeverseTabBaseIE):
+ _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<id>[^/?#]+)/media(?:/|/all|/new)?(?:[?#]|$)'
+ _TESTS = [{
+ 'url': 'https://weverse.io/billlie/media/',
+ 'playlist_mincount': 231,
+ 'info_dict': {
+ 'id': 'billlie-media',
+ 'title': 'Billlie',
+ 'thumbnail': r're:^https?://.*\.jpe?g$',
+ },
+ }, {
+ 'url': 'https://weverse.io/lesserafim/media/all',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://weverse.io/lesserafim/media/new',
+ 'only_matching': True,
+ }]
+
+ _ENDPOINT = '/media/v1.0/community-%s/more'
+ _PATH = 'media'
+ _QUERY = {'fieldSet': 'postsV1', 'filterType': 'RECENT'}
+ _RESULT_IE = WeverseMediaIE
+
+
+class WeverseLiveIE(WeverseBaseIE):
+ _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<id>[^/?#]+)/?(?:[?#]|$)'
+ _TESTS = [{
+ 'url': 'https://weverse.io/purplekiss',
+ 'info_dict': {
+ 'id': '3-116560493',
+ 'ext': 'mp4',
+ 'title': r're:모하냥🫶🏻',
+ 'description': '내일은 금요일~><',
+ 'uploader': '채인',
+ 'uploader_id': '1ffb1d9d904d6b3db2783f876eb9229d',
+ 'channel': 'purplekiss',
+ 'channel_id': '35',
+ 'channel_url': 'https://weverse.io/purplekiss',
+ 'creator': 'PURPLE KISS',
+ 'timestamp': 1680780892,
+ 'upload_date': '20230406',
+ 'release_timestamp': 1680780883,
+ 'release_date': '20230406',
+ 'thumbnail': 'https://weverse-live.pstatic.net/v1.0/live/62044/thumb',
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ 'availability': 'needs_auth',
+ 'live_status': 'is_live',
+ },
+ 'skip': 'Livestream has ended',
+ }, {
+ 'url': 'https://weverse.io/billlie/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ channel = self._match_id(url)
+ channel_id = self._get_community_id(channel)
+
+ video_id = traverse_obj(
+ self._call_api(update_url_query(f'/post/v1.0/community-{channel_id}/liveTab', {
+ 'debugMessage': 'true',
+ 'fields': 'onAirLivePosts.fieldSet(postsV1).limit(10),reservedLivePosts.fieldSet(postsV1).limit(10)',
+ }), channel, note='Downloading live JSON'), (
+ ('onAirLivePosts', 'reservedLivePosts'), 'data',
+ lambda _, v: self._extract_live_status(v) in ('is_live', 'is_upcoming'), 'postId', {str}),
+ get_all=False)
+
+ if not video_id:
+ raise UserNotLive(video_id=channel)
+
+ return self.url_result(f'https://weverse.io/{channel}/live/{video_id}', WeverseIE)
diff --git a/hypervideo_dl/extractor/wevidi.py b/hypervideo_dl/extractor/wevidi.py
new file mode 100644
index 0000000..3b6d032
--- /dev/null
+++ b/hypervideo_dl/extractor/wevidi.py
@@ -0,0 +1,108 @@
+from .common import InfoExtractor
+from ..utils import clean_html, float_or_none, get_element_by_class, js_to_json, traverse_obj
+
+
+class WeVidiIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?wevidi\.net/watch/(?P<id>[\w-]{11})'
+ _TESTS = [{
+ 'url': 'https://wevidi.net/watch/2th7UO5F4KV',
+ 'md5': 'b913d1ff5bbad499e2c7ef4aa6d829d7',
+ 'info_dict': {
+ 'id': '2th7UO5F4KV',
+ 'ext': 'mp4',
+ 'title': 'YouTube Alternative: WeVidi - customizable channels & more',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'description': 'md5:73a27d0a87d49fbcc5584566326ebeed',
+ 'uploader': 'eclecRC',
+ 'duration': 932.098,
+ }
+ }, {
+ 'url': 'https://wevidi.net/watch/ievRuuQHbPS',
+ 'md5': 'ce8a94989a959bff9003fa27ee572935',
+ 'info_dict': {
+ 'id': 'ievRuuQHbPS',
+ 'ext': 'mp4',
+ 'title': 'WeVidi Playlists',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'description': 'md5:32cdfca272687390d9bd9b0c9c6153ee',
+ 'uploader': 'WeVidi',
+ 'duration': 36.1999,
+ }
+ }, {
+ 'url': 'https://wevidi.net/watch/PcMzDWaQSWb',
+ 'md5': '55ee0d3434be5d9e5cc76b83f2bb57ec',
+ 'info_dict': {
+ 'id': 'PcMzDWaQSWb',
+ 'ext': 'mp4',
+ 'title': 'Cat blep',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'description': 'md5:e2c9e2b54b8bb424cc64937c8fdc068f',
+ 'uploader': 'WeVidi',
+ 'duration': 41.972,
+ }
+ }, {
+ 'url': 'https://wevidi.net/watch/wJnRqDHNe_u',
+ 'md5': 'c8f263dd47e66cc17546b3abf47b5a77',
+ 'info_dict': {
+ 'id': 'wJnRqDHNe_u',
+ 'ext': 'mp4',
+ 'title': 'Gissy Talks: YouTube Alternatives',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'description': 'md5:e65036f0d4af80e0af191bd11af5195e',
+ 'uploader': 'GissyEva',
+ 'duration': 630.451,
+ }
+ }, {
+ 'url': 'https://wevidi.net/watch/4m1c4yJR_yc',
+ 'md5': 'c63ce5ca6990dce86855fc02ca5bc1ed',
+ 'info_dict': {
+ 'id': '4m1c4yJR_yc',
+ 'ext': 'mp4',
+ 'title': 'Enough of that! - Awesome Exilez Podcast',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'description': 'md5:96af99dd63468b2dfab3020560e3e9b2',
+ 'uploader': 'eclecRC',
+ 'duration': 6.804,
+ }
+ }]
+
+ def _extract_formats(self, wvplayer_props):
+ # Taken from WeVidi player JS: https://wevidi.net/layouts/default/static/player.min.js
+ resolution_map = {
+ 1: 144,
+ 2: 240,
+ 3: 360,
+ 4: 480,
+ 5: 720,
+ 6: 1080
+ }
+
+ src_path = f'{wvplayer_props["srcVID"]}/{wvplayer_props["srcUID"]}/{wvplayer_props["srcNAME"]}'
+ for res in traverse_obj(wvplayer_props, ('resolutions', ..., {int}, {lambda x: x or None})):
+ format_id = str(-(res // -2) - 1)
+ yield {
+ 'acodec': 'mp4a.40.2',
+ 'ext': 'mp4',
+ 'format_id': format_id,
+ 'height': resolution_map.get(res),
+ 'url': f'https://www.wevidi.net/videoplayback/{src_path}/{format_id}',
+ 'vcodec': 'avc1.42E01E',
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ wvplayer_props = self._search_json(
+ r'WVPlayer\(', webpage, 'player', video_id,
+ transform_source=lambda x: js_to_json(x.replace('||', '}')))
+
+ return {
+ 'id': video_id,
+ 'title': clean_html(get_element_by_class('video_title', webpage)),
+ 'description': clean_html(get_element_by_class('descr_long', webpage)),
+ 'uploader': clean_html(get_element_by_class('username', webpage)),
+ 'formats': list(self._extract_formats(wvplayer_props)),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'duration': float_or_none(wvplayer_props.get('duration')),
+ }
diff --git a/hypervideo_dl/extractor/weyyak.py b/hypervideo_dl/extractor/weyyak.py
new file mode 100644
index 0000000..ef12be8
--- /dev/null
+++ b/hypervideo_dl/extractor/weyyak.py
@@ -0,0 +1,86 @@
+from .common import InfoExtractor
+from ..utils import (
+ float_or_none,
+ int_or_none,
+ parse_age_limit,
+ traverse_obj,
+ unified_timestamp,
+ url_or_none,
+)
+
+
+class WeyyakIE(InfoExtractor):
+ _VALID_URL = r'https?://weyyak\.com/(?P<lang>\w+)/(?:player/)?(?P<type>episode|movie)/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'https://weyyak.com/en/player/episode/1341952/Ribat-Al-Hob-Episode49',
+ 'md5': '0caf55c1a615531c8fe60f146ae46849',
+ 'info_dict': {
+ 'id': '1341952',
+ 'ext': 'mp4',
+ 'title': 'Ribat Al Hob',
+ 'duration': 2771,
+ 'alt_title': 'رباط الحب',
+ 'season': 'Season 1',
+ 'season_number': 1,
+ 'episode': 'Episode 49',
+ 'episode_number': 49,
+ 'timestamp': 1485907200,
+ 'upload_date': '20170201',
+ 'thumbnail': r're:^https://content\.weyyak\.com/.+/poster-image',
+ 'categories': ['Drama', 'Thrillers', 'Romance'],
+ 'tags': 'count:8',
+ },
+ },
+ {
+ 'url': 'https://weyyak.com/en/movie/233255/8-Seconds',
+ 'md5': 'fe740ae0f63e4d1c8a7fc147a410c564',
+ 'info_dict': {
+ 'id': '233255',
+ 'ext': 'mp4',
+ 'title': '8 Seconds',
+ 'duration': 6490,
+ 'alt_title': '8 ثواني',
+ 'description': 'md5:45b83a155c30b49950624c7e99600b9d',
+ 'age_limit': 15,
+ 'release_year': 2015,
+ 'timestamp': 1683106031,
+ 'upload_date': '20230503',
+ 'thumbnail': r're:^https://content\.weyyak\.com/.+/poster-image',
+ 'categories': ['Drama', 'Social'],
+ 'cast': ['Ceylin Adiyaman', 'Esra Inal'],
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id, lang, type_ = self._match_valid_url(url).group('id', 'lang', 'type')
+
+ path = 'episode/' if type_ == 'episode' else 'contents/moviedetails?contentkey='
+ data = self._download_json(
+ f'https://msapifo-prod-me.weyyak.z5.com/v1/{lang}/{path}{video_id}', video_id)['data']
+ m3u8_url = self._download_json(
+ f'https://api-weyyak.akamaized.net/get_info/{data["video_id"]}',
+ video_id, 'Extracting video details')['url_video']
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ **traverse_obj(data, {
+ 'title': ('title', {str}),
+ 'alt_title': ('translated_title', {str}),
+ 'description': ('synopsis', {str}),
+ 'duration': ('length', {float_or_none}),
+ 'age_limit': ('age_rating', {parse_age_limit}),
+ 'season_number': ('season_number', {int_or_none}),
+ 'episode_number': ('episode_number', {int_or_none}),
+ 'thumbnail': ('imagery', 'thumbnail', {url_or_none}),
+ 'categories': ('genres', ..., {str}),
+ 'tags': ('tags', ..., {str}),
+ 'cast': (('main_actor', 'main_actress'), {str}),
+ 'timestamp': ('insertedAt', {unified_timestamp}),
+ 'release_year': ('production_year', {int_or_none}),
+ }),
+ }
diff --git a/hypervideo_dl/extractor/whyp.py b/hypervideo_dl/extractor/whyp.py
new file mode 100644
index 0000000..fef89c3
--- /dev/null
+++ b/hypervideo_dl/extractor/whyp.py
@@ -0,0 +1,50 @@
+from .common import InfoExtractor
+from ..utils import (
+ float_or_none,
+ str_or_none,
+ traverse_obj,
+ url_or_none,
+)
+
+
+class WhypIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?whyp\.it/tracks/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.whyp.it/tracks/18337/home-page-example-track-b4kq7',
+ 'md5': 'c1187b42ebf8605284e3dc92aeb33d16',
+ 'info_dict': {
+ 'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3',
+ 'id': '18337',
+ 'title': 'Home Page Example Track',
+ 'description': 'md5:bd758000fb93f3159339c852b5b9133c',
+ 'ext': 'mp3',
+ 'duration': 52.82,
+ 'uploader': 'Brad',
+ 'uploader_id': '1',
+ 'thumbnail': 'https://cdn.whyp.it/a537bb36-3373-4c61-96c8-27fc1b2f427a.jpg',
+ },
+ }, {
+ 'url': 'https://www.whyp.it/tracks/18337',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ unique_id = self._match_id(url)
+ webpage = self._download_webpage(url, unique_id)
+ data = self._search_nuxt_data(webpage, unique_id)['rawTrack']
+
+ return {
+ 'url': data['audio_url'],
+ 'id': unique_id,
+ **traverse_obj(data, {
+ 'title': 'title',
+ 'description': 'description',
+ 'duration': ('duration', {float_or_none}),
+ 'uploader': ('user', 'username'),
+ 'uploader_id': ('user', 'id', {str_or_none}),
+ 'thumbnail': ('artwork_url', {url_or_none}),
+ }),
+ 'ext': 'mp3',
+ 'vcodec': 'none',
+ 'http_headers': {'Referer': 'https://whyp.it/'},
+ }
diff --git a/hypervideo_dl/extractor/wimbledon.py b/hypervideo_dl/extractor/wimbledon.py
new file mode 100644
index 0000000..0223e54
--- /dev/null
+++ b/hypervideo_dl/extractor/wimbledon.py
@@ -0,0 +1,61 @@
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ traverse_obj,
+)
+
+
+class WimbledonIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?wimbledon\.com/\w+/video/media/(?P<id>\d+)\.html'
+ _TESTS = [{
+ 'url': 'https://www.wimbledon.com/en_GB/video/media/6330247525112.html',
+ 'info_dict': {
+ 'id': '6330247525112',
+ 'ext': 'mp4',
+ 'timestamp': 1687972186,
+ 'description': '',
+ 'thumbnail': r're:^https://[\w.-]+\.prod\.boltdns\.net/[^?#]+/image\.jpg',
+ 'upload_date': '20230628',
+ 'title': 'Coco Gauff | My Wimbledon Inspiration',
+ 'tags': ['features', 'trending', 'homepage'],
+ 'uploader_id': '3506358525001',
+ 'duration': 163072.0,
+ },
+ }, {
+ 'url': 'https://www.wimbledon.com/en_GB/video/media/6308703111112.html',
+ 'info_dict': {
+ 'id': '6308703111112',
+ 'ext': 'mp4',
+ 'thumbnail': r're:^https://[\w.-]+\.prod\.boltdns\.net/[^?#]+/image\.jpg',
+ 'description': 'null',
+ 'upload_date': '20220629',
+ 'uploader_id': '3506358525001',
+ 'title': 'Roblox | WimbleWorld ',
+ 'duration': 101440.0,
+ 'tags': ['features', 'kids'],
+ 'timestamp': 1656500867,
+ },
+ }, {
+ 'url': 'https://www.wimbledon.com/en_US/video/media/6309327106112.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.wimbledon.com/es_Es/video/media/6308377909112.html',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ metadata = self._download_json(
+ f'https://www.wimbledon.com/relatedcontent/rest/v2/wim_v1/en/content/wim_v1_{video_id}_en', video_id)
+
+ return {
+ '_type': 'url_transparent',
+ 'url': f'http://players.brightcove.net/3506358525001/default_default/index.html?videoId={video_id}',
+ 'ie_key': 'BrightcoveNew',
+ 'id': video_id,
+ **traverse_obj(metadata, {
+ 'title': 'title',
+ 'description': 'description',
+ 'duration': ('metadata', 'duration', {parse_duration}),
+ }),
+ }
diff --git a/hypervideo_dl/extractor/wistia.py b/hypervideo_dl/extractor/wistia.py
index 38dcc2f..bce5e83 100644
--- a/hypervideo_dl/extractor/wistia.py
+++ b/hypervideo_dl/extractor/wistia.py
@@ -1,17 +1,20 @@
import re
-import urllib.error
import urllib.parse
from base64 import b64decode
from .common import InfoExtractor
+from ..networking import HEADRequest
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
+ determine_ext,
float_or_none,
int_or_none,
parse_qs,
traverse_obj,
try_get,
update_url_query,
+ urlhandle_detect_ext,
)
@@ -34,6 +37,16 @@ class WistiaBaseIE(InfoExtractor):
return embed_config
+ def _get_real_ext(self, url):
+ ext = determine_ext(url, default_ext='bin')
+ if ext == 'bin':
+ urlh = self._request_webpage(
+ HEADRequest(url), None, note='Checking media extension',
+ errnote='HEAD request returned error', fatal=False)
+ if urlh:
+ ext = urlhandle_detect_ext(urlh, default='bin')
+ return 'mp4' if ext == 'mov' else ext
+
def _extract_media(self, embed_config):
data = embed_config['media']
video_id = data['hashedId']
@@ -51,13 +64,13 @@ class WistiaBaseIE(InfoExtractor):
continue
elif atype in ('still', 'still_image'):
thumbnails.append({
- 'url': aurl,
+ 'url': aurl.replace('.bin', f'.{self._get_real_ext(aurl)}'),
'width': int_or_none(a.get('width')),
'height': int_or_none(a.get('height')),
'filesize': int_or_none(a.get('size')),
})
else:
- aext = a.get('ext')
+ aext = a.get('ext') or self._get_real_ext(aurl)
display_name = a.get('display_name')
format_id = atype
if atype and atype.endswith('_video') and display_name:
@@ -169,26 +182,26 @@ class WistiaIE(WistiaBaseIE):
'md5': '10c1ce9c4dde638202513ed17a3767bd',
'info_dict': {
'id': 'a6ndpko1wg',
- 'ext': 'bin',
+ 'ext': 'mp4',
'title': 'Episode 2: Boxed Water\'s retention is thirsty',
'upload_date': '20210324',
'description': 'md5:da5994c2c2d254833b412469d9666b7a',
'duration': 966.0,
'timestamp': 1616614369,
- 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.bin',
+ 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.png',
}
}, {
'url': 'wistia:5vd7p4bct5',
'md5': 'b9676d24bf30945d97060638fbfe77f0',
'info_dict': {
'id': '5vd7p4bct5',
- 'ext': 'bin',
+ 'ext': 'mp4',
'title': 'md5:eaa9f64c4efd7b5f098b9b6118597679',
'description': 'md5:a9bea0315f0616aa5df2dc413ddcdd0f',
'upload_date': '20220915',
'timestamp': 1663258727,
'duration': 623.019,
- 'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.(?:jpg|bin)$',
+ 'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.jpg$',
},
}, {
'url': 'wistia:sh7fpupwlt',
@@ -208,25 +221,25 @@ class WistiaIE(WistiaBaseIE):
'url': 'https://www.weidert.com/blog/wistia-channels-video-marketing-tool',
'info_dict': {
'id': 'cqwukac3z1',
- 'ext': 'bin',
+ 'ext': 'mp4',
'title': 'How Wistia Channels Can Help Capture Inbound Value From Your Video Content',
'duration': 158.125,
'timestamp': 1618974400,
'description': 'md5:27abc99a758573560be72600ef95cece',
'upload_date': '20210421',
- 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.bin',
+ 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.jpg',
}
}, {
'url': 'https://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
'md5': 'b9676d24bf30945d97060638fbfe77f0',
'info_dict': {
'id': '5vd7p4bct5',
- 'ext': 'bin',
+ 'ext': 'mp4',
'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england',
'upload_date': '20220915',
'timestamp': 1663258727,
'duration': 623.019,
- 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.bin',
+ 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.jpg',
'description': 'a Paywall Videos video',
},
}]
@@ -302,9 +315,9 @@ class WistiaChannelIE(WistiaBaseIE):
'url': 'https://fast.wistia.net/embed/channel/3802iirk0l?wchannelid=3802iirk0l&wmediaid=sp5dqjzw3n',
'info_dict': {
'id': 'sp5dqjzw3n',
- 'ext': 'bin',
+ 'ext': 'mp4',
'title': 'The Roof S2: The Modern CRO',
- 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.bin',
+ 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.png',
'duration': 86.487,
'description': 'A sales leader on The Roof? Man, they really must be letting anyone up here this season.\n',
'timestamp': 1619790290,
@@ -334,12 +347,12 @@ class WistiaChannelIE(WistiaBaseIE):
'info_dict': {
'id': 'pz0m0l0if3',
'title': 'A Framework for Improving Product Team Performance',
- 'ext': 'bin',
+ 'ext': 'mp4',
'timestamp': 1653935275,
'upload_date': '20220530',
'description': 'Learn how to help your company improve and achieve your product related goals.',
'duration': 1854.39,
- 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.bin',
+ 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.png',
},
'params': {'noplaylist': True, 'skip_download': True},
}]
@@ -352,7 +365,7 @@ class WistiaChannelIE(WistiaBaseIE):
try:
data = self._download_embed_config('channel', channel_id, url)
- except (ExtractorError, urllib.error.HTTPError):
+ except (ExtractorError, HTTPError):
# Some channels give a 403 from the JSON API
self.report_warning('Failed to download channel data from API, falling back to webpage.')
webpage = self._download_webpage(f'https://fast.wistia.net/embed/channel/{channel_id}', channel_id)
diff --git a/hypervideo_dl/extractor/wrestleuniverse.py b/hypervideo_dl/extractor/wrestleuniverse.py
new file mode 100644
index 0000000..dd12804
--- /dev/null
+++ b/hypervideo_dl/extractor/wrestleuniverse.py
@@ -0,0 +1,307 @@
+import base64
+import binascii
+import json
+import time
+import uuid
+
+from .common import InfoExtractor
+from ..dependencies import Cryptodome
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ jwt_decode_hs256,
+ traverse_obj,
+ try_call,
+ url_or_none,
+ urlencode_postdata,
+ variadic,
+)
+
+
+class WrestleUniverseBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'wrestleuniverse'
+ _VALID_URL_TMPL = r'https?://(?:www\.)?wrestle-universe\.com/(?:(?P<lang>\w{2})/)?%s/(?P<id>\w+)'
+ _API_HOST = 'api.wrestle-universe.com'
+ _API_PATH = None
+ _REAL_TOKEN = None
+ _TOKEN_EXPIRY = None
+ _REFRESH_TOKEN = None
+ _DEVICE_ID = None
+ _LOGIN_QUERY = {'key': 'AIzaSyCaRPBsDQYVDUWWBXjsTrHESi2r_F3RAdA'}
+ _LOGIN_HEADERS = {
+ 'Accept': '*/*',
+ 'Content-Type': 'application/json',
+ 'X-Client-Version': 'Chrome/JsCore/9.9.4/FirebaseCore-web',
+ 'X-Firebase-gmpid': '1:307308870738:web:820f38fe5150c8976e338b',
+ 'Referer': 'https://www.wrestle-universe.com/',
+ 'Origin': 'https://www.wrestle-universe.com',
+ }
+
+ @property
+ def _TOKEN(self):
+ if not self._REAL_TOKEN or not self._TOKEN_EXPIRY:
+ token = try_call(lambda: self._get_cookies('https://www.wrestle-universe.com/')['token'].value)
+ if not token and not self._REFRESH_TOKEN:
+ self.raise_login_required()
+ self._TOKEN = token
+
+ if not self._REAL_TOKEN or self._TOKEN_EXPIRY <= int(time.time()):
+ if not self._REFRESH_TOKEN:
+ raise ExtractorError(
+ 'Expired token. Refresh your cookies in browser and try again', expected=True)
+ self._refresh_token()
+
+ return self._REAL_TOKEN
+
+ @_TOKEN.setter
+ def _TOKEN(self, value):
+ self._REAL_TOKEN = value
+
+ expiry = traverse_obj(value, ({jwt_decode_hs256}, 'exp', {int_or_none}))
+ if not expiry:
+ raise ExtractorError('There was a problem with the auth token')
+ self._TOKEN_EXPIRY = expiry
+
+ def _perform_login(self, username, password):
+ login = self._download_json(
+ 'https://identitytoolkit.googleapis.com/v1/accounts:signInWithPassword', None,
+ 'Logging in', query=self._LOGIN_QUERY, headers=self._LOGIN_HEADERS, data=json.dumps({
+ 'returnSecureToken': True,
+ 'email': username,
+ 'password': password,
+ }, separators=(',', ':')).encode(), expected_status=400)
+ token = traverse_obj(login, ('idToken', {str}))
+ if not token:
+ raise ExtractorError(
+ f'Unable to log in: {traverse_obj(login, ("error", "message"))}', expected=True)
+ self._REFRESH_TOKEN = traverse_obj(login, ('refreshToken', {str}))
+ if not self._REFRESH_TOKEN:
+ self.report_warning('No refresh token was granted')
+ self._TOKEN = token
+
+ def _real_initialize(self):
+ if self._DEVICE_ID:
+ return
+
+ self._DEVICE_ID = self._configuration_arg('device_id', [None], ie_key=self._NETRC_MACHINE)[0]
+ if not self._DEVICE_ID:
+ self._DEVICE_ID = self.cache.load(self._NETRC_MACHINE, 'device_id')
+ if self._DEVICE_ID:
+ return
+ self._DEVICE_ID = str(uuid.uuid4())
+
+ self.cache.store(self._NETRC_MACHINE, 'device_id', self._DEVICE_ID)
+
+ def _refresh_token(self):
+ refresh = self._download_json(
+ 'https://securetoken.googleapis.com/v1/token', None, 'Refreshing token',
+ query=self._LOGIN_QUERY, data=urlencode_postdata({
+ 'grant_type': 'refresh_token',
+ 'refresh_token': self._REFRESH_TOKEN,
+ }), headers={
+ **self._LOGIN_HEADERS,
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ })
+ if traverse_obj(refresh, ('refresh_token', {str})):
+ self._REFRESH_TOKEN = refresh['refresh_token']
+ token = traverse_obj(refresh, 'access_token', 'id_token', expected_type=str)
+ if not token:
+ raise ExtractorError('No auth token returned from refresh request')
+ self._TOKEN = token
+
+ def _call_api(self, video_id, param='', msg='API', auth=True, data=None, query={}, fatal=True):
+ headers = {'CA-CID': ''}
+ if data:
+ headers['Content-Type'] = 'application/json;charset=utf-8'
+ data = json.dumps(data, separators=(',', ':')).encode()
+ if auth and self._TOKEN:
+ headers['Authorization'] = f'Bearer {self._TOKEN}'
+ return self._download_json(
+ f'https://{self._API_HOST}/v1/{self._API_PATH}/{video_id}{param}', video_id,
+ note=f'Downloading {msg} JSON', errnote=f'Failed to download {msg} JSON',
+ data=data, headers=headers, query=query, fatal=fatal)
+
+ def _call_encrypted_api(self, video_id, param='', msg='API', data={}, query={}, fatal=True):
+ if not Cryptodome.RSA:
+ raise ExtractorError('pycryptodomex not found. Please install', expected=True)
+ private_key = Cryptodome.RSA.generate(2048)
+ cipher = Cryptodome.PKCS1_OAEP.new(private_key, hashAlgo=Cryptodome.SHA1)
+
+ def decrypt(data):
+ if not data:
+ return None
+ try:
+ return cipher.decrypt(base64.b64decode(data)).decode()
+ except (ValueError, binascii.Error) as e:
+ raise ExtractorError(f'Could not decrypt data: {e}')
+
+ token = base64.b64encode(private_key.public_key().export_key('DER')).decode()
+ api_json = self._call_api(video_id, param, msg, data={
+ 'deviceId': self._DEVICE_ID,
+ 'token': token,
+ **data,
+ }, query=query, fatal=fatal)
+ return api_json, decrypt
+
+ def _download_metadata(self, url, video_id, lang, props_keys):
+ metadata = self._call_api(video_id, msg='metadata', query={'al': lang or 'ja'}, auth=False, fatal=False)
+ if not metadata:
+ webpage = self._download_webpage(url, video_id)
+ nextjs_data = self._search_nextjs_data(webpage, video_id)
+ metadata = traverse_obj(nextjs_data, (
+ 'props', 'pageProps', *variadic(props_keys, (str, bytes, dict, set)), {dict})) or {}
+ return metadata
+
+ def _get_formats(self, data, path, video_id=None):
+ hls_url = traverse_obj(data, path, get_all=False)
+ if not hls_url and not data.get('canWatch'):
+ self.raise_no_formats(
+ 'This account does not have access to the requested content', expected=True)
+ elif not hls_url:
+ self.raise_no_formats('No supported formats found')
+ return self._extract_m3u8_formats(hls_url, video_id, 'mp4', m3u8_id='hls', live=True)
+
+
+class WrestleUniverseVODIE(WrestleUniverseBaseIE):
+ _VALID_URL = WrestleUniverseBaseIE._VALID_URL_TMPL % 'videos'
+ _TESTS = [{
+ 'url': 'https://www.wrestle-universe.com/en/videos/dp8mpjmcKfxzUhEHM2uFws',
+ 'info_dict': {
+ 'id': 'dp8mpjmcKfxzUhEHM2uFws',
+ 'ext': 'mp4',
+ 'title': 'The 3rd “Futari wa Princess” Max Heart Tournament',
+ 'description': 'md5:318d5061e944797fbbb81d5c7dd00bf5',
+ 'location': '埼玉・春日部ふれあいキューブ',
+ 'channel': 'tjpw',
+ 'duration': 7119,
+ 'timestamp': 1674979200,
+ 'upload_date': '20230129',
+ 'thumbnail': 'https://image.asset.wrestle-universe.com/8FjD67P8rZc446RBQs5RBN/8FjD67P8rZc446RBQs5RBN',
+ 'chapters': 'count:7',
+ 'cast': 'count:21',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }]
+
+ _API_PATH = 'videoEpisodes'
+
+ def _real_extract(self, url):
+ lang, video_id = self._match_valid_url(url).group('lang', 'id')
+ metadata = self._download_metadata(url, video_id, lang, 'videoEpisodeFallbackData')
+ video_data = self._call_api(video_id, ':watch', 'watch', data={
+ # 'deviceId' is required if ignoreDeviceRestriction is False
+ 'ignoreDeviceRestriction': True,
+ })
+
+ return {
+ 'id': video_id,
+ 'formats': self._get_formats(video_data, (
+ (('protocolHls', 'url'), ('chromecastUrls', ...)), {url_or_none}), video_id),
+ **traverse_obj(metadata, {
+ 'title': ('displayName', {str}),
+ 'description': ('description', {str}),
+ 'channel': ('labels', 'group', {str}),
+ 'location': ('labels', 'venue', {str}),
+ 'timestamp': ('watchStartTime', {int_or_none}),
+ 'thumbnail': ('keyVisualUrl', {url_or_none}),
+ 'cast': ('casts', ..., 'displayName', {str}),
+ 'duration': ('duration', {int}),
+ 'chapters': ('videoChapters', lambda _, v: isinstance(v.get('start'), int), {
+ 'title': ('displayName', {str}),
+ 'start_time': ('start', {int}),
+ 'end_time': ('end', {int}),
+ }),
+ }),
+ }
+
+
+class WrestleUniversePPVIE(WrestleUniverseBaseIE):
+ _VALID_URL = WrestleUniverseBaseIE._VALID_URL_TMPL % 'lives'
+ _TESTS = [{
+ 'note': 'HLS AES-128 key obtained via API',
+ 'url': 'https://www.wrestle-universe.com/en/lives/buH9ibbfhdJAY4GKZcEuJX',
+ 'info_dict': {
+ 'id': 'buH9ibbfhdJAY4GKZcEuJX',
+ 'ext': 'mp4',
+ 'title': '【PPV】Beyond the origins, into the future',
+ 'description': 'md5:9a872db68cd09be4a1e35a3ee8b0bdfc',
+ 'channel': 'tjpw',
+ 'location': '東京・Twin Box AKIHABARA',
+ 'duration': 10098,
+ 'timestamp': 1675076400,
+ 'upload_date': '20230130',
+ 'thumbnail': 'https://image.asset.wrestle-universe.com/rJs2m7cBaLXrwCcxMdQGRM/rJs2m7cBaLXrwCcxMdQGRM',
+ 'thumbnails': 'count:3',
+ 'hls_aes': {
+ 'key': '5633184acd6e43f1f1ac71c6447a4186',
+ 'iv': '5bac71beb33197d5600337ce86de7862',
+ },
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ 'skip': 'No longer available',
+ }, {
+ 'note': 'unencrypted HLS',
+ 'url': 'https://www.wrestle-universe.com/en/lives/wUG8hP5iApC63jbtQzhVVx',
+ 'info_dict': {
+ 'id': 'wUG8hP5iApC63jbtQzhVVx',
+ 'ext': 'mp4',
+ 'title': 'GRAND PRINCESS \'22',
+ 'description': 'md5:e4f43d0d4262de3952ff34831bc99858',
+ 'channel': 'tjpw',
+ 'location': '東京・両国国技館',
+ 'duration': 18044,
+ 'timestamp': 1647665400,
+ 'upload_date': '20220319',
+ 'thumbnail': 'https://image.asset.wrestle-universe.com/i8jxSTCHPfdAKD4zN41Psx/i8jxSTCHPfdAKD4zN41Psx',
+ 'thumbnails': 'count:3',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ }]
+
+ _API_PATH = 'events'
+
+ def _real_extract(self, url):
+ lang, video_id = self._match_valid_url(url).group('lang', 'id')
+ metadata = self._download_metadata(url, video_id, lang, 'eventFallbackData')
+
+ info = {
+ 'id': video_id,
+ **traverse_obj(metadata, {
+ 'title': ('displayName', {str}),
+ 'description': ('description', {str}),
+ 'channel': ('labels', 'group', {str}),
+ 'location': ('labels', 'venue', {str}),
+ 'timestamp': ('startTime', {int_or_none}),
+ 'thumbnails': (('keyVisualUrl', 'alterKeyVisualUrl', 'heroKeyVisualUrl'), {'url': {url_or_none}}),
+ }),
+ }
+
+ ended_time = traverse_obj(metadata, ('endedTime', {int_or_none}))
+ if info.get('timestamp') and ended_time:
+ info['duration'] = ended_time - info['timestamp']
+
+ video_data, decrypt = self._call_encrypted_api(
+ video_id, ':watchArchive', 'watch archive', data={'method': 1})
+ info['formats'] = self._get_formats(video_data, (
+ ('hls', None), ('urls', 'chromecastUrls'), ..., {url_or_none}), video_id)
+ for f in info['formats']:
+ # bitrates are exaggerated in PPV playlists, so avoid wrong/huge filesize_approx values
+ if f.get('tbr'):
+ f['tbr'] = int(f['tbr'] / 2.5)
+
+ hls_aes_key = traverse_obj(video_data, ('hls', 'key', {decrypt}))
+ if hls_aes_key:
+ info['hls_aes'] = {
+ 'key': hls_aes_key,
+ 'iv': traverse_obj(video_data, ('hls', 'iv', {decrypt})),
+ }
+ elif traverse_obj(video_data, ('hls', 'encryptType', {int})):
+ self.report_warning('HLS AES-128 key was not found in API response')
+
+ return info
diff --git a/hypervideo_dl/extractor/wykop.py b/hypervideo_dl/extractor/wykop.py
new file mode 100644
index 0000000..1d29cc8
--- /dev/null
+++ b/hypervideo_dl/extractor/wykop.py
@@ -0,0 +1,268 @@
+import json
+
+from .common import InfoExtractor
+from ..networking.exceptions import HTTPError
+from ..utils import (
+ ExtractorError,
+ format_field,
+ parse_iso8601,
+ traverse_obj,
+ url_or_none,
+)
+
+
+class WykopBaseExtractor(InfoExtractor):
+ def _get_token(self, force_refresh=False):
+ if not force_refresh:
+ maybe_cached = self.cache.load('wykop', 'bearer')
+ if maybe_cached:
+ return maybe_cached
+
+ new_token = traverse_obj(
+ self._do_call_api('auth', None, 'Downloading anonymous auth token', data={
+ # hardcoded in frontend
+ 'key': 'w53947240748',
+ 'secret': 'd537d9e0a7adc1510842059ae5316419',
+ }), ('data', 'token'))
+
+ self.cache.store('wykop', 'bearer', new_token)
+ return new_token
+
+ def _do_call_api(self, path, video_id, note='Downloading JSON metadata', data=None, headers={}):
+ if data:
+ data = json.dumps({'data': data}).encode()
+ headers['Content-Type'] = 'application/json'
+
+ return self._download_json(
+ f'https://wykop.pl/api/v3/{path}', video_id,
+ note=note, data=data, headers=headers)
+
+ def _call_api(self, path, video_id, note='Downloading JSON metadata'):
+ token = self._get_token()
+ for retrying in range(2):
+ try:
+ return self._do_call_api(path, video_id, note, headers={'Authorization': f'Bearer {token}'})
+ except ExtractorError as e:
+ if not retrying and isinstance(e.cause, HTTPError) and e.cause.status == 403:
+ token = self._get_token(True)
+ continue
+ raise
+
+ def _common_data_extract(self, data):
+ author = traverse_obj(data, ('author', 'username'), expected_type=str)
+
+ return {
+ '_type': 'url_transparent',
+ 'display_id': data.get('slug'),
+ 'url': traverse_obj(data,
+ ('media', 'embed', 'url'), # what gets an iframe embed
+ ('source', 'url'), # clickable url (dig only)
+ expected_type=url_or_none),
+ 'thumbnail': traverse_obj(
+ data, ('media', 'photo', 'url'), ('media', 'embed', 'thumbnail'), expected_type=url_or_none),
+ 'uploader': author,
+ 'uploader_id': author,
+ 'uploader_url': format_field(author, None, 'https://wykop.pl/ludzie/%s'),
+ 'timestamp': parse_iso8601(data.get('created_at'), delimiter=' '), # time it got submitted
+ 'like_count': traverse_obj(data, ('votes', 'up'), expected_type=int),
+ 'dislike_count': traverse_obj(data, ('votes', 'down'), expected_type=int),
+ 'comment_count': traverse_obj(data, ('comments', 'count'), expected_type=int),
+ 'age_limit': 18 if data.get('adult') else 0,
+ 'tags': data.get('tags'),
+ }
+
+
+class WykopDigIE(WykopBaseExtractor):
+ IE_NAME = 'wykop:dig'
+ _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://wykop.pl/link/6912923/najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth',
+ 'info_dict': {
+ 'id': 'rlSTBvViflc',
+ 'ext': 'mp4',
+ 'title': 'Najbardziej zrzędliwy kot na świecie I Frozen Planet II I BBC Earth',
+ 'display_id': 'najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth',
+ 'description': 'md5:ac0f87dea1cdcb6b0c53f3612a095c87',
+ 'tags': ['zwierzaczki', 'koty', 'smiesznykotek', 'humor', 'rozrywka', 'ciekawostki'],
+ 'age_limit': 0,
+ 'timestamp': 1669154480,
+ 'release_timestamp': 1669194241,
+ 'release_date': '20221123',
+ 'uploader': 'starnak',
+ 'uploader_id': 'starnak',
+ 'uploader_url': 'https://wykop.pl/ludzie/starnak',
+ 'like_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ 'thumbnail': r're:https?://wykop\.pl/cdn/.+',
+ 'view_count': int,
+ 'channel': 'BBC Earth',
+ 'channel_id': 'UCwmZiChSryoWQCZMIQezgTg',
+ 'channel_url': 'https://www.youtube.com/channel/UCwmZiChSryoWQCZMIQezgTg',
+ 'categories': ['Pets & Animals'],
+ 'upload_date': '20220923',
+ 'duration': 191,
+ 'channel_follower_count': int,
+ 'availability': 'public',
+ 'live_status': 'not_live',
+ 'playable_in_embed': True,
+ },
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return cls._match_valid_url(url) and not WykopDigCommentIE.suitable(url)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ data = self._call_api(f'links/{video_id}', video_id)['data']
+
+ return {
+ **self._common_data_extract(data),
+ 'id': video_id,
+ 'title': data['title'],
+ 'description': data.get('description'),
+ # time it got "digged" to the homepage
+ 'release_timestamp': parse_iso8601(data.get('published_at'), delimiter=' '),
+ }
+
+
+class WykopDigCommentIE(WykopBaseExtractor):
+ IE_NAME = 'wykop:dig:comment'
+ _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<dig_id>\d+)/[^/]+/komentarz/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://wykop.pl/link/6992589/strollowal-oszusta-przez-ponad-24-minuty-udawal-naiwniaka-i-nagral-rozmowe/komentarz/114540527/podobna-sytuacja-ponizej-ciekawa-dyskusja-z-oszustem-na-sam-koniec-sam-bylem-w-biurze-swiadkiem-podobnej-rozmowy-niemal-zakonczonej-sukcesem-bandyty-g',
+ 'info_dict': {
+ 'id': 'u6tEi2FmKZY',
+ 'ext': 'mp4',
+ 'title': 'md5:e7c741c5baa7ed6478000caf72865577',
+ 'display_id': 'md5:45b2d12bd0e262d09cc7cf7abc8412db',
+ 'description': 'md5:bcec7983429f9c0630f9deb9d3d1ba5e',
+ 'timestamp': 1674476945,
+ 'uploader': 'Bartholomew',
+ 'uploader_id': 'Bartholomew',
+ 'uploader_url': 'https://wykop.pl/ludzie/Bartholomew',
+ 'thumbnail': r're:https?://wykop\.pl/cdn/.+',
+ 'tags': [],
+ 'availability': 'public',
+ 'duration': 1838,
+ 'upload_date': '20230117',
+ 'categories': ['Entertainment'],
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ 'channel_follower_count': int,
+ 'playable_in_embed': True,
+ 'live_status': 'not_live',
+ 'age_limit': 0,
+ 'chapters': 'count:3',
+ 'channel': 'Poszukiwacze Okazji',
+ 'channel_id': 'UCzzvJDZThwv06dR4xmzrZBw',
+ 'channel_url': 'https://www.youtube.com/channel/UCzzvJDZThwv06dR4xmzrZBw',
+ },
+ }]
+
+ def _real_extract(self, url):
+ dig_id, comment_id = self._search_regex(self._VALID_URL, url, 'dig and comment ids', group=('dig_id', 'id'))
+ data = self._call_api(f'links/{dig_id}/comments/{comment_id}', comment_id)['data']
+
+ return {
+ **self._common_data_extract(data),
+ 'id': comment_id,
+ 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
+ 'description': data.get('content'),
+ }
+
+
+class WykopPostIE(WykopBaseExtractor):
+ IE_NAME = 'wykop:post'
+ _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://wykop.pl/wpis/68893343/kot-koty-smiesznykotek',
+ 'info_dict': {
+ 'id': 'PL8JMjiUPHUhwc9ZlKa_5IFeBwBV8Xe7jI',
+ 'title': 'PawelW124 - #kot #koty #smiesznykotek',
+ 'description': '#kot #koty #smiesznykotek',
+ 'display_id': 'kot-koty-smiesznykotek',
+ 'tags': ['kot', 'koty', 'smiesznykotek'],
+ 'uploader': 'PawelW124',
+ 'uploader_id': 'PawelW124',
+ 'uploader_url': 'https://wykop.pl/ludzie/PawelW124',
+ 'timestamp': 1668938142,
+ 'age_limit': 0,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'thumbnail': r're:https?://wykop\.pl/cdn/.+',
+ 'comment_count': int,
+ 'channel': 'Revan',
+ 'channel_id': 'UCW9T_-uZoiI7ROARQdTDyOw',
+ 'channel_url': 'https://www.youtube.com/channel/UCW9T_-uZoiI7ROARQdTDyOw',
+ 'upload_date': '20221120',
+ 'modified_date': '20220814',
+ 'availability': 'public',
+ 'view_count': int,
+ },
+ 'playlist_mincount': 15,
+ 'params': {
+ 'flat_playlist': True,
+ }
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return cls._match_valid_url(url) and not WykopPostCommentIE.suitable(url)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ data = self._call_api(f'entries/{video_id}', video_id)['data']
+
+ return {
+ **self._common_data_extract(data),
+ 'id': video_id,
+ 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
+ 'description': data.get('content'),
+ }
+
+
+class WykopPostCommentIE(WykopBaseExtractor):
+ IE_NAME = 'wykop:post:comment'
+ _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<post_id>\d+)/[^/#]+#(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://wykop.pl/wpis/70084873/test-test-test#249303979',
+ 'info_dict': {
+ 'id': 'confusedquickarmyant',
+ 'ext': 'mp4',
+ 'title': 'tpap - treść komentarza',
+ 'display_id': 'tresc-komentarza',
+ 'description': 'treść komentarza',
+ 'uploader': 'tpap',
+ 'uploader_id': 'tpap',
+ 'uploader_url': 'https://wykop.pl/ludzie/tpap',
+ 'timestamp': 1675349470,
+ 'upload_date': '20230202',
+ 'tags': [],
+ 'duration': 2.12,
+ 'age_limit': 0,
+ 'categories': [],
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'thumbnail': r're:https?://wykop\.pl/cdn/.+',
+ },
+ }]
+
+ def _real_extract(self, url):
+ post_id, comment_id = self._search_regex(self._VALID_URL, url, 'post and comment ids', group=('post_id', 'id'))
+ data = self._call_api(f'entries/{post_id}/comments/{comment_id}', comment_id)['data']
+
+ return {
+ **self._common_data_extract(data),
+ 'id': comment_id,
+ 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
+ 'description': data.get('content'),
+ }
diff --git a/hypervideo_dl/extractor/xanimu.py b/hypervideo_dl/extractor/xanimu.py
new file mode 100644
index 0000000..e0b7bf9
--- /dev/null
+++ b/hypervideo_dl/extractor/xanimu.py
@@ -0,0 +1,51 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class XanimuIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?xanimu\.com/(?P<id>[^/]+)/?'
+ _TESTS = [{
+ 'url': 'https://xanimu.com/51944-the-princess-the-frog-hentai/',
+ 'md5': '899b88091d753d92dad4cb63bbf357a7',
+ 'info_dict': {
+ 'id': '51944-the-princess-the-frog-hentai',
+ 'ext': 'mp4',
+ 'title': 'The Princess + The Frog Hentai',
+ 'thumbnail': 'https://xanimu.com/storage/2020/09/the-princess-and-the-frog-hentai.jpg',
+ 'description': r're:^Enjoy The Princess \+ The Frog Hentai',
+ 'duration': 207.0,
+ 'age_limit': 18
+ }
+ }, {
+ 'url': 'https://xanimu.com/huge-expansion/',
+ 'only_matching': True
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ formats = []
+ for format in ['videoHigh', 'videoLow']:
+ format_url = self._search_json(r'var\s+%s\s*=' % re.escape(format), webpage, format,
+ video_id, default=None, contains_pattern=r'[\'"]([^\'"]+)[\'"]')
+ if format_url:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format,
+ 'quality': -2 if format.endswith('Low') else None,
+ })
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': self._search_regex(r'[\'"]headline[\'"]:\s*[\'"]([^"]+)[\'"]', webpage,
+ 'title', default=None) or self._html_extract_title(webpage),
+ 'thumbnail': self._html_search_meta('thumbnailUrl', webpage, default=None),
+ 'description': self._html_search_meta('description', webpage, default=None),
+ 'duration': int_or_none(self._search_regex(r'duration:\s*[\'"]([^\'"]+?)[\'"]',
+ webpage, 'duration', fatal=False)),
+ 'age_limit': 18
+ }
diff --git a/hypervideo_dl/extractor/xhamster.py b/hypervideo_dl/extractor/xhamster.py
index 59eecec..3722479 100644
--- a/hypervideo_dl/extractor/xhamster.py
+++ b/hypervideo_dl/extractor/xhamster.py
@@ -21,7 +21,7 @@ from ..utils import (
class XHamsterIE(InfoExtractor):
- _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com)'
+ _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com|xhvid\.com)'
_VALID_URL = r'''(?x)
https?://
(?:.+?\.)?%s/
@@ -120,6 +120,9 @@ class XHamsterIE(InfoExtractor):
}, {
'url': 'https://xhday.com/videos/strapless-threesome-xhh7yVf',
'only_matching': True,
+ }, {
+ 'url': 'https://xhvid.com/videos/lk-mm-xhc6wn6',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -180,7 +183,7 @@ class XHamsterIE(InfoExtractor):
'height': get_height(quality),
'filesize': format_sizes.get(quality),
'http_headers': {
- 'Referer': urlh.geturl(),
+ 'Referer': urlh.url,
},
})
xplayer_sources = try_get(
@@ -422,6 +425,9 @@ class XHamsterUserIE(InfoExtractor):
}, {
'url': 'https://xhday.com/users/mobhunter',
'only_matching': True,
+ }, {
+ 'url': 'https://xhvid.com/users/pelushe21',
+ 'only_matching': True,
}]
def _entries(self, user_id):
diff --git a/hypervideo_dl/extractor/ximalaya.py b/hypervideo_dl/extractor/ximalaya.py
index b25be77..3d5e6cf 100644
--- a/hypervideo_dl/extractor/ximalaya.py
+++ b/hypervideo_dl/extractor/ximalaya.py
@@ -36,7 +36,7 @@ class XimalayaIE(XimalayaBaseIE):
'height': 180
}
],
- 'categories': ['人文'],
+ 'categories': ['其他'],
'duration': 93,
'view_count': int,
'like_count': int,
@@ -123,7 +123,7 @@ class XimalayaIE(XimalayaBaseIE):
class XimalayaAlbumIE(XimalayaBaseIE):
IE_NAME = 'ximalaya:album'
IE_DESC = '喜马拉雅FM 专辑'
- _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/\d+/album/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?:\d+/)?album/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://www.ximalaya.com/61425525/album/5534601/',
'info_dict': {
@@ -131,6 +131,13 @@ class XimalayaAlbumIE(XimalayaBaseIE):
'id': '5534601',
},
'playlist_mincount': 323,
+ }, {
+ 'url': 'https://www.ximalaya.com/album/6912905',
+ 'info_dict': {
+ 'title': '埃克哈特《修炼当下的力量》',
+ 'id': '6912905',
+ },
+ 'playlist_mincount': 41,
}]
def _real_extract(self, url):
@@ -151,7 +158,7 @@ class XimalayaAlbumIE(XimalayaBaseIE):
return self._download_json(
'https://www.ximalaya.com/revision/album/v1/getTracksList',
playlist_id, note=f'Downloading tracks list page {page_idx}',
- query={'albumId': playlist_id, 'pageNum': page_idx, 'sort': 1})['data']
+ query={'albumId': playlist_id, 'pageNum': page_idx})['data']
def _get_entries(self, page_data):
for e in page_data['tracks']:
diff --git a/hypervideo_dl/extractor/xtube.py b/hypervideo_dl/extractor/xtube.py
index ce4480c..db82925 100644
--- a/hypervideo_dl/extractor/xtube.py
+++ b/hypervideo_dl/extractor/xtube.py
@@ -2,12 +2,12 @@ import itertools
import re
from .common import InfoExtractor
+from ..networking import Request
from ..utils import (
int_or_none,
js_to_json,
orderedSet,
parse_duration,
- sanitized_Request,
str_to_int,
url_or_none,
)
@@ -186,7 +186,7 @@ class XTubeUserIE(InfoExtractor):
entries = []
for pagenum in itertools.count(1):
- request = sanitized_Request(
+ request = Request(
'http://www.xtube.com/profile/%s/videos/%d' % (user_id, pagenum),
headers={
'Cookie': 'popunder=4',
diff --git a/hypervideo_dl/extractor/xvideos.py b/hypervideo_dl/extractor/xvideos.py
index 5c505c8..5df0715 100644
--- a/hypervideo_dl/extractor/xvideos.py
+++ b/hypervideo_dl/extractor/xvideos.py
@@ -157,3 +157,24 @@ class XVideosIE(InfoExtractor):
'thumbnails': thumbnails,
'age_limit': 18,
}
+
+
+class XVideosQuickiesIE(InfoExtractor):
+ IE_NAME = 'xvideos:quickies'
+ _VALID_URL = r'https?://(?P<domain>(?:[^/]+\.)?xvideos2?\.com)/amateur-channels/[^#]+#quickies/a/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.xvideos.com/amateur-channels/wifeluna#quickies/a/47258683',
+ 'md5': '16e322a93282667f1963915568f782c1',
+ 'info_dict': {
+ 'id': '47258683',
+ 'ext': 'mp4',
+ 'title': 'Verification video',
+ 'age_limit': 18,
+ 'duration': 16,
+ 'thumbnail': r're:^https://cdn.*-pic.xvideos-cdn.com/.+\.jpg',
+ }
+ }]
+
+ def _real_extract(self, url):
+ domain, id_ = self._match_valid_url(url).group('domain', 'id')
+ return self.url_result(f'https://{domain}/video{id_}/_', XVideosIE, id_)
diff --git a/hypervideo_dl/extractor/yahoo.py b/hypervideo_dl/extractor/yahoo.py
index a69715b..24148a0 100644
--- a/hypervideo_dl/extractor/yahoo.py
+++ b/hypervideo_dl/extractor/yahoo.py
@@ -2,7 +2,6 @@ import hashlib
import itertools
import urllib.parse
-from .brightcove import BrightcoveNewIE
from .common import InfoExtractor, SearchInfoExtractor
from .youtube import YoutubeIE
from ..utils import (
@@ -11,7 +10,6 @@ from ..utils import (
int_or_none,
mimetype2ext,
parse_iso8601,
- smuggle_url,
traverse_obj,
try_get,
url_or_none,
@@ -337,121 +335,6 @@ class YahooSearchIE(SearchInfoExtractor):
break
-class YahooGyaOPlayerIE(InfoExtractor):
- IE_NAME = 'yahoo:gyao:player'
- _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:player|episode(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/c/y)/(?P<id>\d+/v\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
- _TESTS = [{
- 'url': 'https://gyao.yahoo.co.jp/player/00998/v00818/v0000000000000008564/',
- 'info_dict': {
- 'id': '5993125228001',
- 'ext': 'mp4',
- 'title': 'フューリー 【字幕版】',
- 'description': 'md5:21e691c798a15330eda4db17a8fe45a5',
- 'uploader_id': '4235717419001',
- 'upload_date': '20190124',
- 'timestamp': 1548294365,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- }, {
- 'url': 'https://streaming.yahoo.co.jp/c/y/01034/v00133/v0000000000000000706/',
- 'only_matching': True,
- }, {
- 'url': 'https://gyao.yahoo.co.jp/episode/%E3%81%8D%E3%81%AE%E3%81%86%E4%BD%95%E9%A3%9F%E3%81%B9%E3%81%9F%EF%BC%9F%20%E7%AC%AC2%E8%A9%B1%202019%2F4%2F12%E6%94%BE%E9%80%81%E5%88%86/5cb02352-b725-409e-9f8d-88f947a9f682',
- 'only_matching': True,
- }, {
- 'url': 'https://gyao.yahoo.co.jp/episode/5fa1226c-ef8d-4e93-af7a-fd92f4e30597',
- 'only_matching': True,
- }]
- _GEO_BYPASS = False
-
- def _real_extract(self, url):
- video_id = self._match_id(url).replace('/', ':')
- headers = self.geo_verification_headers()
- headers['Accept'] = 'application/json'
- resp = self._download_json(
- 'https://gyao.yahoo.co.jp/apis/playback/graphql', video_id, query={
- 'appId': 'dj00aiZpPUNJeDh2cU1RazU3UCZzPWNvbnN1bWVyc2VjcmV0Jng9NTk-',
- 'query': '''{
- content(parameter: {contentId: "%s", logicaAgent: PC_WEB}) {
- video {
- delivery {
- id
- }
- title
- }
- }
-}''' % video_id,
- }, headers=headers)
- content = resp['data']['content']
- if not content:
- msg = resp['errors'][0]['message']
- if msg == 'not in japan':
- self.raise_geo_restricted(countries=['JP'])
- raise ExtractorError(msg)
- video = content['video']
- return {
- '_type': 'url_transparent',
- 'id': video_id,
- 'title': video['title'],
- 'url': smuggle_url(
- 'http://players.brightcove.net/4235717419001/SyG5P0gjb_default/index.html?videoId=' + video['delivery']['id'],
- {'geo_countries': ['JP']}),
- 'ie_key': BrightcoveNewIE.ie_key(),
- }
-
-
-class YahooGyaOIE(InfoExtractor):
- IE_NAME = 'yahoo:gyao'
- _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:p|title(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/p/y)/(?P<id>\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
- _TESTS = [{
- 'url': 'https://gyao.yahoo.co.jp/title/%E3%82%BF%E3%82%A4%E3%83%A0%E3%83%9C%E3%82%AB%E3%83%B3%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA%20%E3%83%A4%E3%83%83%E3%82%BF%E3%83%BC%E3%83%9E%E3%83%B3/5f60ceb3-6e5e-40ef-ba40-d68b598d067f',
- 'info_dict': {
- 'id': '5f60ceb3-6e5e-40ef-ba40-d68b598d067f',
- },
- 'playlist_mincount': 80,
- }, {
- 'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/',
- 'only_matching': True,
- }, {
- 'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/',
- 'only_matching': True,
- }, {
- 'url': 'https://gyao.yahoo.co.jp/title/%E3%81%97%E3%82%83%E3%81%B9%E3%81%8F%E3%82%8A007/5b025a49-b2e5-4dc7-945c-09c6634afacf',
- 'only_matching': True,
- }, {
- 'url': 'https://gyao.yahoo.co.jp/title/5b025a49-b2e5-4dc7-945c-09c6634afacf',
- 'only_matching': True,
- }]
-
- def _entries(self, program_id):
- page = 1
- while True:
- playlist = self._download_json(
- f'https://gyao.yahoo.co.jp/api/programs/{program_id}/videos?page={page}&serviceId=gy', program_id,
- note=f'Downloading JSON metadata page {page}')
- if not playlist:
- break
- for video in playlist['videos']:
- video_id = video.get('id')
- if not video_id:
- continue
- if video.get('streamingAvailability') == 'notYet':
- continue
- yield self.url_result(
- 'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'),
- YahooGyaOPlayerIE.ie_key(), video_id)
- if playlist.get('ended'):
- break
- page += 1
-
- def _real_extract(self, url):
- program_id = self._match_id(url).replace('/', ':')
- return self.playlist_result(self._entries(program_id), program_id)
-
-
class YahooJapanNewsIE(InfoExtractor):
IE_NAME = 'yahoo:japannews'
IE_DESC = 'Yahoo! Japan News'
diff --git a/hypervideo_dl/extractor/yandexvideo.py b/hypervideo_dl/extractor/yandexvideo.py
index 535b61f..727250e 100644
--- a/hypervideo_dl/extractor/yandexvideo.py
+++ b/hypervideo_dl/extractor/yandexvideo.py
@@ -270,9 +270,9 @@ class ZenYandexIE(InfoExtractor):
for s_url in stream_urls:
ext = determine_ext(s_url)
if ext == 'mpd':
- formats.extend(self._extract_mpd_formats(s_url, id, mpd_id='dash'))
+ formats.extend(self._extract_mpd_formats(s_url, video_id, mpd_id='dash'))
elif ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(s_url, id, 'mp4'))
+ formats.extend(self._extract_m3u8_formats(s_url, video_id, 'mp4'))
return {
'id': video_id,
'title': video_json.get('title') or self._og_search_title(webpage),
diff --git a/hypervideo_dl/extractor/yappy.py b/hypervideo_dl/extractor/yappy.py
new file mode 100644
index 0000000..7b3d0cb
--- /dev/null
+++ b/hypervideo_dl/extractor/yappy.py
@@ -0,0 +1,127 @@
+from .common import InfoExtractor
+from ..utils import (
+ OnDemandPagedList,
+ int_or_none,
+ traverse_obj,
+ unified_timestamp,
+ url_or_none,
+)
+
+
+class YappyIE(InfoExtractor):
+ _VALID_URL = r'https?://yappy\.media/video/(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://yappy.media/video/47fea6d8586f48d1a0cf96a7342aabd2',
+ 'info_dict': {
+ 'id': '47fea6d8586f48d1a0cf96a7342aabd2',
+ 'ext': 'mp4',
+ 'title': 'Куда нажимать? Как снимать? Смотри видос и погнали!🤘🏻',
+ 'timestamp': 1661893200,
+ 'description': 'Куда нажимать? Как снимать? Смотри видос и погнали!🤘🏻',
+ 'thumbnail': 'https://cdn-st.ritm.media/static/pic/thumbnails/0c7c4d73388f47848acaf540d2e2bb8c-thumbnail.jpg',
+ 'upload_date': '20220830',
+ 'view_count': int,
+ 'like_count': int,
+ 'uploader_id': '59a0c8c485e5410b9c43474bf4c6a373',
+ 'categories': ['Образование и наука', 'Лайфхак', 'Технологии', 'Арт/искусство'],
+ 'repost_count': int,
+ 'uploader': 'YAPPY',
+ }
+ }, {
+ 'url': 'https://yappy.media/video/3862451954ad4bd58ae2ccefddb0bd33',
+ 'info_dict': {
+ 'id': '3862451954ad4bd58ae2ccefddb0bd33',
+ 'ext': 'mp4',
+ 'title': 'Опиши свой характер 3 словами🙃\n#психология #дружба #отношения',
+ 'timestamp': 1674726985,
+ 'like_count': int,
+ 'description': 'Опиши свой характер 3 словами🙃\n#психология #дружба #отношения',
+ 'uploader_id': '6793ee3581974a3586fc01e157de6c99',
+ 'view_count': int,
+ 'repost_count': int,
+ 'uploader': 'LENA SHTURMAN',
+ 'upload_date': '20230126',
+ 'thumbnail': 'https://cdn-st.ritm.media/static/pic/user_thumbnails/6e76bb4bbad640b6/9ec84c115b2b1967/1674716171.jpg',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ json_ld = self._search_json_ld(webpage, video_id)
+ nextjs_data = self._search_nextjs_data(webpage, video_id)
+
+ media_data = (
+ traverse_obj(
+ nextjs_data, ('props', 'pageProps', ('data', 'OpenGraphParameters')), get_all=False)
+ or self._download_json(f'https://yappy.media/api/video/{video_id}', video_id))
+
+ media_url = traverse_obj(media_data, ('link', {url_or_none})) or ''
+ has_watermark = media_url.endswith('-wm.mp4')
+
+ formats = [{
+ 'url': media_url,
+ 'ext': 'mp4',
+ 'format_note': 'Watermarked' if has_watermark else None,
+ 'preference': -10 if has_watermark else None
+ }] if media_url else []
+
+ if has_watermark:
+ formats.append({
+ 'url': media_url.replace('-wm.mp4', '.mp4'),
+ 'ext': 'mp4'
+ })
+
+ audio_link = traverse_obj(media_data, ('audio', 'link'))
+ if audio_link:
+ formats.append({
+ 'url': audio_link,
+ 'ext': 'mp3',
+ 'acodec': 'mp3',
+ 'vcodec': 'none'
+ })
+
+ return {
+ 'id': video_id,
+ 'title': (json_ld.get('description') or self._html_search_meta(['og:title'], webpage)
+ or self._html_extract_title(webpage)),
+ 'formats': formats,
+ 'thumbnail': (media_data.get('thumbnail')
+ or self._html_search_meta(['og:image', 'og:image:secure_url'], webpage)),
+ 'description': (media_data.get('description') or json_ld.get('description')
+ or self._html_search_meta(['description', 'og:description'], webpage)),
+ 'timestamp': unified_timestamp(media_data.get('publishedAt') or json_ld.get('timestamp')),
+ 'view_count': int_or_none(media_data.get('viewsCount') or json_ld.get('view_count')),
+ 'like_count': int_or_none(media_data.get('likesCount')),
+ 'uploader': traverse_obj(media_data, ('creator', 'firstName')),
+ 'uploader_id': traverse_obj(media_data, ('creator', ('uuid', 'nickname')), get_all=False),
+ 'categories': traverse_obj(media_data, ('categories', ..., 'name')) or None,
+ 'repost_count': int_or_none(media_data.get('sharingCount'))
+ }
+
+
+class YappyProfileIE(InfoExtractor):
+ _VALID_URL = r'https?://yappy\.media/profile/(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://yappy.media/profile/59a0c8c485e5410b9c43474bf4c6a373',
+ 'info_dict': {
+ 'id': '59a0c8c485e5410b9c43474bf4c6a373',
+ },
+ 'playlist_mincount': 527,
+ }]
+
+ def _real_extract(self, url):
+ profile_id = self._match_id(url)
+
+ def fetch_page(page_num):
+ page_num += 1
+ videos = self._download_json(
+ f'https://yappy.media/api/video/list/{profile_id}?page={page_num}',
+ profile_id, f'Downloading profile page {page_num} JSON')
+
+ for video in traverse_obj(videos, ('results', lambda _, v: v['uuid'])):
+ yield self.url_result(
+ f'https://yappy.media/video/{video["uuid"]}', YappyIE,
+ video['uuid'], video.get('description'))
+
+ return self.playlist_result(OnDemandPagedList(fetch_page, 15), profile_id)
diff --git a/hypervideo_dl/extractor/yesjapan.py b/hypervideo_dl/extractor/yesjapan.py
index b45fa8f..94e4166 100644
--- a/hypervideo_dl/extractor/yesjapan.py
+++ b/hypervideo_dl/extractor/yesjapan.py
@@ -1,9 +1,6 @@
from .common import InfoExtractor
-from ..utils import (
- HEADRequest,
- get_element_by_attribute,
- parse_iso8601,
-)
+from ..networking import HEADRequest
+from ..utils import get_element_by_attribute, parse_iso8601
class YesJapanIE(InfoExtractor):
@@ -42,7 +39,7 @@ class YesJapanIE(InfoExtractor):
req = self._request_webpage(
redirect_req, video_id, note='Resolving final URL', errnote='Could not resolve final URL', fatal=False)
if req:
- video_url = req.geturl()
+ video_url = req.url
formats = [{
'format_id': 'sd',
diff --git a/hypervideo_dl/extractor/yle_areena.py b/hypervideo_dl/extractor/yle_areena.py
index 118dc12..c5b45f0 100644
--- a/hypervideo_dl/extractor/yle_areena.py
+++ b/hypervideo_dl/extractor/yle_areena.py
@@ -1,40 +1,94 @@
from .common import InfoExtractor
from .kaltura import KalturaIE
-from ..utils import int_or_none, traverse_obj, url_or_none
+from ..utils import (
+ int_or_none,
+ smuggle_url,
+ traverse_obj,
+ unified_strdate,
+ url_or_none,
+)
class YleAreenaIE(InfoExtractor):
_VALID_URL = r'https?://areena\.yle\.fi/(?P<id>[\d-]+)'
- _TESTS = [{
- 'url': 'https://areena.yle.fi/1-4371942',
- 'md5': '932edda0ecf5dfd6423804182d32f8ac',
- 'info_dict': {
- 'id': '0_a3tjk92c',
- 'ext': 'mp4',
- 'title': 'Pouchit',
- 'description': 'md5:d487309c3abbe5650265bbd1742d2f82',
- 'series': 'Modernit miehet',
- 'season': 'Season 1',
- 'season_number': 1,
- 'episode': 'Episode 2',
- 'episode_number': 2,
- 'thumbnail': 'http://cfvod.kaltura.com/p/1955031/sp/195503100/thumbnail/entry_id/0_a3tjk92c/version/100061',
- 'uploader_id': 'ovp@yle.fi',
- 'duration': 1435,
- 'view_count': int,
- 'upload_date': '20181204',
- 'timestamp': 1543916210,
- 'subtitles': {'fin': [{'url': r're:^https?://', 'ext': 'srt'}]},
- 'age_limit': 7,
- }
- }]
+ _TESTS = [
+ {
+ 'url': 'https://areena.yle.fi/1-4371942',
+ 'md5': '932edda0ecf5dfd6423804182d32f8ac',
+ 'info_dict': {
+ 'id': '0_a3tjk92c',
+ 'ext': 'mp4',
+ 'title': 'Pouchit',
+ 'description': 'md5:d487309c3abbe5650265bbd1742d2f82',
+ 'series': 'Modernit miehet',
+ 'season': 'Season 1',
+ 'season_number': 1,
+ 'episode': 'Episode 2',
+ 'episode_number': 2,
+ 'thumbnail': 'http://cfvod.kaltura.com/p/1955031/sp/195503100/thumbnail/entry_id/0_a3tjk92c/version/100061',
+ 'uploader_id': 'ovp@yle.fi',
+ 'duration': 1435,
+ 'view_count': int,
+ 'upload_date': '20181204',
+ 'release_date': '20190106',
+ 'timestamp': 1543916210,
+ 'subtitles': {'fin': [{'url': r're:^https?://', 'ext': 'srt'}]},
+ 'age_limit': 7,
+ 'webpage_url': 'https://areena.yle.fi/1-4371942'
+ }
+ },
+ {
+ 'url': 'https://areena.yle.fi/1-2158940',
+ 'md5': 'cecb603661004e36af8c5188b5212b12',
+ 'info_dict': {
+ 'id': '1_l38iz9ur',
+ 'ext': 'mp4',
+ 'title': 'Albi haluaa vessan',
+ 'description': 'md5:15236d810c837bed861fae0e88663c33',
+ 'series': 'Albi Lumiukko',
+ 'season': None,
+ 'season_number': None,
+ 'episode': None,
+ 'episode_number': None,
+ 'thumbnail': 'http://cfvod.kaltura.com/p/1955031/sp/195503100/thumbnail/entry_id/1_l38iz9ur/version/100021',
+ 'uploader_id': 'ovp@yle.fi',
+ 'duration': 319,
+ 'view_count': int,
+ 'upload_date': '20211202',
+ 'release_date': '20211215',
+ 'timestamp': 1638448202,
+ 'subtitles': {},
+ 'age_limit': 0,
+ 'webpage_url': 'https://areena.yle.fi/1-2158940'
+ }
+ },
+ {
+ 'url': 'https://areena.yle.fi/1-64829589',
+ 'info_dict': {
+ 'id': '1-64829589',
+ 'ext': 'mp4',
+ 'title': 'HKO & Mälkki & Tanner',
+ 'description': 'md5:b4f1b1af2c6569b33f75179a86eea156',
+ 'series': 'Helsingin kaupunginorkesterin konsertteja',
+ 'thumbnail': r're:^https?://.+\.jpg$',
+ 'release_date': '20230120',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ },
+ ]
def _real_extract(self, url):
video_id = self._match_id(url)
info = self._search_json_ld(self._download_webpage(url, video_id), video_id, default={})
video_data = self._download_json(
f'https://player.api.yle.fi/v1/preview/{video_id}.json?app_id=player_static_prod&app_key=8930d72170e48303cf5f3867780d549b',
- video_id)
+ video_id, headers={
+ 'origin': 'https://areena.yle.fi',
+ 'referer': 'https://areena.yle.fi/',
+ 'content-type': 'application/json'
+ })
# Example title: 'K1, J2: Pouchit | Modernit miehet'
series, season_number, episode_number, episode = self._search_regex(
@@ -52,20 +106,33 @@ class YleAreenaIE(InfoExtractor):
'name': sub.get('kind'),
})
+ kaltura_id = traverse_obj(video_data, ('data', 'ongoing_ondemand', 'kaltura', 'id'), expected_type=str)
+ if kaltura_id:
+ info_dict = {
+ '_type': 'url_transparent',
+ 'url': smuggle_url(f'kaltura:1955031:{kaltura_id}', {'source_url': url}),
+ 'ie_key': KalturaIE.ie_key(),
+ }
+ else:
+ info_dict = {
+ 'id': video_id,
+ 'formats': self._extract_m3u8_formats(
+ video_data['data']['ongoing_ondemand']['manifest_url'], video_id, 'mp4', m3u8_id='hls'),
+ }
+
return {
- '_type': 'url_transparent',
- 'url': 'kaltura:1955031:%s' % traverse_obj(video_data, ('data', 'ongoing_ondemand', 'kaltura', 'id')),
- 'ie_key': KalturaIE.ie_key(),
+ **info_dict,
'title': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'title', 'fin'), expected_type=str)
or episode or info.get('title')),
'description': description,
'series': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'series', 'title', 'fin'), expected_type=str)
or series),
'season_number': (int_or_none(self._search_regex(r'Kausi (\d+)', description, 'season number', default=None))
- or int(season_number)),
+ or int_or_none(season_number)),
'episode_number': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'episode_number'), expected_type=int_or_none)
- or int(episode_number)),
+ or int_or_none(episode_number)),
'thumbnails': traverse_obj(info, ('thumbnails', ..., {'url': 'url'})),
'age_limit': traverse_obj(video_data, ('data', 'ongoing_ondemand', 'content_rating', 'age_restriction'), expected_type=int_or_none),
'subtitles': subtitles,
+ 'release_date': unified_strdate(traverse_obj(video_data, ('data', 'ongoing_ondemand', 'start_time'), expected_type=str)),
}
diff --git a/hypervideo_dl/extractor/youku.py b/hypervideo_dl/extractor/youku.py
index 624975b..7ecd9f1 100644
--- a/hypervideo_dl/extractor/youku.py
+++ b/hypervideo_dl/extractor/youku.py
@@ -6,6 +6,7 @@ import time
from .common import InfoExtractor
from ..utils import (
ExtractorError,
+ clean_html,
get_element_by_class,
js_to_json,
str_or_none,
@@ -26,49 +27,9 @@ class YoukuIE(InfoExtractor):
'''
_TESTS = [{
- # MD5 is unstable
- 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html',
- 'info_dict': {
- 'id': 'XMTc1ODE5Njcy',
- 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.',
- 'ext': 'mp4',
- 'duration': 74.73,
- 'thumbnail': r're:^https?://.*',
- 'uploader': '。躲猫猫、',
- 'uploader_id': '36017967',
- 'uploader_url': 'http://i.youku.com/u/UMTQ0MDcxODY4',
- 'tags': list,
- }
- }, {
'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf',
'only_matching': True,
}, {
- 'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html',
- 'info_dict': {
- 'id': 'XODgxNjg1Mzk2',
- 'ext': 'mp4',
- 'title': '武媚娘传奇 85',
- 'duration': 1999.61,
- 'thumbnail': r're:^https?://.*',
- 'uploader': '疯狂豆花',
- 'uploader_id': '62583473',
- 'uploader_url': 'http://i.youku.com/u/UMjUwMzMzODky',
- 'tags': list,
- },
- }, {
- 'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html',
- 'info_dict': {
- 'id': 'XMTI1OTczNDM5Mg',
- 'ext': 'mp4',
- 'title': '花千骨 04',
- 'duration': 2363,
- 'thumbnail': r're:^https?://.*',
- 'uploader': '放剧场-花千骨',
- 'uploader_id': '772849359',
- 'uploader_url': 'http://i.youku.com/u/UMzA5MTM5NzQzNg==',
- 'tags': list,
- },
- }, {
'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html',
'note': 'Video protected with password',
'info_dict': {
@@ -81,6 +42,7 @@ class YoukuIE(InfoExtractor):
'uploader_id': '322014285',
'uploader_url': 'http://i.youku.com/u/UMTI4ODA1NzE0MA==',
'tags': list,
+ 'skip': '404',
},
'params': {
'videopassword': '100600',
@@ -96,31 +58,41 @@ class YoukuIE(InfoExtractor):
'thumbnail': r're:^https?://.*',
'uploader': '明月庄主moon',
'uploader_id': '38465621',
- 'uploader_url': 'http://i.youku.com/u/UMTUzODYyNDg0',
+ 'uploader_url': 'https://www.youku.com/profile/index/?uid=UMTUzODYyNDg0',
'tags': list,
},
}, {
- 'url': 'http://video.tudou.com/v/XMjIyNzAzMTQ4NA==.html?f=46177805',
+ 'url': 'https://v.youku.com/v_show/id_XNTA2NTA0MjA1Mg==.html',
'info_dict': {
- 'id': 'XMjIyNzAzMTQ4NA',
+ 'id': 'XNTA2NTA0MjA1Mg',
'ext': 'mp4',
- 'title': '卡马乔国足开大脚长传冲吊集锦',
- 'duration': 289,
+ 'title': 'Minecraft我的世界:建造超大巨型航空飞机,菜鸟vs高手vs黑客',
+ 'duration': 542.13,
'thumbnail': r're:^https?://.*',
- 'uploader': '阿卜杜拉之星',
- 'uploader_id': '2382249',
- 'uploader_url': 'http://i.youku.com/u/UOTUyODk5Ng==',
+ 'uploader': '波哥游戏解说',
+ 'uploader_id': '156688084',
+ 'uploader_url': 'https://www.youku.com/profile/index/?uid=UNjI2NzUyMzM2',
'tags': list,
},
}, {
- 'url': 'http://video.tudou.com/v/XMjE4ODI3OTg2MA==.html',
- 'only_matching': True,
+ 'url': 'https://v.youku.com/v_show/id_XNTE1MzczOTg4MA==.html',
+ 'info_dict': {
+ 'id': 'XNTE1MzczOTg4MA',
+ 'ext': 'mp4',
+ 'title': '国产超A特工片',
+ 'duration': 362.97,
+ 'thumbnail': r're:^https?://.*',
+ 'uploader': '陈晓娟说历史',
+ 'uploader_id': '1640913339',
+ 'uploader_url': 'https://www.youku.com/profile/index/?uid=UNjU2MzY1MzM1Ng==',
+ 'tags': list,
+ },
}]
@staticmethod
def get_ysuid():
- return '%d%s' % (int(time.time()), ''.join([
- random.choice(string.ascii_letters) for i in range(3)]))
+ return '%d%s' % (int(time.time()), ''.join(
+ random.choices(string.ascii_letters, k=3)))
def get_format_name(self, fm):
_dict = {
@@ -151,7 +123,7 @@ class YoukuIE(InfoExtractor):
# request basic data
basic_data_params = {
'vid': video_id,
- 'ccode': '0532',
+ 'ccode': '0524',
'client_ip': '192.168.1.1',
'utid': cna,
'client_ts': time.time() / 1000,
@@ -182,7 +154,7 @@ class YoukuIE(InfoExtractor):
else:
msg = 'Youku server reported error %i' % error.get('code')
if error_note is not None:
- msg += ': ' + error_note
+ msg += ': ' + clean_html(error_note)
raise ExtractorError(msg)
# get video title
diff --git a/hypervideo_dl/extractor/youporn.py b/hypervideo_dl/extractor/youporn.py
index 8f1b991..6ee0abc 100644
--- a/hypervideo_dl/extractor/youporn.py
+++ b/hypervideo_dl/extractor/youporn.py
@@ -6,6 +6,7 @@ from ..utils import (
int_or_none,
merge_dicts,
str_to_int,
+ traverse_obj,
unified_strdate,
url_or_none,
)
@@ -86,32 +87,31 @@ class YouPornIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = self._match_valid_url(url)
- video_id = mobj.group('id')
- display_id = mobj.group('display_id') or video_id
-
+ video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
definitions = self._download_json(
- 'https://www.youporn.com/api/video/media_definitions/%s/' % video_id,
- display_id)
+ f'https://www.youporn.com/api/video/media_definitions/{video_id}/', display_id or video_id)
+
+ def get_format_data(data, f):
+ return traverse_obj(data, lambda _, v: v['format'] == f and url_or_none(v['videoUrl']))
formats = []
- for definition in definitions:
- if not isinstance(definition, dict):
- continue
- video_url = url_or_none(definition.get('videoUrl'))
- if not video_url:
- continue
- f = {
- 'url': video_url,
- 'filesize': int_or_none(definition.get('videoSize')),
- }
+ # Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s
+ for hls_url in traverse_obj(get_format_data(definitions, 'hls'), (
+ lambda _, v: not isinstance(v['defaultQuality'], bool), 'videoUrl'), (..., 'videoUrl')):
+ formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
+
+ for definition in get_format_data(definitions, 'mp4'):
+ f = traverse_obj(definition, {
+ 'url': 'videoUrl',
+ 'filesize': ('videoSize', {int_or_none})
+ })
height = int_or_none(definition.get('quality'))
# Video URL's path looks like this:
# /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /videos/201703/11/109285532/1080P_4000K_109285532.mp4
# We will benefit from it by extracting some metadata
- mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url)
+ mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', definition['videoUrl'])
if mobj:
if not height:
height = int(mobj.group('height'))
@@ -179,6 +179,7 @@ class YouPornIE(InfoExtractor):
'tags')
data = self._search_json_ld(webpage, video_id, expected_type='VideoObject', fatal=False)
+ data.pop('url', None)
return merge_dicts(data, {
'id': video_id,
'display_id': display_id,
diff --git a/hypervideo_dl/extractor/youtube.py b/hypervideo_dl/extractor/youtube.py
index f7e3c75..8d606b2 100644
--- a/hypervideo_dl/extractor/youtube.py
+++ b/hypervideo_dl/extractor/youtube.py
@@ -15,13 +15,13 @@ import sys
import threading
import time
import traceback
-import urllib.error
import urllib.parse
from .common import InfoExtractor, SearchInfoExtractor
from .openload import PhantomJSwrapper
from ..compat import functools
from ..jsinterp import JSInterpreter
+from ..networking.exceptions import HTTPError, network_exceptions
from ..utils import (
NO_DEFAULT,
ExtractorError,
@@ -41,7 +41,6 @@ from ..utils import (
join_nonempty,
js_to_json,
mimetype2ext,
- network_exceptions,
orderedSet,
parse_codecs,
parse_count,
@@ -66,6 +65,7 @@ from ..utils import (
variadic,
)
+STREAMING_DATA_CLIENT_NAME = '__hypervideo_dl_client'
# any clients starting with _ cannot be explicitly requested by the user
INNERTUBE_CLIENTS = {
'web': {
@@ -248,11 +248,16 @@ def _split_innertube_client(client_name):
return client_name, base, variant[0] if variant else None
+def short_client_name(client_name):
+ main, *parts = _split_innertube_client(client_name)[0].replace('embedscreen', 'e_s').split('_')
+ return join_nonempty(main[:4], ''.join(x[0] for x in parts)).upper()
+
+
def build_innertube_clients():
THIRD_PARTY = {
'embedUrl': 'https://www.youtube.com/', # Can be any valid URL
}
- BASE_CLIENTS = ('android', 'web', 'tv', 'ios', 'mweb')
+ BASE_CLIENTS = ('ios', 'android', 'web', 'tv', 'mweb')
priority = qualities(BASE_CLIENTS[::-1])
for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
@@ -286,13 +291,14 @@ class BadgeType(enum.Enum):
AVAILABILITY_PREMIUM = enum.auto()
AVAILABILITY_SUBSCRIPTION = enum.auto()
LIVE_NOW = enum.auto()
+ VERIFIED = enum.auto()
class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
_RESERVED_NAMES = (
- r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|'
+ r'channel|c|user|playlist|watch|w|v|embed|e|live|watch_popup|clip|'
r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|'
r'browse|oembed|get_video_info|iframe_api|s/player|source|'
r'storefront|oops|index|account|t/terms|about|upload|signin|logout')
@@ -312,6 +318,40 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
r'(?:www\.)?invidious\.pussthecat\.org',
r'(?:www\.)?invidious\.zee\.li',
r'(?:www\.)?invidious\.ethibox\.fr',
+ r'(?:www\.)?iv\.ggtyler\.dev',
+ r'(?:www\.)?inv\.vern\.i2p',
+ r'(?:www\.)?am74vkcrjp2d5v36lcdqgsj2m6x36tbrkhsruoegwfcizzabnfgf5zyd\.onion',
+ r'(?:www\.)?inv\.riverside\.rocks',
+ r'(?:www\.)?invidious\.silur\.me',
+ r'(?:www\.)?inv\.bp\.projectsegfau\.lt',
+ r'(?:www\.)?invidious\.g4c3eya4clenolymqbpgwz3q3tawoxw56yhzk4vugqrl6dtu3ejvhjid\.onion',
+ r'(?:www\.)?invidious\.slipfox\.xyz',
+ r'(?:www\.)?invidious\.esmail5pdn24shtvieloeedh7ehz3nrwcdivnfhfcedl7gf4kwddhkqd\.onion',
+ r'(?:www\.)?inv\.vernccvbvyi5qhfzyqengccj7lkove6bjot2xhh5kajhwvidqafczrad\.onion',
+ r'(?:www\.)?invidious\.tiekoetter\.com',
+ r'(?:www\.)?iv\.odysfvr23q5wgt7i456o5t3trw2cw5dgn56vbjfbq2m7xsc5vqbqpcyd\.onion',
+ r'(?:www\.)?invidious\.nerdvpn\.de',
+ r'(?:www\.)?invidious\.weblibre\.org',
+ r'(?:www\.)?inv\.odyssey346\.dev',
+ r'(?:www\.)?invidious\.dhusch\.de',
+ r'(?:www\.)?iv\.melmac\.space',
+ r'(?:www\.)?watch\.thekitty\.zone',
+ r'(?:www\.)?invidious\.privacydev\.net',
+ r'(?:www\.)?ng27owmagn5amdm7l5s3rsqxwscl5ynppnis5dqcasogkyxcfqn7psid\.onion',
+ r'(?:www\.)?invidious\.drivet\.xyz',
+ r'(?:www\.)?vid\.priv\.au',
+ r'(?:www\.)?euxxcnhsynwmfidvhjf6uzptsmh4dipkmgdmcmxxuo7tunp3ad2jrwyd\.onion',
+ r'(?:www\.)?inv\.vern\.cc',
+ r'(?:www\.)?invidious\.esmailelbob\.xyz',
+ r'(?:www\.)?invidious\.sethforprivacy\.com',
+ r'(?:www\.)?yt\.oelrichsgarcia\.de',
+ r'(?:www\.)?yt\.artemislena\.eu',
+ r'(?:www\.)?invidious\.flokinet\.to',
+ r'(?:www\.)?invidious\.baczek\.me',
+ r'(?:www\.)?y\.com\.sb',
+ r'(?:www\.)?invidious\.epicsite\.xyz',
+ r'(?:www\.)?invidious\.lidarshield\.cloud',
+ r'(?:www\.)?yt\.funami\.tech',
r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
r'(?:www\.)?osbivz6guyeahrwp2lnwyjk2xos342h4ocsxyqrlaopqjuhwn2djiiyd\.onion',
r'(?:www\.)?u2cvlit75owumwpy4dj2hsmvkq7nvrclkpht7xgyye2pyoxhpmclkrad\.onion',
@@ -390,6 +430,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
r'(?:www\.)?piped\.qdi\.fi',
r'(?:www\.)?piped\.video',
r'(?:www\.)?piped\.aeong\.one',
+ r'(?:www\.)?piped\.moomoo\.me',
+ r'(?:www\.)?piped\.chauvet\.pro',
+ r'(?:www\.)?watch\.leptons\.xyz',
+ r'(?:www\.)?pd\.vern\.cc',
+ r'(?:www\.)?piped\.hostux\.net',
+ r'(?:www\.)?piped\.lunar\.icu',
+ # Hyperpipe instances from https://hyperpipe.codeberg.page/
+ r'(?:www\.)?hyperpipe\.surge\.sh',
+ r'(?:www\.)?hyperpipe\.esmailelbob\.xyz',
+ r'(?:www\.)?listen\.whatever\.social',
+ r'(?:www\.)?music\.adminforge\.de',
)
# extracted from account/account_menu ep
@@ -406,6 +457,23 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
_IGNORED_WARNINGS = {'Unavailable videos will be hidden during playback'}
+ _YT_HANDLE_RE = r'@[\w.-]{3,30}' # https://support.google.com/youtube/answer/11585688?hl=en
+ _YT_CHANNEL_UCID_RE = r'UC[\w-]{22}'
+
+ def ucid_or_none(self, ucid):
+ return self._search_regex(rf'^({self._YT_CHANNEL_UCID_RE})$', ucid, 'UC-id', default=None)
+
+ def handle_or_none(self, handle):
+ return self._search_regex(rf'^({self._YT_HANDLE_RE})$', handle, '@-handle', default=None)
+
+ def handle_from_url(self, url):
+ return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_HANDLE_RE})',
+ url, 'channel handle', default=None)
+
+ def ucid_from_url(self, url):
+ return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_CHANNEL_UCID_RE})',
+ url, 'channel id', default=None)
+
@functools.cached_property
def _preferred_lang(self):
"""
@@ -428,16 +496,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
cookies = self._get_cookies('https://www.youtube.com/')
if cookies.get('__Secure-3PSID'):
return
- consent_id = None
- consent = cookies.get('CONSENT')
- if consent:
- if 'YES' in consent.value:
- return
- consent_id = self._search_regex(
- r'PENDING\+(\d+)', consent.value, 'consent', default=None)
- if not consent_id:
- consent_id = random.randint(100, 999)
- self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
+ socs = cookies.get('SOCS')
+ if socs and not socs.value.startswith('CAA'): # not consented
+ return
+ self._set_cookie('.youtube.com', 'SOCS', 'CAI', secure=True) # accept all (required for mixes)
def _initialize_pref(self):
cookies = self._get_cookies('https://www.youtube.com/')
@@ -723,17 +785,26 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def _extract_and_report_alerts(self, data, *args, **kwargs):
return self._report_alerts(self._extract_alerts(data), *args, **kwargs)
- def _extract_badges(self, renderer: dict):
- privacy_icon_map = {
+ def _extract_badges(self, badge_list: list):
+ """
+ Extract known BadgeType's from a list of badge renderers.
+ @returns [{'type': BadgeType}]
+ """
+ icon_type_map = {
'PRIVACY_UNLISTED': BadgeType.AVAILABILITY_UNLISTED,
'PRIVACY_PRIVATE': BadgeType.AVAILABILITY_PRIVATE,
- 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC
+ 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC,
+ 'CHECK_CIRCLE_THICK': BadgeType.VERIFIED,
+ 'OFFICIAL_ARTIST_BADGE': BadgeType.VERIFIED,
+ 'CHECK': BadgeType.VERIFIED,
}
badge_style_map = {
'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION,
'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM,
- 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW
+ 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW,
+ 'BADGE_STYLE_TYPE_VERIFIED': BadgeType.VERIFIED,
+ 'BADGE_STYLE_TYPE_VERIFIED_ARTIST': BadgeType.VERIFIED,
}
label_map = {
@@ -741,13 +812,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'private': BadgeType.AVAILABILITY_PRIVATE,
'members only': BadgeType.AVAILABILITY_SUBSCRIPTION,
'live': BadgeType.LIVE_NOW,
- 'premium': BadgeType.AVAILABILITY_PREMIUM
+ 'premium': BadgeType.AVAILABILITY_PREMIUM,
+ 'verified': BadgeType.VERIFIED,
+ 'official artist channel': BadgeType.VERIFIED,
}
badges = []
- for badge in traverse_obj(renderer, ('badges', ..., 'metadataBadgeRenderer'), default=[]):
+ for badge in traverse_obj(badge_list, (..., lambda key, _: re.search(r'[bB]adgeRenderer$', key))):
badge_type = (
- privacy_icon_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str))
+ icon_type_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str))
or badge_style_map.get(traverse_obj(badge, 'style'))
)
if badge_type:
@@ -755,11 +828,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
continue
# fallback, won't work in some languages
- label = traverse_obj(badge, 'label', expected_type=str, default='')
+ label = traverse_obj(
+ badge, 'label', ('accessibilityData', 'label'), 'tooltip', 'iconTooltip', get_all=False, expected_type=str, default='')
for match, label_badge_type in label_map.items():
if match in label.lower():
- badges.append({'type': badge_type})
- continue
+ badges.append({'type': label_badge_type})
+ break
return badges
@@ -785,7 +859,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
runs = item
runs = runs[:min(len(runs), max_runs or len(runs))]
- text = ''.join(traverse_obj(runs, (..., 'text'), expected_type=str, default=[]))
+ text = ''.join(traverse_obj(runs, (..., 'text'), expected_type=str))
if text:
return text
@@ -805,7 +879,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
"""
thumbnails = []
for path in path_list or [()]:
- for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...), default=[]):
+ for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...)):
thumbnail_url = url_or_none(thumbnail.get('url'))
if not thumbnail_url:
continue
@@ -825,9 +899,16 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def extract_relative_time(relative_time_text):
"""
Extracts a relative time from string and converts to dt object
- e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today'
+ e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today', '8 yr ago'
"""
- mobj = re.search(r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text)
+
+ # XXX: this could be moved to a general function in utils.py
+ # The relative time text strings are roughly the same as what
+ # Javascript's Intl.RelativeTimeFormat function generates.
+ # See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/RelativeTimeFormat
+ mobj = re.search(
+ r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>sec(?:ond)?|s|min(?:ute)?|h(?:our|r)?|d(?:ay)?|w(?:eek|k)?|mo(?:nth)?|y(?:ear|r)?)s?\s*ago',
+ relative_time_text)
if mobj:
start = mobj.group('start')
if start:
@@ -871,15 +952,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
except ExtractorError as e:
if not isinstance(e.cause, network_exceptions):
return self._error_or_warning(e, fatal=fatal)
- elif not isinstance(e.cause, urllib.error.HTTPError):
+ elif not isinstance(e.cause, HTTPError):
retry.error = e
continue
- first_bytes = e.cause.read(512)
+ first_bytes = e.cause.response.read(512)
if not is_html(first_bytes):
yt_error = try_get(
self._parse_json(
- self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False),
+ self._webpage_read_content(e.cause.response, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False),
lambda x: x['error']['message'], str)
if yt_error:
self._report_alerts([('ERROR', yt_error)], fatal=False)
@@ -887,7 +968,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
# We also want to catch all other network exceptions since errors in later pages can be troublesome
# See https://github.com/hypervideo/hypervideo/issues/507#issuecomment-880188210
- if e.cause.code not in (403, 429):
+ if e.cause.status not in (403, 429):
retry.error = e
continue
return self._error_or_warning(e, fatal=fatal)
@@ -911,7 +992,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
@staticmethod
def is_music_url(url):
- return re.match(r'https?://music\.youtube\.com/', url) is not None
+ return re.match(r'(https?://)?music\.youtube\.com/', url) is not None
def _extract_video(self, renderer):
video_id = renderer.get('videoId')
@@ -940,11 +1021,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
if not channel_id:
channel_id = traverse_obj(reel_header_renderer, ('channelNavigationEndpoint', 'browseEndpoint', 'browseId'))
+ channel_id = self.ucid_or_none(channel_id)
+
overlay_style = traverse_obj(
renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'),
get_all=False, expected_type=str)
- badges = self._extract_badges(renderer)
-
+ badges = self._extract_badges(traverse_obj(renderer, 'badges'))
+ owner_badges = self._extract_badges(traverse_obj(renderer, 'ownerBadges'))
navigation_url = urljoin('https://www.youtube.com/', traverse_obj(
renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'),
expected_type=str)) or ''
@@ -968,6 +1051,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
else self._get_count({'simpleText': view_count_text}))
view_count_field = 'concurrent_view_count' if live_status in ('is_live', 'is_upcoming') else 'view_count'
+ channel = (self._get_text(renderer, 'ownerText', 'shortBylineText')
+ or self._get_text(reel_header_renderer, 'channelTitleText'))
+
+ channel_handle = traverse_obj(renderer, (
+ 'shortBylineText', 'runs', ..., 'navigationEndpoint',
+ (('commandMetadata', 'webCommandMetadata', 'url'), ('browseEndpoint', 'canonicalBaseUrl'))),
+ expected_type=self.handle_from_url, get_all=False)
return {
'_type': 'url',
'ie_key': YoutubeIE.ie_key(),
@@ -977,9 +1067,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'description': description,
'duration': duration,
'channel_id': channel_id,
- 'channel': (self._get_text(renderer, 'ownerText', 'shortBylineText')
- or self._get_text(reel_header_renderer, 'channelTitleText')),
+ 'channel': channel,
'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None,
+ 'uploader': channel,
+ 'uploader_id': channel_handle,
+ 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None),
'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'),
'timestamp': (self._parse_time_text(time_text)
if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE)
@@ -993,7 +1085,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None,
is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None),
view_count_field: view_count,
- 'live_status': live_status
+ 'live_status': live_status,
+ 'channel_is_verified': True if self._has_badge(owner_badges, BadgeType.VERIFIED) else None
}
@@ -1012,7 +1105,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
- (?:(?:v|embed|e|shorts)/(?!videoseries|live_stream)) # v/ or embed/ or e/ or shorts/
+ (?:(?:v|embed|e|shorts|live)/(?!videoseries|live_stream)) # v/ or embed/ or e/ or shorts/
|(?: # or the v= param in all its forms
(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
@@ -1181,9 +1274,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'BaW_jenozKc',
'ext': 'mp4',
'title': 'youtube-dl test video "\'/\\ä↭𝕐',
- 'uploader': 'Philipp Hagemeister',
- 'uploader_id': 'phihag',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
'channel': 'Philipp Hagemeister',
'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
@@ -1202,7 +1292,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'start_time': 1,
'end_time': 9,
'comment_count': int,
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'Philipp Hagemeister',
+ 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister',
+ 'uploader_id': '@PhilippHagemeister',
+ 'heatmap': 'count:100',
}
},
{
@@ -1214,9 +1308,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20120608',
'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
- 'uploader': 'SET India',
- 'uploader_id': 'setindia',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
'age_limit': 18,
},
'skip': 'Private video',
@@ -1228,9 +1319,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'BaW_jenozKc',
'ext': 'mp4',
'title': 'youtube-dl test video "\'/\\ä↭𝕐',
- 'uploader': 'Philipp Hagemeister',
- 'uploader_id': 'phihag',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
'channel': 'Philipp Hagemeister',
'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
@@ -1247,7 +1335,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'live_status': 'not_live',
'age_limit': 0,
'comment_count': int,
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'Philipp Hagemeister',
+ 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister',
+ 'uploader_id': '@PhilippHagemeister',
+ 'heatmap': 'count:100',
},
'params': {
'skip_download': True,
@@ -1260,10 +1352,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'a9LDPn-MO4I',
'ext': 'm4a',
'upload_date': '20121002',
- 'uploader_id': '8KVIDEO',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
'description': '',
- 'uploader': '8KVIDEO',
'title': 'UHDTV TEST 8K VIDEO.mp4'
},
'params': {
@@ -1281,8 +1370,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
'duration': 244,
- 'uploader': 'AfrojackVEVO',
- 'uploader_id': 'AfrojackVEVO',
'upload_date': '20131011',
'abr': 129.495,
'like_count': int,
@@ -1294,13 +1381,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'live_status': 'not_live',
'thumbnail': 'https://i.ytimg.com/vi_webp/IB3lcPjvWLA/maxresdefault.webp',
'channel': 'Afrojack',
- 'uploader_url': 'http://www.youtube.com/user/AfrojackVEVO',
'tags': 'count:19',
'availability': 'public',
'categories': ['Music'],
'age_limit': 0,
'alt_title': 'The Spark',
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'Afrojack',
+ 'uploader_url': 'https://www.youtube.com/@Afrojack',
+ 'uploader_id': '@Afrojack',
},
'params': {
'youtube_include_dash_manifest': True,
@@ -1317,9 +1406,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
'duration': 142,
- 'uploader': 'The Witcher',
- 'uploader_id': 'WitcherGame',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
'upload_date': '20140605',
'age_limit': 18,
'categories': ['Gaming'],
@@ -1333,7 +1419,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_id': 'UCzybXLxv08IApdjdN0mJhEg',
'playable_in_embed': True,
'view_count': int,
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'The Witcher',
+ 'uploader_url': 'https://www.youtube.com/@thewitcher',
+ 'uploader_id': '@thewitcher',
+ 'comment_count': int,
+ 'channel_is_verified': True,
+ 'heatmap': 'count:100',
},
},
{
@@ -1345,12 +1437,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Godzilla 2 (Official Video)',
'description': 'md5:bf77e03fcae5529475e500129b05668a',
'upload_date': '20200408',
- 'uploader_id': 'FlyingKitty900',
- 'uploader': 'FlyingKitty',
'age_limit': 18,
'availability': 'needs_auth',
'channel_id': 'UCYQT13AtrJC0gsM1far_zJg',
- 'uploader_url': 'http://www.youtube.com/user/FlyingKitty900',
'channel': 'FlyingKitty',
'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg',
'view_count': int,
@@ -1361,7 +1450,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'like_count': int,
'duration': 177,
'playable_in_embed': True,
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'FlyingKitty',
+ 'uploader_url': 'https://www.youtube.com/@FlyingKitty900',
+ 'uploader_id': '@FlyingKitty900',
+ 'comment_count': int,
+ 'channel_is_verified': True,
},
},
{
@@ -1372,13 +1466,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': '[MMD] Adios - EVERGLOW [+Motion DL]',
'ext': 'mp4',
'upload_date': '20191228',
- 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
- 'uploader': 'Projekt Melody',
'description': 'md5:17eccca93a786d51bc67646756894066',
'age_limit': 18,
'like_count': int,
'availability': 'needs_auth',
- 'uploader_url': 'http://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ',
'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
'view_count': int,
'thumbnail': 'https://i.ytimg.com/vi_webp/Tq92D6wQ1mg/sddefault.webp',
@@ -1390,7 +1481,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'duration': 106,
'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ',
'comment_count': int,
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'Projekt Melody',
+ 'uploader_url': 'https://www.youtube.com/@ProjektMelody',
+ 'uploader_id': '@ProjektMelody',
},
},
{
@@ -1400,8 +1494,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'MeJVWBSsPAY',
'ext': 'mp4',
'title': 'OOMPH! - Such Mich Find Mich (Lyrics)',
- 'uploader': 'Herr Lurik',
- 'uploader_id': 'st3in234',
'description': 'Fan Video. Music & Lyrics by OOMPH!.',
'upload_date': '20130730',
'track': 'Such mich find mich',
@@ -1418,11 +1510,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_id': 'UCdR3RSDPqub28LjZx0v9-aA',
'categories': ['Music'],
'availability': 'public',
- 'uploader_url': 'http://www.youtube.com/user/st3in234',
'channel_url': 'https://www.youtube.com/channel/UCdR3RSDPqub28LjZx0v9-aA',
'live_status': 'not_live',
'artist': 'OOMPH!',
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'Herr Lurik',
+ 'uploader_url': 'https://www.youtube.com/@HerrLurik',
+ 'uploader_id': '@HerrLurik',
},
},
{
@@ -1439,11 +1533,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'duration': 266,
'upload_date': '20100430',
- 'uploader_id': 'deadmau5',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
'creator': 'deadmau5',
'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
- 'uploader': 'deadmau5',
'title': 'Deadmau5 - Some Chords (HD)',
'alt_title': 'Some Chords',
'availability': 'public',
@@ -1461,7 +1552,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_url': 'https://www.youtube.com/channel/UCYEK6xds6eo-3tr4xRdflmQ',
'categories': ['Music'],
'album': 'Some Chords',
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'deadmau5',
+ 'uploader_url': 'https://www.youtube.com/@deadmau5',
+ 'uploader_id': '@deadmau5',
},
'expected_warnings': [
'DASH manifest missing',
@@ -1475,10 +1569,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'duration': 6085,
'upload_date': '20150827',
- 'uploader_id': 'olympic',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
'description': 'md5:04bbbf3ccceb6795947572ca36f45904',
- 'uploader': 'Olympics',
'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
'like_count': int,
'release_timestamp': 1343767800,
@@ -1494,7 +1585,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'live_status': 'was_live',
'view_count': int,
'channel_url': 'https://www.youtube.com/channel/UCTl3QQTvqHFjurroKxexy2Q',
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'Olympics',
+ 'uploader_url': 'https://www.youtube.com/@Olympics',
+ 'uploader_id': '@Olympics',
+ 'channel_is_verified': True,
},
'params': {
'skip_download': 'requires avconv',
@@ -1509,10 +1604,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'stretched_ratio': 16 / 9.,
'duration': 85,
'upload_date': '20110310',
- 'uploader_id': 'AllenMeow',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
- 'uploader': '孫ᄋᄅ',
'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
'playable_in_embed': True,
'channel': '孫ᄋᄅ',
@@ -1527,7 +1619,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'live_status': 'not_live',
'availability': 'unlisted',
'comment_count': int,
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': '孫ᄋᄅ',
+ 'uploader_url': 'https://www.youtube.com/@AllenMeow',
+ 'uploader_id': '@AllenMeow',
},
},
# url_encoded_fmt_stream_map is empty string
@@ -1539,8 +1634,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
'description': '',
'upload_date': '20150404',
- 'uploader_id': 'spbelect',
- 'uploader': 'Наблюдатели Петербурга',
},
'params': {
'skip_download': 'requires avconv',
@@ -1557,9 +1650,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:116377fd2963b81ec4ce64b542173306',
'duration': 220,
'upload_date': '20150625',
- 'uploader_id': 'dorappi2000',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
- 'uploader': 'dorappi2000',
'formats': 'mincount:31',
},
'skip': 'not actual anymore',
@@ -1572,9 +1662,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'CsmdDsKjzN8',
'ext': 'mp4',
'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
- 'uploader': 'Airtek',
'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
- 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
},
'params': {
@@ -1585,6 +1673,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
{
# Multifeed videos (multiple cameras), URL can be of any Camera
+ # TODO: fix multifeed titles
'url': 'https://www.youtube.com/watch?v=zaPI8MvL8pg',
'info_dict': {
'id': 'zaPI8MvL8pg',
@@ -1596,16 +1685,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'j5yGuxZ8lLU',
'ext': 'mp4',
'title': 'Terraria 1.2 Live Stream | Let\'s Play - Part 04 (Chris)',
- 'uploader': 'WiiLikeToPlay',
'description': 'md5:563ccbc698b39298481ca3c571169519',
- 'uploader_url': 'http://www.youtube.com/user/WiiRikeToPray',
'duration': 10120,
'channel_follower_count': int,
'channel_url': 'https://www.youtube.com/channel/UCN2XePorRokPB9TEgRZpddg',
'availability': 'public',
'playable_in_embed': True,
'upload_date': '20131105',
- 'uploader_id': 'WiiRikeToPray',
'categories': ['Gaming'],
'live_status': 'was_live',
'tags': 'count:24',
@@ -1618,17 +1704,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel': 'WiiLikeToPlay',
'view_count': int,
'release_date': '20131106',
+ 'uploader': 'WiiLikeToPlay',
+ 'uploader_id': '@WLTP',
+ 'uploader_url': 'https://www.youtube.com/@WLTP',
},
}, {
'info_dict': {
'id': 'zaPI8MvL8pg',
'ext': 'mp4',
'title': 'Terraria 1.2 Live Stream | Let\'s Play - Part 04 (Tyson)',
- 'uploader_id': 'WiiRikeToPray',
'availability': 'public',
'channel_url': 'https://www.youtube.com/channel/UCN2XePorRokPB9TEgRZpddg',
'channel': 'WiiLikeToPlay',
- 'uploader_url': 'http://www.youtube.com/user/WiiRikeToPray',
'channel_follower_count': int,
'description': 'md5:563ccbc698b39298481ca3c571169519',
'duration': 10108,
@@ -1636,7 +1723,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'like_count': int,
'tags': 'count:24',
'channel_id': 'UCN2XePorRokPB9TEgRZpddg',
- 'uploader': 'WiiLikeToPlay',
'release_timestamp': 1383701915,
'comment_count': int,
'upload_date': '20131105',
@@ -1646,6 +1732,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'live_status': 'was_live',
'categories': ['Gaming'],
'view_count': int,
+ 'uploader': 'WiiLikeToPlay',
+ 'uploader_id': '@WLTP',
+ 'uploader_url': 'https://www.youtube.com/@WLTP',
},
}, {
'info_dict': {
@@ -1659,12 +1748,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'playable_in_embed': True,
'upload_date': '20131105',
'description': 'md5:563ccbc698b39298481ca3c571169519',
- 'uploader_id': 'WiiRikeToPray',
- 'uploader_url': 'http://www.youtube.com/user/WiiRikeToPray',
'channel_follower_count': int,
'tags': 'count:24',
'release_date': '20131106',
- 'uploader': 'WiiLikeToPlay',
'comment_count': int,
'channel_url': 'https://www.youtube.com/channel/UCN2XePorRokPB9TEgRZpddg',
'channel': 'WiiLikeToPlay',
@@ -1674,6 +1760,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'age_limit': 0,
'duration': 10128,
'view_count': int,
+ 'uploader': 'WiiLikeToPlay',
+ 'uploader_id': '@WLTP',
+ 'uploader_url': 'https://www.youtube.com/@WLTP',
},
}],
'params': {'skip_download': True},
@@ -1710,9 +1799,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
'duration': 133,
'upload_date': '20151119',
- 'uploader_id': 'IronSoulElf',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
- 'uploader': 'IronSoulElf',
'creator': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
'track': 'Dark Walk',
'artist': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan',
@@ -1749,8 +1835,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
'description': 'md5:ee18a25c350637c8faff806845bddee9',
'upload_date': '20151107',
- 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
- 'uploader': 'CH GAMER DROID',
},
'params': {
'skip_download': True,
@@ -1772,9 +1856,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:a677553cf0840649b731a3024aeff4cc',
'duration': 721,
'upload_date': '20150128',
- 'uploader_id': 'BerkmanCenter',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
- 'uploader': 'The Berkman Klein Center for Internet & Society',
'license': 'Creative Commons Attribution license (reuse allowed)',
'channel_id': 'UCuLGmD72gJDBwmLw06X58SA',
'channel_url': 'https://www.youtube.com/channel/UCuLGmD72gJDBwmLw06X58SA',
@@ -1788,16 +1869,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'thumbnail': 'https://i.ytimg.com/vi_webp/M4gD1WSo5mA/maxresdefault.webp',
'live_status': 'not_live',
'playable_in_embed': True,
- 'comment_count': int,
'channel_follower_count': int,
'chapters': list,
+ 'uploader': 'The Berkman Klein Center for Internet & Society',
+ 'uploader_id': '@BKCHarvard',
+ 'uploader_url': 'https://www.youtube.com/@BKCHarvard',
},
'params': {
'skip_download': True,
},
},
{
- # Channel-like uploader_url
'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
'info_dict': {
'id': 'eQcmzGIKrzg',
@@ -1806,9 +1888,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
'duration': 4060,
'upload_date': '20151120',
- 'uploader': 'Bernie Sanders',
- 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
'license': 'Creative Commons Attribution license (reuse allowed)',
'playable_in_embed': True,
'tags': 'count:12',
@@ -1825,6 +1904,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'comment_count': int,
'channel_follower_count': int,
'chapters': list,
+ 'uploader': 'Bernie Sanders',
+ 'uploader_url': 'https://www.youtube.com/@BernieSanders',
+ 'uploader_id': '@BernieSanders',
+ 'channel_is_verified': True,
+ 'heatmap': 'count:100',
},
'params': {
'skip_download': True,
@@ -1848,9 +1932,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Piku - Trailer',
'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
'upload_date': '20150811',
- 'uploader': 'FlixMatrix',
- 'uploader_id': 'FlixMatrixKaravan',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
'license': 'Standard YouTube License',
},
'params': {
@@ -1868,9 +1949,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
'duration': 2085,
'upload_date': '20170118',
- 'uploader': 'Vsauce',
- 'uploader_id': 'Vsauce',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
'series': 'Mind Field',
'season_number': 1,
'episode_number': 1,
@@ -1888,7 +1966,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'like_count': int,
'playable_in_embed': True,
'live_status': 'not_live',
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'Vsauce',
+ 'uploader_url': 'https://www.youtube.com/@Vsauce',
+ 'uploader_id': '@Vsauce',
+ 'comment_count': int,
+ 'channel_is_verified': True,
},
'params': {
'skip_download': True,
@@ -1908,9 +1991,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
'duration': 965,
'upload_date': '20140124',
- 'uploader': 'New Century Foundation',
- 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
},
'params': {
'skip_download': True,
@@ -1955,9 +2035,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'duration': 433,
'upload_date': '20130923',
- 'uploader': 'Amelia Putri Harwita',
- 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
'formats': 'maxcount:10',
},
'params': {
@@ -1968,6 +2045,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
{
# Youtube Music Auto-generated description
+ # TODO: fix metadata extraction
'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
'info_dict': {
'id': 'MgNrAu2pzNs',
@@ -1975,8 +2053,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Voyeur Girl',
'description': 'md5:7ae382a65843d6df2685993e90a8628f',
'upload_date': '20190312',
- 'uploader': 'Stephen - Topic',
- 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
'artist': 'Stephen',
'track': 'Voyeur Girl',
'album': 'it\'s too much love to know my dear',
@@ -1984,12 +2060,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'release_year': 2019,
'alt_title': 'Voyeur Girl',
'view_count': int,
- 'uploader_url': 'http://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA',
'playable_in_embed': True,
'like_count': int,
'categories': ['Music'],
'channel_url': 'https://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA',
- 'channel': 'Stephen',
+ 'channel': 'Stephen', # TODO: should be "Stephen - Topic"
+ 'uploader': 'Stephen',
'availability': 'public',
'creator': 'Stephen',
'duration': 169,
@@ -2017,9 +2093,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
'description': 'md5:bf577a41da97918e94fa9798d9228825',
'upload_date': '20090125',
- 'uploader': 'Prochorowka',
- 'uploader_id': 'Prochorowka',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
'artist': 'Panjabi MC',
'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
'album': 'Beware of the Boys (Mundian To Bach Ke)',
@@ -2038,11 +2111,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'IMG 3456',
'description': '',
'upload_date': '20170613',
- 'uploader_id': 'ElevageOrVert',
- 'uploader': 'ElevageOrVert',
'view_count': int,
'thumbnail': 'https://i.ytimg.com/vi_webp/x41yOUIvK2k/maxresdefault.webp',
- 'uploader_url': 'http://www.youtube.com/user/ElevageOrVert',
'like_count': int,
'channel_id': 'UCo03ZQPBW5U4UC3regpt1nw',
'tags': [],
@@ -2053,8 +2123,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'duration': 7,
'playable_in_embed': True,
'live_status': 'not_live',
- 'channel': 'ElevageOrVert',
- 'channel_follower_count': int
+ 'channel': 'l\'Or Vert asbl',
+ 'channel_follower_count': int,
+ 'uploader': 'l\'Or Vert asbl',
+ 'uploader_url': 'https://www.youtube.com/@ElevageOrVert',
+ 'uploader_id': '@ElevageOrVert',
},
'params': {
'skip_download': True,
@@ -2072,11 +2145,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Part 77 Sort a list of simple types in c#',
'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
'upload_date': '20130831',
- 'uploader_id': 'kudvenkat',
- 'uploader': 'kudvenkat',
'channel_id': 'UCCTVrRB5KpIiK6V2GGVsR1Q',
'like_count': int,
- 'uploader_url': 'http://www.youtube.com/user/kudvenkat',
'channel_url': 'https://www.youtube.com/channel/UCCTVrRB5KpIiK6V2GGVsR1Q',
'live_status': 'not_live',
'categories': ['Education'],
@@ -2091,6 +2161,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'comment_count': int,
'channel_follower_count': int,
'chapters': list,
+ 'uploader': 'kudvenkat',
+ 'uploader_url': 'https://www.youtube.com/@Csharp-video-tutorialsBlogspot',
+ 'uploader_id': '@Csharp-video-tutorialsBlogspot',
+ 'channel_is_verified': True,
+ 'heatmap': 'count:100',
},
'params': {
'skip_download': True,
@@ -2114,9 +2189,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Burn Out',
'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131',
'upload_date': '20141120',
- 'uploader': 'The Cinematic Orchestra - Topic',
- 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
'artist': 'The Cinematic Orchestra',
'track': 'Burn Out',
'album': 'Every Day',
@@ -2135,7 +2207,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'thumbnail': 'https://i.ytimg.com/vi/OtqTfy26tG0/maxresdefault.jpg',
'categories': ['Music'],
'playable_in_embed': True,
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'The Cinematic Orchestra',
+ 'comment_count': int,
},
'params': {
'skip_download': True,
@@ -2154,13 +2228,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'title': 'San Diego teen commits suicide after bullying over embarrassing video',
'channel_id': 'UC-SJ6nODDmufqBzPBwCvYvQ',
- 'uploader': 'CBS Mornings',
- 'uploader_id': 'CBSThisMorning',
'upload_date': '20140716',
'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7',
'duration': 170,
'categories': ['News & Politics'],
- 'uploader_url': 'http://www.youtube.com/user/CBSThisMorning',
'view_count': int,
'channel': 'CBS Mornings',
'tags': ['suicide', 'bullying', 'video', 'cbs', 'news'],
@@ -2171,7 +2242,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'like_count': int,
'live_status': 'not_live',
'playable_in_embed': True,
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'CBS Mornings',
+ 'uploader_url': 'https://www.youtube.com/@CBSMornings',
+ 'uploader_id': '@CBSMornings',
+ 'comment_count': int,
+ 'channel_is_verified': True,
}
},
{
@@ -2183,9 +2259,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き',
'description': 'md5:ea770e474b7cd6722b4c95b833c03630',
'upload_date': '20201120',
- 'uploader': 'Walk around Japan',
- 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
'duration': 1456,
'categories': ['Travel & Events'],
'channel_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
@@ -2198,7 +2271,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_url': 'https://www.youtube.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
'live_status': 'not_live',
'playable_in_embed': True,
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'Walk around Japan',
+ 'uploader_url': 'https://www.youtube.com/@walkaroundjapan7124',
+ 'uploader_id': '@walkaroundjapan7124',
},
'params': {
'skip_download': True,
@@ -2224,13 +2300,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': '3gp',
'upload_date': '20210624',
'channel_id': 'UCp68_FLety0O-n9QU6phsgw',
- 'uploader': 'colinfurze',
- 'uploader_id': 'colinfurze',
'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw',
'description': 'md5:5d5991195d599b56cd0c4148907eec50',
'duration': 596,
'categories': ['Entertainment'],
- 'uploader_url': 'http://www.youtube.com/user/colinfurze',
'view_count': int,
'channel': 'colinfurze',
'tags': ['Colin', 'furze', 'Terry', 'tunnel', 'underground', 'bunker'],
@@ -2242,6 +2315,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'playable_in_embed': True,
'channel_follower_count': int,
'chapters': list,
+ 'uploader': 'colinfurze',
+ 'uploader_url': 'https://www.youtube.com/@colinfurze',
+ 'uploader_id': '@colinfurze',
+ 'comment_count': int,
+ 'channel_is_verified': True,
+ 'heatmap': 'count:100',
},
'params': {
'format': '17', # 3gp format available on android
@@ -2267,10 +2346,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mhtml',
'format_id': 'sb0',
'title': 'Your Brain is Plastic',
- 'uploader_id': 'scishow',
'description': 'md5:89cd86034bdb5466cd87c6ba206cd2bc',
'upload_date': '20140324',
- 'uploader': 'SciShow',
'like_count': int,
'channel_id': 'UCZYTClx2T1of7BRZ86-8fow',
'channel_url': 'https://www.youtube.com/channel/UCZYTClx2T1of7BRZ86-8fow',
@@ -2278,7 +2355,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'thumbnail': 'https://i.ytimg.com/vi/5KLPxDtMqe8/maxresdefault.jpg',
'playable_in_embed': True,
'tags': 'count:12',
- 'uploader_url': 'http://www.youtube.com/user/scishow',
'availability': 'public',
'channel': 'SciShow',
'live_status': 'not_live',
@@ -2287,6 +2363,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'age_limit': 0,
'channel_follower_count': int,
'chapters': list,
+ 'uploader': 'SciShow',
+ 'uploader_url': 'https://www.youtube.com/@SciShow',
+ 'uploader_id': '@SciShow',
+ 'comment_count': int,
+ 'channel_is_verified': True,
+ 'heatmap': 'count:100',
}, 'params': {'format': 'mhtml', 'skip_download': True}
}, {
# Ensure video upload_date is in UTC timezone (video was uploaded 1641170939)
@@ -2296,9 +2378,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'title': 'The NP that test your phone performance 🙂',
'description': 'md5:144494b24d4f9dfacb97c1bbef5de84d',
- 'uploader': 'Leon Nguyen',
- 'uploader_id': 'VNSXIII',
- 'uploader_url': 'http://www.youtube.com/user/VNSXIII',
'channel_id': 'UCRqNBSOHgilHfAczlUmlWHA',
'channel_url': 'https://www.youtube.com/channel/UCRqNBSOHgilHfAczlUmlWHA',
'duration': 21,
@@ -2314,7 +2393,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel': 'Leon Nguyen',
'thumbnail': 'https://i.ytimg.com/vi_webp/2NUZ8W2llS4/maxresdefault.webp',
'comment_count': int,
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'Leon Nguyen',
+ 'uploader_url': 'https://www.youtube.com/@LeonNguyen',
+ 'uploader_id': '@LeonNguyen',
+ 'heatmap': 'count:100',
}
}, {
# Same video as above, but with --compat-opt no-youtube-prefer-utc-upload-date
@@ -2324,9 +2407,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'title': 'The NP that test your phone performance 🙂',
'description': 'md5:144494b24d4f9dfacb97c1bbef5de84d',
- 'uploader': 'Leon Nguyen',
- 'uploader_id': 'VNSXIII',
- 'uploader_url': 'http://www.youtube.com/user/VNSXIII',
'channel_id': 'UCRqNBSOHgilHfAczlUmlWHA',
'channel_url': 'https://www.youtube.com/channel/UCRqNBSOHgilHfAczlUmlWHA',
'duration': 21,
@@ -2342,7 +2422,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel': 'Leon Nguyen',
'thumbnail': 'https://i.ytimg.com/vi_webp/2NUZ8W2llS4/maxresdefault.webp',
'comment_count': int,
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'Leon Nguyen',
+ 'uploader_url': 'https://www.youtube.com/@LeonNguyen',
+ 'uploader_id': '@LeonNguyen',
+ 'heatmap': 'count:100',
},
'params': {'compat_opts': ['no-youtube-prefer-utc-upload-date']}
}, {
@@ -2352,10 +2436,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'mzZzzBU6lrM',
'ext': 'mp4',
'title': 'I Met GeorgeNotFound In Real Life...',
- 'description': 'md5:cca98a355c7184e750f711f3a1b22c84',
- 'uploader': 'Quackity',
- 'uploader_id': 'QuackityHQ',
- 'uploader_url': 'http://www.youtube.com/user/QuackityHQ',
+ 'description': 'md5:978296ec9783a031738b684d4ebf302d',
'channel_id': 'UC_8NknAFiyhOUaZqHR3lq3Q',
'channel_url': 'https://www.youtube.com/channel/UC_8NknAFiyhOUaZqHR3lq3Q',
'duration': 955,
@@ -2372,7 +2453,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'availability': 'public',
'channel': 'Quackity',
'thumbnail': 'https://i.ytimg.com/vi/mzZzzBU6lrM/maxresdefault.jpg',
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader': 'Quackity',
+ 'uploader_id': '@Quackity',
+ 'uploader_url': 'https://www.youtube.com/@Quackity',
+ 'comment_count': int,
+ 'channel_is_verified': True,
+ 'heatmap': 'count:100',
}
},
{ # continuous livestream. Microformat upload date should be preferred.
@@ -2390,48 +2477,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_url': 'https://www.youtube.com/channel/UC84whx2xxsiA1gXHXXqKGOA',
'live_status': 'is_live',
'thumbnail': 'https://i.ytimg.com/vi/kgx4WGK0oNU/maxresdefault.jpg',
- 'uploader': '阿鲍Abao',
- 'uploader_url': 'http://www.youtube.com/channel/UC84whx2xxsiA1gXHXXqKGOA',
'channel': 'Abao in Tokyo',
'channel_follower_count': int,
'release_date': '20211127',
'tags': 'count:39',
'categories': ['People & Blogs'],
'like_count': int,
- 'uploader_id': 'UC84whx2xxsiA1gXHXXqKGOA',
'view_count': int,
'playable_in_embed': True,
'description': 'md5:2ef1d002cad520f65825346e2084e49d',
'concurrent_view_count': int,
+ 'uploader': 'Abao in Tokyo',
+ 'uploader_url': 'https://www.youtube.com/@abaointokyo',
+ 'uploader_id': '@abaointokyo',
},
'params': {'skip_download': True}
}, {
- # Story. Requires specific player params to work.
- 'url': 'https://www.youtube.com/watch?v=vv8qTUWmulI',
- 'info_dict': {
- 'id': 'vv8qTUWmulI',
- 'ext': 'mp4',
- 'availability': 'unlisted',
- 'view_count': int,
- 'channel_id': 'UCzIZ8HrzDgc-pNQDUG6avBA',
- 'upload_date': '20220526',
- 'categories': ['Education'],
- 'title': 'Story',
- 'channel': 'IT\'S HISTORY',
- 'description': '',
- 'uploader_id': 'BlastfromthePast',
- 'duration': 12,
- 'uploader': 'IT\'S HISTORY',
- 'playable_in_embed': True,
- 'age_limit': 0,
- 'live_status': 'not_live',
- 'tags': [],
- 'thumbnail': 'https://i.ytimg.com/vi_webp/vv8qTUWmulI/maxresdefault.webp',
- 'uploader_url': 'http://www.youtube.com/user/BlastfromthePast',
- 'channel_url': 'https://www.youtube.com/channel/UCzIZ8HrzDgc-pNQDUG6avBA',
- },
- 'skip': 'stories get removed after some period of time',
- }, {
'url': 'https://www.youtube.com/watch?v=tjjjtzRLHvA',
'info_dict': {
'id': 'tjjjtzRLHvA',
@@ -2440,11 +2501,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20220323',
'like_count': int,
'availability': 'unlisted',
- 'channel': 'nao20010128nao',
- 'thumbnail': 'https://i.ytimg.com/vi_webp/tjjjtzRLHvA/maxresdefault.webp',
+ 'channel': 'Lesmiscore',
+ 'thumbnail': r're:^https?://.*\.jpg',
'age_limit': 0,
- 'uploader': 'nao20010128nao',
- 'uploader_id': 'nao20010128nao',
'categories': ['Music'],
'view_count': int,
'description': '',
@@ -2455,7 +2514,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_follower_count': int,
'duration': 6,
'tags': [],
- 'uploader_url': 'http://www.youtube.com/user/nao20010128nao',
+ 'uploader_id': '@lesmiscore',
+ 'uploader': 'Lesmiscore',
+ 'uploader_url': 'https://www.youtube.com/@lesmiscore',
}
}, {
# Prefer primary title+description language metadata by default
@@ -2473,16 +2534,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'like_count': int,
'playable_in_embed': True,
'availability': 'unlisted',
- 'thumbnail': 'https://i.ytimg.com/vi_webp/el3E4MbxRqQ/maxresdefault.webp',
+ 'thumbnail': r're:^https?://.*\.jpg',
'age_limit': 0,
'duration': 5,
- 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA',
- 'uploader_url': 'http://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
'live_status': 'not_live',
'upload_date': '20220908',
'categories': ['People & Blogs'],
- 'uploader': 'cole-dlp-test-acc',
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
+ 'uploader_url': 'https://www.youtube.com/@coletdjnz',
+ 'uploader_id': '@coletdjnz',
+ 'uploader': 'cole-dlp-test-acc',
},
'params': {'skip_download': True}
}, {
@@ -2497,18 +2558,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'live_status': 'not_live',
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
'upload_date': '20220728',
- 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA',
'view_count': int,
'categories': ['People & Blogs'],
- 'thumbnail': 'https://i.ytimg.com/vi_webp/gHKT4uU8Zng/maxresdefault.webp',
+ 'thumbnail': r're:^https?://.*\.jpg',
'title': 'dlp test video title translated (fr)',
'availability': 'public',
- 'uploader': 'cole-dlp-test-acc',
'age_limit': 0,
'description': 'dlp test video description translated (fr)',
'playable_in_embed': True,
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
- 'uploader_url': 'http://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
+ 'uploader_url': 'https://www.youtube.com/@coletdjnz',
+ 'uploader_id': '@coletdjnz',
+ 'uploader': 'cole-dlp-test-acc',
},
'params': {'skip_download': True, 'extractor_args': {'youtube': {'lang': ['fr']}}},
'expected_warnings': [r'Preferring "fr" translated fields'],
@@ -2524,7 +2585,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'categories': ['Entertainment'],
'description': 'md5:e8031ff6e426cdb6a77670c9b81f6fa6',
- 'uploader_url': 'http://www.youtube.com/user/MrBeast6000',
'live_status': 'not_live',
'duration': 937,
'channel_follower_count': int,
@@ -2534,17 +2594,118 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'playable_in_embed': True,
'view_count': int,
'upload_date': '20221112',
- 'uploader': 'MrBeast',
- 'uploader_id': 'MrBeast6000',
'channel_url': 'https://www.youtube.com/channel/UCX6OQ3DkcsbYNE6H8uQQuVA',
'age_limit': 0,
'availability': 'public',
'channel_id': 'UCX6OQ3DkcsbYNE6H8uQQuVA',
'like_count': int,
'tags': [],
+ 'uploader': 'MrBeast',
+ 'uploader_url': 'https://www.youtube.com/@MrBeast',
+ 'uploader_id': '@MrBeast',
+ 'comment_count': int,
+ 'channel_is_verified': True,
+ 'heatmap': 'count:100',
},
'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'},
- }
+ }, {
+ 'note': 'Audio formats with Dynamic Range Compression',
+ 'url': 'https://www.youtube.com/watch?v=Tq92D6wQ1mg',
+ 'info_dict': {
+ 'id': 'Tq92D6wQ1mg',
+ 'ext': 'webm',
+ 'title': '[MMD] Adios - EVERGLOW [+Motion DL]',
+ 'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ',
+ 'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
+ 'channel_follower_count': int,
+ 'description': 'md5:17eccca93a786d51bc67646756894066',
+ 'upload_date': '20191228',
+ 'tags': ['mmd', 'dance', 'mikumikudance', 'kpop', 'vtuber'],
+ 'playable_in_embed': True,
+ 'like_count': int,
+ 'categories': ['Entertainment'],
+ 'thumbnail': 'https://i.ytimg.com/vi/Tq92D6wQ1mg/sddefault.jpg',
+ 'age_limit': 18,
+ 'channel': 'Projekt Melody',
+ 'view_count': int,
+ 'availability': 'needs_auth',
+ 'comment_count': int,
+ 'live_status': 'not_live',
+ 'duration': 106,
+ 'uploader': 'Projekt Melody',
+ 'uploader_id': '@ProjektMelody',
+ 'uploader_url': 'https://www.youtube.com/@ProjektMelody',
+ },
+ 'params': {'extractor_args': {'youtube': {'player_client': ['tv_embedded']}}, 'format': '251-drc'},
+ },
+ {
+ 'url': 'https://www.youtube.com/live/qVv6vCqciTM',
+ 'info_dict': {
+ 'id': 'qVv6vCqciTM',
+ 'ext': 'mp4',
+ 'age_limit': 0,
+ 'comment_count': int,
+ 'chapters': 'count:13',
+ 'upload_date': '20221223',
+ 'thumbnail': 'https://i.ytimg.com/vi/qVv6vCqciTM/maxresdefault.jpg',
+ 'channel_url': 'https://www.youtube.com/channel/UCIdEIHpS0TdkqRkHL5OkLtA',
+ 'like_count': int,
+ 'release_date': '20221223',
+ 'tags': ['Vtuber', '月ノ美兎', '名取さな', 'にじさんじ', 'クリスマス', '3D配信'],
+ 'title': '【 #インターネット女クリスマス 】3Dで歌ってはしゃぐインターネットの女たち【月ノ美兎/名取さな】',
+ 'view_count': int,
+ 'playable_in_embed': True,
+ 'duration': 4438,
+ 'availability': 'public',
+ 'channel_follower_count': int,
+ 'channel_id': 'UCIdEIHpS0TdkqRkHL5OkLtA',
+ 'categories': ['Entertainment'],
+ 'live_status': 'was_live',
+ 'release_timestamp': 1671793345,
+ 'channel': 'さなちゃんねる',
+ 'description': 'md5:6aebf95cc4a1d731aebc01ad6cc9806d',
+ 'uploader': 'さなちゃんねる',
+ 'uploader_url': 'https://www.youtube.com/@sana_natori',
+ 'uploader_id': '@sana_natori',
+ 'channel_is_verified': True,
+ 'heatmap': 'count:100',
+ },
+ },
+ {
+ # Fallbacks when webpage and web client is unavailable
+ 'url': 'https://www.youtube.com/watch?v=wSSmNUl9Snw',
+ 'info_dict': {
+ 'id': 'wSSmNUl9Snw',
+ 'ext': 'mp4',
+ # 'categories': ['Science & Technology'],
+ 'view_count': int,
+ 'chapters': 'count:2',
+ 'channel': 'Scott Manley',
+ 'like_count': int,
+ 'age_limit': 0,
+ # 'availability': 'public',
+ 'channel_follower_count': int,
+ 'live_status': 'not_live',
+ 'upload_date': '20170831',
+ 'duration': 682,
+ 'tags': 'count:8',
+ 'uploader_url': 'https://www.youtube.com/@scottmanley',
+ 'description': 'md5:f4bed7b200404b72a394c2f97b782c02',
+ 'uploader': 'Scott Manley',
+ 'uploader_id': '@scottmanley',
+ 'title': 'The Computer Hack That Saved Apollo 14',
+ 'channel_id': 'UCxzC4EngIsMrPmbm6Nxvb-A',
+ 'thumbnail': r're:^https?://.*\.webp',
+ 'channel_url': 'https://www.youtube.com/channel/UCxzC4EngIsMrPmbm6Nxvb-A',
+ 'playable_in_embed': True,
+ 'comment_count': int,
+ 'channel_is_verified': True,
+ 'heatmap': 'count:100',
+ },
+ 'params': {
+ 'extractor_args': {'youtube': {'player_client': ['android'], 'player_skip': ['webpage']}},
+ },
+ },
]
_WEBPAGE_TESTS = [
@@ -2558,8 +2719,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Feynman: Mirrors FUN TO IMAGINE 6',
'upload_date': '20080526',
'description': 'md5:873c81d308b979f0e23ee7e620b312a3',
- 'uploader': 'Christopher Sykes',
- 'uploader_id': 'ChristopherJSykes',
'age_limit': 0,
'tags': ['feynman', 'mirror', 'science', 'physics', 'imagination', 'fun', 'cool', 'puzzle'],
'channel_id': 'UCCeo--lls1vna5YJABWAcVA',
@@ -2575,7 +2734,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'view_count': int,
'categories': ['Science & Technology'],
'channel_follower_count': int,
- 'uploader_url': 'http://www.youtube.com/user/ChristopherJSykes',
+ 'uploader': 'Christopher Sykes',
+ 'uploader_url': 'https://www.youtube.com/@ChristopherSykesDocumentaries',
+ 'uploader_id': '@ChristopherSykesDocumentaries',
+ 'heatmap': 'count:100',
},
'params': {
'skip_download': True,
@@ -2608,11 +2770,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return
_, _, prs, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url)
- video_details = traverse_obj(
- prs, (..., 'videoDetails'), expected_type=dict, default=[])
+ video_details = traverse_obj(prs, (..., 'videoDetails'), expected_type=dict)
microformats = traverse_obj(
prs, (..., 'microformat', 'playerMicroformatRenderer'),
- expected_type=dict, default=[])
+ expected_type=dict)
_, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
is_live = live_status == 'is_live'
start_time = time.time()
@@ -2621,18 +2782,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
"""
@returns (manifest_url, manifest_stream_number, is_live) or None
"""
- with lock:
- refetch_manifest(format_id, delay)
-
- f = next((f for f in formats if f['format_id'] == format_id), None)
- if not f:
- if not is_live:
- self.to_screen(f'{video_id}: Video is no longer live')
- else:
- self.report_warning(
- f'Cannot find refreshed manifest for format {format_id}{bug_reports_message()}')
- return None
- return f['manifest_url'], f['manifest_stream_number'], is_live
+ for retry in self.RetryManager(fatal=False):
+ with lock:
+ refetch_manifest(format_id, delay)
+
+ f = next((f for f in formats if f['format_id'] == format_id), None)
+ if not f:
+ if not is_live:
+ retry.error = f'{video_id}: Video is no longer live'
+ else:
+ retry.error = f'Cannot find refreshed manifest for format {format_id}{bug_reports_message()}'
+ continue
+ return f['manifest_url'], f['manifest_stream_number'], is_live
+ return None
for f in formats:
f['is_live'] = is_live
@@ -2668,7 +2830,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Obtain from MPD's maximum seq value
old_mpd_url = mpd_url
last_error = ctx.pop('last_error', None)
- expire_fast = immediate or last_error and isinstance(last_error, urllib.error.HTTPError) and last_error.code == 403
+ expire_fast = immediate or last_error and isinstance(last_error, HTTPError) and last_error.status == 403
mpd_url, stream_number, is_live = (mpd_feed(format_id, 5 if expire_fast else 18000)
or (mpd_url, stream_number, False))
if not refresh_sequence:
@@ -2867,17 +3029,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)',
r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)',
- r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
- r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
+ r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?',
r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
# Obsolete patterns
- r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+ r'("|\')signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
- r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
- r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
jscode, 'Initial JS player signature function name', group='sig')
@@ -2951,7 +3110,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return funcname
return json.loads(js_to_json(self._search_regex(
- rf'var {re.escape(funcname)}\s*=\s*(\[.+?\]);', jscode,
+ rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])\s*[,;]', jscode,
f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)]
def _extract_n_function_code(self, video_id, player_url):
@@ -3100,11 +3259,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters'
), expected_type=list)
- return self._extract_chapters(
+ return self._extract_chapters_helper(
chapter_list,
- chapter_time=lambda chapter: float_or_none(
+ start_function=lambda chapter: float_or_none(
traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000),
- chapter_title=lambda chapter: traverse_obj(
+ title_function=lambda chapter: traverse_obj(
chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str),
duration=duration)
@@ -3112,83 +3271,74 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
content_list = traverse_obj(
data,
('engagementPanels', ..., 'engagementPanelSectionListRenderer', 'content', 'macroMarkersListRenderer', 'contents'),
- expected_type=list, default=[])
+ expected_type=list)
chapter_time = lambda chapter: parse_duration(self._get_text(chapter, 'timeDescription'))
chapter_title = lambda chapter: self._get_text(chapter, 'title')
return next(filter(None, (
- self._extract_chapters(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
- chapter_time, chapter_title, duration)
+ self._extract_chapters_helper(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
+ chapter_time, chapter_title, duration)
for contents in content_list)), [])
- def _extract_chapters_from_description(self, description, duration):
- duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
- sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
- return self._extract_chapters(
- re.findall(sep_re % (duration_re, r'.+?'), description or ''),
- chapter_time=lambda x: parse_duration(x[0]), chapter_title=lambda x: x[1],
- duration=duration, strict=False) or self._extract_chapters(
- re.findall(sep_re % (r'.+?', duration_re), description or ''),
- chapter_time=lambda x: parse_duration(x[1]), chapter_title=lambda x: x[0],
- duration=duration, strict=False)
-
- def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration, strict=True):
- if not duration:
- return
- chapter_list = [{
- 'start_time': chapter_time(chapter),
- 'title': chapter_title(chapter),
- } for chapter in chapter_list or []]
- if not strict:
- chapter_list.sort(key=lambda c: c['start_time'] or 0)
-
- chapters = [{'start_time': 0}]
- for idx, chapter in enumerate(chapter_list):
- if chapter['start_time'] is None:
- self.report_warning(f'Incomplete chapter {idx}')
- elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
- chapters.append(chapter)
- elif chapter not in chapters:
- self.report_warning(
- f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"')
- return chapters[1:]
+ def _extract_heatmap_from_player_overlay(self, data):
+ content_list = traverse_obj(data, (
+ 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer', 'decoratedPlayerBarRenderer', 'playerBar',
+ 'multiMarkersPlayerBarRenderer', 'markersMap', ..., 'value', 'heatmap', 'heatmapRenderer', 'heatMarkers', {list}))
+ return next(filter(None, (
+ traverse_obj(contents, (..., 'heatMarkerRenderer', {
+ 'start_time': ('timeRangeStartMillis', {functools.partial(float_or_none, scale=1000)}),
+ 'end_time': {lambda x: (x['timeRangeStartMillis'] + x['markerDurationMillis']) / 1000},
+ 'value': ('heatMarkerIntensityScoreNormalized', {float_or_none}),
+ })) for contents in content_list)), None)
def _extract_comment(self, comment_renderer, parent=None):
comment_id = comment_renderer.get('commentId')
if not comment_id:
return
- text = self._get_text(comment_renderer, 'contentText')
+ info = {
+ 'id': comment_id,
+ 'text': self._get_text(comment_renderer, 'contentText'),
+ 'like_count': self._get_count(comment_renderer, 'voteCount'),
+ 'author_id': traverse_obj(comment_renderer, ('authorEndpoint', 'browseEndpoint', 'browseId', {self.ucid_or_none})),
+ 'author': self._get_text(comment_renderer, 'authorText'),
+ 'author_thumbnail': traverse_obj(comment_renderer, ('authorThumbnail', 'thumbnails', -1, 'url', {url_or_none})),
+ 'parent': parent or 'root',
+ }
# Timestamp is an estimate calculated from the current time and time_text
time_text = self._get_text(comment_renderer, 'publishedTimeText') or ''
timestamp = self._parse_time_text(time_text)
- author = self._get_text(comment_renderer, 'authorText')
- author_id = try_get(comment_renderer,
- lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], str)
+ info.update({
+ # FIXME: non-standard, but we need a way of showing that it is an estimate.
+ '_time_text': time_text,
+ 'timestamp': timestamp,
+ })
- votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
- lambda x: x['likeCount']), str)) or 0
- author_thumbnail = try_get(comment_renderer,
- lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], str)
+ info['author_url'] = urljoin(
+ 'https://www.youtube.com', traverse_obj(comment_renderer, ('authorEndpoint', (
+ ('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url'))),
+ expected_type=str, get_all=False))
- author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
- is_favorited = 'creatorHeart' in (try_get(
- comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {})
- return {
- 'id': comment_id,
- 'text': text,
- 'timestamp': timestamp,
- 'time_text': time_text,
- 'like_count': votes,
- 'is_favorited': is_favorited,
- 'author': author,
- 'author_id': author_id,
- 'author_thumbnail': author_thumbnail,
- 'author_is_uploader': author_is_uploader,
- 'parent': parent or 'root'
- }
+ author_is_uploader = traverse_obj(comment_renderer, 'authorIsChannelOwner')
+ if author_is_uploader is not None:
+ info['author_is_uploader'] = author_is_uploader
+
+ comment_abr = traverse_obj(
+ comment_renderer, ('actionButtons', 'commentActionButtonsRenderer'), expected_type=dict)
+ if comment_abr is not None:
+ info['is_favorited'] = 'creatorHeart' in comment_abr
+
+ badges = self._extract_badges([traverse_obj(comment_renderer, 'authorCommentBadge')])
+ if self._has_badge(badges, BadgeType.VERIFIED):
+ info['author_is_verified'] = True
+
+ is_pinned = traverse_obj(comment_renderer, 'pinnedCommentBadge')
+ if is_pinned:
+ info['is_pinned'] = True
+
+ return info
def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, tracker=None):
@@ -3201,7 +3351,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
expected_comment_count = self._get_count(
comments_header_renderer, 'countText', 'commentsCount')
- if expected_comment_count:
+ if expected_comment_count is not None:
tracker['est_total'] = expected_comment_count
self.to_screen(f'Downloading ~{expected_comment_count} comments')
comment_sort_index = int(get_single_config_arg('comment_sort') != 'top') # 1 = new, 0 = top
@@ -3236,6 +3386,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
comment = self._extract_comment(comment_renderer, parent)
if not comment:
continue
+ comment_id = comment['id']
+ if comment.get('is_pinned'):
+ tracker['pinned_comment_ids'].add(comment_id)
+ # Sometimes YouTube may break and give us infinite looping comments.
+ # See: https://github.com/hypervideo/hypervideo/issues/6290
+ if comment_id in tracker['seen_comment_ids']:
+ if comment_id in tracker['pinned_comment_ids'] and not comment.get('is_pinned'):
+ # Pinned comments may appear a second time in newest first sort
+ # See: https://github.com/hypervideo/hypervideo/issues/6712
+ continue
+ self.report_warning(
+ 'Detected YouTube comments looping. Stopping comment extraction '
+ f'{"for this thread" if parent else ""} as we probably cannot get any more.')
+ yield
+ else:
+ tracker['seen_comment_ids'].add(comment['id'])
tracker['running_total'] += 1
tracker['total_reply_comments' if parent else 'total_parent_comments'] += 1
@@ -3257,10 +3423,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if not tracker:
tracker = dict(
running_total=0,
- est_total=0,
+ est_total=None,
current_page_thread=0,
total_parent_comments=0,
- total_reply_comments=0)
+ total_reply_comments=0,
+ seen_comment_ids=set(),
+ pinned_comment_ids=set()
+ )
# TODO: Deprecated
# YouTube comments have a max depth of 2
@@ -3287,11 +3456,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
continuation = self._build_api_continuation_query(self._generate_comment_continuation(video_id))
is_forced_continuation = True
+ continuation_items_path = (
+ 'onResponseReceivedEndpoints', ..., ('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems')
for page_num in itertools.count(0):
if not continuation:
break
headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response))
- comment_prog_str = f"({tracker['running_total']}/{tracker['est_total']})"
+ comment_prog_str = f"({tracker['running_total']}/~{tracker['est_total']})"
if page_num == 0:
if is_first_continuation:
note_prefix = 'Downloading comment section API JSON'
@@ -3302,31 +3473,37 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
' ' if parent else '', ' replies' if parent else '',
page_num, comment_prog_str)
+
+ # Do a deep check for incomplete data as sometimes YouTube may return no comments for a continuation
+ # Ignore check if YouTube says the comment count is 0.
+ check_get_keys = None
+ if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0):
+ check_get_keys = [[*continuation_items_path, ..., (
+ 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]]
try:
response = self._extract_response(
item_id=None, query=continuation,
ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
- check_get_keys='onResponseReceivedEndpoints' if not is_forced_continuation else None)
+ check_get_keys=check_get_keys)
except ExtractorError as e:
# Ignore incomplete data error for replies if retries didn't work.
# This is to allow any other parent comments and comment threads to be downloaded.
# See: https://github.com/hypervideo/hypervideo/issues/4669
- if 'incomplete data' in str(e).lower() and parent and self.get_param('ignoreerrors') is True:
- self.report_warning(
- 'Received incomplete data for a comment reply thread and retrying did not help. '
- 'Ignoring to let other comments be downloaded.')
- else:
- raise
+ if 'incomplete data' in str(e).lower() and parent:
+ if self.get_param('ignoreerrors') in (True, 'only_download'):
+ self.report_warning(
+ 'Received incomplete data for a comment reply thread and retrying did not help. '
+ 'Ignoring to let other comments be downloaded. Pass --no-ignore-errors to not ignore.')
+ return
+ else:
+ raise ExtractorError(
+ 'Incomplete data received for comment reply thread. '
+ 'Pass --ignore-errors to ignore and allow rest of comments to download.',
+ expected=True)
+ raise
is_forced_continuation = False
- continuation_contents = traverse_obj(
- response, 'onResponseReceivedEndpoints', expected_type=list, default=[])
-
continuation = None
- for continuation_section in continuation_contents:
- continuation_items = traverse_obj(
- continuation_section,
- (('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems'),
- get_all=False, expected_type=list) or []
+ for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]):
if is_first_continuation:
continuation = extract_header(continuation_items)
is_first_continuation = False
@@ -3389,7 +3566,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')):
return True
- reasons = traverse_obj(player_response, ('playabilityStatus', ('status', 'reason')), default=[])
+ reasons = traverse_obj(player_response, ('playabilityStatus', ('status', 'reason')))
AGE_GATE_REASONS = (
'confirm your age', 'age-restricted', 'inappropriate', # reason
'age_verification_required', 'age_check_required', # status
@@ -3400,8 +3577,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _is_unplayable(player_response):
return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE'
- _STORY_PLAYER_PARAMS = '8AEB'
-
def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data):
session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
@@ -3413,8 +3588,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
yt_query = {
'videoId': video_id,
}
- if smuggled_data.get('is_story') or _split_innertube_client(client)[0] == 'android':
- yt_query['params'] = self._STORY_PLAYER_PARAMS
+ if _split_innertube_client(client)[0] == 'android':
+ yt_query['params'] = 'CgIQBg=='
+
+ pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0]
+ if pp_arg:
+ yt_query['params'] = pp_arg
yt_query.update(self._generate_player_context(sts))
return self._extract_response(
@@ -3426,7 +3605,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _get_requested_clients(self, url, smuggled_data):
requested_clients = []
- default = ['android', 'web']
+ default = ['ios', 'android', 'web']
allowed_clients = sorted(
(client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'),
key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True)
@@ -3513,6 +3692,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self.report_warning(
f'Skipping player response from {client} client (got player response for video "{pr_video_id}" instead of "{video_id}")' + bug_reports_message())
else:
+ # Save client name for introspection later
+ name = short_client_name(client)
+ sd = traverse_obj(pr, ('streamingData', {dict})) or {}
+ sd[STREAMING_DATA_CLIENT_NAME] = name
+ for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})):
+ f[STREAMING_DATA_CLIENT_NAME] = name
prs.append(pr)
# creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in
@@ -3532,10 +3717,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _needs_live_processing(self, live_status, duration):
if (live_status == 'is_live' and self.get_param('live_from_start')
- or live_status == 'post_live' and (duration or 0) > 4 * 3600):
+ or live_status == 'post_live' and (duration or 0) > 2 * 3600):
return live_status
def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration):
+ CHUNK_SIZE = 10 << 20
itags, stream_ids = collections.defaultdict(set), []
itag_qualities, res_qualities = {}, {0: None}
q = qualities([
@@ -3545,7 +3731,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'audio_quality_ultralow', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats
'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
])
- streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[])
+ streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...))
+ format_types = self._configuration_arg('formats')
+ all_formats = 'duplicate' in format_types
+ if self._configuration_arg('include_duplicate_formats'):
+ all_formats = True
+ self._downloader.deprecated_feature('[youtube] include_duplicate_formats extractor argument is deprecated. '
+ 'Use formats=duplicate extractor argument instead')
+
+ def build_fragments(f):
+ return LazyList({
+ 'url': update_url_query(f['url'], {
+ 'range': f'{range_start}-{min(range_start + CHUNK_SIZE - 1, f["filesize"])}'
+ })
+ } for range_start in range(0, f['filesize'], CHUNK_SIZE))
for fmt in streaming_formats:
if fmt.get('targetDurationSec'):
@@ -3553,9 +3752,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
itag = str_or_none(fmt.get('itag'))
audio_track = fmt.get('audioTrack') or {}
- stream_id = '%s.%s' % (itag or '', audio_track.get('id', ''))
- if stream_id in stream_ids:
- continue
+ stream_id = (itag, audio_track.get('id'), fmt.get('isDrc'))
+ if not all_formats:
+ if stream_id in stream_ids:
+ continue
quality = fmt.get('quality')
height = int_or_none(fmt.get('height'))
@@ -3631,29 +3831,36 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if is_damaged:
self.report_warning(
f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)
+
+ client_name = fmt.get(STREAMING_DATA_CLIENT_NAME)
+ name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or ''
+ fps = int_or_none(fmt.get('fps')) or 0
dct = {
'asr': int_or_none(fmt.get('audioSampleRate')),
'filesize': int_or_none(fmt.get('contentLength')),
- 'format_id': itag,
+ 'format_id': f'{itag}{"-drc" if fmt.get("isDrc") else ""}',
'format_note': join_nonempty(
- '%s%s' % (audio_track.get('displayName') or '',
- ' (default)' if language_preference > 0 else ''),
- fmt.get('qualityLabel') or quality.replace('audio_quality_', ''),
+ join_nonempty(audio_track.get('displayName'),
+ language_preference > 0 and ' (default)', delim=''),
+ name, fmt.get('isDrc') and 'DRC',
try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()),
try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()),
- throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '),
+ throttled and 'THROTTLED', is_damaged and 'DAMAGED',
+ (self.get_param('verbose') or all_formats) and client_name,
+ delim=', '),
# Format 22 is likely to be damaged. See https://github.com/hypervideo/hypervideo/issues/3372
- 'source_preference': -10 if throttled else -5 if itag == '22' else -1,
- 'fps': int_or_none(fmt.get('fps')) or None,
+ 'source_preference': ((-10 if throttled else -5 if itag == '22' else -1)
+ + (100 if 'Premium' in name else 0)),
+ 'fps': fps if fps > 1 else None, # For some formats, fps is wrongly returned as 1
'audio_channels': fmt.get('audioChannels'),
'height': height,
- 'quality': q(quality),
+ 'quality': q(quality) - bool(fmt.get('isDrc')) / 2,
'has_drm': bool(fmt.get('drmFamilies')),
'tbr': tbr,
'url': fmt_url,
'width': int_or_none(fmt.get('width')),
'language': join_nonempty(audio_track.get('id', '').split('.')[0],
- 'desc' if language_preference < -1 else ''),
+ 'desc' if language_preference < -1 else '') or None,
'language_preference': language_preference,
# Strictly de-prioritize damaged and 3gp formats
'preference': -10 if is_damaged else -2 if itag == '17' else None,
@@ -3663,27 +3870,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if mime_mobj:
dct['ext'] = mimetype2ext(mime_mobj.group(1))
dct.update(parse_codecs(mime_mobj.group(2)))
- no_audio = dct.get('acodec') == 'none'
- no_video = dct.get('vcodec') == 'none'
- if no_audio:
- dct['vbr'] = tbr
- if no_video:
- dct['abr'] = tbr
- if no_audio or no_video:
- dct['downloader_options'] = {
- # Youtube throttles chunks >~10M
- 'http_chunk_size': 10485760,
- }
- if dct.get('ext'):
- dct['container'] = dct['ext'] + '_dash'
-
if itag:
itags[itag].add(('https', dct.get('language')))
stream_ids.append(stream_id)
- yield dct
+ single_stream = 'none' in (dct.get('acodec'), dct.get('vcodec'))
+ if single_stream and dct.get('ext'):
+ dct['container'] = dct['ext'] + '_dash'
+
+ if (all_formats or 'dashy' in format_types) and dct['filesize']:
+ yield {
+ **dct,
+ 'format_id': f'{dct["format_id"]}-dashy' if all_formats else dct['format_id'],
+ 'protocol': 'http_dash_segments',
+ 'fragments': build_fragments(dct),
+ }
+ if all_formats or 'dashy' not in format_types:
+ dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE}
+ yield dct
needs_live_processing = self._needs_live_processing(live_status, duration)
- skip_bad_formats = not self._configuration_arg('include_incomplete_formats')
+ skip_bad_formats = 'incomplete' not in format_types
+ if self._configuration_arg('include_incomplete_formats'):
+ skip_bad_formats = False
+ self._downloader.deprecated_feature('[youtube] include_incomplete_formats extractor argument is deprecated. '
+ 'Use formats=incomplete extractor argument instead')
skip_manifests = set(self._configuration_arg('skip'))
if (not self.get_param('youtube_include_hls_manifest', True)
@@ -3695,35 +3905,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
skip_manifests.add('dash')
if self._configuration_arg('include_live_dash'):
self._downloader.deprecated_feature('[youtube] include_live_dash extractor argument is deprecated. '
- 'Use include_incomplete_formats extractor argument instead')
+ 'Use formats=incomplete extractor argument instead')
elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live':
skip_manifests.add('dash')
- def process_manifest_format(f, proto, itag):
+ def process_manifest_format(f, proto, client_name, itag):
key = (proto, f.get('language'))
- if key in itags[itag]:
+ if not all_formats and key in itags[itag]:
return False
itags[itag].add(key)
- if any(p != proto for p, _ in itags[itag]):
+ if itag and all_formats:
+ f['format_id'] = f'{itag}-{proto}'
+ elif any(p != proto for p, _ in itags[itag]):
f['format_id'] = f'{itag}-{proto}'
elif itag:
f['format_id'] = itag
+ if f.get('source_preference') is None:
+ f['source_preference'] = -1
+
+ if itag in ('616', '235'):
+ f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ')
+ f['source_preference'] += 100
+
f['quality'] = q(itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1))
if f['quality'] == -1 and f.get('height'):
f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))])
+ if self.get_param('verbose') or all_formats:
+ f['format_note'] = join_nonempty(f.get('format_note'), client_name, delim=', ')
+ if f.get('fps') and f['fps'] <= 1:
+ del f['fps']
+
+ if proto == 'hls' and f.get('has_drm'):
+ f['has_drm'] = 'maybe'
+ f['source_preference'] -= 5
return True
subtitles = {}
for sd in streaming_data:
+ client_name = sd.get(STREAMING_DATA_CLIENT_NAME)
+
hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl')
if hls_manifest_url:
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live')
subtitles = self._merge_subtitles(subs, subtitles)
for f in fmts:
- if process_manifest_format(f, 'hls', self._search_regex(
+ if process_manifest_format(f, 'hls', client_name, self._search_regex(
r'/itag/(\d+)', f['url'], 'itag', default=None)):
yield f
@@ -3732,7 +3961,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False)
subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH
for f in formats:
- if process_manifest_format(f, 'dash', f['format_id']):
+ if process_manifest_format(f, 'dash', client_name, f['format_id']):
f['filesize'] = int_or_none(self._search_regex(
r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
if needs_live_processing:
@@ -3783,8 +4012,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
webpage = None
if 'webpage' not in self._configuration_arg('player_skip'):
query = {'bpctr': '9999999999', 'has_verified': '1'}
- if smuggled_data.get('is_story'):
- query['pp'] = self._STORY_PLAYER_PARAMS
+ pp = self._configuration_arg('player_params', [None], casesense=True)[0]
+ if pp:
+ query['pp'] = pp
webpage = self._download_webpage(
webpage_url, video_id, fatal=False, query=query)
@@ -3810,8 +4040,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else 'was_live' if live_content
else 'not_live' if False in (is_live, live_content)
else None)
- streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
+ streaming_data = traverse_obj(player_responses, (..., 'streamingData'))
*formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, live_status, duration)
+ if all(f.get('has_drm') for f in formats):
+ # If there are no formats that definitely don't have DRM, all have DRM
+ for f in formats:
+ f['has_drm'] = True
return live_broadcast_details, live_status, streaming_data, formats, subtitles
@@ -3825,7 +4059,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
webpage, master_ytcfg, player_responses, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url)
playability_statuses = traverse_obj(
- player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[])
+ player_responses, (..., 'playabilityStatus'), expected_type=dict)
trailer_video_id = get_first(
playability_statuses,
@@ -3838,11 +4072,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None))
if webpage else (lambda x: None))
- video_details = traverse_obj(
- player_responses, (..., 'videoDetails'), expected_type=dict, default=[])
+ video_details = traverse_obj(player_responses, (..., 'videoDetails'), expected_type=dict)
microformats = traverse_obj(
player_responses, (..., 'microformat', 'playerMicroformatRenderer'),
- expected_type=dict, default=[])
+ expected_type=dict)
translated_title = self._get_text(microformats, (..., 'title'))
video_title = (self._preferred_lang and translated_title
@@ -3975,10 +4208,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._downloader._sort_thumbnails(original_thumbnails)
category = get_first(microformats, 'category') or search_meta('genre')
- channel_id = str_or_none(
+ channel_id = self.ucid_or_none(str_or_none(
get_first(video_details, 'channelId')
or get_first(microformats, 'externalChannelId')
- or search_meta('channelId'))
+ or search_meta('channelId')))
owner_profile_url = get_first(microformats, 'ownerProfileUrl')
live_start_time = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp'))
@@ -3997,7 +4230,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
for fmt in filter(is_bad_format, formats):
fmt['preference'] = (fmt.get('preference') or -1) - 10
- fmt['format_note'] = join_nonempty(fmt.get('format_note'), '(Last 4 hours)', delim=' ')
+ fmt['format_note'] = join_nonempty(fmt.get('format_note'), '(Last 2 hours)', delim=' ')
if needs_live_processing:
self._prepare_live_from_start_formats(
@@ -4005,6 +4238,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
formats.extend(self._extract_storyboard(player_responses, duration))
+ channel_handle = self.handle_from_url(owner_profile_url)
+
info = {
'id': video_id,
'title': video_title,
@@ -4014,11 +4249,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# URL checking if user don't care about getting the best possible thumbnail
'thumbnail': traverse_obj(original_thumbnails, (-1, 'url')),
'description': video_description,
- 'uploader': get_first(video_details, 'author'),
- 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
- 'uploader_url': owner_profile_url,
'channel_id': channel_id,
- 'channel_url': format_field(channel_id, None, 'https://www.youtube.com/channel/%s'),
+ 'channel_url': format_field(channel_id, None, 'https://www.youtube.com/channel/%s', default=None),
'duration': duration,
'view_count': int_or_none(
get_first((video_details, microformats), (..., 'viewCount'))
@@ -4048,10 +4280,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Converted into dicts to remove duplicates
captions = {
get_lang_code(sub): sub
- for sub in traverse_obj(pctr, (..., 'captionTracks', ...), default=[])}
+ for sub in traverse_obj(pctr, (..., 'captionTracks', ...))}
translation_languages = {
lang.get('languageCode'): self._get_text(lang.get('languageName'), max_runs=1)
- for lang in traverse_obj(pctr, (..., 'translationLanguages', ...), default=[])}
+ for lang in traverse_obj(pctr, (..., 'translationLanguages', ...))}
def process_language(container, base_url, lang_code, sub_name, query):
lang_subs = container.setdefault(lang_code, [])
@@ -4090,9 +4322,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
continue
trans_code += f'-{lang_code}'
trans_name += format_field(lang_name, None, ' from %s')
- # Add an "-orig" label to the original language so that it can be distinguished.
- # The subs are returned without "-orig" as well for compatibility
if lang_code == f'a-{orig_trans_code}':
+ # Set audio language based on original subtitles
+ for f in formats:
+ if f.get('acodec') != 'none' and not f.get('language'):
+ f['language'] = orig_trans_code
+ # Add an "-orig" label to the original language so that it can be distinguished.
+ # The subs are returned without "-orig" as well for compatibility
process_language(
automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {})
# Setting tlang=lang returns damaged subtitles.
@@ -4112,15 +4348,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
info[d_k] = parse_duration(query[k][0])
# Youtube Music Auto-generated description
- if video_description:
+ if (video_description or '').strip().endswith('\nAuto-generated by YouTube.'):
+ # XXX: Causes catastrophic backtracking if description has "·"
+ # E.g. https://www.youtube.com/watch?v=DoPaAxMQoiI
+ # Simulating atomic groups: (?P<a>[^xy]+)x => (?=(?P<a>[^xy]+))(?P=a)x
+ # reduces it, but does not fully fix it. https://regex101.com/r/8Ssf2h/2
mobj = re.search(
r'''(?xs)
- (?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+
- (?P<album>[^\n]+)
+ (?=(?P<track>[^\n·]+))(?P=track)·
+ (?=(?P<artist>[^\n]+))(?P=artist)\n+
+ (?=(?P<album>[^\n]+))(?P=album)\n
(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?
(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?
- (.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?
- .+\nAuto-generated\ by\ YouTube\.\s*$
+ (.+?\nArtist\s*:\s*
+ (?=(?P<clean_artist>[^\n]+))(?P=clean_artist)\n
+ )?.+\nAuto-generated\ by\ YouTube\.\s*$
''', video_description)
if mobj:
release_year = mobj.group('release_year')
@@ -4140,22 +4382,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
initial_data = None
if webpage:
initial_data = self.extract_yt_initial_data(video_id, webpage, fatal=False)
+ if not traverse_obj(initial_data, 'contents'):
+ self.report_warning('Incomplete data received in embedded initial data; re-fetching using API.')
+ initial_data = None
if not initial_data:
query = {'videoId': video_id}
query.update(self._get_checkok_params())
initial_data = self._extract_response(
item_id=video_id, ep='next', fatal=False,
- ytcfg=master_ytcfg, query=query,
+ ytcfg=master_ytcfg, query=query, check_get_keys='contents',
headers=self.generate_api_headers(ytcfg=master_ytcfg),
note='Downloading initial data API JSON')
info['comment_count'] = traverse_obj(initial_data, (
'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'itemSectionRenderer',
- 'contents', ..., 'commentsEntryPointHeaderRenderer', 'commentCount', 'simpleText'
+ 'contents', ..., 'commentsEntryPointHeaderRenderer', 'commentCount'
), (
'engagementPanels', lambda _, v: v['engagementPanelSectionListRenderer']['panelIdentifier'] == 'comment-item-section',
- 'engagementPanelSectionListRenderer', 'header', 'engagementPanelTitleHeaderRenderer', 'contextualInfo', 'runs', ..., 'text'
- ), expected_type=int_or_none, get_all=False)
+ 'engagementPanelSectionListRenderer', 'header', 'engagementPanelTitleHeaderRenderer', 'contextualInfo'
+ ), expected_type=self._get_count, get_all=False)
try: # This will error if there is no livechat
initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
@@ -4178,6 +4423,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
or self._extract_chapters_from_description(video_description, duration)
or None)
+ info['heatmap'] = self._extract_heatmap_from_player_overlay(initial_data)
+
contents = traverse_obj(
initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents'),
expected_type=list, default=[])
@@ -4205,9 +4452,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
list) or []):
tbrs = variadic(
traverse_obj(
- tlb, 'toggleButtonRenderer',
- ('segmentedLikeDislikeButtonRenderer', ..., 'toggleButtonRenderer'),
- default=[]))
+ tlb, ('toggleButtonRenderer', ...),
+ ('segmentedLikeDislikeButtonRenderer', ..., 'toggleButtonRenderer')))
for tbr in tbrs:
for getter, regex in [(
lambda x: x['defaultText']['accessibility']['accessibilityData'],
@@ -4245,6 +4491,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel': self._get_text(vor, 'title'),
'channel_follower_count': self._get_count(vor, 'subscriberCountText')})
+ if not channel_handle:
+ channel_handle = self.handle_from_url(
+ traverse_obj(vor, (
+ ('navigationEndpoint', ('title', 'runs', ..., 'navigationEndpoint')),
+ (('commandMetadata', 'webCommandMetadata', 'url'), ('browseEndpoint', 'canonicalBaseUrl')),
+ {str}), get_all=False))
+
rows = try_get(
vsir,
lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
@@ -4270,13 +4523,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
info['artist'] = mrr_contents_text
elif mrr_title == 'Song':
info['track'] = mrr_contents_text
+ owner_badges = self._extract_badges(traverse_obj(vsir, ('owner', 'videoOwnerRenderer', 'badges')))
+ if self._has_badge(owner_badges, BadgeType.VERIFIED):
+ info['channel_is_verified'] = True
- fallbacks = {
- 'channel': 'uploader',
- 'channel_id': 'uploader_id',
- 'channel_url': 'uploader_url',
- }
-
+ info.update({
+ 'uploader': info.get('channel'),
+ 'uploader_id': channel_handle,
+ 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None),
+ })
# The upload date for scheduled, live and past live streams / premieres in microformats
# may be different from the stream date. Although not in UTC, we will prefer it in this case.
# See: https://github.com/hypervideo/hypervideo/pull/2223#issuecomment-1008485139
@@ -4288,19 +4543,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', [])
):
upload_date = strftime_or_none(
- self._parse_time_text(self._get_text(vpir, 'dateText')), '%Y%m%d') or upload_date
+ self._parse_time_text(self._get_text(vpir, 'dateText'))) or upload_date
info['upload_date'] = upload_date
- for to, frm in fallbacks.items():
- if not info.get(to):
- info[to] = info.get(frm)
-
for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
v = info.get(s_k)
if v:
info[d_k] = v
- badges = self._extract_badges(traverse_obj(contents, (..., 'videoPrimaryInfoRenderer'), get_all=False))
+ badges = self._extract_badges(traverse_obj(vpir, 'badges'))
is_private = (self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE)
or get_first(video_details, 'isPrivate', expected_type=bool))
@@ -4355,19 +4606,6 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
return info_dict
return wrapper
- def _extract_channel_id(self, webpage):
- channel_id = self._html_search_meta(
- 'channelId', webpage, 'channel id', default=None)
- if channel_id:
- return channel_id
- channel_url = self._html_search_meta(
- ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
- 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
- 'twitter:app:url:googleplay'), webpage, 'channel url')
- return self._search_regex(
- r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
- channel_url, 'channel id')
-
@staticmethod
def _extract_basic_item_renderer(item):
# Modified from _extract_grid_item_renderer
@@ -4382,6 +4620,44 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
elif key.startswith('grid') and key.endswith('Renderer'):
return renderer
+ def _extract_channel_renderer(self, renderer):
+ channel_id = self.ucid_or_none(renderer['channelId'])
+ title = self._get_text(renderer, 'title')
+ channel_url = format_field(channel_id, None, 'https://www.youtube.com/channel/%s', default=None)
+ channel_handle = self.handle_from_url(
+ traverse_obj(renderer, (
+ 'navigationEndpoint', (('commandMetadata', 'webCommandMetadata', 'url'),
+ ('browseEndpoint', 'canonicalBaseUrl')),
+ {str}), get_all=False))
+ if not channel_handle:
+ # As of 2023-06-01, YouTube sets subscriberCountText to the handle in search
+ channel_handle = self.handle_or_none(self._get_text(renderer, 'subscriberCountText'))
+ return {
+ '_type': 'url',
+ 'url': channel_url,
+ 'id': channel_id,
+ 'ie_key': YoutubeTabIE.ie_key(),
+ 'channel': title,
+ 'uploader': title,
+ 'channel_id': channel_id,
+ 'channel_url': channel_url,
+ 'title': title,
+ 'uploader_id': channel_handle,
+ 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None),
+ # See above. YouTube sets videoCountText to the subscriber text in search channel renderers.
+ # However, in feed/channels this is set correctly to the subscriber count
+ 'channel_follower_count': traverse_obj(
+ renderer, 'subscriberCountText', 'videoCountText', expected_type=self._get_count),
+ 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'),
+ 'playlist_count': (
+ # videoCountText may be the subscriber count
+ self._get_count(renderer, 'videoCountText')
+ if self._get_count(renderer, 'subscriberCountText') is not None else None),
+ 'description': self._get_text(renderer, 'descriptionSnippet'),
+ 'channel_is_verified': True if self._has_badge(
+ self._extract_badges(traverse_obj(renderer, 'ownerBadges')), BadgeType.VERIFIED) else None,
+ }
+
def _grid_entries(self, grid_renderer):
for item in grid_renderer['items']:
if not isinstance(item, dict):
@@ -4407,9 +4683,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
# channel
channel_id = renderer.get('channelId')
if channel_id:
- yield self.url_result(
- 'https://www.youtube.com/channel/%s' % channel_id,
- ie=YoutubeTabIE.ie_key(), video_title=title)
+ yield self._extract_channel_renderer(renderer)
continue
# generic endpoint URL support
ep_url = urljoin('https://www.youtube.com/', try_get(
@@ -4425,8 +4699,11 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
def _music_reponsive_list_entry(self, renderer):
video_id = traverse_obj(renderer, ('playlistItemData', 'videoId'))
if video_id:
+ title = traverse_obj(renderer, (
+ 'flexColumns', 0, 'musicResponsiveListItemFlexColumnRenderer',
+ 'text', 'runs', 0, 'text'))
return self.url_result(f'https://music.youtube.com/watch?v={video_id}',
- ie=YoutubeIE.ie_key(), video_id=video_id)
+ ie=YoutubeIE.ie_key(), video_id=video_id, title=title)
playlist_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'playlistId'))
if playlist_id:
video_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'videoId'))
@@ -4485,11 +4762,19 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
def _rich_entries(self, rich_grid_renderer):
renderer = traverse_obj(
- rich_grid_renderer, ('content', ('videoRenderer', 'reelItemRenderer')), get_all=False) or {}
+ rich_grid_renderer,
+ ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer')), get_all=False) or {}
video_id = renderer.get('videoId')
- if not video_id:
+ if video_id:
+ yield self._extract_video(renderer)
+ return
+ playlist_id = renderer.get('playlistId')
+ if playlist_id:
+ yield self.url_result(
+ f'https://www.youtube.com/playlist?list={playlist_id}',
+ ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
+ video_title=self._get_text(renderer, 'title'))
return
- yield self._extract_video(renderer)
def _video_entry(self, video_renderer):
video_id = video_renderer.get('videoId')
@@ -4605,7 +4890,8 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
'videoRenderer': lambda x: [self._video_entry(x)],
'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}),
'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}),
- 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)]
+ 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)],
+ 'richGridRenderer': lambda x: self._extract_entries(x, continuation_list),
}
for key, renderer in isr_content.items():
if key not in known_renderers:
@@ -4633,10 +4919,15 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
yield from extract_entries(parent_renderer)
continuation = continuation_list[0]
-
+ seen_continuations = set()
for page_num in itertools.count(1):
if not continuation:
break
+ continuation_token = continuation.get('continuation')
+ if continuation_token is not None and continuation_token in seen_continuations:
+ self.write_debug('Detected YouTube feed looping - assuming end of feed.')
+ break
+ seen_continuations.add(continuation_token)
headers = self.generate_api_headers(
ytcfg=ytcfg, account_syncid=account_syncid, visitor_data=visitor_data)
response = self._extract_response(
@@ -4717,13 +5008,14 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
metadata_renderer = traverse_obj(data, ('metadata', 'channelMetadataRenderer'), expected_type=dict)
if metadata_renderer:
+ channel_id = traverse_obj(metadata_renderer, ('externalId', {self.ucid_or_none}),
+ ('channelUrl', {self.ucid_from_url}))
info.update({
- 'uploader': metadata_renderer.get('title'),
- 'uploader_id': metadata_renderer.get('externalId'),
- 'uploader_url': metadata_renderer.get('channelUrl'),
+ 'channel': metadata_renderer.get('title'),
+ 'channel_id': channel_id,
})
- if info['uploader_id']:
- info['id'] = info['uploader_id']
+ if info['channel_id']:
+ info['id'] = info['channel_id']
else:
metadata_renderer = traverse_obj(data, ('metadata', 'playlistMetadataRenderer'), expected_type=dict)
@@ -4776,6 +5068,19 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
'thumbnails': (primary_thumbnails or playlist_thumbnails) + avatar_thumbnails + channel_banners,
})
+ channel_handle = (
+ traverse_obj(metadata_renderer, (('vanityChannelUrl', ('ownerUrls', ...)), {self.handle_from_url}), get_all=False)
+ or traverse_obj(data, ('header', ..., 'channelHandleText', {self.handle_or_none}), get_all=False))
+
+ if channel_handle:
+ info.update({
+ 'uploader_id': channel_handle,
+ 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None),
+ })
+
+ channel_badges = self._extract_badges(traverse_obj(data, ('header', ..., 'badges'), get_all=False))
+ if self._has_badge(channel_badges, BadgeType.VERIFIED):
+ info['channel_is_verified'] = True
# Playlist stats is a text runs array containing [video count, view count, last updated].
# last updated or (view count and last updated) may be missing.
playlist_stats = get_first(
@@ -4784,17 +5089,21 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
last_updated_unix = self._parse_time_text(
self._get_text(playlist_stats, 2) # deprecated, remove when old layout discontinued
or self._get_text(playlist_header_renderer, ('byline', 1, 'playlistBylineRenderer', 'text')))
- info['modified_date'] = strftime_or_none(last_updated_unix, '%Y%m%d')
+ info['modified_date'] = strftime_or_none(last_updated_unix)
info['view_count'] = self._get_count(playlist_stats, 1)
if info['view_count'] is None: # 0 is allowed
info['view_count'] = self._get_count(playlist_header_renderer, 'viewCountText')
+ if info['view_count'] is None:
+ info['view_count'] = self._get_count(data, (
+ 'contents', 'twoColumnBrowseResultsRenderer', 'tabs', ..., 'tabRenderer', 'content', 'sectionListRenderer',
+ 'contents', ..., 'itemSectionRenderer', 'contents', ..., 'channelAboutFullMetadataRenderer', 'viewCountText'))
info['playlist_count'] = self._get_count(playlist_stats, 0)
if info['playlist_count'] is None: # 0 is allowed
info['playlist_count'] = self._get_count(playlist_header_renderer, ('byline', 0, 'playlistBylineRenderer', 'text'))
- if not info.get('uploader_id'):
+ if not info.get('channel_id'):
owner = traverse_obj(playlist_header_renderer, 'ownerText')
if not owner: # Deprecated
owner = traverse_obj(
@@ -4803,16 +5112,17 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
owner_text = self._get_text(owner)
browse_ep = traverse_obj(owner, ('runs', 0, 'navigationEndpoint', 'browseEndpoint')) or {}
info.update({
- 'uploader': self._search_regex(r'^by (.+) and \d+ others?$', owner_text, 'uploader', default=owner_text),
- 'uploader_id': browse_ep.get('browseId'),
- 'uploader_url': urljoin('https://www.youtube.com', browse_ep.get('canonicalBaseUrl'))
+ 'channel': self._search_regex(r'^by (.+) and \d+ others?$', owner_text, 'uploader', default=owner_text),
+ 'channel_id': self.ucid_or_none(browse_ep.get('browseId')),
+ 'uploader_id': self.handle_from_url(urljoin('https://www.youtube.com', browse_ep.get('canonicalBaseUrl')))
})
info.update({
- 'channel': info['uploader'],
- 'channel_id': info['uploader_id'],
- 'channel_url': info['uploader_url']
+ 'uploader': info['channel'],
+ 'channel_url': format_field(info.get('channel_id'), None, 'https://www.youtube.com/channel/%s', default=None),
+ 'uploader_url': format_field(info.get('uploader_id'), None, 'https://www.youtube.com/%s', default=None),
})
+
return info
def _extract_inline_playlist(self, playlist, playlist_id, data, ytcfg):
@@ -4879,7 +5189,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer')) or {}
player_header_privacy = playlist_header_renderer.get('privacy')
- badges = self._extract_badges(sidebar_renderer)
+ badges = self._extract_badges(traverse_obj(sidebar_renderer, 'badges'))
# Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge
privacy_setting_icon = get_first(
@@ -4951,7 +5261,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
data = self.extract_yt_initial_data(item_id, webpage or '', fatal=fatal) or {}
except ExtractorError as e:
if isinstance(e.cause, network_exceptions):
- if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code not in (403, 429):
+ if not isinstance(e.cause, HTTPError) or e.cause.status not in (403, 429):
retry.error = e
continue
self._error_or_warning(e, fatal=fatal)
@@ -5060,7 +5370,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
IE_DESC = 'YouTube Tabs'
_VALID_URL = r'''(?x:
https?://
- (?:\w+\.)?
+ (?!consent\.)(?:\w+\.)?
(?:
youtube(?:kids)?\.com|
%(invidious)s
@@ -5089,12 +5399,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'title': 'Igor Kleiner - Playlists',
'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
'uploader': 'Igor Kleiner',
- 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'uploader_id': '@IgorDataScience',
+ 'uploader_url': 'https://www.youtube.com/@IgorDataScience',
'channel': 'Igor Kleiner',
'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg',
'tags': ['"критическое', 'мышление"', '"наука', 'просто"', 'математика', '"анализ', 'данных"'],
'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg',
- 'uploader_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg',
'channel_follower_count': int
},
}, {
@@ -5105,9 +5415,9 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'id': 'UCqj7Cz7revf5maW9g5pgNcg',
'title': 'Igor Kleiner - Playlists',
'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
- 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
'uploader': 'Igor Kleiner',
- 'uploader_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg',
+ 'uploader_id': '@IgorDataScience',
+ 'uploader_url': 'https://www.youtube.com/@IgorDataScience',
'tags': ['"критическое', 'мышление"', '"наука', 'просто"', 'математика', '"анализ', 'данных"'],
'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg',
'channel': 'Igor Kleiner',
@@ -5122,14 +5432,15 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'id': 'UCYO_jab_esuFRV4b17AJtAw',
'title': '3Blue1Brown - Playlists',
'description': 'md5:e1384e8a133307dd10edee76e875d62f',
- 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
- 'uploader': '3Blue1Brown',
'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
- 'uploader_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
'channel': '3Blue1Brown',
'channel_id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'uploader_id': '@3blue1brown',
+ 'uploader_url': 'https://www.youtube.com/@3blue1brown',
+ 'uploader': '3Blue1Brown',
'tags': ['Mathematics'],
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'channel_is_verified': True,
},
}, {
'note': 'playlists, singlepage',
@@ -5140,10 +5451,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'title': 'ThirstForScience - Playlists',
'description': 'md5:609399d937ea957b0f53cbffb747a14c',
'uploader': 'ThirstForScience',
- 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
- 'uploader_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ',
- 'channel_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ',
+ 'uploader_url': 'https://www.youtube.com/@ThirstForScience',
+ 'uploader_id': '@ThirstForScience',
'channel_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
+ 'channel_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ',
'tags': 'count:13',
'channel': 'ThirstForScience',
'channel_follower_count': int
@@ -5155,8 +5466,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'note': 'basic, single video playlist',
'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
'info_dict': {
- 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
- 'uploader': 'Sergey M.',
'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
'title': 'youtube-dl public playlist',
'description': '',
@@ -5165,17 +5474,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'modified_date': '20201130',
'channel': 'Sergey M.',
'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
- 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
'availability': 'public',
+ 'uploader': 'Sergey M.',
+ 'uploader_url': 'https://www.youtube.com/@sergeym.6173',
+ 'uploader_id': '@sergeym.6173',
},
'playlist_count': 1,
}, {
'note': 'empty playlist',
'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
'info_dict': {
- 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
- 'uploader': 'Sergey M.',
'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
'title': 'youtube-dl empty playlist',
'tags': [],
@@ -5184,8 +5493,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'modified_date': '20160902',
'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
- 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
'availability': 'public',
+ 'uploader_url': 'https://www.youtube.com/@sergeym.6173',
+ 'uploader_id': '@sergeym.6173',
+ 'uploader': 'Sergey M.',
},
'playlist_count': 0,
}, {
@@ -5196,10 +5507,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'title': 'lex will - Home',
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
'uploader': 'lex will',
- 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_id': '@lexwill718',
'channel': 'lex will',
'tags': ['bible', 'history', 'prophesy'],
- 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_url': 'https://www.youtube.com/@lexwill718',
'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'channel_follower_count': int
@@ -5213,11 +5524,11 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'title': 'lex will - Videos',
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
'uploader': 'lex will',
- 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_id': '@lexwill718',
'tags': ['bible', 'history', 'prophesy'],
'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_url': 'https://www.youtube.com/@lexwill718',
'channel': 'lex will',
'channel_follower_count': int
},
@@ -5230,9 +5541,9 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'title': 'lex will - Videos',
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
'uploader': 'lex will',
- 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_id': '@lexwill718',
'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_url': 'https://www.youtube.com/@lexwill718',
'channel': 'lex will',
'tags': ['bible', 'history', 'prophesy'],
'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
@@ -5247,8 +5558,8 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'title': 'lex will - Playlists',
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
'uploader': 'lex will',
- 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'uploader_id': '@lexwill718',
+ 'uploader_url': 'https://www.youtube.com/@lexwill718',
'channel': 'lex will',
'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
@@ -5263,14 +5574,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'title': 'lex will - Community',
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
- 'uploader': 'lex will',
- 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'channel': 'lex will',
'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'tags': ['bible', 'history', 'prophesy'],
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader_url': 'https://www.youtube.com/@lexwill718',
+ 'uploader_id': '@lexwill718',
+ 'uploader': 'lex will',
},
'playlist_mincount': 18,
}, {
@@ -5280,14 +5591,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'title': 'lex will - Channels',
'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
- 'uploader': 'lex will',
- 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'channel': 'lex will',
'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
'tags': ['bible', 'history', 'prophesy'],
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader_url': 'https://www.youtube.com/@lexwill718',
+ 'uploader_id': '@lexwill718',
+ 'uploader': 'lex will',
},
'playlist_mincount': 12,
}, {
@@ -5298,14 +5609,15 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'id': 'UCYO_jab_esuFRV4b17AJtAw',
'title': '3Blue1Brown - Search - linear algebra',
'description': 'md5:e1384e8a133307dd10edee76e875d62f',
- 'uploader': '3Blue1Brown',
- 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
- 'uploader_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
'tags': ['Mathematics'],
'channel': '3Blue1Brown',
'channel_id': 'UCYO_jab_esuFRV4b17AJtAw',
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'uploader_url': 'https://www.youtube.com/@3blue1brown',
+ 'uploader_id': '@3blue1brown',
+ 'uploader': '3Blue1Brown',
+ 'channel_is_verified': True,
},
}, {
'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
@@ -5322,17 +5634,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'info_dict': {
'title': '29C3: Not my department',
'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
- 'uploader': 'Christiaan008',
- 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
'tags': [],
- 'uploader_url': 'https://www.youtube.com/c/ChRiStIaAn008',
'view_count': int,
'modified_date': '20150605',
'channel_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
- 'channel_url': 'https://www.youtube.com/c/ChRiStIaAn008',
+ 'channel_url': 'https://www.youtube.com/channel/UCEPzS1rYsrkqzSLNp76nrcg',
'channel': 'Christiaan008',
'availability': 'public',
+ 'uploader_id': '@ChRiStIaAn008',
+ 'uploader': 'Christiaan008',
+ 'uploader_url': 'https://www.youtube.com/@ChRiStIaAn008',
},
'playlist_count': 96,
}, {
@@ -5341,17 +5653,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'info_dict': {
'title': 'Uploads from Cauchemar',
'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
- 'uploader': 'Cauchemar',
- 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
- 'channel_url': 'https://www.youtube.com/c/Cauchemar89',
+ 'channel_url': 'https://www.youtube.com/channel/UCBABnxM4Ar9ten8Mdjj1j0Q',
'tags': [],
'modified_date': r're:\d{8}',
'channel': 'Cauchemar',
- 'uploader_url': 'https://www.youtube.com/c/Cauchemar89',
'view_count': int,
'description': '',
'channel_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
'availability': 'public',
+ 'uploader_id': '@Cauchemar89',
+ 'uploader': 'Cauchemar',
+ 'uploader_url': 'https://www.youtube.com/@Cauchemar89',
},
'playlist_mincount': 1123,
'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
@@ -5365,17 +5677,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'info_dict': {
'title': 'Uploads from Interstellar Movie',
'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
- 'uploader': 'Interstellar Movie',
- 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
- 'uploader_url': 'https://www.youtube.com/c/InterstellarMovie',
'tags': [],
'view_count': int,
'channel_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
- 'channel_url': 'https://www.youtube.com/c/InterstellarMovie',
+ 'channel_url': 'https://www.youtube.com/channel/UCXw-G3eDE9trcvY2sBMM_aA',
'channel': 'Interstellar Movie',
'description': '',
'modified_date': r're:\d{8}',
'availability': 'public',
+ 'uploader_id': '@InterstellarMovie',
+ 'uploader': 'Interstellar Movie',
+ 'uploader_url': 'https://www.youtube.com/@InterstellarMovie',
},
'playlist_mincount': 21,
}, {
@@ -5384,17 +5696,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'info_dict': {
'title': 'Uploads from Phim Siêu Nhân Nhật Bản',
'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q',
- 'uploader': 'Phim Siêu Nhân Nhật Bản',
- 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
'view_count': int,
'channel': 'Phim Siêu Nhân Nhật Bản',
'tags': [],
- 'uploader_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q',
'description': '',
'channel_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q',
'channel_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q',
'modified_date': r're:\d{8}',
'availability': 'public',
+ 'uploader_url': 'https://www.youtube.com/@phimsieunhannhatban',
+ 'uploader_id': '@phimsieunhannhatban',
+ 'uploader': 'Phim Siêu Nhân Nhật Bản',
},
'playlist_mincount': 200,
'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
@@ -5404,17 +5716,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'info_dict': {
'title': 'Uploads from BlankTV',
'id': 'UU8l9frL61Yl5KFOl87nIm2w',
- 'uploader': 'BlankTV',
- 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w',
'channel': 'BlankTV',
- 'channel_url': 'https://www.youtube.com/c/blanktv',
+ 'channel_url': 'https://www.youtube.com/channel/UC8l9frL61Yl5KFOl87nIm2w',
'channel_id': 'UC8l9frL61Yl5KFOl87nIm2w',
'view_count': int,
'tags': [],
- 'uploader_url': 'https://www.youtube.com/c/blanktv',
'modified_date': r're:\d{8}',
'description': '',
'availability': 'public',
+ 'uploader_id': '@blanktv',
+ 'uploader': 'BlankTV',
+ 'uploader_url': 'https://www.youtube.com/@blanktv',
},
'playlist_mincount': 1000,
'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
@@ -5424,17 +5736,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'info_dict': {
'title': 'Data Analysis with Dr Mike Pound',
'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
- 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
- 'uploader': 'Computerphile',
'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
- 'uploader_url': 'https://www.youtube.com/user/Computerphile',
'tags': [],
'view_count': int,
'channel_id': 'UC9-y-6csu5WGm29I7JiwpnA',
- 'channel_url': 'https://www.youtube.com/user/Computerphile',
+ 'channel_url': 'https://www.youtube.com/channel/UC9-y-6csu5WGm29I7JiwpnA',
'channel': 'Computerphile',
'availability': 'public',
'modified_date': '20190712',
+ 'uploader_id': '@Computerphile',
+ 'uploader': 'Computerphile',
+ 'uploader_url': 'https://www.youtube.com/@Computerphile',
},
'playlist_mincount': 11,
}, {
@@ -5447,9 +5759,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'id': 'FqZTN594JQw',
'ext': 'webm',
'title': "Smiley's People 01 detective, Adventure Series, Action",
- 'uploader': 'STREEM',
- 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
'upload_date': '20150526',
'license': 'Standard YouTube License',
'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
@@ -5472,12 +5781,9 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
}, {
'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
'info_dict': {
- 'id': 'Wq15eF5vCbI', # This will keep changing
+ 'id': 'hGkQjiJLjWQ', # This will keep changing
'ext': 'mp4',
'title': str,
- 'uploader': 'Sky News',
- 'uploader_id': 'skynews',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
'upload_date': r're:\d{8}',
'description': str,
'categories': ['News & Politics'],
@@ -5496,6 +5802,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'channel_url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ',
'channel_follower_count': int,
'concurrent_view_count': int,
+ 'uploader_url': 'https://www.youtube.com/@SkyNews',
+ 'uploader_id': '@SkyNews',
+ 'uploader': 'Sky News',
+ 'channel_is_verified': True,
},
'params': {
'skip_download': True,
@@ -5507,9 +5817,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'id': 'a48o2S1cPoo',
'ext': 'mp4',
'title': 'The Young Turks - Live Main Show',
- 'uploader': 'The Young Turks',
- 'uploader_id': 'TheYoungTurks',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
'upload_date': '20150715',
'license': 'Standard YouTube License',
'description': 'md5:438179573adcdff3c97ebb1ee632b891',
@@ -5590,41 +5897,40 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
'info_dict': {
'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq',
- 'uploader': 'NoCopyrightSounds',
'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
- 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
'title': 'NCS : All Releases 💿',
- 'uploader_url': 'https://www.youtube.com/c/NoCopyrightSounds',
- 'channel_url': 'https://www.youtube.com/c/NoCopyrightSounds',
+ 'channel_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg',
'modified_date': r're:\d{8}',
'view_count': int,
'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
'tags': [],
'channel': 'NoCopyrightSounds',
'availability': 'public',
+ 'uploader_url': 'https://www.youtube.com/@NoCopyrightSounds',
+ 'uploader': 'NoCopyrightSounds',
+ 'uploader_id': '@NoCopyrightSounds',
},
'playlist_mincount': 166,
- 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
+ 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden', 'YouTube Music is not directly supported'],
}, {
+ # TODO: fix 'unviewable' issue with this playlist when reloading with unavailable videos
'note': 'Topic, should redirect to playlist?list=UU...',
'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
'info_dict': {
'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
- 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
'title': 'Uploads from Royalty Free Music - Topic',
- 'uploader': 'Royalty Free Music - Topic',
'tags': [],
'channel_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
'channel': 'Royalty Free Music - Topic',
'view_count': int,
'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
- 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
'modified_date': r're:\d{8}',
- 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
'description': '',
'availability': 'public',
+ 'uploader': 'Royalty Free Music - Topic',
},
'playlist_mincount': 101,
+ 'expected_warnings': ['YouTube Music is not directly supported', r'[Uu]navailable videos (are|will be) hidden'],
}, {
# Destination channel with only a hidden self tab (tab id is UCtFRv9O2AHqOZjjynzrv-xg)
# Treat as a general feed
@@ -5648,12 +5954,11 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'modified_date': r're:\d{8}',
},
'playlist_count': 50,
+ 'expected_warnings': ['YouTube Music is not directly supported'],
}, {
'note': 'unlisted single video playlist',
'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
'info_dict': {
- 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q',
- 'uploader': 'colethedj',
'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf',
'title': 'hypervideo unlisted playlist test',
'availability': 'unlisted',
@@ -5662,11 +5967,31 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'channel': 'colethedj',
'view_count': int,
'description': '',
- 'uploader_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q',
'channel_id': 'UC9zHu_mHU96r19o-wV5Qs1Q',
'channel_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q',
+ 'uploader_url': 'https://www.youtube.com/@colethedj1894',
+ 'uploader_id': '@colethedj1894',
+ 'uploader': 'colethedj',
},
+ 'playlist': [{
+ 'info_dict': {
+ 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
+ 'id': 'BaW_jenozKc',
+ '_type': 'url',
+ 'ie_key': 'Youtube',
+ 'duration': 10,
+ 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
+ 'channel_url': 'https://www.youtube.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
+ 'view_count': int,
+ 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc',
+ 'channel': 'Philipp Hagemeister',
+ 'uploader_id': '@PhilippHagemeister',
+ 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister',
+ 'uploader': 'Philipp Hagemeister',
+ }
+ }],
'playlist_count': 1,
+ 'params': {'extract_flat': True},
}, {
'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData',
'url': 'https://www.youtube.com/feed/recommended',
@@ -5687,13 +6012,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'id': 'UCu6mSoMNzHQiBIOCkHUa2Aw',
'title': 'Cody\'sLab - Videos',
'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa',
- 'uploader': 'Cody\'sLab',
- 'uploader_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw',
'channel': 'Cody\'sLab',
'channel_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw',
'tags': [],
'channel_url': 'https://www.youtube.com/channel/UCu6mSoMNzHQiBIOCkHUa2Aw',
- 'uploader_url': 'https://www.youtube.com/channel/UCu6mSoMNzHQiBIOCkHUa2Aw',
'channel_follower_count': int
},
'playlist_mincount': 650,
@@ -5707,9 +6029,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw',
'info_dict': {
'id': 'UU9ALqqC4aIeG5iDs7i90Bfw',
- 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
'title': 'Uploads from Royalty Free Music - Topic',
- 'uploader': 'Royalty Free Music - Topic',
'modified_date': r're:\d{8}',
'channel_id': 'UC9ALqqC4aIeG5iDs7i90Bfw',
'description': '',
@@ -5717,14 +6037,15 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'tags': [],
'channel': 'Royalty Free Music - Topic',
'view_count': int,
- 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw',
'availability': 'public',
+ 'uploader': 'Royalty Free Music - Topic',
},
'playlist_mincount': 101,
'params': {
'skip_download': True,
'extractor_args': {'youtubetab': {'skip': ['webpage']}}
},
+ 'expected_warnings': ['YouTube Music is not directly supported', r'[Uu]navailable videos (are|will be) hidden'],
}, {
'note': 'non-standard redirect to regional channel',
'url': 'https://www.youtube.com/channel/UCwVVpHQ2Cs9iGJfpdFngePQ',
@@ -5737,15 +6058,15 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'modified_date': '20220407',
'channel_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q',
'tags': [],
- 'uploader_id': 'UCKcqXmCcyqnhgpA5P0oHH_Q',
- 'uploader': 'pukkandan',
'availability': 'unlisted',
'channel_id': 'UCKcqXmCcyqnhgpA5P0oHH_Q',
'channel': 'pukkandan',
'description': 'Test for collaborative playlist',
'title': 'hypervideo test - collaborative playlist',
'view_count': int,
- 'uploader_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q',
+ 'uploader_url': 'https://www.youtube.com/@pukkandan',
+ 'uploader_id': '@pukkandan',
+ 'uploader': 'pukkandan',
},
'playlist_mincount': 2
}, {
@@ -5754,15 +6075,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'info_dict': {
'id': 'UCiu-3thuViMebBjw_5nWYrA',
'tags': [],
- 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA',
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
'description': 'test description',
'title': 'cole-dlp-test-acc - 再生リスト',
- 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
- 'uploader': 'cole-dlp-test-acc',
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
'channel': 'cole-dlp-test-acc',
- 'channel_follower_count': int,
+ 'uploader_url': 'https://www.youtube.com/@coletdjnz',
+ 'uploader_id': '@coletdjnz',
+ 'uploader': 'cole-dlp-test-acc',
},
'playlist_mincount': 1,
'params': {'extractor_args': {'youtube': {'lang': ['ja']}}},
@@ -5776,14 +6096,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'tags': [],
'view_count': int,
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
- 'uploader': 'cole-dlp-test-acc',
- 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA',
'channel': 'cole-dlp-test-acc',
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
'description': 'test',
- 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
'title': 'dlp test playlist',
'availability': 'public',
+ 'uploader_url': 'https://www.youtube.com/@coletdjnz',
+ 'uploader_id': '@coletdjnz',
+ 'uploader': 'cole-dlp-test-acc',
},
'playlist_mincount': 1,
'params': {'extractor_args': {'youtube': {'lang': ['ja']}}},
@@ -5835,29 +6155,30 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'channel_follower_count': int,
'channel_id': 'UCK9V2B22uJYu3N7eR_BT9QA',
'channel_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA',
- 'uploader': 'Polka Ch. 尾丸ポルカ',
- 'description': 'md5:3b8df1ac5af337aa206e37ee3d181ec9',
+ 'description': 'md5:e56b74b5bb7e9c701522162e9abfb822',
'channel': 'Polka Ch. 尾丸ポルカ',
'tags': 'count:35',
- 'uploader_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA',
- 'uploader_id': 'UCK9V2B22uJYu3N7eR_BT9QA',
+ 'uploader_url': 'https://www.youtube.com/@OmaruPolka',
+ 'uploader': 'Polka Ch. 尾丸ポルカ',
+ 'uploader_id': '@OmaruPolka',
},
'playlist_count': 3,
}, {
# Shorts tab with channel with handle
+ # TODO: fix channel description
'url': 'https://www.youtube.com/@NotJustBikes/shorts',
'info_dict': {
'id': 'UC0intLFzLaudFG-xAvUEO-A',
'title': 'Not Just Bikes - Shorts',
'tags': 'count:12',
- 'uploader': 'Not Just Bikes',
'channel_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A',
- 'description': 'md5:7513148b1f02b924783157d84c4ea555',
+ 'description': 'md5:26bc55af26855a608a5cf89dfa595c8d',
'channel_follower_count': int,
- 'uploader_id': 'UC0intLFzLaudFG-xAvUEO-A',
'channel_id': 'UC0intLFzLaudFG-xAvUEO-A',
- 'uploader_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A',
'channel': 'Not Just Bikes',
+ 'uploader_url': 'https://www.youtube.com/@NotJustBikes',
+ 'uploader': 'Not Just Bikes',
+ 'uploader_id': '@NotJustBikes',
},
'playlist_mincount': 10,
}, {
@@ -5869,12 +6190,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'tags': 'count:7',
'channel_id': 'UC3eYAvjCVwNHgkaGbXX3sig',
'channel_url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig',
- 'uploader_id': 'UC3eYAvjCVwNHgkaGbXX3sig',
'channel': '中村悠一',
- 'uploader_url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig',
'channel_follower_count': int,
- 'uploader': '中村悠一',
'description': 'md5:e744f6c93dafa7a03c0c6deecb157300',
+ 'uploader_url': 'https://www.youtube.com/@Yuichi-Nakamura',
+ 'uploader_id': '@Yuichi-Nakamura',
+ 'uploader': '中村悠一',
},
'playlist_mincount': 60,
}, {
@@ -5893,15 +6214,15 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'info_dict': {
'id': 'UCgJ5_1F6yJhYLnyMszUdmUg',
'title': 'Shorts Break - Shorts',
- 'tags': 'count:32',
+ 'tags': 'count:48',
'channel_id': 'UCgJ5_1F6yJhYLnyMszUdmUg',
'channel': 'Shorts Break',
- 'description': 'md5:a6c234cf3d50d878ef8721e34457cd11',
- 'uploader': 'Shorts Break',
+ 'description': 'md5:6de33c5e7ba686e5f3efd4e19c7ef499',
'channel_follower_count': int,
- 'uploader_id': 'UCgJ5_1F6yJhYLnyMszUdmUg',
- 'uploader_url': 'https://www.youtube.com/channel/UCgJ5_1F6yJhYLnyMszUdmUg',
'channel_url': 'https://www.youtube.com/channel/UCgJ5_1F6yJhYLnyMszUdmUg',
+ 'uploader_url': 'https://www.youtube.com/@ShortsBreak_Official',
+ 'uploader': 'Shorts Break',
+ 'uploader_id': '@ShortsBreak_Official',
},
'playlist_mincount': 30,
}, {
@@ -5924,31 +6245,28 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'playlist_mincount': 30,
}, {
# Shorts url result in shorts tab
+ # TODO: Fix channel id extraction
'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/shorts',
'info_dict': {
'id': 'UCiu-3thuViMebBjw_5nWYrA',
'title': 'cole-dlp-test-acc - Shorts',
- 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA',
'channel': 'cole-dlp-test-acc',
- 'channel_follower_count': int,
'description': 'test description',
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
'tags': [],
+ 'uploader_url': 'https://www.youtube.com/@coletdjnz',
+ 'uploader_id': '@coletdjnz',
'uploader': 'cole-dlp-test-acc',
- 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
-
},
'playlist': [{
'info_dict': {
+ # Channel data is not currently available for short renderers (as of 2023-03-01)
'_type': 'url',
'ie_key': 'Youtube',
'url': 'https://www.youtube.com/shorts/sSM9J5YH_60',
'id': 'sSM9J5YH_60',
- 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
'title': 'SHORT short',
- 'channel': 'cole-dlp-test-acc',
- 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
'view_count': int,
'thumbnails': list,
}
@@ -5974,10 +6292,124 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'channel_url': str,
'concurrent_view_count': int,
'channel': str,
+ 'uploader': str,
+ 'uploader_url': str,
+ 'uploader_id': str,
+ 'channel_is_verified': bool, # this will keep changing
}
}],
- 'params': {'extract_flat': True},
+ 'params': {'extract_flat': True, 'playlist_items': '1'},
'playlist_mincount': 1
+ }, {
+ # Channel renderer metadata. Contains number of videos on the channel
+ 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/channels',
+ 'info_dict': {
+ 'id': 'UCiu-3thuViMebBjw_5nWYrA',
+ 'title': 'cole-dlp-test-acc - Channels',
+ 'channel': 'cole-dlp-test-acc',
+ 'description': 'test description',
+ 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
+ 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
+ 'tags': [],
+ 'uploader_url': 'https://www.youtube.com/@coletdjnz',
+ 'uploader_id': '@coletdjnz',
+ 'uploader': 'cole-dlp-test-acc',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ '_type': 'url',
+ 'ie_key': 'YoutubeTab',
+ 'url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw',
+ 'id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw',
+ 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw',
+ 'title': 'PewDiePie',
+ 'channel': 'PewDiePie',
+ 'channel_url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw',
+ 'thumbnails': list,
+ 'channel_follower_count': int,
+ 'playlist_count': int,
+ 'uploader': 'PewDiePie',
+ 'uploader_url': 'https://www.youtube.com/@PewDiePie',
+ 'uploader_id': '@PewDiePie',
+ 'channel_is_verified': True,
+ }
+ }],
+ 'params': {'extract_flat': True},
+ }, {
+ 'url': 'https://www.youtube.com/@3blue1brown/about',
+ 'info_dict': {
+ 'id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'tags': ['Mathematics'],
+ 'title': '3Blue1Brown - About',
+ 'channel_follower_count': int,
+ 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'channel': '3Blue1Brown',
+ 'view_count': int,
+ 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
+ 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
+ 'uploader_url': 'https://www.youtube.com/@3blue1brown',
+ 'uploader_id': '@3blue1brown',
+ 'uploader': '3Blue1Brown',
+ 'channel_is_verified': True,
+ },
+ 'playlist_count': 0,
+ }, {
+ # Podcasts tab, with rich entry playlistRenderers
+ 'url': 'https://www.youtube.com/@99percentinvisiblepodcast/podcasts',
+ 'info_dict': {
+ 'id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw',
+ 'channel_id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw',
+ 'uploader_url': 'https://www.youtube.com/@99percentinvisiblepodcast',
+ 'description': 'md5:3a0ed38f1ad42a68ef0428c04a15695c',
+ 'title': '99 Percent Invisible - Podcasts',
+ 'uploader': '99 Percent Invisible',
+ 'channel_follower_count': int,
+ 'channel_url': 'https://www.youtube.com/channel/UCVMF2HD4ZgC0QHpU9Yq5Xrw',
+ 'tags': [],
+ 'channel': '99 Percent Invisible',
+ 'uploader_id': '@99percentinvisiblepodcast',
+ },
+ 'playlist_count': 1,
+ }, {
+ # Releases tab, with rich entry playlistRenderers (same as Podcasts tab)
+ 'url': 'https://www.youtube.com/@AHimitsu/releases',
+ 'info_dict': {
+ 'id': 'UCgFwu-j5-xNJml2FtTrrB3A',
+ 'channel': 'A Himitsu',
+ 'uploader_url': 'https://www.youtube.com/@AHimitsu',
+ 'title': 'A Himitsu - Releases',
+ 'uploader_id': '@AHimitsu',
+ 'uploader': 'A Himitsu',
+ 'channel_id': 'UCgFwu-j5-xNJml2FtTrrB3A',
+ 'tags': 'count:16',
+ 'description': 'I make music',
+ 'channel_url': 'https://www.youtube.com/channel/UCgFwu-j5-xNJml2FtTrrB3A',
+ 'channel_follower_count': int,
+ 'channel_is_verified': True,
+ },
+ 'playlist_mincount': 10,
+ }, {
+ # Playlist with only shorts, shown as reel renderers
+ # FIXME: future: YouTube currently doesn't give continuation for this,
+ # may do in future.
+ 'url': 'https://www.youtube.com/playlist?list=UUxqPAgubo4coVn9Lx1FuKcg',
+ 'info_dict': {
+ 'id': 'UUxqPAgubo4coVn9Lx1FuKcg',
+ 'channel_url': 'https://www.youtube.com/channel/UCxqPAgubo4coVn9Lx1FuKcg',
+ 'view_count': int,
+ 'uploader_id': '@BangyShorts',
+ 'description': '',
+ 'uploader_url': 'https://www.youtube.com/@BangyShorts',
+ 'channel_id': 'UCxqPAgubo4coVn9Lx1FuKcg',
+ 'channel': 'Bangy Shorts',
+ 'uploader': 'Bangy Shorts',
+ 'tags': [],
+ 'availability': 'public',
+ 'modified_date': '20230626',
+ 'title': 'Uploads from Bangy Shorts',
+ },
+ 'playlist_mincount': 100,
+ 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
}]
@classmethod
@@ -6044,6 +6476,8 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
original_tab_id, display_id = tab[1:], f'{item_id}{tab}'
if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
url = f'{pre}/videos{post}'
+ if smuggled_data.get('is_music_url'):
+ self.report_warning(f'YouTube Music is not directly supported. Redirecting to {url}')
# Handle both video/playlist URLs
qs = parse_qs(url)
@@ -6192,15 +6626,15 @@ class YoutubePlaylistIE(InfoExtractor):
'title': '[OLD]Team Fortress 2 (Class-based LP)',
'id': 'PLBB231211A4F62143',
'uploader': 'Wickman',
- 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
+ 'uploader_id': '@WickmanVT',
'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2',
'view_count': int,
- 'uploader_url': 'https://www.youtube.com/c/WickmanVT',
+ 'uploader_url': 'https://www.youtube.com/@WickmanVT',
'modified_date': r're:\d{8}',
'channel_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
'channel': 'Wickman',
'tags': [],
- 'channel_url': 'https://www.youtube.com/c/WickmanVT',
+ 'channel_url': 'https://www.youtube.com/channel/UCKSpbfbl5kRQpTdL7kMc-1Q',
'availability': 'public',
},
'playlist_mincount': 29,
@@ -6220,7 +6654,7 @@ class YoutubePlaylistIE(InfoExtractor):
'title': 'JODA15',
'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
'uploader': 'milan',
- 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
+ 'uploader_id': '@milan5503',
'description': '',
'channel_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw',
'tags': [],
@@ -6228,7 +6662,7 @@ class YoutubePlaylistIE(InfoExtractor):
'view_count': int,
'channel': 'milan',
'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
- 'uploader_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw',
+ 'uploader_url': 'https://www.youtube.com/@milan5503',
'availability': 'public',
},
'expected_warnings': [r'[Uu]navailable videos? (is|are|will be) hidden'],
@@ -6239,13 +6673,13 @@ class YoutubePlaylistIE(InfoExtractor):
'title': '2018 Chinese New Singles (11/6 updated)',
'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
'uploader': 'LBK',
- 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
+ 'uploader_id': '@music_king',
'description': 'md5:da521864744d60a198e3a88af4db0d9d',
'channel': 'LBK',
'view_count': int,
- 'channel_url': 'https://www.youtube.com/c/愛低音的國王',
+ 'channel_url': 'https://www.youtube.com/channel/UC21nz3_MesPLqtDqwdvnoxA',
'tags': [],
- 'uploader_url': 'https://www.youtube.com/c/愛低音的國王',
+ 'uploader_url': 'https://www.youtube.com/@music_king',
'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA',
'modified_date': r're:\d{8}',
'availability': 'public',
@@ -6291,8 +6725,8 @@ class YoutubeYtBeIE(InfoExtractor):
'ext': 'mp4',
'title': 'Small Scale Baler and Braiding Rugs',
'uploader': 'Backus-Page House Museum',
- 'uploader_id': 'backuspagemuseum',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
+ 'uploader_id': '@backuspagemuseum',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@backuspagemuseum',
'upload_date': '20161008',
'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
'categories': ['Nonprofits & Activism'],
@@ -6300,7 +6734,7 @@ class YoutubeYtBeIE(InfoExtractor):
'like_count': int,
'age_limit': 0,
'playable_in_embed': True,
- 'thumbnail': 'https://i.ytimg.com/vi_webp/yeWKywCrFtk/maxresdefault.webp',
+ 'thumbnail': r're:^https?://.*\.webp',
'channel': 'Backus-Page House Museum',
'channel_id': 'UCEfMCQ9bs3tjvjy1s451zaw',
'live_status': 'not_live',
@@ -6416,7 +6850,7 @@ class YoutubeNotificationsIE(YoutubeTabBaseInfoExtractor):
if not video_id:
browse_ep = traverse_obj(
notification, ('navigationEndpoint', 'browseEndpoint'), expected_type=dict)
- channel_id = traverse_obj(browse_ep, 'browseId', expected_type=str)
+ channel_id = self.ucid_or_none(traverse_obj(browse_ep, 'browseId', expected_type=str))
post_id = self._search_regex(
r'/post/(.+)', traverse_obj(browse_ep, 'canonicalBaseUrl', expected_type=str),
'post id', default=None)
@@ -6446,6 +6880,7 @@ class YoutubeNotificationsIE(YoutubeTabBaseInfoExtractor):
'title': title,
'channel_id': channel_id,
'channel': channel,
+ 'uploader': channel,
'thumbnails': self._extract_thumbnails(notification, 'videoThumbnail'),
'timestamp': timestamp,
}
@@ -6532,6 +6967,36 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor):
# }],
},
}, {
+ # Channel results
+ 'url': 'https://www.youtube.com/results?search_query=kurzgesagt&sp=EgIQAg%253D%253D',
+ 'info_dict': {
+ 'id': 'kurzgesagt',
+ 'title': 'kurzgesagt',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ '_type': 'url',
+ 'id': 'UCsXVk37bltHxD1rDPwtNM8Q',
+ 'url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q',
+ 'ie_key': 'YoutubeTab',
+ 'channel': 'Kurzgesagt – In a Nutshell',
+ 'description': 'md5:4ae48dfa9505ffc307dad26342d06bfc',
+ 'title': 'Kurzgesagt – In a Nutshell',
+ 'channel_id': 'UCsXVk37bltHxD1rDPwtNM8Q',
+ # No longer available for search as it is set to the handle.
+ # 'playlist_count': int,
+ 'channel_url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q',
+ 'thumbnails': list,
+ 'uploader_id': '@kurzgesagt',
+ 'uploader_url': 'https://www.youtube.com/@kurzgesagt',
+ 'uploader': 'Kurzgesagt – In a Nutshell',
+ 'channel_is_verified': True,
+ 'channel_follower_count': int,
+ }
+ }],
+ 'params': {'extract_flat': True, 'playlist_items': '1'},
+ 'playlist_mincount': 1,
+ }, {
'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
'only_matching': True,
}]
@@ -6669,22 +7134,6 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
}]
-class YoutubeStoriesIE(InfoExtractor):
- IE_DESC = 'YouTube channel stories; "ytstories:" prefix'
- IE_NAME = 'youtube:stories'
- _VALID_URL = r'ytstories:UC(?P<id>[A-Za-z0-9_-]{21}[AQgw])$'
- _TESTS = [{
- 'url': 'ytstories:UCwFCb4jeqaKWnciAYM-ZVHg',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- playlist_id = f'RLTD{self._match_id(url)}'
- return self.url_result(
- smuggle_url(f'https://www.youtube.com/playlist?list={playlist_id}&playnext=1', {'is_story': True}),
- ie=YoutubeTabIE, video_id=playlist_id)
-
-
class YoutubeShortsAudioPivotIE(InfoExtractor):
IE_DESC = 'YouTube Shorts audio pivot (Shorts using audio of a given video)'
IE_NAME = 'youtube:shorts:pivot:audio'
@@ -6784,11 +7233,14 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor):
'title': 'Mobile Games on Console - Scott The Woz',
'upload_date': '20210920',
'uploader': 'Scott The Woz',
- 'uploader_id': 'scottthewoz',
- 'uploader_url': 'http://www.youtube.com/user/scottthewoz',
+ 'uploader_id': '@ScottTheWoz',
+ 'uploader_url': 'https://www.youtube.com/@ScottTheWoz',
'view_count': int,
'live_status': 'not_live',
- 'channel_follower_count': int
+ 'channel_follower_count': int,
+ 'chapters': 'count:20',
+ 'comment_count': int,
+ 'heatmap': 'count:100',
}
}]
@@ -6816,6 +7268,53 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor):
}
+class YoutubeConsentRedirectIE(YoutubeBaseInfoExtractor):
+ IE_NAME = 'youtube:consent'
+ IE_DESC = False # Do not list
+ _VALID_URL = r'https?://consent\.youtube\.com/m\?'
+ _TESTS = [{
+ 'url': 'https://consent.youtube.com/m?continue=https%3A%2F%2Fwww.youtube.com%2Flive%2FqVv6vCqciTM%3Fcbrd%3D1&gl=NL&m=0&pc=yt&hl=en&src=1',
+ 'info_dict': {
+ 'id': 'qVv6vCqciTM',
+ 'ext': 'mp4',
+ 'age_limit': 0,
+ 'uploader_id': '@sana_natori',
+ 'comment_count': int,
+ 'chapters': 'count:13',
+ 'upload_date': '20221223',
+ 'thumbnail': 'https://i.ytimg.com/vi/qVv6vCqciTM/maxresdefault.jpg',
+ 'channel_url': 'https://www.youtube.com/channel/UCIdEIHpS0TdkqRkHL5OkLtA',
+ 'uploader_url': 'https://www.youtube.com/@sana_natori',
+ 'like_count': int,
+ 'release_date': '20221223',
+ 'tags': ['Vtuber', '月ノ美兎', '名取さな', 'にじさんじ', 'クリスマス', '3D配信'],
+ 'title': '【 #インターネット女クリスマス 】3Dで歌ってはしゃぐインターネットの女たち【月ノ美兎/名取さな】',
+ 'view_count': int,
+ 'playable_in_embed': True,
+ 'duration': 4438,
+ 'availability': 'public',
+ 'channel_follower_count': int,
+ 'channel_id': 'UCIdEIHpS0TdkqRkHL5OkLtA',
+ 'categories': ['Entertainment'],
+ 'live_status': 'was_live',
+ 'release_timestamp': 1671793345,
+ 'channel': 'さなちゃんねる',
+ 'description': 'md5:6aebf95cc4a1d731aebc01ad6cc9806d',
+ 'uploader': 'さなちゃんねる',
+ 'channel_is_verified': True,
+ 'heatmap': 'count:100',
+ },
+ 'add_ie': ['Youtube'],
+ 'params': {'skip_download': 'Youtube'},
+ }]
+
+ def _real_extract(self, url):
+ redirect_url = url_or_none(parse_qs(url).get('continue', [None])[-1])
+ if not redirect_url:
+ raise ExtractorError('Invalid cookie consent redirect URL', expected=True)
+ return self.url_result(redirect_url)
+
+
class YoutubeTruncatedIDIE(InfoExtractor):
IE_NAME = 'youtube:truncated_id'
IE_DESC = False # Do not list
diff --git a/hypervideo_dl/extractor/zaiko.py b/hypervideo_dl/extractor/zaiko.py
new file mode 100644
index 0000000..0ccacbb
--- /dev/null
+++ b/hypervideo_dl/extractor/zaiko.py
@@ -0,0 +1,130 @@
+import base64
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ extract_attributes,
+ int_or_none,
+ str_or_none,
+ traverse_obj,
+ try_call,
+ unescapeHTML,
+ url_or_none,
+)
+
+
+class ZaikoBaseIE(InfoExtractor):
+ def _download_real_webpage(self, url, video_id):
+ webpage, urlh = self._download_webpage_handle(url, video_id)
+ final_url = urlh.url
+ if 'zaiko.io/login' in final_url:
+ self.raise_login_required()
+ elif '/_buy/' in final_url:
+ raise ExtractorError('Your account does not have tickets to this event', expected=True)
+ return webpage
+
+ def _parse_vue_element_attr(self, name, string, video_id):
+ page_elem = self._search_regex(rf'(<{name}[^>]+>)', string, name)
+ attrs = {}
+ for key, value in extract_attributes(page_elem).items():
+ if key.startswith(':'):
+ attrs[key[1:]] = self._parse_json(
+ value, video_id, transform_source=unescapeHTML, fatal=False)
+ return attrs
+
+
+class ZaikoIE(ZaikoBaseIE):
+ _VALID_URL = r'https?://(?:[\w-]+\.)?zaiko\.io/event/(?P<id>\d+)/stream(?:/\d+)+'
+ _TESTS = [{
+ 'url': 'https://zaiko.io/event/324868/stream/20571/20571',
+ 'info_dict': {
+ 'id': '324868',
+ 'ext': 'mp4',
+ 'title': 'ZAIKO STREAMING TEST',
+ 'alt_title': '[VOD] ZAIKO STREAMING TEST_20210603(Do Not Delete)',
+ 'uploader_id': '454',
+ 'uploader': 'ZAIKO ZERO',
+ 'release_timestamp': 1583809200,
+ 'thumbnail': r're:https://[a-z0-9]+.cloudfront.net/[a-z0-9_]+/[a-z0-9_]+',
+ 'release_date': '20200310',
+ 'categories': ['Tech House'],
+ 'live_status': 'was_live',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_real_webpage(url, video_id)
+ stream_meta = self._parse_vue_element_attr('stream-page', webpage, video_id)
+
+ player_page = self._download_webpage(
+ stream_meta['stream-access']['video_source'], video_id,
+ 'Downloading player page', headers={'referer': 'https://zaiko.io/'})
+ player_meta = self._parse_vue_element_attr('player', player_page, video_id)
+ status = traverse_obj(player_meta, ('initial_event_info', 'status', {str}))
+ live_status, msg, expected = {
+ 'vod': ('was_live', 'No VOD stream URL was found', False),
+ 'archiving': ('post_live', 'Event VOD is still being processed', True),
+ 'deleting': ('post_live', 'This event has ended', True),
+ 'deleted': ('post_live', 'This event has ended', True),
+ 'error': ('post_live', 'This event has ended', True),
+ 'disconnected': ('post_live', 'Stream has been disconnected', True),
+ 'live_to_disconnected': ('post_live', 'Stream has been disconnected', True),
+ 'live': ('is_live', 'No livestream URL found was found', False),
+ 'waiting': ('is_upcoming', 'Live event has not yet started', True),
+ 'cancelled': ('not_live', 'Event has been cancelled', True),
+ }.get(status) or ('not_live', f'Unknown event status "{status}"', False)
+
+ stream_url = traverse_obj(player_meta, ('initial_event_info', 'endpoint', {url_or_none}))
+ formats = self._extract_m3u8_formats(
+ stream_url, video_id, live=True, fatal=False) if stream_url else []
+ if not formats:
+ self.raise_no_formats(msg, expected=expected)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'live_status': live_status,
+ **traverse_obj(stream_meta, {
+ 'title': ('event', 'name', {str}),
+ 'uploader': ('profile', 'name', {str}),
+ 'uploader_id': ('profile', 'id', {str_or_none}),
+ 'release_timestamp': ('stream', 'start', 'timestamp', {int_or_none}),
+ 'categories': ('event', 'genres', ..., {lambda x: x or None}),
+ }),
+ **traverse_obj(player_meta, ('initial_event_info', {
+ 'alt_title': ('title', {str}),
+ 'thumbnail': ('poster_url', {url_or_none}),
+ })),
+ }
+
+
+class ZaikoETicketIE(ZaikoBaseIE):
+ _VALID_URL = r'https?://(?:www.)?zaiko\.io/account/eticket/(?P<id>[\w=-]{49})'
+ _TESTS = [{
+ 'url': 'https://zaiko.io/account/eticket/TZjMwMzQ2Y2EzMXwyMDIzMDYwNzEyMTMyNXw1MDViOWU2Mw==',
+ 'playlist_count': 1,
+ 'info_dict': {
+ 'id': 'f30346ca31-20230607121325-505b9e63',
+ 'title': 'ZAIKO STREAMING TEST',
+ 'thumbnail': 'https://media.zkocdn.net/pf_1/1_3wdyjcjyupseatkwid34u',
+ },
+ 'skip': 'Only available with the ticketholding account',
+ }]
+
+ def _real_extract(self, url):
+ ticket_id = self._match_id(url)
+ ticket_id = try_call(
+ lambda: base64.urlsafe_b64decode(ticket_id[1:]).decode().replace('|', '-')) or ticket_id
+
+ webpage = self._download_real_webpage(url, ticket_id)
+ eticket = self._parse_vue_element_attr('eticket', webpage, ticket_id)
+
+ return self.playlist_result(
+ [self.url_result(stream, ZaikoIE) for stream in traverse_obj(eticket, ('streams', ..., 'url'))],
+ ticket_id, **traverse_obj(eticket, ('ticket-details', {
+ 'title': 'event_name',
+ 'thumbnail': 'event_img_url',
+ })))
diff --git a/hypervideo_dl/extractor/zattoo.py b/hypervideo_dl/extractor/zattoo.py
index 22620c0..6bd9ea0 100644
--- a/hypervideo_dl/extractor/zattoo.py
+++ b/hypervideo_dl/extractor/zattoo.py
@@ -2,7 +2,8 @@ import re
from uuid import uuid4
from .common import InfoExtractor
-from ..compat import compat_HTTPError, compat_str
+from ..compat import compat_str
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
int_or_none,
@@ -36,7 +37,7 @@ class ZattooPlatformBaseIE(InfoExtractor):
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
})
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 400:
raise ExtractorError(
'Unable to login: incorrect username and/or password',
expected=True)
diff --git a/hypervideo_dl/extractor/zdf.py b/hypervideo_dl/extractor/zdf.py
index fca426a..c04d51b 100644
--- a/hypervideo_dl/extractor/zdf.py
+++ b/hypervideo_dl/extractor/zdf.py
@@ -24,7 +24,7 @@ from ..utils import (
class ZDFBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['DE']
- _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd')
+ _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd', 'fhd', 'uhd')
def _call_api(self, url, video_id, item, api_token=None, referrer=None):
headers = {}
@@ -61,6 +61,9 @@ class ZDFBaseIE(InfoExtractor):
elif mime_type == 'application/f4m+xml' or ext == 'f4m':
new_formats = self._extract_f4m_formats(
update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)
+ elif ext == 'mpd':
+ new_formats = self._extract_mpd_formats(
+ format_url, video_id, mpd_id='dash', fatal=False)
else:
f = parse_codecs(meta.get('mimeCodec'))
if not f and meta.get('type'):
@@ -174,7 +177,8 @@ class ZDFIE(ZDFBaseIE):
'thumbnail': 'md5:e65f459f741be5455c952cd820eb188e',
'title': 'heute journal vom 30.12.2021',
'timestamp': 1640897100,
- }
+ },
+ 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"',
}, {
'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html',
'info_dict': {
@@ -189,7 +193,7 @@ class ZDFIE(ZDFBaseIE):
},
}, {
'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html',
- 'md5': '1b93bdec7d02fc0b703c5e7687461628',
+ 'md5': '57af4423db0455a3975d2dc4578536bc',
'info_dict': {
'ext': 'mp4',
'id': 'video_funk_1770473',
@@ -198,7 +202,7 @@ class ZDFIE(ZDFBaseIE):
'title': 'Alles ist verzaubert',
'timestamp': 1635520560,
'upload_date': '20211029',
- 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-100~1920x1080?cb=1636466431799',
+ 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-102~1920x1080?cb=1663848412907',
},
}, {
# Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche
@@ -241,10 +245,23 @@ class ZDFIE(ZDFBaseIE):
'title': 'Das Geld anderer Leute',
'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d',
'duration': 2581.0,
- 'timestamp': 1654790700,
- 'upload_date': '20220609',
+ 'timestamp': 1675160100,
+ 'upload_date': '20230131',
'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=2400x1350',
},
+ }, {
+ 'url': 'https://www.zdf.de/dokumentation/terra-x/unser-gruener-planet-wuesten-doku-100.html',
+ 'info_dict': {
+ 'id': '220605_dk_gruener_planet_wuesten_tex',
+ 'ext': 'mp4',
+ 'title': 'Unser grüner Planet - Wüsten',
+ 'description': 'md5:4fc647b6f9c3796eea66f4a0baea2862',
+ 'duration': 2613.0,
+ 'timestamp': 1654450200,
+ 'upload_date': '20220605',
+ 'format_note': 'uhd, main',
+ 'thumbnail': 'https://www.zdf.de/assets/saguaro-kakteen-102~3840x2160?cb=1655910690796',
+ },
}]
def _extract_entry(self, url, player, content, video_id):
@@ -259,7 +276,7 @@ class ZDFIE(ZDFBaseIE):
raise ExtractorError('Could not extract ptmd_path')
info = self._extract_ptmd(
- urljoin(url, ptmd_path.replace('{playerId}', 'ngplayer_2_4')), video_id, player['apiToken'], url)
+ urljoin(url, ptmd_path.replace('{playerId}', 'android_native_5')), video_id, player['apiToken'], url)
thumbnails = []
layouts = try_get(
diff --git a/hypervideo_dl/extractor/zee5.py b/hypervideo_dl/extractor/zee5.py
index a64eb9e..ca79cf0 100644
--- a/hypervideo_dl/extractor/zee5.py
+++ b/hypervideo_dl/extractor/zee5.py
@@ -1,14 +1,16 @@
import json
-import random
-import string
+import time
+import uuid
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
int_or_none,
+ jwt_decode_hs256,
parse_age_limit,
str_or_none,
+ try_call,
try_get,
unified_strdate,
unified_timestamp,
@@ -94,12 +96,12 @@ class Zee5IE(InfoExtractor):
'url': 'https://www.zee5.com/music-videos/details/adhento-gaani-vunnapaatuga-jersey-nani-shraddha-srinath/0-0-56973',
'only_matching': True
}]
- _DETAIL_API_URL = 'https://spapi.zee5.com/singlePlayback/getDetails/secure?content_id={}&device_id={}&platform_name=desktop_web&country=IN&check_parental_control=false'
- _DEVICE_ID = ''.join(random.choices(string.ascii_letters + string.digits, k=20)).ljust(32, '0')
+ _DEVICE_ID = str(uuid.uuid4())
_USER_TOKEN = None
_LOGIN_HINT = 'Use "--username <mobile_number>" to login using otp or "--username token" and "--password <user_token>" to login using user token.'
_NETRC_MACHINE = 'zee5'
_GEO_COUNTRIES = ['IN']
+ _USER_COUNTRY = None
def _perform_login(self, username, password):
if len(username) == 10 and username.isdigit() and self._USER_TOKEN is None:
@@ -118,16 +120,21 @@ class Zee5IE(InfoExtractor):
self._USER_TOKEN = otp_verify_json.get('token')
if not self._USER_TOKEN:
raise ExtractorError(otp_request_json['message'], expected=True)
- elif username.lower() == 'token' and len(password) > 1198:
+ elif username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)):
self._USER_TOKEN = password
else:
raise ExtractorError(self._LOGIN_HINT, expected=True)
+ token = jwt_decode_hs256(self._USER_TOKEN)
+ if token.get('exp', 0) <= int(time.time()):
+ raise ExtractorError('User token has expired', expected=True)
+ self._USER_COUNTRY = token.get('current_country')
+
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
access_token_request = self._download_json(
- 'https://useraction.zee5.com/token/platform_tokens.php?platform_name=web_app',
- video_id, note='Downloading access token')
+ 'https://launchapi.zee5.com/launch?platform_name=web_app',
+ video_id, note='Downloading access token')['platform_token']
data = {
'x-access-token': access_token_request['token']
}
@@ -137,8 +144,13 @@ class Zee5IE(InfoExtractor):
data['X-Z5-Guest-Token'] = self._DEVICE_ID
json_data = self._download_json(
- self._DETAIL_API_URL.format(video_id, self._DEVICE_ID),
- video_id, headers={'content-type': 'application/json'}, data=json.dumps(data).encode('utf-8'))
+ 'https://spapi.zee5.com/singlePlayback/getDetails/secure', video_id, query={
+ 'content_id': video_id,
+ 'device_id': self._DEVICE_ID,
+ 'platform_name': 'desktop_web',
+ 'country': self._USER_COUNTRY or self.get_param('geo_bypass_country') or 'IN',
+ 'check_parental_control': False,
+ }, headers={'content-type': 'application/json'}, data=json.dumps(data).encode('utf-8'))
asset_data = json_data['assetDetails']
show_data = json_data.get('showDetails', {})
if 'premium' in asset_data['business_type']:
@@ -228,8 +240,8 @@ class Zee5SeriesIE(InfoExtractor):
def _entries(self, show_id):
access_token_request = self._download_json(
- 'https://useraction.zee5.com/token/platform_tokens.php?platform_name=web_app',
- show_id, note='Downloading access token')
+ 'https://launchapi.zee5.com/launch?platform_name=web_app',
+ show_id, note='Downloading access token')['platform_token']
headers = {
'X-Access-Token': access_token_request['token'],
'Referer': 'https://www.zee5.com/',
diff --git a/hypervideo_dl/extractor/zingmp3.py b/hypervideo_dl/extractor/zingmp3.py
index a818c9f..007658c 100644
--- a/hypervideo_dl/extractor/zingmp3.py
+++ b/hypervideo_dl/extractor/zingmp3.py
@@ -1,16 +1,11 @@
-import functools
import hashlib
import hmac
+import itertools
import json
import urllib.parse
from .common import InfoExtractor
-from ..utils import (
- OnDemandPagedList,
- int_or_none,
- traverse_obj,
- urljoin,
-)
+from ..utils import int_or_none, traverse_obj, try_call, urljoin
class ZingMp3BaseIE(InfoExtractor):
@@ -37,6 +32,7 @@ class ZingMp3BaseIE(InfoExtractor):
'info-artist': '/api/v2/page/get/artist',
'user-list-song': '/api/v2/song/get/list',
'user-list-video': '/api/v2/video/get/list',
+ 'hub': '/api/v2/page/get/hub-detail',
}
def _api_url(self, url_type, params):
@@ -46,9 +42,9 @@ class ZingMp3BaseIE(InfoExtractor):
''.join(f'{k}={v}' for k, v in sorted(params.items())).encode()).hexdigest()
data = {
**params,
- 'apiKey': '88265e23d4284f25963e6eedac8fbfa3',
- 'sig': hmac.new(
- b'2aa2d1c561e809b267f3638c4a307aab', f'{api_slug}{sha256}'.encode(), hashlib.sha512).hexdigest(),
+ 'apiKey': 'X5BM3w8N7MKozC0B85o4KMlzLZKhV00y',
+ 'sig': hmac.new(b'acOrvUS15XRW2o9JksiK1KgQ6Vbds8ZW',
+ f'{api_slug}{sha256}'.encode(), hashlib.sha512).hexdigest(),
}
return f'{self._DOMAIN}{api_slug}?{urllib.parse.urlencode(data)}'
@@ -67,6 +63,19 @@ class ZingMp3BaseIE(InfoExtractor):
for url in traverse_obj(items, (..., 'link')) or []:
yield self.url_result(urljoin(self._DOMAIN, url))
+ def _fetch_page(self, id_, url_type, page):
+ raise NotImplementedError('This method must be implemented by subclasses')
+
+ def _paged_list(self, _id, url_type):
+ count = 0
+ for page in itertools.count(1):
+ data = self._fetch_page(_id, url_type, page)
+ entries = list(self._parse_items(data.get('items')))
+ count += len(entries)
+ yield from entries
+ if not data.get('hasMore') or try_call(lambda: count > data['total']):
+ break
+
class ZingMp3IE(ZingMp3BaseIE):
_VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip|embed'
@@ -166,8 +175,11 @@ class ZingMp3IE(ZingMp3BaseIE):
'height': int_or_none(res),
})
- if not formats and item.get('msg') == 'Sorry, this content is not available in your country.':
- self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
+ if not formats:
+ if item.get('msg') == 'Sorry, this content is not available in your country.':
+ self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
+ else:
+ self.raise_no_formats('The song is only for VIP accounts.')
lyric = item.get('lyric') or self._call_api('lyric', {'id': item_id}, fatal=False).get('file')
@@ -200,7 +212,7 @@ class ZingMp3AlbumIE(ZingMp3BaseIE):
'id': 'ZWZAEZZD',
'title': 'Những Bài Hát Hay Nhất Của Mr. Siro',
},
- 'playlist_mincount': 49,
+ 'playlist_mincount': 20,
}, {
'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html',
'only_matching': True,
@@ -305,22 +317,20 @@ class ZingMp3ChartMusicVideoIE(ZingMp3BaseIE):
'id': 'IWZ9Z086',
'title': 'the-loai-video_Khong-Loi',
},
- 'playlist_mincount': 10,
+ 'playlist_mincount': 1,
}]
def _fetch_page(self, song_id, url_type, page):
- return self._parse_items(self._call_api(url_type, {
+ return self._call_api(url_type, {
'id': song_id,
'type': 'genre',
- 'page': page + 1,
+ 'page': page,
'count': self._PER_PAGE
- }).get('items'))
+ })
def _real_extract(self, url):
song_id, regions, url_type = self._match_valid_url(url).group('id', 'regions', 'type')
- return self.playlist_result(
- OnDemandPagedList(functools.partial(self._fetch_page, song_id, url_type), self._PER_PAGE),
- song_id, f'{url_type}_{regions}')
+ return self.playlist_result(self._paged_list(song_id, url_type), song_id, f'{url_type}_{regions}')
class ZingMp3UserIE(ZingMp3BaseIE):
@@ -331,7 +341,7 @@ class ZingMp3UserIE(ZingMp3BaseIE):
'info_dict': {
'id': 'IWZ98609',
'title': 'Mr. Siro - bai-hat',
- 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5',
+ 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36',
},
'playlist_mincount': 91,
}, {
@@ -339,7 +349,7 @@ class ZingMp3UserIE(ZingMp3BaseIE):
'info_dict': {
'id': 'IWZ98609',
'title': 'Mr. Siro - album',
- 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5',
+ 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36',
},
'playlist_mincount': 3,
}, {
@@ -347,7 +357,7 @@ class ZingMp3UserIE(ZingMp3BaseIE):
'info_dict': {
'id': 'IWZ98609',
'title': 'Mr. Siro - single',
- 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5',
+ 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36',
},
'playlist_mincount': 20,
}, {
@@ -355,19 +365,19 @@ class ZingMp3UserIE(ZingMp3BaseIE):
'info_dict': {
'id': 'IWZ98609',
'title': 'Mr. Siro - video',
- 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5',
+ 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36',
},
'playlist_mincount': 15,
}]
def _fetch_page(self, user_id, url_type, page):
url_type = 'user-list-song' if url_type == 'bai-hat' else 'user-list-video'
- return self._parse_items(self._call_api(url_type, {
+ return self._call_api(url_type, {
'id': user_id,
'type': 'artist',
- 'page': page + 1,
+ 'page': page,
'count': self._PER_PAGE
- }, query={'sort': 'new', 'sectionId': 'aSong'}).get('items'))
+ })
def _real_extract(self, url):
user_alias, url_type = self._match_valid_url(url).group('user', 'type')
@@ -376,10 +386,41 @@ class ZingMp3UserIE(ZingMp3BaseIE):
user_info = self._call_api('info-artist', {}, user_alias, query={'alias': user_alias})
if url_type in ('bai-hat', 'video'):
- entries = OnDemandPagedList(
- functools.partial(self._fetch_page, user_info['id'], url_type), self._PER_PAGE)
+ entries = self._paged_list(user_info['id'], url_type)
else:
entries = self._parse_items(traverse_obj(user_info, (
- 'sections', lambda _, v: v['link'] == f'/{user_alias}/{url_type}', 'items', ...)))
+ 'sections',
+ lambda _, v: v['sectionId'] == 'aAlbum' if url_type == 'album' else v['sectionId'] == 'aSingle',
+ 'items', ...)))
return self.playlist_result(
entries, user_info['id'], f'{user_info.get("name")} - {url_type}', user_info.get('biography'))
+
+
+class ZingMp3HubIE(ZingMp3BaseIE):
+ IE_NAME = 'zingmp3:hub'
+ _VALID_URL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<type>hub)/(?P<regions>[^/]+)/(?P<id>[^\.]+)'
+ _TESTS = [{
+ 'url': 'https://zingmp3.vn/hub/Nhac-Moi/IWZ9Z0CA.html',
+ 'info_dict': {
+ 'id': 'IWZ9Z0CA',
+ 'title': 'Nhạc Mới',
+ 'description': 'md5:1cc31b68a6f746427b07b2756c22a558',
+ },
+ 'playlist_mincount': 20,
+ }, {
+ 'url': 'https://zingmp3.vn/hub/Nhac-Viet/IWZ9Z087.html',
+ 'info_dict': {
+ 'id': 'IWZ9Z087',
+ 'title': 'Nhạc Việt',
+ 'description': 'md5:acc976c8bdde64d5c6ee4a92c39f7a77',
+ },
+ 'playlist_mincount': 30,
+ }]
+
+ def _real_extract(self, url):
+ song_id, regions, url_type = self._match_valid_url(url).group('id', 'regions', 'type')
+ hub_detail = self._call_api(url_type, {'id': song_id})
+ entries = self._parse_items(traverse_obj(hub_detail, (
+ 'sections', lambda _, v: v['sectionId'] == 'hub', 'items', ...)))
+ return self.playlist_result(
+ entries, song_id, hub_detail.get('title'), hub_detail.get('description'))
diff --git a/hypervideo_dl/extractor/zoom.py b/hypervideo_dl/extractor/zoom.py
index ef8b715..3d7ccca 100644
--- a/hypervideo_dl/extractor/zoom.py
+++ b/hypervideo_dl/extractor/zoom.py
@@ -5,6 +5,7 @@ from ..utils import (
str_or_none,
js_to_json,
parse_filesize,
+ traverse_obj,
urlencode_postdata,
urljoin,
)
@@ -12,8 +13,8 @@ from ..utils import (
class ZoomIE(InfoExtractor):
IE_NAME = 'zoom'
- _VALID_URL = r'(?P<base_url>https?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?:play|share)/(?P<id>[A-Za-z0-9_.-]+)'
- _TEST = {
+ _VALID_URL = r'(?P<base_url>https?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?P<type>play|share)/(?P<id>[A-Za-z0-9_.-]+)'
+ _TESTS = [{
'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5',
'md5': 'ab445e8c911fddc4f9adc842c2c5d434',
'info_dict': {
@@ -22,36 +23,73 @@ class ZoomIE(InfoExtractor):
'title': 'China\'s "two sessions" and the new five-year plan',
},
'skip': 'Recording requires email authentication to access',
- }
+ }, {
+ # play URL
+ 'url': 'https://ffgolf.zoom.us/rec/play/qhEhXbrxq1Zoucx8CMtHzq1Z_2YZRPVCqWK_K-2FkEGRsSLDeOX8Tu4P6jtjZcRry8QhIbvKZdtr4UNo.QcPn2debFskI9whJ',
+ 'md5': '2c4b1c4e5213ebf9db293e88d9385bee',
+ 'info_dict': {
+ 'id': 'qhEhXbrxq1Zoucx8CMtHzq1Z_2YZRPVCqWK_K-2FkEGRsSLDeOX8Tu4P6jtjZcRry8QhIbvKZdtr4UNo.QcPn2debFskI9whJ',
+ 'ext': 'mp4',
+ 'title': 'Prépa AF2023 - Séance 5 du 11 avril - R20/VM/GO',
+ },
+ }, {
+ # share URL
+ 'url': 'https://us02web.zoom.us/rec/share/hkUk5Zxcga0nkyNGhVCRfzkA2gX_mzgS3LpTxEEWJz9Y_QpIQ4mZFOUx7KZRZDQA.9LGQBdqmDAYgiZ_8',
+ 'md5': '90fdc7cfcaee5d52d1c817fc03c43c9b',
+ 'info_dict': {
+ 'id': 'hkUk5Zxcga0nkyNGhVCRfzkA2gX_mzgS3LpTxEEWJz9Y_QpIQ4mZFOUx7KZRZDQA.9LGQBdqmDAYgiZ_8',
+ 'ext': 'mp4',
+ 'title': 'Timea Andrea Lelik\'s Personal Meeting Room',
+ },
+ }]
- def _real_extract(self, url):
- base_url, play_id = self._match_valid_url(url).groups()
- webpage = self._download_webpage(url, play_id)
+ def _get_page_data(self, webpage, video_id):
+ return self._search_json(
+ r'window\.__data__\s*=', webpage, 'data', video_id, transform_source=js_to_json)
+ def _get_real_webpage(self, url, base_url, video_id, url_type):
+ webpage = self._download_webpage(url, video_id, note=f'Downloading {url_type} webpage')
try:
form = self._form_hidden_inputs('password_form', webpage)
except ExtractorError:
- form = None
- if form:
- password = self.get_param('videopassword')
- if not password:
- raise ExtractorError(
- 'This video is protected by a passcode, use the --video-password option', expected=True)
- is_meeting = form.get('useWhichPasswd') == 'meeting'
- validation = self._download_json(
- base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''),
- play_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({
- 'id': form[('meet' if is_meeting else 'file') + 'Id'],
- 'passwd': password,
- 'action': form.get('action'),
- }))
- if not validation.get('status'):
- raise ExtractorError(validation['errorMessage'], expected=True)
- webpage = self._download_webpage(url, play_id)
+ return webpage
+
+ password = self.get_param('videopassword')
+ if not password:
+ raise ExtractorError(
+ 'This video is protected by a passcode, use the --video-password option', expected=True)
+ is_meeting = form.get('useWhichPasswd') == 'meeting'
+ validation = self._download_json(
+ base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''),
+ video_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({
+ 'id': form[('meet' if is_meeting else 'file') + 'Id'],
+ 'passwd': password,
+ 'action': form.get('action'),
+ }))
+ if not validation.get('status'):
+ raise ExtractorError(validation['errorMessage'], expected=True)
+ return self._download_webpage(url, video_id, note=f'Re-downloading {url_type} webpage')
+
+ def _real_extract(self, url):
+ base_url, url_type, video_id = self._match_valid_url(url).group('base_url', 'type', 'id')
+
+ if url_type == 'share':
+ webpage = self._get_real_webpage(url, base_url, video_id, 'share')
+ meeting_id = self._get_page_data(webpage, video_id)['meetingId']
+ redirect_path = self._download_json(
+ f'{base_url}nws/recording/1.0/play/share-info/{meeting_id}',
+ video_id, note='Downloading share info JSON')['result']['redirectUrl']
+ url = urljoin(base_url, redirect_path)
+
+ webpage = self._get_real_webpage(url, base_url, video_id, 'play')
+ file_id = self._get_page_data(webpage, video_id)['fileId']
+ if not file_id:
+ # When things go wrong, file_id can be empty string
+ raise ExtractorError('Unable to extract file ID')
- data = self._parse_json(self._search_regex(
- r'(?s)window\.__data__\s*=\s*({.+?});',
- webpage, 'data'), play_id, js_to_json)
+ data = self._download_json(
+ f'{base_url}nws/recording/1.0/play/info/{file_id}', video_id,
+ note='Downloading play info JSON')['result']
subtitles = {}
for _type in ('transcript', 'cc', 'chapter'):
@@ -67,11 +105,11 @@ class ZoomIE(InfoExtractor):
formats.append({
'format_note': 'Camera stream',
'url': str_or_none(data.get('viewMp4Url')),
- 'width': int_or_none(data.get('viewResolvtionsWidth')),
- 'height': int_or_none(data.get('viewResolvtionsHeight')),
- 'format_id': str_or_none(data.get('recordingId')),
+ 'width': int_or_none(traverse_obj(data, ('viewResolvtions', 0))),
+ 'height': int_or_none(traverse_obj(data, ('viewResolvtions', 1))),
+ 'format_id': str_or_none(traverse_obj(data, ('recording', 'id'))),
'ext': 'mp4',
- 'filesize_approx': parse_filesize(data.get('fileSize')),
+ 'filesize_approx': parse_filesize(str_or_none(traverse_obj(data, ('recording', 'fileSizeInMB')))),
'preference': 0
})
@@ -79,16 +117,16 @@ class ZoomIE(InfoExtractor):
formats.append({
'format_note': 'Screen share stream',
'url': str_or_none(data.get('shareMp4Url')),
- 'width': int_or_none(data.get('shareResolvtionsWidth')),
- 'height': int_or_none(data.get('shareResolvtionsHeight')),
- 'format_id': str_or_none(data.get('shareVideoId')),
+ 'width': int_or_none(traverse_obj(data, ('shareResolvtions', 0))),
+ 'height': int_or_none(traverse_obj(data, ('shareResolvtions', 1))),
+ 'format_id': str_or_none(traverse_obj(data, ('shareVideo', 'id'))),
'ext': 'mp4',
'preference': -1
})
return {
- 'id': play_id,
- 'title': data.get('topic'),
+ 'id': video_id,
+ 'title': str_or_none(traverse_obj(data, ('meet', 'topic'))),
'subtitles': subtitles,
'formats': formats,
'http_headers': {
diff --git a/hypervideo_dl/extractor/zype.py b/hypervideo_dl/extractor/zype.py
index 8cf9945..2f3b4c4 100644
--- a/hypervideo_dl/extractor/zype.py
+++ b/hypervideo_dl/extractor/zype.py
@@ -1,7 +1,7 @@
import re
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..networking.exceptions import HTTPError
from ..utils import (
dict_get,
ExtractorError,
@@ -37,9 +37,9 @@ class ZypeIE(InfoExtractor):
response = self._download_json(re.sub(
r'\.(?:js|html)\?', '.json?', url), video_id)['response']
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 403):
+ if isinstance(e.cause, HTTPError) and e.cause.status in (400, 401, 403):
raise ExtractorError(self._parse_json(
- e.cause.read().decode(), video_id)['message'], expected=True)
+ e.cause.response.read().decode(), video_id)['message'], expected=True)
raise
body = response['body']
diff --git a/hypervideo_dl/jsinterp.py b/hypervideo_dl/jsinterp.py
index adc5a19..9d989ad 100644
--- a/hypervideo_dl/jsinterp.py
+++ b/hypervideo_dl/jsinterp.py
@@ -9,6 +9,7 @@ import re
from .utils import (
NO_DEFAULT,
ExtractorError,
+ function_with_repr,
js_to_json,
remove_quotes,
truncate_string,
@@ -19,7 +20,12 @@ from .utils import (
def _js_bit_op(op):
def zeroise(x):
- return 0 if x in (None, JS_Undefined) else x
+ if x in (None, JS_Undefined):
+ return 0
+ with contextlib.suppress(TypeError):
+ if math.isnan(x): # NB: NaN cannot be checked by membership
+ return 0
+ return x
def wrapped(a, b):
return op(zeroise(a), zeroise(b)) & 0xffffffff
@@ -38,7 +44,7 @@ def _js_arith_op(op):
def _js_div(a, b):
- if JS_Undefined in (a, b) or not (a and b):
+ if JS_Undefined in (a, b) or not (a or b):
return float('nan')
return (a or 0) / b if b else float('inf')
@@ -184,7 +190,8 @@ class Debugger:
cls.write('=> Raises:', e, '<-|', stmt, level=allow_recursion)
raise
if cls.ENABLED and stmt.strip():
- cls.write(['->', '=>'][should_ret], repr(ret), '<-|', stmt, level=allow_recursion)
+ if should_ret or not repr(ret) == stmt:
+ cls.write(['->', '=>'][should_ret], repr(ret), '<-|', stmt, level=allow_recursion)
return ret, should_ret
return interpret_statement
@@ -205,8 +212,6 @@ class JSInterpreter:
'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string
}
- _EXC_NAME = '__hypervideo_dl_exception__'
-
def __init__(self, code, objects=None):
self.code, self._functions = code, {}
self._objects = {} if objects is None else objects
@@ -220,6 +225,8 @@ class JSInterpreter:
def _named_object(self, namespace, obj):
self.__named_object_counter += 1
name = f'__hypervideo_dl_jsinterp_obj{self.__named_object_counter}'
+ if callable(obj) and not isinstance(obj, function_with_repr):
+ obj = function_with_repr(obj, f'F<{self.__named_object_counter}>')
namespace[name] = obj
return name
@@ -256,9 +263,11 @@ class JSInterpreter:
elif in_quote == '/' and char in '[]':
in_regex_char_group = char == '['
escaping = not escaping and in_quote and char == '\\'
- after_op = not in_quote and char in OP_CHARS or (char.isspace() and after_op)
+ in_unary_op = (not in_quote and not in_regex_char_group
+ and after_op not in (True, False) and char in '-+')
+ after_op = char if (not in_quote and char in OP_CHARS) else (char.isspace() and after_op)
- if char != delim[pos] or any(counters.values()) or in_quote:
+ if char != delim[pos] or any(counters.values()) or in_quote or in_unary_op:
pos = 0
continue
elif pos != delim_len:
@@ -343,7 +352,10 @@ class JSInterpreter:
inner, outer = self._separate(expr, expr[0], 1)
if expr[0] == '/':
flags, outer = self._regex_flags(outer)
- inner = re.compile(inner[1:], flags=flags)
+ # We don't support regex methods yet, so no point compiling it
+ inner = f'{inner}/{flags}'
+ # Avoid https://github.com/python/cpython/issues/74534
+ # inner = re.compile(inner[1:].replace('[[', r'[\['), flags=flags)
else:
inner = json.loads(js_to_json(f'{inner}{expr[0]}', strict=True))
if not outer:
@@ -354,11 +366,11 @@ class JSInterpreter:
obj = expr[4:]
if obj.startswith('Date('):
left, right = self._separate_at_paren(obj[4:])
- expr = unified_timestamp(
+ date = unified_timestamp(
self.interpret_expression(left, local_vars, allow_recursion), False)
- if not expr:
+ if date is None:
raise self.Exception(f'Failed to parse date {left!r}', expr)
- expr = self._dump(int(expr * 1000), local_vars) + right
+ expr = self._dump(int(date * 1000), local_vars) + right
else:
raise self.Exception(f'Unsupported object {obj}', expr)
@@ -402,10 +414,25 @@ class JSInterpreter:
m = re.match(r'''(?x)
(?P<try>try)\s*\{|
+ (?P<if>if)\s*\(|
(?P<switch>switch)\s*\(|
(?P<for>for)\s*\(
''', expr)
md = m.groupdict() if m else {}
+ if md.get('if'):
+ cndn, expr = self._separate_at_paren(expr[m.end() - 1:])
+ if_expr, expr = self._separate_at_paren(expr.lstrip())
+ # TODO: "else if" is not handled
+ else_expr = None
+ m = re.match(r'else\s*{', expr)
+ if m:
+ else_expr, expr = self._separate_at_paren(expr[m.end() - 1:])
+ cndn = _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion))
+ ret, should_abort = self.interpret_statement(
+ if_expr if cndn else else_expr, local_vars, allow_recursion)
+ if should_abort:
+ return ret, True
+
if md.get('try'):
try_expr, expr = self._separate_at_paren(expr[m.end() - 1:])
err = None
@@ -418,7 +445,7 @@ class JSInterpreter:
err = e
pending = (None, False)
- m = re.match(r'catch\s*(?P<err>\(\s*{_NAME_RE}\s*\))?\{{'.format(**globals()), expr)
+ m = re.match(fr'catch\s*(?P<err>\(\s*{_NAME_RE}\s*\))?\{{', expr)
if m:
sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:])
if err:
@@ -752,7 +779,7 @@ class JSInterpreter:
obj = {}
obj_m = re.search(
r'''(?x)
- (?<!this\.)%s\s*=\s*{\s*
+ (?<!\.)%s\s*=\s*{\s*
(?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*)
}\s*;
''' % (re.escape(objname), _FUNC_NAME_RE),
@@ -768,7 +795,8 @@ class JSInterpreter:
fields)
for f in fields_m:
argnames = f.group('args').split(',')
- obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code'))
+ name = remove_quotes(f.group('key'))
+ obj[name] = function_with_repr(self.build_function(argnames, f.group('code')), f'F<{name}>')
return obj
@@ -784,13 +812,15 @@ class JSInterpreter:
\((?P<args>[^)]*)\)\s*
(?P<code>{.+})''' % {'name': re.escape(funcname)},
self.code)
- code, _ = self._separate_at_paren(func_m.group('code'))
if func_m is None:
raise self.Exception(f'Could not find JS function "{funcname}"')
+ code, _ = self._separate_at_paren(func_m.group('code'))
return [x.strip() for x in func_m.group('args').split(',')], code
def extract_function(self, funcname):
- return self.extract_function_from_code(*self.extract_function_code(funcname))
+ return function_with_repr(
+ self.extract_function_from_code(*self.extract_function_code(funcname)),
+ f'F<{funcname}>')
def extract_function_from_code(self, argnames, code, *global_stack):
local_vars = {}
diff --git a/hypervideo_dl/networking/__init__.py b/hypervideo_dl/networking/__init__.py
new file mode 100644
index 0000000..5e88764
--- /dev/null
+++ b/hypervideo_dl/networking/__init__.py
@@ -0,0 +1,13 @@
+# flake8: noqa: 401
+from .common import (
+ HEADRequest,
+ PUTRequest,
+ Request,
+ RequestDirector,
+ RequestHandler,
+ Response,
+)
+
+# isort: split
+# TODO: all request handlers should be safely imported
+from . import _urllib
diff --git a/hypervideo_dl/networking/_helper.py b/hypervideo_dl/networking/_helper.py
new file mode 100644
index 0000000..c0c7f9c
--- /dev/null
+++ b/hypervideo_dl/networking/_helper.py
@@ -0,0 +1,208 @@
+from __future__ import annotations
+
+import contextlib
+import functools
+import ssl
+import sys
+import typing
+import urllib.parse
+import urllib.request
+
+from .exceptions import RequestError, UnsupportedRequest
+from ..dependencies import certifi
+from ..socks import ProxyType
+from ..utils import format_field, traverse_obj
+
+if typing.TYPE_CHECKING:
+ from collections.abc import Iterable
+
+ from ..utils.networking import HTTPHeaderDict
+
+
+def ssl_load_certs(context: ssl.SSLContext, use_certifi=True):
+ if certifi and use_certifi:
+ context.load_verify_locations(cafile=certifi.where())
+ else:
+ try:
+ context.load_default_certs()
+ # Work around the issue in load_default_certs when there are bad certificates. See:
+ # https://github.com/hypervideo/hypervideo/issues/1060,
+ # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
+ except ssl.SSLError:
+ # enum_certificates is not present in mingw python. See https://github.com/hypervideo/hypervideo/issues/1151
+ if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
+ for storename in ('CA', 'ROOT'):
+ ssl_load_windows_store_certs(context, storename)
+ context.set_default_verify_paths()
+
+
+def ssl_load_windows_store_certs(ssl_context, storename):
+ # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
+ try:
+ certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
+ if encoding == 'x509_asn' and (
+ trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
+ except PermissionError:
+ return
+ for cert in certs:
+ with contextlib.suppress(ssl.SSLError):
+ ssl_context.load_verify_locations(cadata=cert)
+
+
+def make_socks_proxy_opts(socks_proxy):
+ url_components = urllib.parse.urlparse(socks_proxy)
+ if url_components.scheme.lower() == 'socks5':
+ socks_type = ProxyType.SOCKS5
+ rdns = False
+ elif url_components.scheme.lower() == 'socks5h':
+ socks_type = ProxyType.SOCKS5
+ rdns = True
+ elif url_components.scheme.lower() == 'socks4':
+ socks_type = ProxyType.SOCKS4
+ rdns = False
+ elif url_components.scheme.lower() == 'socks4a':
+ socks_type = ProxyType.SOCKS4A
+ rdns = True
+ else:
+ raise ValueError(f'Unknown SOCKS proxy version: {url_components.scheme.lower()}')
+
+ def unquote_if_non_empty(s):
+ if not s:
+ return s
+ return urllib.parse.unquote_plus(s)
+ return {
+ 'proxytype': socks_type,
+ 'addr': url_components.hostname,
+ 'port': url_components.port or 1080,
+ 'rdns': rdns,
+ 'username': unquote_if_non_empty(url_components.username),
+ 'password': unquote_if_non_empty(url_components.password),
+ }
+
+
+def select_proxy(url, proxies):
+ """Unified proxy selector for all backends"""
+ url_components = urllib.parse.urlparse(url)
+ if 'no' in proxies:
+ hostport = url_components.hostname + format_field(url_components.port, None, ':%s')
+ if urllib.request.proxy_bypass_environment(hostport, {'no': proxies['no']}):
+ return
+ elif urllib.request.proxy_bypass(hostport): # check system settings
+ return
+
+ return traverse_obj(proxies, url_components.scheme or 'http', 'all')
+
+
+def get_redirect_method(method, status):
+ """Unified redirect method handling"""
+
+ # A 303 must either use GET or HEAD for subsequent request
+ # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
+ if status == 303 and method != 'HEAD':
+ method = 'GET'
+ # 301 and 302 redirects are commonly turned into a GET from a POST
+ # for subsequent requests by browsers, so we'll do the same.
+ # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
+ # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
+ if status in (301, 302) and method == 'POST':
+ method = 'GET'
+ return method
+
+
+def make_ssl_context(
+ verify=True,
+ client_certificate=None,
+ client_certificate_key=None,
+ client_certificate_password=None,
+ legacy_support=False,
+ use_certifi=True,
+):
+ context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+ context.check_hostname = verify
+ context.verify_mode = ssl.CERT_REQUIRED if verify else ssl.CERT_NONE
+
+ # Some servers may reject requests if ALPN extension is not sent. See:
+ # https://github.com/python/cpython/issues/85140
+ # https://github.com/hypervideo/hypervideo/issues/3878
+ with contextlib.suppress(NotImplementedError):
+ context.set_alpn_protocols(['http/1.1'])
+ if verify:
+ ssl_load_certs(context, use_certifi)
+
+ if legacy_support:
+ context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
+ context.set_ciphers('DEFAULT') # compat
+
+ elif ssl.OPENSSL_VERSION_INFO >= (1, 1, 1) and not ssl.OPENSSL_VERSION.startswith('LibreSSL'):
+ # Use the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
+ # This is to ensure consistent behavior across Python versions and libraries, and help avoid fingerprinting
+ # in some situations [2][3].
+ # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
+ # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
+ # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
+ # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
+ # 2. https://github.com/hypervideo/hypervideo/issues/4627
+ # 3. https://github.com/hypervideo/hypervideo/pull/5294
+ # 4. https://peps.python.org/pep-0644/
+ # 5. https://peps.python.org/pep-0644/#libressl-support
+ # 6. https://github.com/hypervideo/hypervideo/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
+ context.set_ciphers(
+ '@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
+ context.minimum_version = ssl.TLSVersion.TLSv1_2
+
+ if client_certificate:
+ try:
+ context.load_cert_chain(
+ client_certificate, keyfile=client_certificate_key,
+ password=client_certificate_password)
+ except ssl.SSLError:
+ raise RequestError('Unable to load client certificate')
+
+ if getattr(context, 'post_handshake_auth', None) is not None:
+ context.post_handshake_auth = True
+ return context
+
+
+class InstanceStoreMixin:
+ def __init__(self, **kwargs):
+ self.__instances = []
+ super().__init__(**kwargs) # So that both MRO works
+
+ @staticmethod
+ def _create_instance(**kwargs):
+ raise NotImplementedError
+
+ def _get_instance(self, **kwargs):
+ for key, instance in self.__instances:
+ if key == kwargs:
+ return instance
+
+ instance = self._create_instance(**kwargs)
+ self.__instances.append((kwargs, instance))
+ return instance
+
+ def _close_instance(self, instance):
+ if callable(getattr(instance, 'close', None)):
+ instance.close()
+
+ def _clear_instances(self):
+ for _, instance in self.__instances:
+ self._close_instance(instance)
+ self.__instances.clear()
+
+
+def add_accept_encoding_header(headers: HTTPHeaderDict, supported_encodings: Iterable[str]):
+ if 'Accept-Encoding' not in headers:
+ headers['Accept-Encoding'] = ', '.join(supported_encodings) or 'identity'
+
+
+def wrap_request_errors(func):
+ @functools.wraps(func)
+ def wrapper(self, *args, **kwargs):
+ try:
+ return func(self, *args, **kwargs)
+ except UnsupportedRequest as e:
+ if e.handler is None:
+ e.handler = self
+ raise
+ return wrapper
diff --git a/hypervideo_dl/networking/_urllib.py b/hypervideo_dl/networking/_urllib.py
new file mode 100644
index 0000000..7dcf538
--- /dev/null
+++ b/hypervideo_dl/networking/_urllib.py
@@ -0,0 +1,454 @@
+from __future__ import annotations
+
+import functools
+import http.client
+import io
+import socket
+import ssl
+import urllib.error
+import urllib.parse
+import urllib.request
+import urllib.response
+import zlib
+from urllib.request import (
+ DataHandler,
+ FileHandler,
+ FTPHandler,
+ HTTPCookieProcessor,
+ HTTPDefaultErrorHandler,
+ HTTPErrorProcessor,
+ UnknownHandler,
+)
+
+from ._helper import (
+ InstanceStoreMixin,
+ add_accept_encoding_header,
+ get_redirect_method,
+ make_socks_proxy_opts,
+ select_proxy,
+)
+from .common import Features, RequestHandler, Response, register_rh
+from .exceptions import (
+ CertificateVerifyError,
+ HTTPError,
+ IncompleteRead,
+ ProxyError,
+ RequestError,
+ SSLError,
+ TransportError,
+)
+from ..dependencies import brotli
+from ..socks import ProxyError as SocksProxyError
+from ..socks import sockssocket
+from ..utils import update_url_query
+from ..utils.networking import normalize_url
+
+SUPPORTED_ENCODINGS = ['gzip', 'deflate']
+CONTENT_DECODE_ERRORS = [zlib.error, OSError]
+
+if brotli:
+ SUPPORTED_ENCODINGS.append('br')
+ CONTENT_DECODE_ERRORS.append(brotli.error)
+
+
+def _create_http_connection(http_class, source_address, *args, **kwargs):
+ hc = http_class(*args, **kwargs)
+
+ if source_address is not None:
+ # This is to workaround _create_connection() from socket where it will try all
+ # address data from getaddrinfo() including IPv6. This filters the result from
+ # getaddrinfo() based on the source_address value.
+ # This is based on the cpython socket.create_connection() function.
+ # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
+ def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
+ host, port = address
+ err = None
+ addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
+ af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
+ ip_addrs = [addr for addr in addrs if addr[0] == af]
+ if addrs and not ip_addrs:
+ ip_version = 'v4' if af == socket.AF_INET else 'v6'
+ raise OSError(
+ "No remote IP%s addresses available for connect, can't use '%s' as source address"
+ % (ip_version, source_address[0]))
+ for res in ip_addrs:
+ af, socktype, proto, canonname, sa = res
+ sock = None
+ try:
+ sock = socket.socket(af, socktype, proto)
+ if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
+ sock.settimeout(timeout)
+ sock.bind(source_address)
+ sock.connect(sa)
+ err = None # Explicitly break reference cycle
+ return sock
+ except OSError as _:
+ err = _
+ if sock is not None:
+ sock.close()
+ if err is not None:
+ raise err
+ else:
+ raise OSError('getaddrinfo returns an empty list')
+ if hasattr(hc, '_create_connection'):
+ hc._create_connection = _create_connection
+ hc.source_address = (source_address, 0)
+
+ return hc
+
+
+class HTTPHandler(urllib.request.AbstractHTTPHandler):
+ """Handler for HTTP requests and responses.
+
+ This class, when installed with an OpenerDirector, automatically adds
+ the standard headers to every HTTP request and handles gzipped, deflated and
+ brotli responses from web servers.
+
+ Part of this code was copied from:
+
+ http://techknack.net/python-urllib2-handlers/
+
+ Andrew Rowls, the author of that code, agreed to release it to the
+ public domain.
+ """
+
+ def __init__(self, context=None, source_address=None, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self._source_address = source_address
+ self._context = context
+
+ @staticmethod
+ def _make_conn_class(base, req):
+ conn_class = base
+ socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
+ if socks_proxy:
+ conn_class = make_socks_conn_class(conn_class, socks_proxy)
+ return conn_class
+
+ def http_open(self, req):
+ conn_class = self._make_conn_class(http.client.HTTPConnection, req)
+ return self.do_open(functools.partial(
+ _create_http_connection, conn_class, self._source_address), req)
+
+ def https_open(self, req):
+ conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
+ return self.do_open(
+ functools.partial(
+ _create_http_connection, conn_class, self._source_address),
+ req, context=self._context)
+
+ @staticmethod
+ def deflate(data):
+ if not data:
+ return data
+ try:
+ return zlib.decompress(data, -zlib.MAX_WBITS)
+ except zlib.error:
+ return zlib.decompress(data)
+
+ @staticmethod
+ def brotli(data):
+ if not data:
+ return data
+ return brotli.decompress(data)
+
+ @staticmethod
+ def gz(data):
+ # There may be junk added the end of the file
+ # We ignore it by only ever decoding a single gzip payload
+ if not data:
+ return data
+ return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
+
+ def http_request(self, req):
+ # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
+ # always respected by websites, some tend to give out URLs with non percent-encoded
+ # non-ASCII characters (see telemb.py, ard.py [#3412])
+ # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
+ # To work around aforementioned issue we will replace request's original URL with
+ # percent-encoded one
+ # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
+ # the code of this workaround has been moved here from YoutubeDL.urlopen()
+ url = req.get_full_url()
+ url_escaped = normalize_url(url)
+
+ # Substitute URL if any change after escaping
+ if url != url_escaped:
+ req = update_Request(req, url=url_escaped)
+
+ return super().do_request_(req)
+
+ def http_response(self, req, resp):
+ old_resp = resp
+
+ # Content-Encoding header lists the encodings in order that they were applied [1].
+ # To decompress, we simply do the reverse.
+ # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
+ decoded_response = None
+ for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
+ if encoding == 'gzip':
+ decoded_response = self.gz(decoded_response or resp.read())
+ elif encoding == 'deflate':
+ decoded_response = self.deflate(decoded_response or resp.read())
+ elif encoding == 'br' and brotli:
+ decoded_response = self.brotli(decoded_response or resp.read())
+
+ if decoded_response is not None:
+ resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
+ resp.msg = old_resp.msg
+ # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
+ # https://github.com/ytdl-org/youtube-dl/issues/6457).
+ if 300 <= resp.code < 400:
+ location = resp.headers.get('Location')
+ if location:
+ # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
+ location = location.encode('iso-8859-1').decode()
+ location_escaped = normalize_url(location)
+ if location != location_escaped:
+ del resp.headers['Location']
+ resp.headers['Location'] = location_escaped
+ return resp
+
+ https_request = http_request
+ https_response = http_response
+
+
+def make_socks_conn_class(base_class, socks_proxy):
+ assert issubclass(base_class, (
+ http.client.HTTPConnection, http.client.HTTPSConnection))
+
+ proxy_args = make_socks_proxy_opts(socks_proxy)
+
+ class SocksConnection(base_class):
+ def connect(self):
+ self.sock = sockssocket()
+ self.sock.setproxy(**proxy_args)
+ if type(self.timeout) in (int, float): # noqa: E721
+ self.sock.settimeout(self.timeout)
+ self.sock.connect((self.host, self.port))
+
+ if isinstance(self, http.client.HTTPSConnection):
+ self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
+
+ return SocksConnection
+
+
+class RedirectHandler(urllib.request.HTTPRedirectHandler):
+ """YoutubeDL redirect handler
+
+ The code is based on HTTPRedirectHandler implementation from CPython [1].
+
+ This redirect handler fixes and improves the logic to better align with RFC7261
+ and what browsers tend to do [2][3]
+
+ 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
+ 2. https://datatracker.ietf.org/doc/html/rfc7231
+ 3. https://github.com/python/cpython/issues/91306
+ """
+
+ http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
+
+ def redirect_request(self, req, fp, code, msg, headers, newurl):
+ if code not in (301, 302, 303, 307, 308):
+ raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
+
+ new_data = req.data
+
+ # Technically the Cookie header should be in unredirected_hdrs,
+ # however in practice some may set it in normal headers anyway.
+ # We will remove it here to prevent any leaks.
+ remove_headers = ['Cookie']
+
+ new_method = get_redirect_method(req.get_method(), code)
+ # only remove payload if method changed (e.g. POST to GET)
+ if new_method != req.get_method():
+ new_data = None
+ remove_headers.extend(['Content-Length', 'Content-Type'])
+
+ new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
+
+ return urllib.request.Request(
+ newurl, headers=new_headers, origin_req_host=req.origin_req_host,
+ unverifiable=True, method=new_method, data=new_data)
+
+
+class ProxyHandler(urllib.request.BaseHandler):
+ handler_order = 100
+
+ def __init__(self, proxies=None):
+ self.proxies = proxies
+ # Set default handlers
+ for type in ('http', 'https', 'ftp'):
+ setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
+
+ def proxy_open(self, req):
+ proxy = select_proxy(req.get_full_url(), self.proxies)
+ if proxy is None:
+ return
+ if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
+ req.add_header('Ytdl-socks-proxy', proxy)
+ # hypervideo's http/https handlers do wrapping the socket with socks
+ return None
+ return urllib.request.ProxyHandler.proxy_open(
+ self, req, proxy, None)
+
+
+class PUTRequest(urllib.request.Request):
+ def get_method(self):
+ return 'PUT'
+
+
+class HEADRequest(urllib.request.Request):
+ def get_method(self):
+ return 'HEAD'
+
+
+def update_Request(req, url=None, data=None, headers=None, query=None):
+ req_headers = req.headers.copy()
+ req_headers.update(headers or {})
+ req_data = data if data is not None else req.data
+ req_url = update_url_query(url or req.get_full_url(), query)
+ req_get_method = req.get_method()
+ if req_get_method == 'HEAD':
+ req_type = HEADRequest
+ elif req_get_method == 'PUT':
+ req_type = PUTRequest
+ else:
+ req_type = urllib.request.Request
+ new_req = req_type(
+ req_url, data=req_data, headers=req_headers,
+ origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
+ if hasattr(req, 'timeout'):
+ new_req.timeout = req.timeout
+ return new_req
+
+
+class UrllibResponseAdapter(Response):
+ """
+ HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
+ """
+
+ def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
+ # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
+ # HTTPResponse: .getcode() was deprecated, .status always existed [2]
+ # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
+ # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
+ super().__init__(
+ fp=res, headers=res.headers, url=res.url,
+ status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
+
+ def read(self, amt=None):
+ try:
+ return self.fp.read(amt)
+ except Exception as e:
+ handle_response_read_exceptions(e)
+ raise e
+
+
+def handle_sslerror(e: ssl.SSLError):
+ if not isinstance(e, ssl.SSLError):
+ return
+ if isinstance(e, ssl.SSLCertVerificationError):
+ raise CertificateVerifyError(cause=e) from e
+ raise SSLError(cause=e) from e
+
+
+def handle_response_read_exceptions(e):
+ if isinstance(e, http.client.IncompleteRead):
+ raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e
+ elif isinstance(e, ssl.SSLError):
+ handle_sslerror(e)
+ elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
+ # OSErrors raised here should mostly be network related
+ raise TransportError(cause=e) from e
+
+
+@register_rh
+class UrllibRH(RequestHandler, InstanceStoreMixin):
+ _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
+ _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
+ _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
+ RH_NAME = 'urllib'
+
+ def __init__(self, *, enable_file_urls: bool = False, **kwargs):
+ super().__init__(**kwargs)
+ self.enable_file_urls = enable_file_urls
+ if self.enable_file_urls:
+ self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
+
+ def _check_extensions(self, extensions):
+ super()._check_extensions(extensions)
+ extensions.pop('cookiejar', None)
+ extensions.pop('timeout', None)
+
+ def _create_instance(self, proxies, cookiejar):
+ opener = urllib.request.OpenerDirector()
+ handlers = [
+ ProxyHandler(proxies),
+ HTTPHandler(
+ debuglevel=int(bool(self.verbose)),
+ context=self._make_sslcontext(),
+ source_address=self.source_address),
+ HTTPCookieProcessor(cookiejar),
+ DataHandler(),
+ UnknownHandler(),
+ HTTPDefaultErrorHandler(),
+ FTPHandler(),
+ HTTPErrorProcessor(),
+ RedirectHandler(),
+ ]
+
+ if self.enable_file_urls:
+ handlers.append(FileHandler())
+
+ for handler in handlers:
+ opener.add_handler(handler)
+
+ # Delete the default user-agent header, which would otherwise apply in
+ # cases where our custom HTTP handler doesn't come into play
+ # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
+ opener.addheaders = []
+ return opener
+
+ def _send(self, request):
+ headers = self._merge_headers(request.headers)
+ add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
+ urllib_req = urllib.request.Request(
+ url=request.url,
+ data=request.data,
+ headers=dict(headers),
+ method=request.method
+ )
+
+ opener = self._get_instance(
+ proxies=request.proxies or self.proxies,
+ cookiejar=request.extensions.get('cookiejar') or self.cookiejar
+ )
+ try:
+ res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
+ except urllib.error.HTTPError as e:
+ if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
+ # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
+ e._closer.file = None
+ raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
+ raise # unexpected
+ except urllib.error.URLError as e:
+ cause = e.reason # NOTE: cause may be a string
+
+ # proxy errors
+ if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
+ raise ProxyError(cause=e) from e
+
+ handle_response_read_exceptions(cause)
+ raise TransportError(cause=e) from e
+ except (http.client.InvalidURL, ValueError) as e:
+ # Validation errors
+ # http.client.HTTPConnection raises ValueError in some validation cases
+ # such as if request method contains illegal control characters [1]
+ # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
+ raise RequestError(cause=e) from e
+ except Exception as e:
+ handle_response_read_exceptions(e)
+ raise # unexpected
+
+ return UrllibResponseAdapter(res)
diff --git a/hypervideo_dl/networking/common.py b/hypervideo_dl/networking/common.py
new file mode 100644
index 0000000..584c7bb
--- /dev/null
+++ b/hypervideo_dl/networking/common.py
@@ -0,0 +1,564 @@
+from __future__ import annotations
+
+import abc
+import copy
+import enum
+import functools
+import io
+import typing
+import urllib.parse
+import urllib.request
+import urllib.response
+from collections.abc import Iterable, Mapping
+from email.message import Message
+from http import HTTPStatus
+
+from ._helper import make_ssl_context, wrap_request_errors
+from .exceptions import (
+ NoSupportingHandlers,
+ RequestError,
+ TransportError,
+ UnsupportedRequest,
+)
+from ..compat.types import NoneType
+from ..cookies import YoutubeDLCookieJar
+from ..utils import (
+ bug_reports_message,
+ classproperty,
+ deprecation_warning,
+ error_to_str,
+ update_url_query,
+)
+from ..utils.networking import HTTPHeaderDict, normalize_url
+
+
+def register_preference(*handlers: type[RequestHandler]):
+ assert all(issubclass(handler, RequestHandler) for handler in handlers)
+
+ def outer(preference: Preference):
+ @functools.wraps(preference)
+ def inner(handler, *args, **kwargs):
+ if not handlers or isinstance(handler, handlers):
+ return preference(handler, *args, **kwargs)
+ return 0
+ _RH_PREFERENCES.add(inner)
+ return inner
+ return outer
+
+
+class RequestDirector:
+ """RequestDirector class
+
+ Helper class that, when given a request, forward it to a RequestHandler that supports it.
+
+ Preference functions in the form of func(handler, request) -> int
+ can be registered into the `preferences` set. These are used to sort handlers
+ in order of preference.
+
+ @param logger: Logger instance.
+ @param verbose: Print debug request information to stdout.
+ """
+
+ def __init__(self, logger, verbose=False):
+ self.handlers: dict[str, RequestHandler] = {}
+ self.preferences: set[Preference] = set()
+ self.logger = logger # TODO(Grub4k): default logger
+ self.verbose = verbose
+
+ def close(self):
+ for handler in self.handlers.values():
+ handler.close()
+
+ def add_handler(self, handler: RequestHandler):
+ """Add a handler. If a handler of the same RH_KEY exists, it will overwrite it"""
+ assert isinstance(handler, RequestHandler), 'handler must be a RequestHandler'
+ self.handlers[handler.RH_KEY] = handler
+
+ def _get_handlers(self, request: Request) -> list[RequestHandler]:
+ """Sorts handlers by preference, given a request"""
+ preferences = {
+ rh: sum(pref(rh, request) for pref in self.preferences)
+ for rh in self.handlers.values()
+ }
+ self._print_verbose('Handler preferences for this request: %s' % ', '.join(
+ f'{rh.RH_NAME}={pref}' for rh, pref in preferences.items()))
+ return sorted(self.handlers.values(), key=preferences.get, reverse=True)
+
+ def _print_verbose(self, msg):
+ if self.verbose:
+ self.logger.stdout(f'director: {msg}')
+
+ def send(self, request: Request) -> Response:
+ """
+ Passes a request onto a suitable RequestHandler
+ """
+ if not self.handlers:
+ raise RequestError('No request handlers configured')
+
+ assert isinstance(request, Request)
+
+ unexpected_errors = []
+ unsupported_errors = []
+ for handler in self._get_handlers(request):
+ self._print_verbose(f'Checking if "{handler.RH_NAME}" supports this request.')
+ try:
+ handler.validate(request)
+ except UnsupportedRequest as e:
+ self._print_verbose(
+ f'"{handler.RH_NAME}" cannot handle this request (reason: {error_to_str(e)})')
+ unsupported_errors.append(e)
+ continue
+
+ self._print_verbose(f'Sending request via "{handler.RH_NAME}"')
+ try:
+ response = handler.send(request)
+ except RequestError:
+ raise
+ except Exception as e:
+ self.logger.error(
+ f'[{handler.RH_NAME}] Unexpected error: {error_to_str(e)}{bug_reports_message()}',
+ is_error=False)
+ unexpected_errors.append(e)
+ continue
+
+ assert isinstance(response, Response)
+ return response
+
+ raise NoSupportingHandlers(unsupported_errors, unexpected_errors)
+
+
+_REQUEST_HANDLERS = {}
+
+
+def register_rh(handler):
+ """Register a RequestHandler class"""
+ assert issubclass(handler, RequestHandler), f'{handler} must be a subclass of RequestHandler'
+ assert handler.RH_KEY not in _REQUEST_HANDLERS, f'RequestHandler {handler.RH_KEY} already registered'
+ _REQUEST_HANDLERS[handler.RH_KEY] = handler
+ return handler
+
+
+class Features(enum.Enum):
+ ALL_PROXY = enum.auto()
+ NO_PROXY = enum.auto()
+
+
+class RequestHandler(abc.ABC):
+
+ """Request Handler class
+
+ Request handlers are class that, given a Request,
+ process the request from start to finish and return a Response.
+
+ Concrete subclasses need to redefine the _send(request) method,
+ which handles the underlying request logic and returns a Response.
+
+ RH_NAME class variable may contain a display name for the RequestHandler.
+ By default, this is generated from the class name.
+
+ The concrete request handler MUST have "RH" as the suffix in the class name.
+
+ All exceptions raised by a RequestHandler should be an instance of RequestError.
+ Any other exception raised will be treated as a handler issue.
+
+ If a Request is not supported by the handler, an UnsupportedRequest
+ should be raised with a reason.
+
+ By default, some checks are done on the request in _validate() based on the following class variables:
+ - `_SUPPORTED_URL_SCHEMES`: a tuple of supported url schemes.
+ Any Request with an url scheme not in this list will raise an UnsupportedRequest.
+
+ - `_SUPPORTED_PROXY_SCHEMES`: a tuple of support proxy url schemes. Any Request that contains
+ a proxy url with an url scheme not in this list will raise an UnsupportedRequest.
+
+ - `_SUPPORTED_FEATURES`: a tuple of supported features, as defined in Features enum.
+
+ The above may be set to None to disable the checks.
+
+ Parameters:
+ @param logger: logger instance
+ @param headers: HTTP Headers to include when sending requests.
+ @param cookiejar: Cookiejar to use for requests.
+ @param timeout: Socket timeout to use when sending requests.
+ @param proxies: Proxies to use for sending requests.
+ @param source_address: Client-side IP address to bind to for requests.
+ @param verbose: Print debug request and traffic information to stdout.
+ @param prefer_system_certs: Whether to prefer system certificates over other means (e.g. certifi).
+ @param client_cert: SSL client certificate configuration.
+ dict with {client_certificate, client_certificate_key, client_certificate_password}
+ @param verify: Verify SSL certificates
+ @param legacy_ssl_support: Enable legacy SSL options such as legacy server connect and older cipher support.
+
+ Some configuration options may be available for individual Requests too. In this case,
+ either the Request configuration option takes precedence or they are merged.
+
+ Requests may have additional optional parameters defined as extensions.
+ RequestHandler subclasses may choose to support custom extensions.
+
+ If an extension is supported, subclasses should extend _check_extensions(extensions)
+ to pop and validate the extension.
+ - Extensions left in `extensions` are treated as unsupported and UnsupportedRequest will be raised.
+
+ The following extensions are defined for RequestHandler:
+ - `cookiejar`: Cookiejar to use for this request.
+ - `timeout`: socket timeout to use for this request.
+ To enable these, add extensions.pop('<extension>', None) to _check_extensions
+
+ Apart from the url protocol, proxies dict may contain the following keys:
+ - `all`: proxy to use for all protocols. Used as a fallback if no proxy is set for a specific protocol.
+ - `no`: comma seperated list of hostnames (optionally with port) to not use a proxy for.
+ Note: a RequestHandler may not support these, as defined in `_SUPPORTED_FEATURES`.
+
+ """
+
+ _SUPPORTED_URL_SCHEMES = ()
+ _SUPPORTED_PROXY_SCHEMES = ()
+ _SUPPORTED_FEATURES = ()
+
+ def __init__(
+ self, *,
+ logger, # TODO(Grub4k): default logger
+ headers: HTTPHeaderDict = None,
+ cookiejar: YoutubeDLCookieJar = None,
+ timeout: float | int | None = None,
+ proxies: dict = None,
+ source_address: str = None,
+ verbose: bool = False,
+ prefer_system_certs: bool = False,
+ client_cert: dict[str, str | None] = None,
+ verify: bool = True,
+ legacy_ssl_support: bool = False,
+ **_,
+ ):
+
+ self._logger = logger
+ self.headers = headers or {}
+ self.cookiejar = cookiejar if cookiejar is not None else YoutubeDLCookieJar()
+ self.timeout = float(timeout or 20)
+ self.proxies = proxies or {}
+ self.source_address = source_address
+ self.verbose = verbose
+ self.prefer_system_certs = prefer_system_certs
+ self._client_cert = client_cert or {}
+ self.verify = verify
+ self.legacy_ssl_support = legacy_ssl_support
+ super().__init__()
+
+ def _make_sslcontext(self):
+ return make_ssl_context(
+ verify=self.verify,
+ legacy_support=self.legacy_ssl_support,
+ use_certifi=not self.prefer_system_certs,
+ **self._client_cert,
+ )
+
+ def _merge_headers(self, request_headers):
+ return HTTPHeaderDict(self.headers, request_headers)
+
+ def _check_url_scheme(self, request: Request):
+ scheme = urllib.parse.urlparse(request.url).scheme.lower()
+ if self._SUPPORTED_URL_SCHEMES is not None and scheme not in self._SUPPORTED_URL_SCHEMES:
+ raise UnsupportedRequest(f'Unsupported url scheme: "{scheme}"')
+ return scheme # for further processing
+
+ def _check_proxies(self, proxies):
+ for proxy_key, proxy_url in proxies.items():
+ if proxy_url is None:
+ continue
+ if proxy_key == 'no':
+ if self._SUPPORTED_FEATURES is not None and Features.NO_PROXY not in self._SUPPORTED_FEATURES:
+ raise UnsupportedRequest('"no" proxy is not supported')
+ continue
+ if (
+ proxy_key == 'all'
+ and self._SUPPORTED_FEATURES is not None
+ and Features.ALL_PROXY not in self._SUPPORTED_FEATURES
+ ):
+ raise UnsupportedRequest('"all" proxy is not supported')
+
+ # Unlikely this handler will use this proxy, so ignore.
+ # This is to allow a case where a proxy may be set for a protocol
+ # for one handler in which such protocol (and proxy) is not supported by another handler.
+ if self._SUPPORTED_URL_SCHEMES is not None and proxy_key not in (*self._SUPPORTED_URL_SCHEMES, 'all'):
+ continue
+
+ if self._SUPPORTED_PROXY_SCHEMES is None:
+ # Skip proxy scheme checks
+ continue
+
+ try:
+ if urllib.request._parse_proxy(proxy_url)[0] is None:
+ # Scheme-less proxies are not supported
+ raise UnsupportedRequest(f'Proxy "{proxy_url}" missing scheme')
+ except ValueError as e:
+ # parse_proxy may raise on some invalid proxy urls such as "/a/b/c"
+ raise UnsupportedRequest(f'Invalid proxy url "{proxy_url}": {e}')
+
+ scheme = urllib.parse.urlparse(proxy_url).scheme.lower()
+ if scheme not in self._SUPPORTED_PROXY_SCHEMES:
+ raise UnsupportedRequest(f'Unsupported proxy type: "{scheme}"')
+
+ def _check_extensions(self, extensions):
+ """Check extensions for unsupported extensions. Subclasses should extend this."""
+ assert isinstance(extensions.get('cookiejar'), (YoutubeDLCookieJar, NoneType))
+ assert isinstance(extensions.get('timeout'), (float, int, NoneType))
+
+ def _validate(self, request):
+ self._check_url_scheme(request)
+ self._check_proxies(request.proxies or self.proxies)
+ extensions = request.extensions.copy()
+ self._check_extensions(extensions)
+ if extensions:
+ # TODO: add support for optional extensions
+ raise UnsupportedRequest(f'Unsupported extensions: {", ".join(extensions.keys())}')
+
+ @wrap_request_errors
+ def validate(self, request: Request):
+ if not isinstance(request, Request):
+ raise TypeError('Expected an instance of Request')
+ self._validate(request)
+
+ @wrap_request_errors
+ def send(self, request: Request) -> Response:
+ if not isinstance(request, Request):
+ raise TypeError('Expected an instance of Request')
+ return self._send(request)
+
+ @abc.abstractmethod
+ def _send(self, request: Request):
+ """Handle a request from start to finish. Redefine in subclasses."""
+ pass
+
+ def close(self):
+ pass
+
+ @classproperty
+ def RH_NAME(cls):
+ return cls.__name__[:-2]
+
+ @classproperty
+ def RH_KEY(cls):
+ assert cls.__name__.endswith('RH'), 'RequestHandler class names must end with "RH"'
+ return cls.__name__[:-2]
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, *args):
+ self.close()
+
+
+class Request:
+ """
+ Represents a request to be made.
+ Partially backwards-compatible with urllib.request.Request.
+
+ @param url: url to send. Will be sanitized.
+ @param data: payload data to send. Must be bytes, iterable of bytes, a file-like object or None
+ @param headers: headers to send.
+ @param proxies: proxy dict mapping of proto:proxy to use for the request and any redirects.
+ @param query: URL query parameters to update the url with.
+ @param method: HTTP method to use. If no method specified, will use POST if payload data is present else GET
+ @param extensions: Dictionary of Request extensions to add, as supported by handlers.
+ """
+
+ def __init__(
+ self,
+ url: str,
+ data: RequestData = None,
+ headers: typing.Mapping = None,
+ proxies: dict = None,
+ query: dict = None,
+ method: str = None,
+ extensions: dict = None
+ ):
+
+ self._headers = HTTPHeaderDict()
+ self._data = None
+
+ if query:
+ url = update_url_query(url, query)
+
+ self.url = url
+ self.method = method
+ if headers:
+ self.headers = headers
+ self.data = data # note: must be done after setting headers
+ self.proxies = proxies or {}
+ self.extensions = extensions or {}
+
+ @property
+ def url(self):
+ return self._url
+
+ @url.setter
+ def url(self, url):
+ if not isinstance(url, str):
+ raise TypeError('url must be a string')
+ elif url.startswith('//'):
+ url = 'http:' + url
+ self._url = normalize_url(url)
+
+ @property
+ def method(self):
+ return self._method or ('POST' if self.data is not None else 'GET')
+
+ @method.setter
+ def method(self, method):
+ if method is None:
+ self._method = None
+ elif isinstance(method, str):
+ self._method = method.upper()
+ else:
+ raise TypeError('method must be a string')
+
+ @property
+ def data(self):
+ return self._data
+
+ @data.setter
+ def data(self, data: RequestData):
+ # Try catch some common mistakes
+ if data is not None and (
+ not isinstance(data, (bytes, io.IOBase, Iterable)) or isinstance(data, (str, Mapping))
+ ):
+ raise TypeError('data must be bytes, iterable of bytes, or a file-like object')
+
+ if data == self._data and self._data is None:
+ self.headers.pop('Content-Length', None)
+
+ # https://docs.python.org/3/library/urllib.request.html#urllib.request.Request.data
+ if data != self._data:
+ if self._data is not None:
+ self.headers.pop('Content-Length', None)
+ self._data = data
+
+ if self._data is None:
+ self.headers.pop('Content-Type', None)
+
+ if 'Content-Type' not in self.headers and self._data is not None:
+ self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
+
+ @property
+ def headers(self) -> HTTPHeaderDict:
+ return self._headers
+
+ @headers.setter
+ def headers(self, new_headers: Mapping):
+ """Replaces headers of the request. If not a CaseInsensitiveDict, it will be converted to one."""
+ if isinstance(new_headers, HTTPHeaderDict):
+ self._headers = new_headers
+ elif isinstance(new_headers, Mapping):
+ self._headers = HTTPHeaderDict(new_headers)
+ else:
+ raise TypeError('headers must be a mapping')
+
+ def update(self, url=None, data=None, headers=None, query=None):
+ self.data = data if data is not None else self.data
+ self.headers.update(headers or {})
+ self.url = update_url_query(url or self.url, query or {})
+
+ def copy(self):
+ return self.__class__(
+ url=self.url,
+ headers=copy.deepcopy(self.headers),
+ proxies=copy.deepcopy(self.proxies),
+ data=self._data,
+ extensions=copy.copy(self.extensions),
+ method=self._method,
+ )
+
+
+HEADRequest = functools.partial(Request, method='HEAD')
+PUTRequest = functools.partial(Request, method='PUT')
+
+
+class Response(io.IOBase):
+ """
+ Base class for HTTP response adapters.
+
+ By default, it provides a basic wrapper for a file-like response object.
+
+ Interface partially backwards-compatible with addinfourl and http.client.HTTPResponse.
+
+ @param fp: Original, file-like, response.
+ @param url: URL that this is a response of.
+ @param headers: response headers.
+ @param status: Response HTTP status code. Default is 200 OK.
+ @param reason: HTTP status reason. Will use built-in reasons based on status code if not provided.
+ """
+
+ def __init__(
+ self,
+ fp: typing.IO,
+ url: str,
+ headers: Mapping[str, str],
+ status: int = 200,
+ reason: str = None):
+
+ self.fp = fp
+ self.headers = Message()
+ for name, value in headers.items():
+ self.headers.add_header(name, value)
+ self.status = status
+ self.url = url
+ try:
+ self.reason = reason or HTTPStatus(status).phrase
+ except ValueError:
+ self.reason = None
+
+ def readable(self):
+ return self.fp.readable()
+
+ def read(self, amt: int = None) -> bytes:
+ # Expected errors raised here should be of type RequestError or subclasses.
+ # Subclasses should redefine this method with more precise error handling.
+ try:
+ return self.fp.read(amt)
+ except Exception as e:
+ raise TransportError(cause=e) from e
+
+ def close(self):
+ self.fp.close()
+ return super().close()
+
+ def get_header(self, name, default=None):
+ """Get header for name.
+ If there are multiple matching headers, return all seperated by comma."""
+ headers = self.headers.get_all(name)
+ if not headers:
+ return default
+ if name.title() == 'Set-Cookie':
+ # Special case, only get the first one
+ # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.3-4.1
+ return headers[0]
+ return ', '.join(headers)
+
+ # The following methods are for compatability reasons and are deprecated
+ @property
+ def code(self):
+ deprecation_warning('Response.code is deprecated, use Response.status', stacklevel=2)
+ return self.status
+
+ def getcode(self):
+ deprecation_warning('Response.getcode() is deprecated, use Response.status', stacklevel=2)
+ return self.status
+
+ def geturl(self):
+ deprecation_warning('Response.geturl() is deprecated, use Response.url', stacklevel=2)
+ return self.url
+
+ def info(self):
+ deprecation_warning('Response.info() is deprecated, use Response.headers', stacklevel=2)
+ return self.headers
+
+ def getheader(self, name, default=None):
+ deprecation_warning('Response.getheader() is deprecated, use Response.get_header', stacklevel=2)
+ return self.get_header(name, default)
+
+
+if typing.TYPE_CHECKING:
+ RequestData = bytes | Iterable[bytes] | typing.IO | None
+ Preference = typing.Callable[[RequestHandler, Request], int]
+
+_RH_PREFERENCES: set[Preference] = set()
diff --git a/hypervideo_dl/networking/exceptions.py b/hypervideo_dl/networking/exceptions.py
new file mode 100644
index 0000000..10afc9c
--- /dev/null
+++ b/hypervideo_dl/networking/exceptions.py
@@ -0,0 +1,217 @@
+from __future__ import annotations
+
+import typing
+import urllib.error
+
+from ..utils import YoutubeDLError, deprecation_warning
+
+if typing.TYPE_CHECKING:
+ from .common import RequestHandler, Response
+
+
+class RequestError(YoutubeDLError):
+ def __init__(
+ self,
+ msg: str | None = None,
+ cause: Exception | str | None = None,
+ handler: RequestHandler = None
+ ):
+ self.handler = handler
+ self.cause = cause
+ if not msg and cause:
+ msg = str(cause)
+ super().__init__(msg)
+
+
+class UnsupportedRequest(RequestError):
+ """raised when a handler cannot handle a request"""
+ pass
+
+
+class NoSupportingHandlers(RequestError):
+ """raised when no handlers can support a request for various reasons"""
+
+ def __init__(self, unsupported_errors: list[UnsupportedRequest], unexpected_errors: list[Exception]):
+ self.unsupported_errors = unsupported_errors or []
+ self.unexpected_errors = unexpected_errors or []
+
+ # Print a quick summary of the errors
+ err_handler_map = {}
+ for err in unsupported_errors:
+ err_handler_map.setdefault(err.msg, []).append(err.handler.RH_NAME)
+
+ reason_str = ', '.join([f'{msg} ({", ".join(handlers)})' for msg, handlers in err_handler_map.items()])
+ if unexpected_errors:
+ reason_str = ' + '.join(filter(None, [reason_str, f'{len(unexpected_errors)} unexpected error(s)']))
+
+ err_str = 'Unable to handle request'
+ if reason_str:
+ err_str += f': {reason_str}'
+
+ super().__init__(msg=err_str)
+
+
+class TransportError(RequestError):
+ """Network related errors"""
+
+
+class HTTPError(RequestError):
+ def __init__(self, response: Response, redirect_loop=False):
+ self.response = response
+ self.status = response.status
+ self.reason = response.reason
+ self.redirect_loop = redirect_loop
+ msg = f'HTTP Error {response.status}: {response.reason}'
+ if redirect_loop:
+ msg += ' (redirect loop detected)'
+
+ super().__init__(msg=msg)
+
+ def close(self):
+ self.response.close()
+
+ def __repr__(self):
+ return f'<HTTPError {self.status}: {self.reason}>'
+
+
+class IncompleteRead(TransportError):
+ def __init__(self, partial, expected=None, **kwargs):
+ self.partial = partial
+ self.expected = expected
+ msg = f'{len(partial)} bytes read'
+ if expected is not None:
+ msg += f', {expected} more expected'
+
+ super().__init__(msg=msg, **kwargs)
+
+ def __repr__(self):
+ return f'<IncompleteRead: {self.msg}>'
+
+
+class SSLError(TransportError):
+ pass
+
+
+class CertificateVerifyError(SSLError):
+ """Raised when certificate validated has failed"""
+ pass
+
+
+class ProxyError(TransportError):
+ pass
+
+
+class _CompatHTTPError(urllib.error.HTTPError, HTTPError):
+ """
+ Provides backwards compatibility with urllib.error.HTTPError.
+ Do not use this class directly, use HTTPError instead.
+ """
+
+ def __init__(self, http_error: HTTPError):
+ super().__init__(
+ url=http_error.response.url,
+ code=http_error.status,
+ msg=http_error.msg,
+ hdrs=http_error.response.headers,
+ fp=http_error.response
+ )
+ self._closer.file = None # Disable auto close
+ self._http_error = http_error
+ HTTPError.__init__(self, http_error.response, redirect_loop=http_error.redirect_loop)
+
+ @property
+ def status(self):
+ return self._http_error.status
+
+ @status.setter
+ def status(self, value):
+ return
+
+ @property
+ def reason(self):
+ return self._http_error.reason
+
+ @reason.setter
+ def reason(self, value):
+ return
+
+ @property
+ def headers(self):
+ deprecation_warning('HTTPError.headers is deprecated, use HTTPError.response.headers instead')
+ return self._http_error.response.headers
+
+ @headers.setter
+ def headers(self, value):
+ return
+
+ def info(self):
+ deprecation_warning('HTTPError.info() is deprecated, use HTTPError.response.headers instead')
+ return self.response.headers
+
+ def getcode(self):
+ deprecation_warning('HTTPError.getcode is deprecated, use HTTPError.status instead')
+ return self.status
+
+ def geturl(self):
+ deprecation_warning('HTTPError.geturl is deprecated, use HTTPError.response.url instead')
+ return self.response.url
+
+ @property
+ def code(self):
+ deprecation_warning('HTTPError.code is deprecated, use HTTPError.status instead')
+ return self.status
+
+ @code.setter
+ def code(self, value):
+ return
+
+ @property
+ def url(self):
+ deprecation_warning('HTTPError.url is deprecated, use HTTPError.response.url instead')
+ return self.response.url
+
+ @url.setter
+ def url(self, value):
+ return
+
+ @property
+ def hdrs(self):
+ deprecation_warning('HTTPError.hdrs is deprecated, use HTTPError.response.headers instead')
+ return self.response.headers
+
+ @hdrs.setter
+ def hdrs(self, value):
+ return
+
+ @property
+ def filename(self):
+ deprecation_warning('HTTPError.filename is deprecated, use HTTPError.response.url instead')
+ return self.response.url
+
+ @filename.setter
+ def filename(self, value):
+ return
+
+ def __getattr__(self, name):
+ # File operations are passed through the response.
+ # Warn for some commonly used ones
+ passthrough_warnings = {
+ 'read': 'response.read()',
+ # technically possibly due to passthrough, but we should discourage this
+ 'get_header': 'response.get_header()',
+ 'readable': 'response.readable()',
+ 'closed': 'response.closed',
+ 'tell': 'response.tell()',
+ }
+ if name in passthrough_warnings:
+ deprecation_warning(f'HTTPError.{name} is deprecated, use HTTPError.{passthrough_warnings[name]} instead')
+ return super().__getattr__(name)
+
+ def __str__(self):
+ return str(self._http_error)
+
+ def __repr__(self):
+ return repr(self._http_error)
+
+
+network_exceptions = (HTTPError, TransportError)
diff --git a/hypervideo_dl/options.py b/hypervideo_dl/options.py
index bf8684c..10489ad 100644
--- a/hypervideo_dl/options.py
+++ b/hypervideo_dl/options.py
@@ -28,80 +28,67 @@ from .utils import (
expand_path,
format_field,
get_executable_path,
+ get_system_config_dirs,
+ get_user_config_dirs,
join_nonempty,
orderedSet_from_options,
remove_end,
+ variadic,
write_string,
)
-from .version import __version__
+from .version import CHANNEL, __version__
def parseOpts(overrideArguments=None, ignore_config_files='if_override'):
+ PACKAGE_NAME = 'hypervideo'
+
root = Config(create_parser())
if ignore_config_files == 'if_override':
ignore_config_files = overrideArguments is not None
- def _readUserConf(package_name, default=[]):
- # .config
- xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
- userConfFile = os.path.join(xdg_config_home, package_name, 'config')
- if not os.path.isfile(userConfFile):
- userConfFile = os.path.join(xdg_config_home, '%s.conf' % package_name)
- userConf = Config.read_file(userConfFile, default=None)
- if userConf is not None:
- return userConf, userConfFile
-
- # appdata
- appdata_dir = os.getenv('appdata')
- if appdata_dir:
- userConfFile = os.path.join(appdata_dir, package_name, 'config')
- userConf = Config.read_file(userConfFile, default=None)
- if userConf is None:
- userConfFile += '.txt'
- userConf = Config.read_file(userConfFile, default=None)
- if userConf is not None:
- return userConf, userConfFile
+ def read_config(*paths):
+ path = os.path.join(*paths)
+ conf = Config.read_file(path, default=None)
+ if conf is not None:
+ return conf, path
- # home
- userConfFile = os.path.join(compat_expanduser('~'), '%s.conf' % package_name)
- userConf = Config.read_file(userConfFile, default=None)
- if userConf is None:
- userConfFile += '.txt'
- userConf = Config.read_file(userConfFile, default=None)
- if userConf is not None:
- return userConf, userConfFile
+ def _load_from_config_dirs(config_dirs):
+ for config_dir in config_dirs:
+ head, tail = os.path.split(config_dir)
+ assert tail == PACKAGE_NAME or config_dir == os.path.join(compat_expanduser('~'), f'.{PACKAGE_NAME}')
- return default, None
+ yield read_config(head, f'{PACKAGE_NAME}.conf')
+ if tail.startswith('.'): # ~/.PACKAGE_NAME
+ yield read_config(head, f'{PACKAGE_NAME}.conf.txt')
+ yield read_config(config_dir, 'config')
+ yield read_config(config_dir, 'config.txt')
- def add_config(label, path, user=False):
+ def add_config(label, path=None, func=None):
""" Adds config and returns whether to continue """
if root.parse_known_args()[0].ignoreconfig:
return False
- # Multiple package names can be given here
- # E.g. ('hypervideo', 'youtube-dlc', 'youtube-dl') will look for
- # the configuration file of any of these three packages
- for package in ('hypervideo',):
- if user:
- args, current_path = _readUserConf(package, default=None)
- else:
- current_path = os.path.join(path, '%s.conf' % package)
- args = Config.read_file(current_path, default=None)
- if args is not None:
- root.append_config(args, current_path, label=label)
- return True
+ elif func:
+ assert path is None
+ args, current_path = next(
+ filter(None, _load_from_config_dirs(func(PACKAGE_NAME))), (None, None))
+ else:
+ current_path = os.path.join(path, 'hypervideo.conf')
+ args = Config.read_file(current_path, default=None)
+ if args is not None:
+ root.append_config(args, current_path, label=label)
return True
def load_configs():
yield not ignore_config_files
yield add_config('Portable', get_executable_path())
yield add_config('Home', expand_path(root.parse_known_args()[0].paths.get('home', '')).strip())
- yield add_config('User', None, user=True)
- yield add_config('System', '/etc')
+ yield add_config('User', func=get_user_config_dirs)
+ yield add_config('System', func=get_system_config_dirs)
opts = optparse.Values({'verbose': True, 'print_help': False})
try:
try:
- if overrideArguments:
+ if overrideArguments is not None:
root.append_config(overrideArguments, label='Override')
else:
root.append_config(sys.argv[1:], label='Command-line')
@@ -256,14 +243,14 @@ def create_parser():
if multiple_keys:
allowed_keys = fr'({allowed_keys})(,({allowed_keys}))*'
mobj = re.match(
- fr'(?i)(?P<keys>{allowed_keys}){delimiter}(?P<val>.*)$',
+ fr'(?is)(?P<keys>{allowed_keys}){delimiter}(?P<val>.*)$',
value[0] if multiple_args else value)
if mobj is not None:
keys, val = mobj.group('keys').split(','), mobj.group('val')
if multiple_args:
val = [val, *value[1:]]
elif default_key is not None:
- keys, val = [default_key], value
+ keys, val = variadic(default_key), value
else:
raise optparse.OptionValueError(
f'wrong {opt_str} formatting; it should be {option.metavar}, not "{value}"')
@@ -276,6 +263,20 @@ def create_parser():
out_dict[key] = out_dict.get(key, []) + [val] if append else val
setattr(parser.values, option.dest, out_dict)
+ def when_prefix(default):
+ return {
+ 'default': {},
+ 'type': 'str',
+ 'action': 'callback',
+ 'callback': _dict_from_options_callback,
+ 'callback_kwargs': {
+ 'allowed_keys': '|'.join(map(re.escape, POSTPROCESS_WHEN)),
+ 'default_key': default,
+ 'multiple_keys': False,
+ 'append': True,
+ },
+ }
+
parser = _YoutubeDLOptionParser()
alias_group = optparse.OptionGroup(parser, 'Aliases')
Formatter = string.Formatter()
@@ -393,7 +394,7 @@ def create_parser():
general.add_option(
'--no-flat-playlist',
action='store_false', dest='extract_flat',
- help='Extract the videos of a playlist')
+ help='Fully extract the videos of a playlist (default)')
general.add_option(
'--live-from-start',
action='store_true', dest='live_from_start',
@@ -422,8 +423,25 @@ def create_parser():
help='Do not mark videos watched (default)')
general.add_option(
'--no-colors', '--no-colours',
- action='store_true', dest='no_color', default=False,
- help='Do not emit color codes in output (Alias: --no-colours)')
+ action='store_const', dest='color', const={
+ 'stdout': 'no_color',
+ 'stderr': 'no_color',
+ },
+ help=optparse.SUPPRESS_HELP)
+ general.add_option(
+ '--color',
+ dest='color', metavar='[STREAM:]POLICY', default={}, type='str',
+ action='callback', callback=_dict_from_options_callback,
+ callback_kwargs={
+ 'allowed_keys': 'stdout|stderr',
+ 'default_key': ['stdout', 'stderr'],
+ 'process': str.strip,
+ }, help=(
+ 'Whether to emit color codes in output, optionally prefixed by '
+ 'the STREAM (stdout or stderr) to apply the setting to. '
+ 'Can be one of "always", "auto" (default), "never", or '
+ '"no_color" (use non color terminal sequences). '
+ 'Can be used multiple times'))
general.add_option(
'--compat-options',
metavar='OPTS', dest='compat_opts', default=set(), type='str',
@@ -431,13 +449,15 @@ def create_parser():
callback_kwargs={
'allowed_values': {
'filename', 'filename-sanitization', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles',
- 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge',
- 'no-attach-info-json', 'embed-metadata', 'embed-thumbnail-atomicparsley',
- 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi',
+ 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge', 'playlist-match-filter',
+ 'no-attach-info-json', 'embed-thumbnail-atomicparsley', 'no-external-downloader-progress',
+ 'embed-metadata', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi',
'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-youtube-prefer-utc-upload-date',
}, 'aliases': {
- 'youtube-dl': ['all', '-multistreams'],
- 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat'],
+ 'youtube-dl': ['all', '-multistreams', '-playlist-match-filter'],
+ 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter'],
+ '2021': ['2022', 'no-certifi', 'filename-sanitization', 'no-youtube-prefer-utc-upload-date'],
+ '2022': ['no-external-downloader-progress', 'playlist-match-filter'],
}
}, help=(
'Options that can help keep compatibility with youtube-dl or youtube-dlc '
@@ -482,6 +502,11 @@ def create_parser():
action='store_const', const='::', dest='source_address',
help='Make all connections via IPv6',
)
+ network.add_option(
+ '--enable-file-urls', action='store_true',
+ dest='enable_file_urls', default=False,
+ help='Enable file:// URLs. This is disabled by default for security reasons.'
+ )
geo = optparse.OptionGroup(parser, 'Geo-restriction')
geo.add_option(
@@ -495,21 +520,26 @@ def create_parser():
dest='cn_verification_proxy', default=None, metavar='URL',
help=optparse.SUPPRESS_HELP)
geo.add_option(
+ '--xff', metavar='VALUE',
+ dest='geo_bypass', default='default',
+ help=(
+ 'How to fake X-Forwarded-For HTTP header to try bypassing geographic restriction. '
+ 'One of "default" (only when known to be useful), "never", '
+ 'an IP block in CIDR notation, or a two-letter ISO 3166-2 country code'))
+ geo.add_option(
'--geo-bypass',
- action='store_true', dest='geo_bypass', default=True,
- help='Bypass geographic restriction via faking X-Forwarded-For HTTP header (default)')
+ action='store_const', dest='geo_bypass', const='default',
+ help=optparse.SUPPRESS_HELP)
geo.add_option(
'--no-geo-bypass',
- action='store_false', dest='geo_bypass',
- help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header')
+ action='store_const', dest='geo_bypass', const='never',
+ help=optparse.SUPPRESS_HELP)
geo.add_option(
- '--geo-bypass-country', metavar='CODE',
- dest='geo_bypass_country', default=None,
- help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code')
+ '--geo-bypass-country', metavar='CODE', dest='geo_bypass',
+ help=optparse.SUPPRESS_HELP)
geo.add_option(
- '--geo-bypass-ip-block', metavar='IP_BLOCK',
- dest='geo_bypass_ip_block', default=None,
- help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation')
+ '--geo-bypass-ip-block', metavar='IP_BLOCK', dest='geo_bypass',
+ help=optparse.SUPPRESS_HELP)
selection = optparse.OptionGroup(parser, 'Video Selection')
selection.add_option(
@@ -524,10 +554,10 @@ def create_parser():
'-I', '--playlist-items',
dest='playlist_items', metavar='ITEM_SPEC', default=None,
help=(
- 'Comma separated playlist_index of the videos to download. '
+ 'Comma separated playlist_index of the items to download. '
'You can specify a range using "[START]:[STOP][:STEP]". For backward compatibility, START-STOP is also supported. '
'Use negative indices to count from the right and negative STEP to download in reverse order. '
- 'E.g. "-I 1:3,7,-5::2" used on a playlist of size 15 will download the videos at index 1,2,3,7,11,13,15'))
+ 'E.g. "-I 1:3,7,-5::2" used on a playlist of size 15 will download the items at index 1,2,3,7,11,13,15'))
selection.add_option(
'--match-title',
dest='matchtitle', metavar='REGEX',
@@ -543,13 +573,14 @@ def create_parser():
selection.add_option(
'--max-filesize',
metavar='SIZE', dest='max_filesize', default=None,
- help='Abort download if filesize if larger than SIZE, e.g. 50k or 44.6M')
+ help='Abort download if filesize is larger than SIZE, e.g. 50k or 44.6M')
selection.add_option(
'--date',
metavar='DATE', dest='date', default=None,
help=(
- 'Download only videos uploaded on this date. The date can be "YYYYMMDD" or in the format '
- '[now|today|yesterday][-N[day|week|month|year]]. E.g. --date today-2weeks'))
+ 'Download only videos uploaded on this date. '
+ 'The date can be "YYYYMMDD" or in the format [now|today|yesterday][-N[day|week|month|year]]. '
+ 'E.g. "--date today-2weeks" downloads only videos uploaded on the same day two weeks ago'))
selection.add_option(
'--datebefore',
metavar='DATE', dest='datebefore', default=None,
@@ -586,9 +617,17 @@ def create_parser():
'that contains the phrase "cats & dogs" (caseless). '
'Use "--match-filter -" to interactively ask whether to download each video'))
selection.add_option(
- '--no-match-filter',
- metavar='FILTER', dest='match_filter', action='store_const', const=None,
- help='Do not use generic video filter (default)')
+ '--no-match-filters',
+ dest='match_filter', action='store_const', const=None,
+ help='Do not use any --match-filter (default)')
+ selection.add_option(
+ '--break-match-filters',
+ metavar='FILTER', dest='breaking_match_filter', action='append',
+ help='Same as "--match-filters" but stops the download process when a video is rejected')
+ selection.add_option(
+ '--no-break-match-filters',
+ dest='breaking_match_filter', action='store_const', const=None,
+ help='Do not use any --break-match-filters (default)')
selection.add_option(
'--no-playlist',
action='store_true', dest='noplaylist', default=False,
@@ -620,11 +659,11 @@ def create_parser():
selection.add_option(
'--break-on-reject',
action='store_true', dest='break_on_reject', default=False,
- help='Stop the download process when encountering a file that has been filtered out')
+ help=optparse.SUPPRESS_HELP)
selection.add_option(
'--break-per-input',
action='store_true', dest='break_per_url', default=False,
- help='--break-on-existing, --break-on-reject, --max-downloads, and autonumber resets per input URL')
+ help='Alters --max-downloads, --break-on-existing, --break-match-filter, and autonumber to reset per input URL')
selection.add_option(
'--no-break-per-input',
action='store_false', dest='break_per_url',
@@ -664,6 +703,10 @@ def create_parser():
dest='netrc_location', metavar='PATH',
help='Location of .netrc authentication data; either the path or its containing directory. Defaults to ~/.netrc')
authentication.add_option(
+ '--netrc-cmd',
+ dest='netrc_cmd', metavar='NETRC_CMD',
+ help='Command to execute to get the credentials for an extractor.')
+ authentication.add_option(
'--video-password',
dest='videopassword', metavar='PASSWORD',
help='Video password (vimeo, youku)')
@@ -864,11 +907,11 @@ def create_parser():
'This option can be used multiple times to set the sleep for the different retry types, '
'e.g. --retry-sleep linear=1::2 --retry-sleep fragment:exp=1:20'))
downloader.add_option(
- '--skip-unavailable-fragments', '--no-abort-on-unavailable-fragment',
+ '--skip-unavailable-fragments', '--no-abort-on-unavailable-fragments',
action='store_true', dest='skip_unavailable_fragments', default=True,
- help='Skip unavailable fragments for DASH, hlsnative and ISM downloads (default) (Alias: --no-abort-on-unavailable-fragment)')
+ help='Skip unavailable fragments for DASH, hlsnative and ISM downloads (default) (Alias: --no-abort-on-unavailable-fragments)')
downloader.add_option(
- '--abort-on-unavailable-fragment', '--no-skip-unavailable-fragments',
+ '--abort-on-unavailable-fragments', '--no-skip-unavailable-fragments',
action='store_false', dest='skip_unavailable_fragments',
help='Abort download if a fragment is unavailable (Alias: --no-skip-unavailable-fragments)')
downloader.add_option(
@@ -951,8 +994,9 @@ def create_parser():
'--download-sections',
metavar='REGEX', dest='download_ranges', action='append',
help=(
- 'Download only chapters whose title matches the given regular expression. '
- 'Time ranges prefixed by a "*" can also be used in place of chapters to download the specified range. '
+ 'Download only chapters that match the regular expression. '
+ 'A "*" prefix denotes time-range instead of chapter. Negative timestamps are calculated from the end. '
+ '"*from-url" can be used to download between the "start_time" and "end_time" extracted from the URL. '
'Needs ffmpeg. This option can be used multiple times to download multiple sections, '
'e.g. --download-sections "*10:15-inf" --download-sections "intro"'))
downloader.add_option(
@@ -1012,7 +1056,7 @@ def create_parser():
metavar='URL', dest='referer', default=None,
help=optparse.SUPPRESS_HELP)
workarounds.add_option(
- '--add-header',
+ '--add-headers',
metavar='FIELD:VALUE', dest='headers', default={}, type='str',
action='callback', callback=_dict_from_options_callback,
callback_kwargs={'multiple_keys': False},
@@ -1045,9 +1089,13 @@ def create_parser():
verbosity = optparse.OptionGroup(parser, 'Verbosity and Simulation Options')
verbosity.add_option(
'-q', '--quiet',
- action='store_true', dest='quiet', default=False,
+ action='store_true', dest='quiet', default=None,
help='Activate quiet mode. If used with --verbose, print the log to stderr')
verbosity.add_option(
+ '--no-quiet',
+ action='store_false', dest='quiet',
+ help='Deactivate quiet mode. (Default)')
+ verbosity.add_option(
'--no-warnings',
dest='no_warnings', action='store_true', default=False,
help='Ignore warnings')
@@ -1075,28 +1123,16 @@ def create_parser():
help='Do not download the video but write all related files (Alias: --no-download)')
verbosity.add_option(
'-O', '--print',
- metavar='[WHEN:]TEMPLATE', dest='forceprint', default={}, type='str',
- action='callback', callback=_dict_from_options_callback,
- callback_kwargs={
- 'allowed_keys': 'video|' + '|'.join(map(re.escape, POSTPROCESS_WHEN)),
- 'default_key': 'video',
- 'multiple_keys': False,
- 'append': True,
- }, help=(
+ metavar='[WHEN:]TEMPLATE', dest='forceprint', **when_prefix('video'),
+ help=(
'Field name or output template to print to screen, optionally prefixed with when to print it, separated by a ":". '
- 'Supported values of "WHEN" are the same as that of --use-postprocessor, and "video" (default). '
+ 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: video). '
'Implies --quiet. Implies --simulate unless --no-simulate or later stages of WHEN are used. '
'This option can be used multiple times'))
verbosity.add_option(
'--print-to-file',
- metavar='[WHEN:]TEMPLATE FILE', dest='print_to_file', default={}, type='str', nargs=2,
- action='callback', callback=_dict_from_options_callback,
- callback_kwargs={
- 'allowed_keys': 'video|' + '|'.join(map(re.escape, POSTPROCESS_WHEN)),
- 'default_key': 'video',
- 'multiple_keys': False,
- 'append': True,
- }, help=(
+ metavar='[WHEN:]TEMPLATE FILE', dest='print_to_file', nargs=2, **when_prefix('video'),
+ help=(
'Append given template to the file. The values of WHEN and TEMPLATE are same as that of --print. '
'FILE uses the same syntax as the output template. This option can be used multiple times'))
verbosity.add_option(
@@ -1361,8 +1397,7 @@ def create_parser():
'--clean-info-json', '--clean-infojson',
action='store_true', dest='clean_infojson', default=None,
help=(
- 'Remove some private fields such as filenames from the infojson. '
- 'Note that it could still contain some personal information (default)'))
+ 'Remove some internal metadata such as filenames from the infojson (default)'))
filesystem.add_option(
'--no-clean-info-json', '--no-clean-infojson',
action='store_false', dest='clean_infojson',
@@ -1573,14 +1608,16 @@ def create_parser():
help=optparse.SUPPRESS_HELP)
postproc.add_option(
'--parse-metadata',
- metavar='FROM:TO', dest='parse_metadata', action='append',
+ metavar='[WHEN:]FROM:TO', dest='parse_metadata', **when_prefix('pre_process'),
help=(
- 'Parse additional metadata like title/artist from other fields; '
- 'see "MODIFYING METADATA" for details'))
+ 'Parse additional metadata like title/artist from other fields; see "MODIFYING METADATA" for details. '
+ 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: pre_process)'))
postproc.add_option(
'--replace-in-metadata',
- dest='parse_metadata', metavar='FIELDS REGEX REPLACE', action='append', nargs=3,
- help='Replace text in a metadata field using the given regex. This option can be used multiple times')
+ dest='parse_metadata', metavar='[WHEN:]FIELDS REGEX REPLACE', nargs=3, **when_prefix('pre_process'),
+ help=(
+ 'Replace text in a metadata field using the given regex. This option can be used multiple times. '
+ 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: pre_process)'))
postproc.add_option(
'--xattrs', '--xattr',
action='store_true', dest='xattrs', default=False,
@@ -1618,19 +1655,12 @@ def create_parser():
help='Location of the ffmpeg binary; either the path to the binary or its containing directory')
postproc.add_option(
'--exec',
- metavar='[WHEN:]CMD', dest='exec_cmd', default={}, type='str',
- action='callback', callback=_dict_from_options_callback,
- callback_kwargs={
- 'allowed_keys': '|'.join(map(re.escape, POSTPROCESS_WHEN)),
- 'default_key': 'after_move',
- 'multiple_keys': False,
- 'append': True,
- }, help=(
- 'Execute a command, optionally prefixed with when to execute it (after_move if unspecified), separated by a ":". '
- 'Supported values of "WHEN" are the same as that of --use-postprocessor. '
+ metavar='[WHEN:]CMD', dest='exec_cmd', **when_prefix('after_move'),
+ help=(
+ 'Execute a command, optionally prefixed with when to execute it, separated by a ":". '
+ 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: after_move). '
'Same syntax as the output template can be used to pass any field as arguments to the command. '
- 'After download, an additional field "filepath" that contains the final path of the downloaded file '
- 'is also available, and if no fields are passed, %(filepath)q is appended to the end of the command. '
+ 'If no fields are passed, %(filepath,_filename|)q is appended to the end of the command. '
'This option can be used multiple times'))
postproc.add_option(
'--no-exec',
@@ -1703,7 +1733,8 @@ def create_parser():
'ARGS are a semicolon ";" delimited list of NAME=VALUE. '
'The "when" argument determines when the postprocessor is invoked. '
'It can be one of "pre_process" (after video extraction), "after_filter" (after video passes filter), '
- '"before_dl" (before each video download), "post_process" (after each video download; default), '
+ '"video" (after --format; before --print/--output), "before_dl" (before each video download), '
+ '"post_process" (after each video download; default), '
'"after_move" (after moving video file to it\'s final locations), '
'"after_video" (after downloading and processing all formats of a video), '
'or "playlist" (at end of playlist). '
diff --git a/hypervideo_dl/plugins.py b/hypervideo_dl/plugins.py
new file mode 100644
index 0000000..38e4a2c
--- /dev/null
+++ b/hypervideo_dl/plugins.py
@@ -0,0 +1,173 @@
+import contextlib
+import importlib
+import importlib.abc
+import importlib.machinery
+import importlib.util
+import inspect
+import itertools
+import pkgutil
+import sys
+import traceback
+import zipimport
+from pathlib import Path
+from zipfile import ZipFile
+
+from .compat import functools # isort: split
+from .utils import (
+ get_executable_path,
+ get_system_config_dirs,
+ get_user_config_dirs,
+ orderedSet,
+ write_string,
+)
+
+PACKAGE_NAME = 'hypervideo_dl_plugins'
+COMPAT_PACKAGE_NAME = 'ytdlp_plugins'
+
+
+class PluginLoader(importlib.abc.Loader):
+ """Dummy loader for virtual namespace packages"""
+
+ def exec_module(self, module):
+ return None
+
+
+@functools.cache
+def dirs_in_zip(archive):
+ try:
+ with ZipFile(archive) as zip_:
+ return set(itertools.chain.from_iterable(
+ Path(file).parents for file in zip_.namelist()))
+ except FileNotFoundError:
+ pass
+ except Exception as e:
+ write_string(f'WARNING: Could not read zip file {archive}: {e}\n')
+ return set()
+
+
+class PluginFinder(importlib.abc.MetaPathFinder):
+ """
+ This class provides one or multiple namespace packages.
+ It searches in sys.path and hypervideo config folders for
+ the existing subdirectories from which the modules can be imported
+ """
+
+ def __init__(self, *packages):
+ self._zip_content_cache = {}
+ self.packages = set(itertools.chain.from_iterable(
+ itertools.accumulate(name.split('.'), lambda a, b: '.'.join((a, b)))
+ for name in packages))
+
+ def search_locations(self, fullname):
+ candidate_locations = []
+
+ def _get_package_paths(*root_paths, containing_folder='plugins'):
+ for config_dir in orderedSet(map(Path, root_paths), lazy=True):
+ with contextlib.suppress(OSError):
+ yield from (config_dir / containing_folder).iterdir()
+
+ # Load from hypervideo config folders
+ candidate_locations.extend(_get_package_paths(
+ *get_user_config_dirs('hypervideo'),
+ *get_system_config_dirs('hypervideo'),
+ containing_folder='plugins'))
+
+ # Load from hypervideo-plugins folders
+ candidate_locations.extend(_get_package_paths(
+ get_executable_path(),
+ *get_user_config_dirs(''),
+ *get_system_config_dirs(''),
+ containing_folder='hypervideo-plugins'))
+
+ candidate_locations.extend(map(Path, sys.path)) # PYTHONPATH
+ with contextlib.suppress(ValueError): # Added when running __main__.py directly
+ candidate_locations.remove(Path(__file__).parent)
+
+ parts = Path(*fullname.split('.'))
+ for path in orderedSet(candidate_locations, lazy=True):
+ candidate = path / parts
+ if candidate.is_dir():
+ yield candidate
+ elif path.suffix in ('.zip', '.egg', '.whl') and path.is_file():
+ if parts in dirs_in_zip(path):
+ yield candidate
+
+ def find_spec(self, fullname, path=None, target=None):
+ if fullname not in self.packages:
+ return None
+
+ search_locations = list(map(str, self.search_locations(fullname)))
+ if not search_locations:
+ return None
+
+ spec = importlib.machinery.ModuleSpec(fullname, PluginLoader(), is_package=True)
+ spec.submodule_search_locations = search_locations
+ return spec
+
+ def invalidate_caches(self):
+ dirs_in_zip.cache_clear()
+ for package in self.packages:
+ if package in sys.modules:
+ del sys.modules[package]
+
+
+def directories():
+ spec = importlib.util.find_spec(PACKAGE_NAME)
+ return spec.submodule_search_locations if spec else []
+
+
+def iter_modules(subpackage):
+ fullname = f'{PACKAGE_NAME}.{subpackage}'
+ with contextlib.suppress(ModuleNotFoundError):
+ pkg = importlib.import_module(fullname)
+ yield from pkgutil.iter_modules(path=pkg.__path__, prefix=f'{fullname}.')
+
+
+def load_module(module, module_name, suffix):
+ return inspect.getmembers(module, lambda obj: (
+ inspect.isclass(obj)
+ and obj.__name__.endswith(suffix)
+ and obj.__module__.startswith(module_name)
+ and not obj.__name__.startswith('_')
+ and obj.__name__ in getattr(module, '__all__', [obj.__name__])))
+
+
+def load_plugins(name, suffix):
+ classes = {}
+
+ for finder, module_name, _ in iter_modules(name):
+ if any(x.startswith('_') for x in module_name.split('.')):
+ continue
+ try:
+ if sys.version_info < (3, 10) and isinstance(finder, zipimport.zipimporter):
+ # zipimporter.load_module() is deprecated in 3.10 and removed in 3.12
+ # The exec_module branch below is the replacement for >= 3.10
+ # See: https://docs.python.org/3/library/zipimport.html#zipimport.zipimporter.exec_module
+ module = finder.load_module(module_name)
+ else:
+ spec = finder.find_spec(module_name)
+ module = importlib.util.module_from_spec(spec)
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module)
+ except Exception:
+ write_string(f'Error while importing module {module_name!r}\n{traceback.format_exc(limit=-1)}')
+ continue
+ classes.update(load_module(module, module_name, suffix))
+
+ # Compat: old plugin system using __init__.py
+ # Note: plugins imported this way do not show up in directories()
+ # nor are considered part of the hypervideo_dl_plugins namespace package
+ with contextlib.suppress(FileNotFoundError):
+ spec = importlib.util.spec_from_file_location(
+ name, Path(get_executable_path(), COMPAT_PACKAGE_NAME, name, '__init__.py'))
+ plugins = importlib.util.module_from_spec(spec)
+ sys.modules[spec.name] = plugins
+ spec.loader.exec_module(plugins)
+ classes.update(load_module(plugins, spec.name, suffix))
+
+ return classes
+
+
+sys.meta_path.insert(0, PluginFinder(f'{PACKAGE_NAME}.extractor', f'{PACKAGE_NAME}.postprocessor'))
+
+__all__ = ['directories', 'load_plugins', 'PACKAGE_NAME', 'COMPAT_PACKAGE_NAME']
diff --git a/hypervideo_dl/postprocessor/__init__.py b/hypervideo_dl/postprocessor/__init__.py
index f168be4..bfe9df7 100644
--- a/hypervideo_dl/postprocessor/__init__.py
+++ b/hypervideo_dl/postprocessor/__init__.py
@@ -33,14 +33,15 @@ from .movefilesafterdownload import MoveFilesAfterDownloadPP
from .sponskrub import SponSkrubPP
from .sponsorblock import SponsorBlockPP
from .xattrpp import XAttrMetadataPP
-from ..utils import load_plugins
+from ..plugins import load_plugins
-_PLUGIN_CLASSES = load_plugins('postprocessor', 'PP', globals())
+_PLUGIN_CLASSES = load_plugins('postprocessor', 'PP')
def get_postprocessor(key):
return globals()[key + 'PP']
+globals().update(_PLUGIN_CLASSES)
__all__ = [name for name in globals().keys() if name.endswith('PP')]
__all__.extend(('PostProcessor', 'FFmpegPostProcessor'))
diff --git a/hypervideo_dl/postprocessor/common.py b/hypervideo_dl/postprocessor/common.py
index c3fca35..9a0aa6f 100644
--- a/hypervideo_dl/postprocessor/common.py
+++ b/hypervideo_dl/postprocessor/common.py
@@ -1,16 +1,15 @@
import functools
import json
import os
-import urllib.error
+from ..networking import Request
+from ..networking.exceptions import HTTPError, network_exceptions
from ..utils import (
PostProcessingError,
RetryManager,
_configuration_args,
deprecation_warning,
encodeFilename,
- network_exceptions,
- sanitized_Request,
)
@@ -187,7 +186,7 @@ class PostProcessor(metaclass=PostProcessorMetaClass):
tmpl = progress_template.get('postprocess')
if tmpl:
self._downloader.to_screen(
- self._downloader.evaluate_outtmpl(tmpl, progress_dict), skip_eol=True, quiet=False)
+ self._downloader.evaluate_outtmpl(tmpl, progress_dict), quiet=False)
self._downloader.to_console_title(self._downloader.evaluate_outtmpl(
progress_template.get('postprocess-title') or 'hypervideo %(progress._default_template)s',
@@ -203,13 +202,13 @@ class PostProcessor(metaclass=PostProcessorMetaClass):
self.write_debug(f'{self.PP_NAME} query: {url}')
for retry in RetryManager(self.get_param('extractor_retries', 3), self._retry_download):
try:
- rsp = self._downloader.urlopen(sanitized_Request(url))
+ rsp = self._downloader.urlopen(Request(url))
except network_exceptions as e:
- if isinstance(e, urllib.error.HTTPError) and e.code in expected_http_errors:
+ if isinstance(e, HTTPError) and e.status in expected_http_errors:
return None
retry.error = PostProcessingError(f'Unable to communicate with {self.PP_NAME} API: {e}')
continue
- return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8'))
+ return json.loads(rsp.read().decode(rsp.headers.get_param('charset') or 'utf-8'))
class AudioConversionError(PostProcessingError): # Deprecated
diff --git a/hypervideo_dl/postprocessor/embedthumbnail.py b/hypervideo_dl/postprocessor/embedthumbnail.py
index 7cd3952..707ec76 100644
--- a/hypervideo_dl/postprocessor/embedthumbnail.py
+++ b/hypervideo_dl/postprocessor/embedthumbnail.py
@@ -107,14 +107,14 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
options.extend(['-map', '-0:%d' % old_stream])
new_stream -= 1
options.extend([
- '-attach', thumbnail_filename,
+ '-attach', self._ffmpeg_filename_argument(thumbnail_filename),
'-metadata:s:%d' % new_stream, 'mimetype=%s' % mimetype,
'-metadata:s:%d' % new_stream, 'filename=cover.%s' % thumbnail_ext])
self._report_run('ffmpeg', filename)
self.run_ffmpeg(filename, temp_filename, options)
- elif info['ext'] in ['m4a', 'mp4', 'mov']:
+ elif info['ext'] in ['m4a', 'mp4', 'm4v', 'mov']:
prefer_atomicparsley = 'embed-thumbnail-atomicparsley' in self.get_param('compat_opts', [])
# Method 1: Use mutagen
if not mutagen or prefer_atomicparsley:
@@ -213,7 +213,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
temp_filename = filename
else:
- raise EmbedThumbnailPPError('Supported filetypes for thumbnail embedding are: mp3, mkv/mka, ogg/opus/flac, m4a/mp4/mov')
+ raise EmbedThumbnailPPError('Supported filetypes for thumbnail embedding are: mp3, mkv/mka, ogg/opus/flac, m4a/mp4/m4v/mov')
if success and temp_filename != filename:
os.replace(temp_filename, filename)
diff --git a/hypervideo_dl/postprocessor/ffmpeg.py b/hypervideo_dl/postprocessor/ffmpeg.py
index 0471594..5bd0dbd 100644
--- a/hypervideo_dl/postprocessor/ffmpeg.py
+++ b/hypervideo_dl/postprocessor/ffmpeg.py
@@ -44,6 +44,7 @@ EXT_TO_OUT_FORMATS = {
'ts': 'mpegts',
'wma': 'asf',
'wmv': 'asf',
+ 'weba': 'webm',
'vtt': 'webvtt',
}
ACODECS = {
@@ -301,6 +302,11 @@ class FFmpegPostProcessor(PostProcessor):
None)
return num, len(streams)
+ def _fixup_chapters(self, info):
+ last_chapter = traverse_obj(info, ('chapters', -1))
+ if last_chapter and not last_chapter.get('end_time'):
+ last_chapter['end_time'] = self._get_real_video_duration(info['filepath'])
+
def _get_real_video_duration(self, filepath, fatal=True):
try:
duration = float_or_none(
@@ -407,7 +413,7 @@ class FFmpegPostProcessor(PostProcessor):
"""
concat_file = f'{out_file}.concat'
self.write_debug(f'Writing concat spec to {concat_file}')
- with open(concat_file, 'wt', encoding='utf-8') as f:
+ with open(concat_file, 'w', encoding='utf-8') as f:
f.writelines(self._concat_spec(in_files, concat_opts))
out_flags = list(self.stream_copy_opts(ext=determine_ext(out_file)))
@@ -507,8 +513,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
if acodec != 'copy':
more_opts = self._quality_args(acodec)
- # not os.path.splitext, since the latter does not work on unicode in all setups
- temp_path = new_path = f'{path.rpartition(".")[0]}.{extension}'
+ temp_path = new_path = replace_extension(path, extension, information['ext'])
if new_path == path:
if acodec == 'copy':
@@ -538,7 +543,10 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
class FFmpegVideoConvertorPP(FFmpegPostProcessor):
- SUPPORTED_EXTS = (*MEDIA_EXTENSIONS.common_video, *sorted(MEDIA_EXTENSIONS.common_audio + ('aac', 'vorbis')))
+ SUPPORTED_EXTS = (
+ *sorted((*MEDIA_EXTENSIONS.common_video, 'gif')),
+ *sorted((*MEDIA_EXTENSIONS.common_audio, 'aac', 'vorbis')),
+ )
FORMAT_RE = create_mapping_re(SUPPORTED_EXTS)
_ACTION = 'converting'
@@ -675,6 +683,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
@PostProcessor._restrict_to(images=False)
def run(self, info):
+ self._fixup_chapters(info)
filename, metadata_filename = info['filepath'], None
files_to_delete, options = [], []
if self._add_chapters and info.get('chapters'):
@@ -708,7 +717,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
@staticmethod
def _get_chapter_opts(chapters, metadata_filename):
- with open(metadata_filename, 'wt', encoding='utf-8') as f:
+ with open(metadata_filename, 'w', encoding='utf-8') as f:
def ffmpeg_escape(text):
return re.sub(r'([\\=;#\n])', r'\\\1', text)
@@ -800,7 +809,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
new_stream -= 1
yield (
- '-attach', infofn,
+ '-attach', self._ffmpeg_filename_argument(infofn),
f'-metadata:s:{new_stream}', 'mimetype=application/json',
f'-metadata:s:{new_stream}', 'filename=info.json',
)
@@ -889,8 +898,11 @@ class FFmpegFixupM3u8PP(FFmpegFixupPostProcessor):
@PostProcessor._restrict_to(images=False)
def run(self, info):
if all(self._needs_fixup(info)):
+ args = ['-f', 'mp4']
+ if self.get_audio_codec(info['filepath']) == 'aac':
+ args.extend(['-bsf:a', 'aac_adtstoasc'])
self._fixup('Fixing MPEG-TS in MP4 container', info['filepath'], [
- *self.stream_copy_opts(), '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'])
+ *self.stream_copy_opts(), *args])
return [], info
@@ -978,7 +990,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
with open(dfxp_file, 'rb') as f:
srt_data = dfxp2srt(f.read())
- with open(srt_file, 'wt', encoding='utf-8') as f:
+ with open(srt_file, 'w', encoding='utf-8') as f:
f.write(srt_data)
old_file = srt_file
@@ -1037,6 +1049,7 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor):
@PostProcessor._restrict_to(images=False)
def run(self, info):
+ self._fixup_chapters(info)
chapters = info.get('chapters') or []
if not chapters:
self.to_screen('Chapter information is unavailable')
diff --git a/hypervideo_dl/postprocessor/metadataparser.py b/hypervideo_dl/postprocessor/metadataparser.py
index 381182b..509dccb 100644
--- a/hypervideo_dl/postprocessor/metadataparser.py
+++ b/hypervideo_dl/postprocessor/metadataparser.py
@@ -1,7 +1,7 @@
import re
from .common import PostProcessor
-from ..utils import Namespace, filter_dict
+from ..utils import Namespace, filter_dict, function_with_repr
class MetadataParserPP(PostProcessor):
@@ -60,6 +60,7 @@ class MetadataParserPP(PostProcessor):
f(info)
return [], info
+ @function_with_repr
def interpretter(self, inp, out):
def f(info):
data_to_parse = self._downloader.evaluate_outtmpl(template, info)
@@ -76,6 +77,7 @@ class MetadataParserPP(PostProcessor):
out_re = re.compile(self.format_to_regex(out))
return f
+ @function_with_repr
def replacer(self, field, search, replace):
def f(info):
val = info.get(field)
diff --git a/hypervideo_dl/postprocessor/modify_chapters.py b/hypervideo_dl/postprocessor/modify_chapters.py
index a745b45..f521986 100644
--- a/hypervideo_dl/postprocessor/modify_chapters.py
+++ b/hypervideo_dl/postprocessor/modify_chapters.py
@@ -23,6 +23,7 @@ class ModifyChaptersPP(FFmpegPostProcessor):
@PostProcessor._restrict_to(images=False)
def run(self, info):
+ self._fixup_chapters(info)
# Chapters must be preserved intact when downloading multiple formats of the same video.
chapters, sponsor_chapters = self._mark_chapters_to_remove(
copy.deepcopy(info.get('chapters')) or [],
diff --git a/hypervideo_dl/utils/__init__.py b/hypervideo_dl/utils/__init__.py
new file mode 100644
index 0000000..c267e32
--- /dev/null
+++ b/hypervideo_dl/utils/__init__.py
@@ -0,0 +1,10 @@
+# flake8: noqa: F403
+from ..compat.compat_utils import passthrough_module
+
+passthrough_module(__name__, '._deprecated')
+del passthrough_module
+
+# isort: off
+from .traversal import *
+from ._utils import *
+from ._utils import _configuration_args, _get_exe_version_output # noqa: F401
diff --git a/hypervideo_dl/utils/_deprecated.py b/hypervideo_dl/utils/_deprecated.py
new file mode 100644
index 0000000..a8ae8ec
--- /dev/null
+++ b/hypervideo_dl/utils/_deprecated.py
@@ -0,0 +1,39 @@
+"""Deprecated - New code should avoid these"""
+import warnings
+
+from ..compat.compat_utils import passthrough_module
+
+# XXX: Implement this the same way as other DeprecationWarnings without circular import
+passthrough_module(__name__, '.._legacy', callback=lambda attr: warnings.warn(
+ DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=6))
+del passthrough_module
+
+
+from ._utils import preferredencoding
+
+
+def encodeFilename(s, for_subprocess=False):
+ assert isinstance(s, str)
+ return s
+
+
+def decodeFilename(b, for_subprocess=False):
+ return b
+
+
+def decodeArgument(b):
+ return b
+
+
+def decodeOption(optval):
+ if optval is None:
+ return optval
+ if isinstance(optval, bytes):
+ optval = optval.decode(preferredencoding())
+
+ assert isinstance(optval, str)
+ return optval
+
+
+def error_to_compat_str(err):
+ return str(err)
diff --git a/hypervideo_dl/utils/_legacy.py b/hypervideo_dl/utils/_legacy.py
new file mode 100644
index 0000000..dde0209
--- /dev/null
+++ b/hypervideo_dl/utils/_legacy.py
@@ -0,0 +1,242 @@
+"""No longer used and new code should not use. Exists only for API compat."""
+import platform
+import struct
+import sys
+import urllib.error
+import urllib.parse
+import urllib.request
+import zlib
+
+from ._utils import Popen, decode_base_n, preferredencoding
+from .networking import escape_rfc3986 # noqa: F401
+from .networking import normalize_url as escape_url # noqa: F401
+from .traversal import traverse_obj
+from ..dependencies import certifi, websockets
+from ..networking._helper import make_ssl_context
+from ..networking._urllib import HTTPHandler
+
+# isort: split
+from .networking import random_user_agent, std_headers # noqa: F401
+from ..cookies import YoutubeDLCookieJar # noqa: F401
+from ..networking._urllib import PUTRequest # noqa: F401
+from ..networking._urllib import SUPPORTED_ENCODINGS, HEADRequest # noqa: F401
+from ..networking._urllib import ProxyHandler as PerRequestProxyHandler # noqa: F401
+from ..networking._urllib import RedirectHandler as YoutubeDLRedirectHandler # noqa: F401
+from ..networking._urllib import ( # noqa: F401
+ make_socks_conn_class,
+ update_Request,
+)
+from ..networking.exceptions import HTTPError, network_exceptions # noqa: F401
+
+has_certifi = bool(certifi)
+has_websockets = bool(websockets)
+
+
+def load_plugins(name, suffix, namespace):
+ from ..plugins import load_plugins
+ ret = load_plugins(name, suffix)
+ namespace.update(ret)
+ return ret
+
+
+def traverse_dict(dictn, keys, casesense=True):
+ return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
+
+
+def decode_base(value, digits):
+ return decode_base_n(value, table=digits)
+
+
+def platform_name():
+ """ Returns the platform name as a str """
+ return platform.platform()
+
+
+def get_subprocess_encoding():
+ if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
+ # For subprocess calls, encode with locale encoding
+ # Refer to http://stackoverflow.com/a/9951851/35070
+ encoding = preferredencoding()
+ else:
+ encoding = sys.getfilesystemencoding()
+ if encoding is None:
+ encoding = 'utf-8'
+ return encoding
+
+
+# UNUSED
+# Based on png2str() written by @gdkchan and improved by @yokrysty
+# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
+def decode_png(png_data):
+ # Reference: https://www.w3.org/TR/PNG/
+ header = png_data[8:]
+
+ if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
+ raise OSError('Not a valid PNG file.')
+
+ int_map = {1: '>B', 2: '>H', 4: '>I'}
+ unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
+
+ chunks = []
+
+ while header:
+ length = unpack_integer(header[:4])
+ header = header[4:]
+
+ chunk_type = header[:4]
+ header = header[4:]
+
+ chunk_data = header[:length]
+ header = header[length:]
+
+ header = header[4:] # Skip CRC
+
+ chunks.append({
+ 'type': chunk_type,
+ 'length': length,
+ 'data': chunk_data
+ })
+
+ ihdr = chunks[0]['data']
+
+ width = unpack_integer(ihdr[:4])
+ height = unpack_integer(ihdr[4:8])
+
+ idat = b''
+
+ for chunk in chunks:
+ if chunk['type'] == b'IDAT':
+ idat += chunk['data']
+
+ if not idat:
+ raise OSError('Unable to read PNG data.')
+
+ decompressed_data = bytearray(zlib.decompress(idat))
+
+ stride = width * 3
+ pixels = []
+
+ def _get_pixel(idx):
+ x = idx % stride
+ y = idx // stride
+ return pixels[y][x]
+
+ for y in range(height):
+ basePos = y * (1 + stride)
+ filter_type = decompressed_data[basePos]
+
+ current_row = []
+
+ pixels.append(current_row)
+
+ for x in range(stride):
+ color = decompressed_data[1 + basePos + x]
+ basex = y * stride + x
+ left = 0
+ up = 0
+
+ if x > 2:
+ left = _get_pixel(basex - 3)
+ if y > 0:
+ up = _get_pixel(basex - stride)
+
+ if filter_type == 1: # Sub
+ color = (color + left) & 0xff
+ elif filter_type == 2: # Up
+ color = (color + up) & 0xff
+ elif filter_type == 3: # Average
+ color = (color + ((left + up) >> 1)) & 0xff
+ elif filter_type == 4: # Paeth
+ a = left
+ b = up
+ c = 0
+
+ if x > 2 and y > 0:
+ c = _get_pixel(basex - stride - 3)
+
+ p = a + b - c
+
+ pa = abs(p - a)
+ pb = abs(p - b)
+ pc = abs(p - c)
+
+ if pa <= pb and pa <= pc:
+ color = (color + a) & 0xff
+ elif pb <= pc:
+ color = (color + b) & 0xff
+ else:
+ color = (color + c) & 0xff
+
+ current_row.append(color)
+
+ return width, height, pixels
+
+
+def register_socks_protocols():
+ # "Register" SOCKS protocols
+ # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
+ # URLs with protocols not in urlparse.uses_netloc are not handled correctly
+ for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
+ if scheme not in urllib.parse.uses_netloc:
+ urllib.parse.uses_netloc.append(scheme)
+
+
+def handle_youtubedl_headers(headers):
+ filtered_headers = headers
+
+ if 'Youtubedl-no-compression' in filtered_headers:
+ filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
+ del filtered_headers['Youtubedl-no-compression']
+
+ return filtered_headers
+
+
+def request_to_url(req):
+ if isinstance(req, urllib.request.Request):
+ return req.get_full_url()
+ else:
+ return req
+
+
+def sanitized_Request(url, *args, **kwargs):
+ from ..utils import extract_basic_auth, sanitize_url
+ url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
+ if auth_header is not None:
+ headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
+ headers['Authorization'] = auth_header
+ return urllib.request.Request(url, *args, **kwargs)
+
+
+class YoutubeDLHandler(HTTPHandler):
+ def __init__(self, params, *args, **kwargs):
+ self._params = params
+ super().__init__(*args, **kwargs)
+
+
+YoutubeDLHTTPSHandler = YoutubeDLHandler
+
+
+class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
+ def __init__(self, cookiejar=None):
+ urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
+
+ def http_response(self, request, response):
+ return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
+
+ https_request = urllib.request.HTTPCookieProcessor.http_request
+ https_response = http_response
+
+
+def make_HTTPS_handler(params, **kwargs):
+ return YoutubeDLHTTPSHandler(params, context=make_ssl_context(
+ verify=not params.get('nocheckcertificate'),
+ client_certificate=params.get('client_certificate'),
+ client_certificate_key=params.get('client_certificate_key'),
+ client_certificate_password=params.get('client_certificate_password'),
+ legacy_support=params.get('legacyserverconnect'),
+ use_certifi='no-certifi' not in params.get('compat_opts', []),
+ ), **kwargs)
+
+
+def process_communicate_or_kill(p, *args, **kwargs):
+ return Popen.communicate_or_kill(p, *args, **kwargs)
diff --git a/hypervideo_dl/utils/_utils.py b/hypervideo_dl/utils/_utils.py
new file mode 100644
index 0000000..5a85462
--- /dev/null
+++ b/hypervideo_dl/utils/_utils.py
@@ -0,0 +1,5484 @@
+import asyncio
+import atexit
+import base64
+import binascii
+import calendar
+import codecs
+import collections
+import collections.abc
+import contextlib
+import datetime
+import email.header
+import email.utils
+import errno
+import hashlib
+import hmac
+import html.entities
+import html.parser
+import inspect
+import io
+import itertools
+import json
+import locale
+import math
+import mimetypes
+import netrc
+import operator
+import os
+import platform
+import random
+import re
+import shlex
+import socket
+import ssl
+import struct
+import subprocess
+import sys
+import tempfile
+import time
+import traceback
+import types
+import unicodedata
+import urllib.error
+import urllib.parse
+import urllib.request
+import xml.etree.ElementTree
+
+from . import traversal
+
+from ..compat import functools # isort: split
+from ..compat import (
+ compat_etree_fromstring,
+ compat_expanduser,
+ compat_HTMLParseError,
+ compat_os_name,
+ compat_shlex_quote,
+)
+from ..dependencies import websockets, xattr
+
+__name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
+
+# This is not clearly defined otherwise
+compiled_regex_type = type(re.compile(''))
+
+
+class NO_DEFAULT:
+ pass
+
+
+def IDENTITY(x):
+ return x
+
+
+ENGLISH_MONTH_NAMES = [
+ 'January', 'February', 'March', 'April', 'May', 'June',
+ 'July', 'August', 'September', 'October', 'November', 'December']
+
+MONTH_NAMES = {
+ 'en': ENGLISH_MONTH_NAMES,
+ 'fr': [
+ 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
+ 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
+ # these follow the genitive grammatical case (dopełniacz)
+ # some websites might be using nominative, which will require another month list
+ # https://en.wikibooks.org/wiki/Polish/Noun_cases
+ 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
+ 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
+}
+
+# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
+TIMEZONE_NAMES = {
+ 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
+ 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
+ 'EST': -5, 'EDT': -4, # Eastern
+ 'CST': -6, 'CDT': -5, # Central
+ 'MST': -7, 'MDT': -6, # Mountain
+ 'PST': -8, 'PDT': -7 # Pacific
+}
+
+# needed for sanitizing filenames in restricted mode
+ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
+ itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
+ 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
+
+DATE_FORMATS = (
+ '%d %B %Y',
+ '%d %b %Y',
+ '%B %d %Y',
+ '%B %dst %Y',
+ '%B %dnd %Y',
+ '%B %drd %Y',
+ '%B %dth %Y',
+ '%b %d %Y',
+ '%b %dst %Y',
+ '%b %dnd %Y',
+ '%b %drd %Y',
+ '%b %dth %Y',
+ '%b %dst %Y %I:%M',
+ '%b %dnd %Y %I:%M',
+ '%b %drd %Y %I:%M',
+ '%b %dth %Y %I:%M',
+ '%Y %m %d',
+ '%Y-%m-%d',
+ '%Y.%m.%d.',
+ '%Y/%m/%d',
+ '%Y/%m/%d %H:%M',
+ '%Y/%m/%d %H:%M:%S',
+ '%Y%m%d%H%M',
+ '%Y%m%d%H%M%S',
+ '%Y%m%d',
+ '%Y-%m-%d %H:%M',
+ '%Y-%m-%d %H:%M:%S',
+ '%Y-%m-%d %H:%M:%S.%f',
+ '%Y-%m-%d %H:%M:%S:%f',
+ '%d.%m.%Y %H:%M',
+ '%d.%m.%Y %H.%M',
+ '%Y-%m-%dT%H:%M:%SZ',
+ '%Y-%m-%dT%H:%M:%S.%fZ',
+ '%Y-%m-%dT%H:%M:%S.%f0Z',
+ '%Y-%m-%dT%H:%M:%S',
+ '%Y-%m-%dT%H:%M:%S.%f',
+ '%Y-%m-%dT%H:%M',
+ '%b %d %Y at %H:%M',
+ '%b %d %Y at %H:%M:%S',
+ '%B %d %Y at %H:%M',
+ '%B %d %Y at %H:%M:%S',
+ '%H:%M %d-%b-%Y',
+)
+
+DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
+DATE_FORMATS_DAY_FIRST.extend([
+ '%d-%m-%Y',
+ '%d.%m.%Y',
+ '%d.%m.%y',
+ '%d/%m/%Y',
+ '%d/%m/%y',
+ '%d/%m/%Y %H:%M:%S',
+ '%d-%m-%Y %H:%M',
+ '%H:%M %d/%m/%Y',
+])
+
+DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
+DATE_FORMATS_MONTH_FIRST.extend([
+ '%m-%d-%Y',
+ '%m.%d.%Y',
+ '%m/%d/%Y',
+ '%m/%d/%y',
+ '%m/%d/%Y %H:%M:%S',
+])
+
+PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
+JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
+
+NUMBER_RE = r'\d+(?:\.\d+)?'
+
+
+@functools.cache
+def preferredencoding():
+ """Get preferred encoding.
+
+ Returns the best encoding scheme for the system, based on
+ locale.getpreferredencoding() and some further tweaks.
+ """
+ try:
+ pref = locale.getpreferredencoding()
+ 'TEST'.encode(pref)
+ except Exception:
+ pref = 'UTF-8'
+
+ return pref
+
+
+def write_json_file(obj, fn):
+ """ Encode obj as JSON and write it to fn, atomically if possible """
+
+ tf = tempfile.NamedTemporaryFile(
+ prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
+ suffix='.tmp', delete=False, mode='w', encoding='utf-8')
+
+ try:
+ with tf:
+ json.dump(obj, tf, ensure_ascii=False)
+ if sys.platform == 'win32':
+ # Need to remove existing file on Windows, else os.rename raises
+ # WindowsError or FileExistsError.
+ with contextlib.suppress(OSError):
+ os.unlink(fn)
+ with contextlib.suppress(OSError):
+ mask = os.umask(0)
+ os.umask(mask)
+ os.chmod(tf.name, 0o666 & ~mask)
+ os.rename(tf.name, fn)
+ except Exception:
+ with contextlib.suppress(OSError):
+ os.remove(tf.name)
+ raise
+
+
+def find_xpath_attr(node, xpath, key, val=None):
+ """ Find the xpath xpath[@key=val] """
+ assert re.match(r'^[a-zA-Z_-]+$', key)
+ expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
+ return node.find(expr)
+
+# On python2.6 the xml.etree.ElementTree.Element methods don't support
+# the namespace parameter
+
+
+def xpath_with_ns(path, ns_map):
+ components = [c.split(':') for c in path.split('/')]
+ replaced = []
+ for c in components:
+ if len(c) == 1:
+ replaced.append(c[0])
+ else:
+ ns, tag = c
+ replaced.append('{%s}%s' % (ns_map[ns], tag))
+ return '/'.join(replaced)
+
+
+def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
+ def _find_xpath(xpath):
+ return node.find(xpath)
+
+ if isinstance(xpath, str):
+ n = _find_xpath(xpath)
+ else:
+ for xp in xpath:
+ n = _find_xpath(xp)
+ if n is not None:
+ break
+
+ if n is None:
+ if default is not NO_DEFAULT:
+ return default
+ elif fatal:
+ name = xpath if name is None else name
+ raise ExtractorError('Could not find XML element %s' % name)
+ else:
+ return None
+ return n
+
+
+def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
+ n = xpath_element(node, xpath, name, fatal=fatal, default=default)
+ if n is None or n == default:
+ return n
+ if n.text is None:
+ if default is not NO_DEFAULT:
+ return default
+ elif fatal:
+ name = xpath if name is None else name
+ raise ExtractorError('Could not find XML element\'s text %s' % name)
+ else:
+ return None
+ return n.text
+
+
+def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
+ n = find_xpath_attr(node, xpath, key)
+ if n is None:
+ if default is not NO_DEFAULT:
+ return default
+ elif fatal:
+ name = f'{xpath}[@{key}]' if name is None else name
+ raise ExtractorError('Could not find XML attribute %s' % name)
+ else:
+ return None
+ return n.attrib[key]
+
+
+def get_element_by_id(id, html, **kwargs):
+ """Return the content of the tag with the specified ID in the passed HTML document"""
+ return get_element_by_attribute('id', id, html, **kwargs)
+
+
+def get_element_html_by_id(id, html, **kwargs):
+ """Return the html of the tag with the specified ID in the passed HTML document"""
+ return get_element_html_by_attribute('id', id, html, **kwargs)
+
+
+def get_element_by_class(class_name, html):
+ """Return the content of the first tag with the specified class in the passed HTML document"""
+ retval = get_elements_by_class(class_name, html)
+ return retval[0] if retval else None
+
+
+def get_element_html_by_class(class_name, html):
+ """Return the html of the first tag with the specified class in the passed HTML document"""
+ retval = get_elements_html_by_class(class_name, html)
+ return retval[0] if retval else None
+
+
+def get_element_by_attribute(attribute, value, html, **kwargs):
+ retval = get_elements_by_attribute(attribute, value, html, **kwargs)
+ return retval[0] if retval else None
+
+
+def get_element_html_by_attribute(attribute, value, html, **kargs):
+ retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
+ return retval[0] if retval else None
+
+
+def get_elements_by_class(class_name, html, **kargs):
+ """Return the content of all tags with the specified class in the passed HTML document as a list"""
+ return get_elements_by_attribute(
+ 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
+ html, escape_value=False)
+
+
+def get_elements_html_by_class(class_name, html):
+ """Return the html of all tags with the specified class in the passed HTML document as a list"""
+ return get_elements_html_by_attribute(
+ 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
+ html, escape_value=False)
+
+
+def get_elements_by_attribute(*args, **kwargs):
+ """Return the content of the tag with the specified attribute in the passed HTML document"""
+ return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
+
+
+def get_elements_html_by_attribute(*args, **kwargs):
+ """Return the html of the tag with the specified attribute in the passed HTML document"""
+ return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
+
+
+def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
+ """
+ Return the text (content) and the html (whole) of the tag with the specified
+ attribute in the passed HTML document
+ """
+ if not value:
+ return
+
+ quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
+
+ value = re.escape(value) if escape_value else value
+
+ partial_element_re = rf'''(?x)
+ <(?P<tag>{tag})
+ (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
+ \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
+ '''
+
+ for m in re.finditer(partial_element_re, html):
+ content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
+
+ yield (
+ unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
+ whole
+ )
+
+
+class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
+ """
+ HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
+ closing tag for the first opening tag it has encountered, and can be used
+ as a context manager
+ """
+
+ class HTMLBreakOnClosingTagException(Exception):
+ pass
+
+ def __init__(self):
+ self.tagstack = collections.deque()
+ html.parser.HTMLParser.__init__(self)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, *_):
+ self.close()
+
+ def close(self):
+ # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
+ # so data remains buffered; we no longer have any interest in it, thus
+ # override this method to discard it
+ pass
+
+ def handle_starttag(self, tag, _):
+ self.tagstack.append(tag)
+
+ def handle_endtag(self, tag):
+ if not self.tagstack:
+ raise compat_HTMLParseError('no tags in the stack')
+ while self.tagstack:
+ inner_tag = self.tagstack.pop()
+ if inner_tag == tag:
+ break
+ else:
+ raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
+ if not self.tagstack:
+ raise self.HTMLBreakOnClosingTagException()
+
+
+# XXX: This should be far less strict
+def get_element_text_and_html_by_tag(tag, html):
+ """
+ For the first element with the specified tag in the passed HTML document
+ return its' content (text) and the whole element (html)
+ """
+ def find_or_raise(haystack, needle, exc):
+ try:
+ return haystack.index(needle)
+ except ValueError:
+ raise exc
+ closing_tag = f'</{tag}>'
+ whole_start = find_or_raise(
+ html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
+ content_start = find_or_raise(
+ html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
+ content_start += whole_start + 1
+ with HTMLBreakOnClosingTagParser() as parser:
+ parser.feed(html[whole_start:content_start])
+ if not parser.tagstack or parser.tagstack[0] != tag:
+ raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
+ offset = content_start
+ while offset < len(html):
+ next_closing_tag_start = find_or_raise(
+ html[offset:], closing_tag,
+ compat_HTMLParseError(f'closing {tag} tag not found'))
+ next_closing_tag_end = next_closing_tag_start + len(closing_tag)
+ try:
+ parser.feed(html[offset:offset + next_closing_tag_end])
+ offset += next_closing_tag_end
+ except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
+ return html[content_start:offset + next_closing_tag_start], \
+ html[whole_start:offset + next_closing_tag_end]
+ raise compat_HTMLParseError('unexpected end of html')
+
+
+class HTMLAttributeParser(html.parser.HTMLParser):
+ """Trivial HTML parser to gather the attributes for a single element"""
+
+ def __init__(self):
+ self.attrs = {}
+ html.parser.HTMLParser.__init__(self)
+
+ def handle_starttag(self, tag, attrs):
+ self.attrs = dict(attrs)
+ raise compat_HTMLParseError('done')
+
+
+class HTMLListAttrsParser(html.parser.HTMLParser):
+ """HTML parser to gather the attributes for the elements of a list"""
+
+ def __init__(self):
+ html.parser.HTMLParser.__init__(self)
+ self.items = []
+ self._level = 0
+
+ def handle_starttag(self, tag, attrs):
+ if tag == 'li' and self._level == 0:
+ self.items.append(dict(attrs))
+ self._level += 1
+
+ def handle_endtag(self, tag):
+ self._level -= 1
+
+
+def extract_attributes(html_element):
+ """Given a string for an HTML element such as
+ <el
+ a="foo" B="bar" c="&98;az" d=boz
+ empty= noval entity="&amp;"
+ sq='"' dq="'"
+ >
+ Decode and return a dictionary of attributes.
+ {
+ 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
+ 'empty': '', 'noval': None, 'entity': '&',
+ 'sq': '"', 'dq': '\''
+ }.
+ """
+ parser = HTMLAttributeParser()
+ with contextlib.suppress(compat_HTMLParseError):
+ parser.feed(html_element)
+ parser.close()
+ return parser.attrs
+
+
+def parse_list(webpage):
+ """Given a string for an series of HTML <li> elements,
+ return a dictionary of their attributes"""
+ parser = HTMLListAttrsParser()
+ parser.feed(webpage)
+ parser.close()
+ return parser.items
+
+
+def clean_html(html):
+ """Clean an HTML snippet into a readable string"""
+
+ if html is None: # Convenience for sanitizing descriptions etc.
+ return html
+
+ html = re.sub(r'\s+', ' ', html)
+ html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
+ html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
+ # Strip html tags
+ html = re.sub('<.*?>', '', html)
+ # Replace html entities
+ html = unescapeHTML(html)
+ return html.strip()
+
+
+class LenientJSONDecoder(json.JSONDecoder):
+ # TODO: Write tests
+ def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
+ self.transform_source, self.ignore_extra = transform_source, ignore_extra
+ self._close_attempts = 2 * close_objects
+ super().__init__(*args, **kwargs)
+
+ @staticmethod
+ def _close_object(err):
+ doc = err.doc[:err.pos]
+ # We need to add comma first to get the correct error message
+ if err.msg.startswith('Expecting \',\''):
+ return doc + ','
+ elif not doc.endswith(','):
+ return
+
+ if err.msg.startswith('Expecting property name'):
+ return doc[:-1] + '}'
+ elif err.msg.startswith('Expecting value'):
+ return doc[:-1] + ']'
+
+ def decode(self, s):
+ if self.transform_source:
+ s = self.transform_source(s)
+ for attempt in range(self._close_attempts + 1):
+ try:
+ if self.ignore_extra:
+ return self.raw_decode(s.lstrip())[0]
+ return super().decode(s)
+ except json.JSONDecodeError as e:
+ if e.pos is None:
+ raise
+ elif attempt < self._close_attempts:
+ s = self._close_object(e)
+ if s is not None:
+ continue
+ raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
+ assert False, 'Too many attempts to decode JSON'
+
+
+def sanitize_open(filename, open_mode):
+ """Try to open the given filename, and slightly tweak it if this fails.
+
+ Attempts to open the given filename. If this fails, it tries to change
+ the filename slightly, step by step, until it's either able to open it
+ or it fails and raises a final exception, like the standard open()
+ function.
+
+ It returns the tuple (stream, definitive_file_name).
+ """
+ if filename == '-':
+ if sys.platform == 'win32':
+ import msvcrt
+
+ # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
+ with contextlib.suppress(io.UnsupportedOperation):
+ msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
+ return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
+
+ for attempt in range(2):
+ try:
+ try:
+ if sys.platform == 'win32':
+ # FIXME: An exclusive lock also locks the file from being read.
+ # Since windows locks are mandatory, don't lock the file on windows (for now).
+ # Ref: https://github.com/hypervideo/hypervideo/issues/3124
+ raise LockingUnsupportedError()
+ stream = locked_file(filename, open_mode, block=False).__enter__()
+ except OSError:
+ stream = open(filename, open_mode)
+ return stream, filename
+ except OSError as err:
+ if attempt or err.errno in (errno.EACCES,):
+ raise
+ old_filename, filename = filename, sanitize_path(filename)
+ if old_filename == filename:
+ raise
+
+
+def timeconvert(timestr):
+ """Convert RFC 2822 defined time string into system timestamp"""
+ timestamp = None
+ timetuple = email.utils.parsedate_tz(timestr)
+ if timetuple is not None:
+ timestamp = email.utils.mktime_tz(timetuple)
+ return timestamp
+
+
+def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
+ """Sanitizes a string so it could be used as part of a filename.
+ @param restricted Use a stricter subset of allowed characters
+ @param is_id Whether this is an ID that should be kept unchanged if possible.
+ If unset, hypervideo's new sanitization rules are in effect
+ """
+ if s == '':
+ return ''
+
+ def replace_insane(char):
+ if restricted and char in ACCENT_CHARS:
+ return ACCENT_CHARS[char]
+ elif not restricted and char == '\n':
+ return '\0 '
+ elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
+ # Replace with their full-width unicode counterparts
+ return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
+ elif char == '?' or ord(char) < 32 or ord(char) == 127:
+ return ''
+ elif char == '"':
+ return '' if restricted else '\''
+ elif char == ':':
+ return '\0_\0-' if restricted else '\0 \0-'
+ elif char in '\\/|*<>':
+ return '\0_'
+ if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
+ return '\0_'
+ return char
+
+ # Replace look-alike Unicode glyphs
+ if restricted and (is_id is NO_DEFAULT or not is_id):
+ s = unicodedata.normalize('NFKC', s)
+ s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
+ result = ''.join(map(replace_insane, s))
+ if is_id is NO_DEFAULT:
+ result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
+ STRIP_RE = r'(?:\0.|[ _-])*'
+ result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
+ result = result.replace('\0', '') or '_'
+
+ if not is_id:
+ while '__' in result:
+ result = result.replace('__', '_')
+ result = result.strip('_')
+ # Common case of "Foreign band name - English song title"
+ if restricted and result.startswith('-_'):
+ result = result[2:]
+ if result.startswith('-'):
+ result = '_' + result[len('-'):]
+ result = result.lstrip('.')
+ if not result:
+ result = '_'
+ return result
+
+
+def sanitize_path(s, force=False):
+ """Sanitizes and normalizes path on Windows"""
+ if sys.platform == 'win32':
+ force = False
+ drive_or_unc, _ = os.path.splitdrive(s)
+ elif force:
+ drive_or_unc = ''
+ else:
+ return s
+
+ norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
+ if drive_or_unc:
+ norm_path.pop(0)
+ sanitized_path = [
+ path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
+ for path_part in norm_path]
+ if drive_or_unc:
+ sanitized_path.insert(0, drive_or_unc + os.path.sep)
+ elif force and s and s[0] == os.path.sep:
+ sanitized_path.insert(0, os.path.sep)
+ return os.path.join(*sanitized_path)
+
+
+def sanitize_url(url, *, scheme='http'):
+ # Prepend protocol-less URLs with `http:` scheme in order to mitigate
+ # the number of unwanted failures due to missing protocol
+ if url is None:
+ return
+ elif url.startswith('//'):
+ return f'{scheme}:{url}'
+ # Fix some common typos seen so far
+ COMMON_TYPOS = (
+ # https://github.com/ytdl-org/youtube-dl/issues/15649
+ (r'^httpss://', r'https://'),
+ # https://bx1.be/lives/direct-tv/
+ (r'^rmtp([es]?)://', r'rtmp\1://'),
+ )
+ for mistake, fixup in COMMON_TYPOS:
+ if re.match(mistake, url):
+ return re.sub(mistake, fixup, url)
+ return url
+
+
+def extract_basic_auth(url):
+ parts = urllib.parse.urlsplit(url)
+ if parts.username is None:
+ return url, None
+ url = urllib.parse.urlunsplit(parts._replace(netloc=(
+ parts.hostname if parts.port is None
+ else '%s:%d' % (parts.hostname, parts.port))))
+ auth_payload = base64.b64encode(
+ ('%s:%s' % (parts.username, parts.password or '')).encode())
+ return url, f'Basic {auth_payload.decode()}'
+
+
+def expand_path(s):
+ """Expand shell variables and ~"""
+ return os.path.expandvars(compat_expanduser(s))
+
+
+def orderedSet(iterable, *, lazy=False):
+ """Remove all duplicates from the input iterable"""
+ def _iter():
+ seen = [] # Do not use set since the items can be unhashable
+ for x in iterable:
+ if x not in seen:
+ seen.append(x)
+ yield x
+
+ return _iter() if lazy else list(_iter())
+
+
+def _htmlentity_transform(entity_with_semicolon):
+ """Transforms an HTML entity to a character."""
+ entity = entity_with_semicolon[:-1]
+
+ # Known non-numeric HTML entity
+ if entity in html.entities.name2codepoint:
+ return chr(html.entities.name2codepoint[entity])
+
+ # TODO: HTML5 allows entities without a semicolon.
+ # E.g. '&Eacuteric' should be decoded as 'Éric'.
+ if entity_with_semicolon in html.entities.html5:
+ return html.entities.html5[entity_with_semicolon]
+
+ mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
+ if mobj is not None:
+ numstr = mobj.group(1)
+ if numstr.startswith('x'):
+ base = 16
+ numstr = '0%s' % numstr
+ else:
+ base = 10
+ # See https://github.com/ytdl-org/youtube-dl/issues/7518
+ with contextlib.suppress(ValueError):
+ return chr(int(numstr, base))
+
+ # Unknown entity in name, return its literal representation
+ return '&%s;' % entity
+
+
+def unescapeHTML(s):
+ if s is None:
+ return None
+ assert isinstance(s, str)
+
+ return re.sub(
+ r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
+
+
+def escapeHTML(text):
+ return (
+ text
+ .replace('&', '&amp;')
+ .replace('<', '&lt;')
+ .replace('>', '&gt;')
+ .replace('"', '&quot;')
+ .replace("'", '&#39;')
+ )
+
+
+class netrc_from_content(netrc.netrc):
+ def __init__(self, content):
+ self.hosts, self.macros = {}, {}
+ with io.StringIO(content) as stream:
+ self._parse('-', stream, False)
+
+
+class Popen(subprocess.Popen):
+ if sys.platform == 'win32':
+ _startupinfo = subprocess.STARTUPINFO()
+ _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+ else:
+ _startupinfo = None
+
+ @staticmethod
+ def _fix_pyinstaller_ld_path(env):
+ """Restore LD_LIBRARY_PATH when using PyInstaller
+ Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
+ https://github.com/hypervideo/hypervideo/issues/4573
+ """
+ if not hasattr(sys, '_MEIPASS'):
+ return
+
+ def _fix(key):
+ orig = env.get(f'{key}_ORIG')
+ if orig is None:
+ env.pop(key, None)
+ else:
+ env[key] = orig
+
+ _fix('LD_LIBRARY_PATH') # Linux
+ _fix('DYLD_LIBRARY_PATH') # macOS
+
+ def __init__(self, *args, env=None, text=False, **kwargs):
+ if env is None:
+ env = os.environ.copy()
+ self._fix_pyinstaller_ld_path(env)
+
+ self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
+ if text is True:
+ kwargs['universal_newlines'] = True # For 3.6 compatibility
+ kwargs.setdefault('encoding', 'utf-8')
+ kwargs.setdefault('errors', 'replace')
+ super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
+
+ def communicate_or_kill(self, *args, **kwargs):
+ try:
+ return self.communicate(*args, **kwargs)
+ except BaseException: # Including KeyboardInterrupt
+ self.kill(timeout=None)
+ raise
+
+ def kill(self, *, timeout=0):
+ super().kill()
+ if timeout != 0:
+ self.wait(timeout=timeout)
+
+ @classmethod
+ def run(cls, *args, timeout=None, **kwargs):
+ with cls(*args, **kwargs) as proc:
+ default = '' if proc.__text_mode else b''
+ stdout, stderr = proc.communicate_or_kill(timeout=timeout)
+ return stdout or default, stderr or default, proc.returncode
+
+
+def encodeArgument(s):
+ # Legacy code that uses byte strings
+ # Uncomment the following line after fixing all post processors
+ # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
+ return s if isinstance(s, str) else s.decode('ascii')
+
+
+_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
+
+
+def timetuple_from_msec(msec):
+ secs, msec = divmod(msec, 1000)
+ mins, secs = divmod(secs, 60)
+ hrs, mins = divmod(mins, 60)
+ return _timetuple(hrs, mins, secs, msec)
+
+
+def formatSeconds(secs, delim=':', msec=False):
+ time = timetuple_from_msec(secs * 1000)
+ if time.hours:
+ ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
+ elif time.minutes:
+ ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
+ else:
+ ret = '%d' % time.seconds
+ return '%s.%03d' % (ret, time.milliseconds) if msec else ret
+
+
+def bug_reports_message(before=';'):
+ msg = ('please report this issue on https://issues.hyperbola.info/ , '
+ 'filling out the appropriate issue template. '
+ 'Confirm you are on the latest version using pacman -Su')
+
+ before = before.rstrip()
+ if not before or before.endswith(('.', '!', '?')):
+ msg = msg[0].title() + msg[1:]
+
+ return (before + ' ' if before else '') + msg
+
+
+class YoutubeDLError(Exception):
+ """Base exception for YoutubeDL errors."""
+ msg = None
+
+ def __init__(self, msg=None):
+ if msg is not None:
+ self.msg = msg
+ elif self.msg is None:
+ self.msg = type(self).__name__
+ super().__init__(self.msg)
+
+
+class ExtractorError(YoutubeDLError):
+ """Error during info extraction."""
+
+ def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
+ """ tb, if given, is the original traceback (so that it can be printed out).
+ If expected is set, this is a normal error message and most likely not a bug in hypervideo.
+ """
+ from ..networking.exceptions import network_exceptions
+ if sys.exc_info()[0] in network_exceptions:
+ expected = True
+
+ self.orig_msg = str(msg)
+ self.traceback = tb
+ self.expected = expected
+ self.cause = cause
+ self.video_id = video_id
+ self.ie = ie
+ self.exc_info = sys.exc_info() # preserve original exception
+ if isinstance(self.exc_info[1], ExtractorError):
+ self.exc_info = self.exc_info[1].exc_info
+ super().__init__(self.__msg)
+
+ @property
+ def __msg(self):
+ return ''.join((
+ format_field(self.ie, None, '[%s] '),
+ format_field(self.video_id, None, '%s: '),
+ self.orig_msg,
+ format_field(self.cause, None, ' (caused by %r)'),
+ '' if self.expected else bug_reports_message()))
+
+ def format_traceback(self):
+ return join_nonempty(
+ self.traceback and ''.join(traceback.format_tb(self.traceback)),
+ self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
+ delim='\n') or None
+
+ def __setattr__(self, name, value):
+ super().__setattr__(name, value)
+ if getattr(self, 'msg', None) and name not in ('msg', 'args'):
+ self.msg = self.__msg or type(self).__name__
+ self.args = (self.msg, ) # Cannot be property
+
+
+class UnsupportedError(ExtractorError):
+ def __init__(self, url):
+ super().__init__(
+ 'Unsupported URL: %s' % url, expected=True)
+ self.url = url
+
+
+class RegexNotFoundError(ExtractorError):
+ """Error when a regex didn't match"""
+ pass
+
+
+class GeoRestrictedError(ExtractorError):
+ """Geographic restriction Error exception.
+
+ This exception may be thrown when a video is not available from your
+ geographic location due to geographic restrictions imposed by a website.
+ """
+
+ def __init__(self, msg, countries=None, **kwargs):
+ kwargs['expected'] = True
+ super().__init__(msg, **kwargs)
+ self.countries = countries
+
+
+class UserNotLive(ExtractorError):
+ """Error when a channel/user is not live"""
+
+ def __init__(self, msg=None, **kwargs):
+ kwargs['expected'] = True
+ super().__init__(msg or 'The channel is not currently live', **kwargs)
+
+
+class DownloadError(YoutubeDLError):
+ """Download Error exception.
+
+ This exception may be thrown by FileDownloader objects if they are not
+ configured to continue on errors. They will contain the appropriate
+ error message.
+ """
+
+ def __init__(self, msg, exc_info=None):
+ """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
+ super().__init__(msg)
+ self.exc_info = exc_info
+
+
+class EntryNotInPlaylist(YoutubeDLError):
+ """Entry not in playlist exception.
+
+ This exception will be thrown by YoutubeDL when a requested entry
+ is not found in the playlist info_dict
+ """
+ msg = 'Entry not found in info'
+
+
+class SameFileError(YoutubeDLError):
+ """Same File exception.
+
+ This exception will be thrown by FileDownloader objects if they detect
+ multiple files would have to be downloaded to the same file on disk.
+ """
+ msg = 'Fixed output name but more than one file to download'
+
+ def __init__(self, filename=None):
+ if filename is not None:
+ self.msg += f': {filename}'
+ super().__init__(self.msg)
+
+
+class PostProcessingError(YoutubeDLError):
+ """Post Processing exception.
+
+ This exception may be raised by PostProcessor's .run() method to
+ indicate an error in the postprocessing task.
+ """
+
+
+class DownloadCancelled(YoutubeDLError):
+ """ Exception raised when the download queue should be interrupted """
+ msg = 'The download was cancelled'
+
+
+class ExistingVideoReached(DownloadCancelled):
+ """ --break-on-existing triggered """
+ msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
+
+
+class RejectedVideoReached(DownloadCancelled):
+ """ --break-match-filter triggered """
+ msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
+
+
+class MaxDownloadsReached(DownloadCancelled):
+ """ --max-downloads limit has been reached. """
+ msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
+
+
+class ReExtractInfo(YoutubeDLError):
+ """ Video info needs to be re-extracted. """
+
+ def __init__(self, msg, expected=False):
+ super().__init__(msg)
+ self.expected = expected
+
+
+class ThrottledDownload(ReExtractInfo):
+ """ Download speed below --throttled-rate. """
+ msg = 'The download speed is below throttle limit'
+
+ def __init__(self):
+ super().__init__(self.msg, expected=False)
+
+
+class UnavailableVideoError(YoutubeDLError):
+ """Unavailable Format exception.
+
+ This exception will be thrown when a video is requested
+ in a format that is not available for that video.
+ """
+ msg = 'Unable to download video'
+
+ def __init__(self, err=None):
+ if err is not None:
+ self.msg += f': {err}'
+ super().__init__(self.msg)
+
+
+class ContentTooShortError(YoutubeDLError):
+ """Content Too Short exception.
+
+ This exception may be raised by FileDownloader objects when a file they
+ download is too small for what the server announced first, indicating
+ the connection was probably interrupted.
+ """
+
+ def __init__(self, downloaded, expected):
+ super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
+ # Both in bytes
+ self.downloaded = downloaded
+ self.expected = expected
+
+
+class XAttrMetadataError(YoutubeDLError):
+ def __init__(self, code=None, msg='Unknown error'):
+ super().__init__(msg)
+ self.code = code
+ self.msg = msg
+
+ # Parsing code and msg
+ if (self.code in (errno.ENOSPC, errno.EDQUOT)
+ or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
+ self.reason = 'NO_SPACE'
+ elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
+ self.reason = 'VALUE_TOO_LONG'
+ else:
+ self.reason = 'NOT_SUPPORTED'
+
+
+class XAttrUnavailableError(YoutubeDLError):
+ pass
+
+
+def is_path_like(f):
+ return isinstance(f, (str, bytes, os.PathLike))
+
+
+def extract_timezone(date_str):
+ m = re.search(
+ r'''(?x)
+ ^.{8,}? # >=8 char non-TZ prefix, if present
+ (?P<tz>Z| # just the UTC Z, or
+ (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
+ (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
+ [ ]? # optional space
+ (?P<sign>\+|-) # +/-
+ (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
+ $)
+ ''', date_str)
+ if not m:
+ m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
+ timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
+ if timezone is not None:
+ date_str = date_str[:-len(m.group('tz'))]
+ timezone = datetime.timedelta(hours=timezone or 0)
+ else:
+ date_str = date_str[:-len(m.group('tz'))]
+ if not m.group('sign'):
+ timezone = datetime.timedelta()
+ else:
+ sign = 1 if m.group('sign') == '+' else -1
+ timezone = datetime.timedelta(
+ hours=sign * int(m.group('hours')),
+ minutes=sign * int(m.group('minutes')))
+ return timezone, date_str
+
+
+def parse_iso8601(date_str, delimiter='T', timezone=None):
+ """ Return a UNIX timestamp from the given date """
+
+ if date_str is None:
+ return None
+
+ date_str = re.sub(r'\.[0-9]+', '', date_str)
+
+ if timezone is None:
+ timezone, date_str = extract_timezone(date_str)
+
+ with contextlib.suppress(ValueError):
+ date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
+ dt = datetime.datetime.strptime(date_str, date_format) - timezone
+ return calendar.timegm(dt.timetuple())
+
+
+def date_formats(day_first=True):
+ return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
+
+
+def unified_strdate(date_str, day_first=True):
+ """Return a string with the date in the format YYYYMMDD"""
+
+ if date_str is None:
+ return None
+ upload_date = None
+ # Replace commas
+ date_str = date_str.replace(',', ' ')
+ # Remove AM/PM + timezone
+ date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+ _, date_str = extract_timezone(date_str)
+
+ for expression in date_formats(day_first):
+ with contextlib.suppress(ValueError):
+ upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
+ if upload_date is None:
+ timetuple = email.utils.parsedate_tz(date_str)
+ if timetuple:
+ with contextlib.suppress(ValueError):
+ upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
+ if upload_date is not None:
+ return str(upload_date)
+
+
+def unified_timestamp(date_str, day_first=True):
+ if not isinstance(date_str, str):
+ return None
+
+ date_str = re.sub(r'\s+', ' ', re.sub(
+ r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
+
+ pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
+ timezone, date_str = extract_timezone(date_str)
+
+ # Remove AM/PM + timezone
+ date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+
+ # Remove unrecognized timezones from ISO 8601 alike timestamps
+ m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
+ if m:
+ date_str = date_str[:-len(m.group('tz'))]
+
+ # Python only supports microseconds, so remove nanoseconds
+ m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
+ if m:
+ date_str = m.group(1)
+
+ for expression in date_formats(day_first):
+ with contextlib.suppress(ValueError):
+ dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
+ return calendar.timegm(dt.timetuple())
+
+ timetuple = email.utils.parsedate_tz(date_str)
+ if timetuple:
+ return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
+
+
+def determine_ext(url, default_ext='unknown_video'):
+ if url is None or '.' not in url:
+ return default_ext
+ guess = url.partition('?')[0].rpartition('.')[2]
+ if re.match(r'^[A-Za-z0-9]+$', guess):
+ return guess
+ # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
+ elif guess.rstrip('/') in KNOWN_EXTENSIONS:
+ return guess.rstrip('/')
+ else:
+ return default_ext
+
+
+def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
+ return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
+
+
+def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
+ R"""
+ Return a datetime object from a string.
+ Supported format:
+ (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
+
+ @param format strftime format of DATE
+ @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
+ auto: round to the unit provided in date_str (if applicable).
+ """
+ auto_precision = False
+ if precision == 'auto':
+ auto_precision = True
+ precision = 'microsecond'
+ today = datetime_round(datetime.datetime.utcnow(), precision)
+ if date_str in ('now', 'today'):
+ return today
+ if date_str == 'yesterday':
+ return today - datetime.timedelta(days=1)
+ match = re.match(
+ r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
+ date_str)
+ if match is not None:
+ start_time = datetime_from_str(match.group('start'), precision, format)
+ time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
+ unit = match.group('unit')
+ if unit == 'month' or unit == 'year':
+ new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
+ unit = 'day'
+ else:
+ if unit == 'week':
+ unit = 'day'
+ time *= 7
+ delta = datetime.timedelta(**{unit + 's': time})
+ new_date = start_time + delta
+ if auto_precision:
+ return datetime_round(new_date, unit)
+ return new_date
+
+ return datetime_round(datetime.datetime.strptime(date_str, format), precision)
+
+
+def date_from_str(date_str, format='%Y%m%d', strict=False):
+ R"""
+ Return a date object from a string using datetime_from_str
+
+ @param strict Restrict allowed patterns to "YYYYMMDD" and
+ (now|today|yesterday)(-\d+(day|week|month|year)s?)?
+ """
+ if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
+ raise ValueError(f'Invalid date format "{date_str}"')
+ return datetime_from_str(date_str, precision='microsecond', format=format).date()
+
+
+def datetime_add_months(dt, months):
+ """Increment/Decrement a datetime object by months."""
+ month = dt.month + months - 1
+ year = dt.year + month // 12
+ month = month % 12 + 1
+ day = min(dt.day, calendar.monthrange(year, month)[1])
+ return dt.replace(year, month, day)
+
+
+def datetime_round(dt, precision='day'):
+ """
+ Round a datetime object's time to a specific precision
+ """
+ if precision == 'microsecond':
+ return dt
+
+ unit_seconds = {
+ 'day': 86400,
+ 'hour': 3600,
+ 'minute': 60,
+ 'second': 1,
+ }
+ roundto = lambda x, n: ((x + n / 2) // n) * n
+ timestamp = calendar.timegm(dt.timetuple())
+ return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
+
+
+def hyphenate_date(date_str):
+ """
+ Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
+ match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
+ if match is not None:
+ return '-'.join(match.groups())
+ else:
+ return date_str
+
+
+class DateRange:
+ """Represents a time interval between two dates"""
+
+ def __init__(self, start=None, end=None):
+ """start and end must be strings in the format accepted by date"""
+ if start is not None:
+ self.start = date_from_str(start, strict=True)
+ else:
+ self.start = datetime.datetime.min.date()
+ if end is not None:
+ self.end = date_from_str(end, strict=True)
+ else:
+ self.end = datetime.datetime.max.date()
+ if self.start > self.end:
+ raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
+
+ @classmethod
+ def day(cls, day):
+ """Returns a range that only contains the given day"""
+ return cls(day, day)
+
+ def __contains__(self, date):
+ """Check if the date is in the range"""
+ if not isinstance(date, datetime.date):
+ date = date_from_str(date)
+ return self.start <= date <= self.end
+
+ def __repr__(self):
+ return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
+
+ def __eq__(self, other):
+ return (isinstance(other, DateRange)
+ and self.start == other.start and self.end == other.end)
+
+
+@functools.cache
+def system_identifier():
+ python_implementation = platform.python_implementation()
+ if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
+ python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
+ libc_ver = []
+ with contextlib.suppress(OSError): # We may not have access to the executable
+ libc_ver = platform.libc_ver()
+
+ return 'Python %s (%s %s %s) - %s (%s%s)' % (
+ platform.python_version(),
+ python_implementation,
+ platform.machine(),
+ platform.architecture()[0],
+ platform.platform(),
+ ssl.OPENSSL_VERSION,
+ format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
+ )
+
+
+@functools.cache
+def get_windows_version():
+ ''' Get Windows version. returns () if it's not running on Windows '''
+ if compat_os_name == 'nt':
+ return version_tuple(platform.win32_ver()[1])
+ else:
+ return ()
+
+
+def write_string(s, out=None, encoding=None):
+ assert isinstance(s, str)
+ out = out or sys.stderr
+ # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
+ if not out:
+ return
+
+ if compat_os_name == 'nt' and supports_terminal_sequences(out):
+ s = re.sub(r'([\r\n]+)', r' \1', s)
+
+ enc, buffer = None, out
+ if 'b' in getattr(out, 'mode', ''):
+ enc = encoding or preferredencoding()
+ elif hasattr(out, 'buffer'):
+ buffer = out.buffer
+ enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
+
+ buffer.write(s.encode(enc, 'ignore') if enc else s)
+ out.flush()
+
+
+# TODO: Use global logger
+def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
+ from .. import _IN_CLI
+ if _IN_CLI:
+ if msg in deprecation_warning._cache:
+ return
+ deprecation_warning._cache.add(msg)
+ if printer:
+ return printer(f'{msg}{bug_reports_message()}', **kwargs)
+ return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
+ else:
+ import warnings
+ warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
+
+
+deprecation_warning._cache = set()
+
+
+def bytes_to_intlist(bs):
+ if not bs:
+ return []
+ if isinstance(bs[0], int): # Python 3
+ return list(bs)
+ else:
+ return [ord(c) for c in bs]
+
+
+def intlist_to_bytes(xs):
+ if not xs:
+ return b''
+ return struct.pack('%dB' % len(xs), *xs)
+
+
+class LockingUnsupportedError(OSError):
+ msg = 'File locking is not supported'
+
+ def __init__(self):
+ super().__init__(self.msg)
+
+
+# Cross-platform file locking
+if sys.platform == 'win32':
+ import ctypes
+ import ctypes.wintypes
+ import msvcrt
+
+ class OVERLAPPED(ctypes.Structure):
+ _fields_ = [
+ ('Internal', ctypes.wintypes.LPVOID),
+ ('InternalHigh', ctypes.wintypes.LPVOID),
+ ('Offset', ctypes.wintypes.DWORD),
+ ('OffsetHigh', ctypes.wintypes.DWORD),
+ ('hEvent', ctypes.wintypes.HANDLE),
+ ]
+
+ kernel32 = ctypes.WinDLL('kernel32')
+ LockFileEx = kernel32.LockFileEx
+ LockFileEx.argtypes = [
+ ctypes.wintypes.HANDLE, # hFile
+ ctypes.wintypes.DWORD, # dwFlags
+ ctypes.wintypes.DWORD, # dwReserved
+ ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
+ ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
+ ctypes.POINTER(OVERLAPPED) # Overlapped
+ ]
+ LockFileEx.restype = ctypes.wintypes.BOOL
+ UnlockFileEx = kernel32.UnlockFileEx
+ UnlockFileEx.argtypes = [
+ ctypes.wintypes.HANDLE, # hFile
+ ctypes.wintypes.DWORD, # dwReserved
+ ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
+ ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
+ ctypes.POINTER(OVERLAPPED) # Overlapped
+ ]
+ UnlockFileEx.restype = ctypes.wintypes.BOOL
+ whole_low = 0xffffffff
+ whole_high = 0x7fffffff
+
+ def _lock_file(f, exclusive, block):
+ overlapped = OVERLAPPED()
+ overlapped.Offset = 0
+ overlapped.OffsetHigh = 0
+ overlapped.hEvent = 0
+ f._lock_file_overlapped_p = ctypes.pointer(overlapped)
+
+ if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
+ (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
+ 0, whole_low, whole_high, f._lock_file_overlapped_p):
+ # NB: No argument form of "ctypes.FormatError" does not work on PyPy
+ raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
+
+ def _unlock_file(f):
+ assert f._lock_file_overlapped_p
+ handle = msvcrt.get_osfhandle(f.fileno())
+ if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
+ raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
+
+else:
+ try:
+ import fcntl
+
+ def _lock_file(f, exclusive, block):
+ flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
+ if not block:
+ flags |= fcntl.LOCK_NB
+ try:
+ fcntl.flock(f, flags)
+ except BlockingIOError:
+ raise
+ except OSError: # AOSP does not have flock()
+ fcntl.lockf(f, flags)
+
+ def _unlock_file(f):
+ with contextlib.suppress(OSError):
+ return fcntl.flock(f, fcntl.LOCK_UN)
+ with contextlib.suppress(OSError):
+ return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
+ return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
+
+ except ImportError:
+
+ def _lock_file(f, exclusive, block):
+ raise LockingUnsupportedError()
+
+ def _unlock_file(f):
+ raise LockingUnsupportedError()
+
+
+class locked_file:
+ locked = False
+
+ def __init__(self, filename, mode, block=True, encoding=None):
+ if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
+ raise NotImplementedError(mode)
+ self.mode, self.block = mode, block
+
+ writable = any(f in mode for f in 'wax+')
+ readable = any(f in mode for f in 'r+')
+ flags = functools.reduce(operator.ior, (
+ getattr(os, 'O_CLOEXEC', 0), # UNIX only
+ getattr(os, 'O_BINARY', 0), # Windows only
+ getattr(os, 'O_NOINHERIT', 0), # Windows only
+ os.O_CREAT if writable else 0, # O_TRUNC only after locking
+ os.O_APPEND if 'a' in mode else 0,
+ os.O_EXCL if 'x' in mode else 0,
+ os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
+ ))
+
+ self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
+
+ def __enter__(self):
+ exclusive = 'r' not in self.mode
+ try:
+ _lock_file(self.f, exclusive, self.block)
+ self.locked = True
+ except OSError:
+ self.f.close()
+ raise
+ if 'w' in self.mode:
+ try:
+ self.f.truncate()
+ except OSError as e:
+ if e.errno not in (
+ errno.ESPIPE, # Illegal seek - expected for FIFO
+ errno.EINVAL, # Invalid argument - expected for /dev/null
+ ):
+ raise
+ return self
+
+ def unlock(self):
+ if not self.locked:
+ return
+ try:
+ _unlock_file(self.f)
+ finally:
+ self.locked = False
+
+ def __exit__(self, *_):
+ try:
+ self.unlock()
+ finally:
+ self.f.close()
+
+ open = __enter__
+ close = __exit__
+
+ def __getattr__(self, attr):
+ return getattr(self.f, attr)
+
+ def __iter__(self):
+ return iter(self.f)
+
+
+@functools.cache
+def get_filesystem_encoding():
+ encoding = sys.getfilesystemencoding()
+ return encoding if encoding is not None else 'utf-8'
+
+
+def shell_quote(args):
+ quoted_args = []
+ encoding = get_filesystem_encoding()
+ for a in args:
+ if isinstance(a, bytes):
+ # We may get a filename encoded with 'encodeFilename'
+ a = a.decode(encoding)
+ quoted_args.append(compat_shlex_quote(a))
+ return ' '.join(quoted_args)
+
+
+def smuggle_url(url, data):
+ """ Pass additional data in a URL for internal use. """
+
+ url, idata = unsmuggle_url(url, {})
+ data.update(idata)
+ sdata = urllib.parse.urlencode(
+ {'__youtubedl_smuggle': json.dumps(data)})
+ return url + '#' + sdata
+
+
+def unsmuggle_url(smug_url, default=None):
+ if '#__youtubedl_smuggle' not in smug_url:
+ return smug_url, default
+ url, _, sdata = smug_url.rpartition('#')
+ jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
+ data = json.loads(jsond)
+ return url, data
+
+
+def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
+ """ Formats numbers with decimal sufixes like K, M, etc """
+ num, factor = float_or_none(num), float(factor)
+ if num is None or num < 0:
+ return None
+ POSSIBLE_SUFFIXES = 'kMGTPEZY'
+ exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
+ suffix = ['', *POSSIBLE_SUFFIXES][exponent]
+ if factor == 1024:
+ suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
+ converted = num / (factor ** exponent)
+ return fmt % (converted, suffix)
+
+
+def format_bytes(bytes):
+ return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
+
+
+def lookup_unit_table(unit_table, s, strict=False):
+ num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
+ units_re = '|'.join(re.escape(u) for u in unit_table)
+ m = (re.fullmatch if strict else re.match)(
+ rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
+ if not m:
+ return None
+
+ num = float(m.group('num').replace(',', '.'))
+ mult = unit_table[m.group('unit')]
+ return round(num * mult)
+
+
+def parse_bytes(s):
+ """Parse a string indicating a byte quantity into an integer"""
+ return lookup_unit_table(
+ {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
+ s.upper(), strict=True)
+
+
+def parse_filesize(s):
+ if s is None:
+ return None
+
+ # The lower-case forms are of course incorrect and unofficial,
+ # but we support those too
+ _UNIT_TABLE = {
+ 'B': 1,
+ 'b': 1,
+ 'bytes': 1,
+ 'KiB': 1024,
+ 'KB': 1000,
+ 'kB': 1024,
+ 'Kb': 1000,
+ 'kb': 1000,
+ 'kilobytes': 1000,
+ 'kibibytes': 1024,
+ 'MiB': 1024 ** 2,
+ 'MB': 1000 ** 2,
+ 'mB': 1024 ** 2,
+ 'Mb': 1000 ** 2,
+ 'mb': 1000 ** 2,
+ 'megabytes': 1000 ** 2,
+ 'mebibytes': 1024 ** 2,
+ 'GiB': 1024 ** 3,
+ 'GB': 1000 ** 3,
+ 'gB': 1024 ** 3,
+ 'Gb': 1000 ** 3,
+ 'gb': 1000 ** 3,
+ 'gigabytes': 1000 ** 3,
+ 'gibibytes': 1024 ** 3,
+ 'TiB': 1024 ** 4,
+ 'TB': 1000 ** 4,
+ 'tB': 1024 ** 4,
+ 'Tb': 1000 ** 4,
+ 'tb': 1000 ** 4,
+ 'terabytes': 1000 ** 4,
+ 'tebibytes': 1024 ** 4,
+ 'PiB': 1024 ** 5,
+ 'PB': 1000 ** 5,
+ 'pB': 1024 ** 5,
+ 'Pb': 1000 ** 5,
+ 'pb': 1000 ** 5,
+ 'petabytes': 1000 ** 5,
+ 'pebibytes': 1024 ** 5,
+ 'EiB': 1024 ** 6,
+ 'EB': 1000 ** 6,
+ 'eB': 1024 ** 6,
+ 'Eb': 1000 ** 6,
+ 'eb': 1000 ** 6,
+ 'exabytes': 1000 ** 6,
+ 'exbibytes': 1024 ** 6,
+ 'ZiB': 1024 ** 7,
+ 'ZB': 1000 ** 7,
+ 'zB': 1024 ** 7,
+ 'Zb': 1000 ** 7,
+ 'zb': 1000 ** 7,
+ 'zettabytes': 1000 ** 7,
+ 'zebibytes': 1024 ** 7,
+ 'YiB': 1024 ** 8,
+ 'YB': 1000 ** 8,
+ 'yB': 1024 ** 8,
+ 'Yb': 1000 ** 8,
+ 'yb': 1000 ** 8,
+ 'yottabytes': 1000 ** 8,
+ 'yobibytes': 1024 ** 8,
+ }
+
+ return lookup_unit_table(_UNIT_TABLE, s)
+
+
+def parse_count(s):
+ if s is None:
+ return None
+
+ s = re.sub(r'^[^\d]+\s', '', s).strip()
+
+ if re.match(r'^[\d,.]+$', s):
+ return str_to_int(s)
+
+ _UNIT_TABLE = {
+ 'k': 1000,
+ 'K': 1000,
+ 'm': 1000 ** 2,
+ 'M': 1000 ** 2,
+ 'kk': 1000 ** 2,
+ 'KK': 1000 ** 2,
+ 'b': 1000 ** 3,
+ 'B': 1000 ** 3,
+ }
+
+ ret = lookup_unit_table(_UNIT_TABLE, s)
+ if ret is not None:
+ return ret
+
+ mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
+ if mobj:
+ return str_to_int(mobj.group(1))
+
+
+def parse_resolution(s, *, lenient=False):
+ if s is None:
+ return {}
+
+ if lenient:
+ mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
+ else:
+ mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
+ if mobj:
+ return {
+ 'width': int(mobj.group('w')),
+ 'height': int(mobj.group('h')),
+ }
+
+ mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
+ if mobj:
+ return {'height': int(mobj.group(1))}
+
+ mobj = re.search(r'\b([48])[kK]\b', s)
+ if mobj:
+ return {'height': int(mobj.group(1)) * 540}
+
+ return {}
+
+
+def parse_bitrate(s):
+ if not isinstance(s, str):
+ return
+ mobj = re.search(r'\b(\d+)\s*kbps', s)
+ if mobj:
+ return int(mobj.group(1))
+
+
+def month_by_name(name, lang='en'):
+ """ Return the number of a month by (locale-independently) English name """
+
+ month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
+
+ try:
+ return month_names.index(name) + 1
+ except ValueError:
+ return None
+
+
+def month_by_abbreviation(abbrev):
+ """ Return the number of a month by (locale-independently) English
+ abbreviations """
+
+ try:
+ return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
+ except ValueError:
+ return None
+
+
+def fix_xml_ampersands(xml_str):
+ """Replace all the '&' by '&amp;' in XML"""
+ return re.sub(
+ r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
+ '&amp;',
+ xml_str)
+
+
+def setproctitle(title):
+ assert isinstance(title, str)
+
+ # Workaround for https://github.com/hypervideo/hypervideo/issues/4541
+ try:
+ import ctypes
+ except ImportError:
+ return
+
+ try:
+ libc = ctypes.cdll.LoadLibrary('libc.so.6')
+ except OSError:
+ return
+ except TypeError:
+ # LoadLibrary in Windows Python 2.7.13 only expects
+ # a bytestring, but since unicode_literals turns
+ # every string into a unicode string, it fails.
+ return
+ title_bytes = title.encode()
+ buf = ctypes.create_string_buffer(len(title_bytes))
+ buf.value = title_bytes
+ try:
+ libc.prctl(15, buf, 0, 0, 0)
+ except AttributeError:
+ return # Strange libc, just skip this
+
+
+def remove_start(s, start):
+ return s[len(start):] if s is not None and s.startswith(start) else s
+
+
+def remove_end(s, end):
+ return s[:-len(end)] if s is not None and s.endswith(end) else s
+
+
+def remove_quotes(s):
+ if s is None or len(s) < 2:
+ return s
+ for quote in ('"', "'", ):
+ if s[0] == quote and s[-1] == quote:
+ return s[1:-1]
+ return s
+
+
+def get_domain(url):
+ """
+ This implementation is inconsistent, but is kept for compatibility.
+ Use this only for "webpage_url_domain"
+ """
+ return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
+
+
+def url_basename(url):
+ path = urllib.parse.urlparse(url).path
+ return path.strip('/').split('/')[-1]
+
+
+def base_url(url):
+ return re.match(r'https?://[^?#]+/', url).group()
+
+
+def urljoin(base, path):
+ if isinstance(path, bytes):
+ path = path.decode()
+ if not isinstance(path, str) or not path:
+ return None
+ if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
+ return path
+ if isinstance(base, bytes):
+ base = base.decode()
+ if not isinstance(base, str) or not re.match(
+ r'^(?:https?:)?//', base):
+ return None
+ return urllib.parse.urljoin(base, path)
+
+
+def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
+ if get_attr and v is not None:
+ v = getattr(v, get_attr, None)
+ try:
+ return int(v) * invscale // scale
+ except (ValueError, TypeError, OverflowError):
+ return default
+
+
+def str_or_none(v, default=None):
+ return default if v is None else str(v)
+
+
+def str_to_int(int_str):
+ """ A more relaxed version of int_or_none """
+ if isinstance(int_str, int):
+ return int_str
+ elif isinstance(int_str, str):
+ int_str = re.sub(r'[,\.\+]', '', int_str)
+ return int_or_none(int_str)
+
+
+def float_or_none(v, scale=1, invscale=1, default=None):
+ if v is None:
+ return default
+ try:
+ return float(v) * invscale / scale
+ except (ValueError, TypeError):
+ return default
+
+
+def bool_or_none(v, default=None):
+ return v if isinstance(v, bool) else default
+
+
+def strip_or_none(v, default=None):
+ return v.strip() if isinstance(v, str) else default
+
+
+def url_or_none(url):
+ if not url or not isinstance(url, str):
+ return None
+ url = url.strip()
+ return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
+
+
+def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
+ datetime_object = None
+ try:
+ if isinstance(timestamp, (int, float)): # unix timestamp
+ # Using naive datetime here can break timestamp() in Windows
+ # Ref: https://github.com/hypervideo/hypervideo/issues/5185, https://github.com/python/cpython/issues/94414
+ # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
+ # Ref: https://github.com/hypervideo/hypervideo/issues/6706#issuecomment-1496842642
+ datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
+ + datetime.timedelta(seconds=timestamp))
+ elif isinstance(timestamp, str): # assume YYYYMMDD
+ datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
+ date_format = re.sub( # Support %s on windows
+ r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
+ return datetime_object.strftime(date_format)
+ except (ValueError, TypeError, AttributeError):
+ return default
+
+
+def parse_duration(s):
+ if not isinstance(s, str):
+ return None
+ s = s.strip()
+ if not s:
+ return None
+
+ days, hours, mins, secs, ms = [None] * 5
+ m = re.match(r'''(?x)
+ (?P<before_secs>
+ (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
+ (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
+ (?P<ms>[.:][0-9]+)?Z?$
+ ''', s)
+ if m:
+ days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
+ else:
+ m = re.match(
+ r'''(?ix)(?:P?
+ (?:
+ [0-9]+\s*y(?:ears?)?,?\s*
+ )?
+ (?:
+ [0-9]+\s*m(?:onths?)?,?\s*
+ )?
+ (?:
+ [0-9]+\s*w(?:eeks?)?,?\s*
+ )?
+ (?:
+ (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
+ )?
+ T)?
+ (?:
+ (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
+ )?
+ (?:
+ (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
+ )?
+ (?:
+ (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
+ )?Z?$''', s)
+ if m:
+ days, hours, mins, secs, ms = m.groups()
+ else:
+ m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
+ if m:
+ hours, mins = m.groups()
+ else:
+ return None
+
+ if ms:
+ ms = ms.replace(':', '.')
+ return sum(float(part or 0) * mult for part, mult in (
+ (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
+
+
+def prepend_extension(filename, ext, expected_real_ext=None):
+ name, real_ext = os.path.splitext(filename)
+ return (
+ f'{name}.{ext}{real_ext}'
+ if not expected_real_ext or real_ext[1:] == expected_real_ext
+ else f'{filename}.{ext}')
+
+
+def replace_extension(filename, ext, expected_real_ext=None):
+ name, real_ext = os.path.splitext(filename)
+ return '{}.{}'.format(
+ name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
+ ext)
+
+
+def check_executable(exe, args=[]):
+ """ Checks if the given binary is installed somewhere in PATH, and returns its name.
+ args can be a list of arguments for a short output (like -version) """
+ try:
+ Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ except OSError:
+ return False
+ return exe
+
+
+def _get_exe_version_output(exe, args):
+ try:
+ # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
+ # SIGTTOU if hypervideo is run in the background.
+ # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
+ stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
+ stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+ if ret:
+ return None
+ except OSError:
+ return False
+ return stdout
+
+
+def detect_exe_version(output, version_re=None, unrecognized='present'):
+ assert isinstance(output, str)
+ if version_re is None:
+ version_re = r'version\s+([-0-9._a-zA-Z]+)'
+ m = re.search(version_re, output)
+ if m:
+ return m.group(1)
+ else:
+ return unrecognized
+
+
+def get_exe_version(exe, args=['--version'],
+ version_re=None, unrecognized=('present', 'broken')):
+ """ Returns the version of the specified executable,
+ or False if the executable is not present """
+ unrecognized = variadic(unrecognized)
+ assert len(unrecognized) in (1, 2)
+ out = _get_exe_version_output(exe, args)
+ if out is None:
+ return unrecognized[-1]
+ return out and detect_exe_version(out, version_re, unrecognized[0])
+
+
+def frange(start=0, stop=None, step=1):
+ """Float range"""
+ if stop is None:
+ start, stop = 0, start
+ sign = [-1, 1][step > 0] if step else 0
+ while sign * start < sign * stop:
+ yield start
+ start += step
+
+
+class LazyList(collections.abc.Sequence):
+ """Lazy immutable list from an iterable
+ Note that slices of a LazyList are lists and not LazyList"""
+
+ class IndexError(IndexError):
+ pass
+
+ def __init__(self, iterable, *, reverse=False, _cache=None):
+ self._iterable = iter(iterable)
+ self._cache = [] if _cache is None else _cache
+ self._reversed = reverse
+
+ def __iter__(self):
+ if self._reversed:
+ # We need to consume the entire iterable to iterate in reverse
+ yield from self.exhaust()
+ return
+ yield from self._cache
+ for item in self._iterable:
+ self._cache.append(item)
+ yield item
+
+ def _exhaust(self):
+ self._cache.extend(self._iterable)
+ self._iterable = [] # Discard the emptied iterable to make it pickle-able
+ return self._cache
+
+ def exhaust(self):
+ """Evaluate the entire iterable"""
+ return self._exhaust()[::-1 if self._reversed else 1]
+
+ @staticmethod
+ def _reverse_index(x):
+ return None if x is None else ~x
+
+ def __getitem__(self, idx):
+ if isinstance(idx, slice):
+ if self._reversed:
+ idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
+ start, stop, step = idx.start, idx.stop, idx.step or 1
+ elif isinstance(idx, int):
+ if self._reversed:
+ idx = self._reverse_index(idx)
+ start, stop, step = idx, idx, 0
+ else:
+ raise TypeError('indices must be integers or slices')
+ if ((start or 0) < 0 or (stop or 0) < 0
+ or (start is None and step < 0)
+ or (stop is None and step > 0)):
+ # We need to consume the entire iterable to be able to slice from the end
+ # Obviously, never use this with infinite iterables
+ self._exhaust()
+ try:
+ return self._cache[idx]
+ except IndexError as e:
+ raise self.IndexError(e) from e
+ n = max(start or 0, stop or 0) - len(self._cache) + 1
+ if n > 0:
+ self._cache.extend(itertools.islice(self._iterable, n))
+ try:
+ return self._cache[idx]
+ except IndexError as e:
+ raise self.IndexError(e) from e
+
+ def __bool__(self):
+ try:
+ self[-1] if self._reversed else self[0]
+ except self.IndexError:
+ return False
+ return True
+
+ def __len__(self):
+ self._exhaust()
+ return len(self._cache)
+
+ def __reversed__(self):
+ return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
+
+ def __copy__(self):
+ return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
+
+ def __repr__(self):
+ # repr and str should mimic a list. So we exhaust the iterable
+ return repr(self.exhaust())
+
+ def __str__(self):
+ return repr(self.exhaust())
+
+
+class PagedList:
+
+ class IndexError(IndexError):
+ pass
+
+ def __len__(self):
+ # This is only useful for tests
+ return len(self.getslice())
+
+ def __init__(self, pagefunc, pagesize, use_cache=True):
+ self._pagefunc = pagefunc
+ self._pagesize = pagesize
+ self._pagecount = float('inf')
+ self._use_cache = use_cache
+ self._cache = {}
+
+ def getpage(self, pagenum):
+ page_results = self._cache.get(pagenum)
+ if page_results is None:
+ page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
+ if self._use_cache:
+ self._cache[pagenum] = page_results
+ return page_results
+
+ def getslice(self, start=0, end=None):
+ return list(self._getslice(start, end))
+
+ def _getslice(self, start, end):
+ raise NotImplementedError('This method must be implemented by subclasses')
+
+ def __getitem__(self, idx):
+ assert self._use_cache, 'Indexing PagedList requires cache'
+ if not isinstance(idx, int) or idx < 0:
+ raise TypeError('indices must be non-negative integers')
+ entries = self.getslice(idx, idx + 1)
+ if not entries:
+ raise self.IndexError()
+ return entries[0]
+
+
+class OnDemandPagedList(PagedList):
+ """Download pages until a page with less than maximum results"""
+
+ def _getslice(self, start, end):
+ for pagenum in itertools.count(start // self._pagesize):
+ firstid = pagenum * self._pagesize
+ nextfirstid = pagenum * self._pagesize + self._pagesize
+ if start >= nextfirstid:
+ continue
+
+ startv = (
+ start % self._pagesize
+ if firstid <= start < nextfirstid
+ else 0)
+ endv = (
+ ((end - 1) % self._pagesize) + 1
+ if (end is not None and firstid <= end <= nextfirstid)
+ else None)
+
+ try:
+ page_results = self.getpage(pagenum)
+ except Exception:
+ self._pagecount = pagenum - 1
+ raise
+ if startv != 0 or endv is not None:
+ page_results = page_results[startv:endv]
+ yield from page_results
+
+ # A little optimization - if current page is not "full", ie. does
+ # not contain page_size videos then we can assume that this page
+ # is the last one - there are no more ids on further pages -
+ # i.e. no need to query again.
+ if len(page_results) + startv < self._pagesize:
+ break
+
+ # If we got the whole page, but the next page is not interesting,
+ # break out early as well
+ if end == nextfirstid:
+ break
+
+
+class InAdvancePagedList(PagedList):
+ """PagedList with total number of pages known in advance"""
+
+ def __init__(self, pagefunc, pagecount, pagesize):
+ PagedList.__init__(self, pagefunc, pagesize, True)
+ self._pagecount = pagecount
+
+ def _getslice(self, start, end):
+ start_page = start // self._pagesize
+ end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
+ skip_elems = start - start_page * self._pagesize
+ only_more = None if end is None else end - start
+ for pagenum in range(start_page, end_page):
+ page_results = self.getpage(pagenum)
+ if skip_elems:
+ page_results = page_results[skip_elems:]
+ skip_elems = None
+ if only_more is not None:
+ if len(page_results) < only_more:
+ only_more -= len(page_results)
+ else:
+ yield from page_results[:only_more]
+ break
+ yield from page_results
+
+
+class PlaylistEntries:
+ MissingEntry = object()
+ is_exhausted = False
+
+ def __init__(self, ydl, info_dict):
+ self.ydl = ydl
+
+ # _entries must be assigned now since infodict can change during iteration
+ entries = info_dict.get('entries')
+ if entries is None:
+ raise EntryNotInPlaylist('There are no entries')
+ elif isinstance(entries, list):
+ self.is_exhausted = True
+
+ requested_entries = info_dict.get('requested_entries')
+ self.is_incomplete = requested_entries is not None
+ if self.is_incomplete:
+ assert self.is_exhausted
+ self._entries = [self.MissingEntry] * max(requested_entries or [0])
+ for i, entry in zip(requested_entries, entries):
+ self._entries[i - 1] = entry
+ elif isinstance(entries, (list, PagedList, LazyList)):
+ self._entries = entries
+ else:
+ self._entries = LazyList(entries)
+
+ PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
+ (?P<start>[+-]?\d+)?
+ (?P<range>[:-]
+ (?P<end>[+-]?\d+|inf(?:inite)?)?
+ (?::(?P<step>[+-]?\d+))?
+ )?''')
+
+ @classmethod
+ def parse_playlist_items(cls, string):
+ for segment in string.split(','):
+ if not segment:
+ raise ValueError('There is two or more consecutive commas')
+ mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
+ if not mobj:
+ raise ValueError(f'{segment!r} is not a valid specification')
+ start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
+ if int_or_none(step) == 0:
+ raise ValueError(f'Step in {segment!r} cannot be zero')
+ yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
+
+ def get_requested_items(self):
+ playlist_items = self.ydl.params.get('playlist_items')
+ playlist_start = self.ydl.params.get('playliststart', 1)
+ playlist_end = self.ydl.params.get('playlistend')
+ # For backwards compatibility, interpret -1 as whole list
+ if playlist_end in (-1, None):
+ playlist_end = ''
+ if not playlist_items:
+ playlist_items = f'{playlist_start}:{playlist_end}'
+ elif playlist_start != 1 or playlist_end:
+ self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
+
+ for index in self.parse_playlist_items(playlist_items):
+ for i, entry in self[index]:
+ yield i, entry
+ if not entry:
+ continue
+ try:
+ # The item may have just been added to archive. Don't break due to it
+ if not self.ydl.params.get('lazy_playlist'):
+ # TODO: Add auto-generated fields
+ self.ydl._match_entry(entry, incomplete=True, silent=True)
+ except (ExistingVideoReached, RejectedVideoReached):
+ return
+
+ def get_full_count(self):
+ if self.is_exhausted and not self.is_incomplete:
+ return len(self)
+ elif isinstance(self._entries, InAdvancePagedList):
+ if self._entries._pagesize == 1:
+ return self._entries._pagecount
+
+ @functools.cached_property
+ def _getter(self):
+ if isinstance(self._entries, list):
+ def get_entry(i):
+ try:
+ entry = self._entries[i]
+ except IndexError:
+ entry = self.MissingEntry
+ if not self.is_incomplete:
+ raise self.IndexError()
+ if entry is self.MissingEntry:
+ raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
+ return entry
+ else:
+ def get_entry(i):
+ try:
+ return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
+ except (LazyList.IndexError, PagedList.IndexError):
+ raise self.IndexError()
+ return get_entry
+
+ def __getitem__(self, idx):
+ if isinstance(idx, int):
+ idx = slice(idx, idx)
+
+ # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
+ step = 1 if idx.step is None else idx.step
+ if idx.start is None:
+ start = 0 if step > 0 else len(self) - 1
+ else:
+ start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
+
+ # NB: Do not call len(self) when idx == [:]
+ if idx.stop is None:
+ stop = 0 if step < 0 else float('inf')
+ else:
+ stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
+ stop += [-1, 1][step > 0]
+
+ for i in frange(start, stop, step):
+ if i < 0:
+ continue
+ try:
+ entry = self._getter(i)
+ except self.IndexError:
+ self.is_exhausted = True
+ if step > 0:
+ break
+ continue
+ yield i + 1, entry
+
+ def __len__(self):
+ return len(tuple(self[:]))
+
+ class IndexError(IndexError):
+ pass
+
+
+def uppercase_escape(s):
+ unicode_escape = codecs.getdecoder('unicode_escape')
+ return re.sub(
+ r'\\U[0-9a-fA-F]{8}',
+ lambda m: unicode_escape(m.group(0))[0],
+ s)
+
+
+def lowercase_escape(s):
+ unicode_escape = codecs.getdecoder('unicode_escape')
+ return re.sub(
+ r'\\u[0-9a-fA-F]{4}',
+ lambda m: unicode_escape(m.group(0))[0],
+ s)
+
+
+def parse_qs(url, **kwargs):
+ return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
+
+
+def read_batch_urls(batch_fd):
+ def fixup(url):
+ if not isinstance(url, str):
+ url = url.decode('utf-8', 'replace')
+ BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
+ for bom in BOM_UTF8:
+ if url.startswith(bom):
+ url = url[len(bom):]
+ url = url.lstrip()
+ if not url or url.startswith(('#', ';', ']')):
+ return False
+ # "#" cannot be stripped out since it is part of the URI
+ # However, it can be safely stripped out if following a whitespace
+ return re.split(r'\s#', url, 1)[0].rstrip()
+
+ with contextlib.closing(batch_fd) as fd:
+ return [url for url in map(fixup, fd) if url]
+
+
+def urlencode_postdata(*args, **kargs):
+ return urllib.parse.urlencode(*args, **kargs).encode('ascii')
+
+
+def update_url(url, *, query_update=None, **kwargs):
+ """Replace URL components specified by kwargs
+ @param url str or parse url tuple
+ @param query_update update query
+ @returns str
+ """
+ if isinstance(url, str):
+ if not kwargs and not query_update:
+ return url
+ else:
+ url = urllib.parse.urlparse(url)
+ if query_update:
+ assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
+ kwargs['query'] = urllib.parse.urlencode({
+ **urllib.parse.parse_qs(url.query),
+ **query_update
+ }, True)
+ return urllib.parse.urlunparse(url._replace(**kwargs))
+
+
+def update_url_query(url, query):
+ return update_url(url, query_update=query)
+
+
+def _multipart_encode_impl(data, boundary):
+ content_type = 'multipart/form-data; boundary=%s' % boundary
+
+ out = b''
+ for k, v in data.items():
+ out += b'--' + boundary.encode('ascii') + b'\r\n'
+ if isinstance(k, str):
+ k = k.encode()
+ if isinstance(v, str):
+ v = v.encode()
+ # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
+ # suggests sending UTF-8 directly. Firefox sends UTF-8, too
+ content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
+ if boundary.encode('ascii') in content:
+ raise ValueError('Boundary overlaps with data')
+ out += content
+
+ out += b'--' + boundary.encode('ascii') + b'--\r\n'
+
+ return out, content_type
+
+
+def multipart_encode(data, boundary=None):
+ '''
+ Encode a dict to RFC 7578-compliant form-data
+
+ data:
+ A dict where keys and values can be either Unicode or bytes-like
+ objects.
+ boundary:
+ If specified a Unicode object, it's used as the boundary. Otherwise
+ a random boundary is generated.
+
+ Reference: https://tools.ietf.org/html/rfc7578
+ '''
+ has_specified_boundary = boundary is not None
+
+ while True:
+ if boundary is None:
+ boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
+
+ try:
+ out, content_type = _multipart_encode_impl(data, boundary)
+ break
+ except ValueError:
+ if has_specified_boundary:
+ raise
+ boundary = None
+
+ return out, content_type
+
+
+def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
+ if blocked_types is NO_DEFAULT:
+ blocked_types = (str, bytes, collections.abc.Mapping)
+ return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
+
+
+def variadic(x, allowed_types=NO_DEFAULT):
+ if not isinstance(allowed_types, (tuple, type)):
+ deprecation_warning('allowed_types should be a tuple or a type')
+ allowed_types = tuple(allowed_types)
+ return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
+
+
+def try_call(*funcs, expected_type=None, args=[], kwargs={}):
+ for f in funcs:
+ try:
+ val = f(*args, **kwargs)
+ except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
+ pass
+ else:
+ if expected_type is None or isinstance(val, expected_type):
+ return val
+
+
+def try_get(src, getter, expected_type=None):
+ return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
+
+
+def filter_dict(dct, cndn=lambda _, v: v is not None):
+ return {k: v for k, v in dct.items() if cndn(k, v)}
+
+
+def merge_dicts(*dicts):
+ merged = {}
+ for a_dict in dicts:
+ for k, v in a_dict.items():
+ if (v is not None and k not in merged
+ or isinstance(v, str) and merged[k] == ''):
+ merged[k] = v
+ return merged
+
+
+def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
+ return string if isinstance(string, str) else str(string, encoding, errors)
+
+
+US_RATINGS = {
+ 'G': 0,
+ 'PG': 10,
+ 'PG-13': 13,
+ 'R': 16,
+ 'NC': 18,
+}
+
+
+TV_PARENTAL_GUIDELINES = {
+ 'TV-Y': 0,
+ 'TV-Y7': 7,
+ 'TV-G': 0,
+ 'TV-PG': 0,
+ 'TV-14': 14,
+ 'TV-MA': 17,
+}
+
+
+def parse_age_limit(s):
+ # isinstance(False, int) is True. So type() must be used instead
+ if type(s) is int: # noqa: E721
+ return s if 0 <= s <= 21 else None
+ elif not isinstance(s, str):
+ return None
+ m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
+ if m:
+ return int(m.group('age'))
+ s = s.upper()
+ if s in US_RATINGS:
+ return US_RATINGS[s]
+ m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
+ if m:
+ return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
+ return None
+
+
+def strip_jsonp(code):
+ return re.sub(
+ r'''(?sx)^
+ (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
+ (?:\s*&&\s*(?P=func_name))?
+ \s*\(\s*(?P<callback_data>.*)\);?
+ \s*?(?://[^\n]*)*$''',
+ r'\g<callback_data>', code)
+
+
+def js_to_json(code, vars={}, *, strict=False):
+ # vars is a dict of var, val pairs to substitute
+ STRING_QUOTES = '\'"`'
+ STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
+ COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
+ SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
+ INTEGER_TABLE = (
+ (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
+ (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
+ )
+
+ def process_escape(match):
+ JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
+ escape = match.group(1) or match.group(2)
+
+ return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
+ else R'\u00' if escape == 'x'
+ else '' if escape == '\n'
+ else escape)
+
+ def template_substitute(match):
+ evaluated = js_to_json(match.group(1), vars, strict=strict)
+ if evaluated[0] == '"':
+ return json.loads(evaluated)
+ return evaluated
+
+ def fix_kv(m):
+ v = m.group(0)
+ if v in ('true', 'false', 'null'):
+ return v
+ elif v in ('undefined', 'void 0'):
+ return 'null'
+ elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
+ return ''
+
+ if v[0] in STRING_QUOTES:
+ v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
+ escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
+ return f'"{escaped}"'
+
+ for regex, base in INTEGER_TABLE:
+ im = re.match(regex, v)
+ if im:
+ i = int(im.group(1), base)
+ return f'"{i}":' if v.endswith(':') else str(i)
+
+ if v in vars:
+ try:
+ if not strict:
+ json.loads(vars[v])
+ except json.JSONDecodeError:
+ return json.dumps(vars[v])
+ else:
+ return vars[v]
+
+ if not strict:
+ return f'"{v}"'
+
+ raise ValueError(f'Unknown value: {v}')
+
+ def create_map(mobj):
+ return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
+
+ code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
+ if not strict:
+ code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
+ code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
+ code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
+ code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
+
+ return re.sub(rf'''(?sx)
+ {STRING_RE}|
+ {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
+ void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
+ \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
+ [0-9]+(?={SKIP_RE}:)|
+ !+
+ ''', fix_kv, code)
+
+
+def qualities(quality_ids):
+ """ Get a numeric quality value out of a list of possible values """
+ def q(qid):
+ try:
+ return quality_ids.index(qid)
+ except ValueError:
+ return -1
+ return q
+
+
+POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
+
+
+DEFAULT_OUTTMPL = {
+ 'default': '%(title)s [%(id)s].%(ext)s',
+ 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
+}
+OUTTMPL_TYPES = {
+ 'chapter': None,
+ 'subtitle': None,
+ 'thumbnail': None,
+ 'description': 'description',
+ 'annotation': 'annotations.xml',
+ 'infojson': 'info.json',
+ 'link': None,
+ 'pl_video': None,
+ 'pl_thumbnail': None,
+ 'pl_description': 'description',
+ 'pl_infojson': 'info.json',
+}
+
+# As of [1] format syntax is:
+# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
+# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
+STR_FORMAT_RE_TMPL = r'''(?x)
+ (?<!%)(?P<prefix>(?:%%)*)
+ %
+ (?P<has_key>\((?P<key>{0})\))?
+ (?P<format>
+ (?P<conversion>[#0\-+ ]+)?
+ (?P<min_width>\d+)?
+ (?P<precision>\.\d+)?
+ (?P<len_mod>[hlL])? # unused in python
+ {1} # conversion type
+ )
+'''
+
+
+STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
+
+
+def limit_length(s, length):
+ """ Add ellipses to overly long strings """
+ if s is None:
+ return None
+ ELLIPSES = '...'
+ if len(s) > length:
+ return s[:length - len(ELLIPSES)] + ELLIPSES
+ return s
+
+
+def version_tuple(v):
+ return tuple(int(e) for e in re.split(r'[-.]', v))
+
+
+def is_outdated_version(version, limit, assume_new=True):
+ if not version:
+ return not assume_new
+ try:
+ return version_tuple(version) < version_tuple(limit)
+ except ValueError:
+ return not assume_new
+
+
+def ytdl_is_updateable():
+ """ Returns if hypervideo can be updated with -U """
+
+ from ..update import is_non_updateable
+
+ return not is_non_updateable()
+
+
+def args_to_str(args):
+ # Get a short string representation for a subprocess command
+ return ' '.join(compat_shlex_quote(a) for a in args)
+
+
+def error_to_str(err):
+ return f'{type(err).__name__}: {err}'
+
+
+def mimetype2ext(mt, default=NO_DEFAULT):
+ if not isinstance(mt, str):
+ if default is not NO_DEFAULT:
+ return default
+ return None
+
+ MAP = {
+ # video
+ '3gpp': '3gp',
+ 'mp2t': 'ts',
+ 'mp4': 'mp4',
+ 'mpeg': 'mpeg',
+ 'mpegurl': 'm3u8',
+ 'quicktime': 'mov',
+ 'webm': 'webm',
+ 'vp9': 'vp9',
+ 'x-flv': 'flv',
+ 'x-m4v': 'm4v',
+ 'x-matroska': 'mkv',
+ 'x-mng': 'mng',
+ 'x-mp4-fragmented': 'mp4',
+ 'x-ms-asf': 'asf',
+ 'x-ms-wmv': 'wmv',
+ 'x-msvideo': 'avi',
+
+ # application (streaming playlists)
+ 'dash+xml': 'mpd',
+ 'f4m+xml': 'f4m',
+ 'hds+xml': 'f4m',
+ 'vnd.apple.mpegurl': 'm3u8',
+ 'vnd.ms-sstr+xml': 'ism',
+ 'x-mpegurl': 'm3u8',
+
+ # audio
+ 'audio/mp4': 'm4a',
+ # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
+ # Using .mp3 as it's the most popular one
+ 'audio/mpeg': 'mp3',
+ 'audio/webm': 'webm',
+ 'audio/x-matroska': 'mka',
+ 'audio/x-mpegurl': 'm3u',
+ 'midi': 'mid',
+ 'ogg': 'ogg',
+ 'wav': 'wav',
+ 'wave': 'wav',
+ 'x-aac': 'aac',
+ 'x-flac': 'flac',
+ 'x-m4a': 'm4a',
+ 'x-realaudio': 'ra',
+ 'x-wav': 'wav',
+
+ # image
+ 'avif': 'avif',
+ 'bmp': 'bmp',
+ 'gif': 'gif',
+ 'jpeg': 'jpg',
+ 'png': 'png',
+ 'svg+xml': 'svg',
+ 'tiff': 'tif',
+ 'vnd.wap.wbmp': 'wbmp',
+ 'webp': 'webp',
+ 'x-icon': 'ico',
+ 'x-jng': 'jng',
+ 'x-ms-bmp': 'bmp',
+
+ # caption
+ 'filmstrip+json': 'fs',
+ 'smptett+xml': 'tt',
+ 'ttaf+xml': 'dfxp',
+ 'ttml+xml': 'ttml',
+ 'x-ms-sami': 'sami',
+
+ # misc
+ 'gzip': 'gz',
+ 'json': 'json',
+ 'xml': 'xml',
+ 'zip': 'zip',
+ }
+
+ mimetype = mt.partition(';')[0].strip().lower()
+ _, _, subtype = mimetype.rpartition('/')
+
+ ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
+ if ext:
+ return ext
+ elif default is not NO_DEFAULT:
+ return default
+ return subtype.replace('+', '.')
+
+
+def ext2mimetype(ext_or_url):
+ if not ext_or_url:
+ return None
+ if '.' not in ext_or_url:
+ ext_or_url = f'file.{ext_or_url}'
+ return mimetypes.guess_type(ext_or_url)[0]
+
+
+def parse_codecs(codecs_str):
+ # http://tools.ietf.org/html/rfc6381
+ if not codecs_str:
+ return {}
+ split_codecs = list(filter(None, map(
+ str.strip, codecs_str.strip().strip(',').split(','))))
+ vcodec, acodec, scodec, hdr = None, None, None, None
+ for full_codec in split_codecs:
+ parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
+ if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
+ 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
+ if vcodec:
+ continue
+ vcodec = full_codec
+ if parts[0] in ('dvh1', 'dvhe'):
+ hdr = 'DV'
+ elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
+ hdr = 'HDR10'
+ elif parts[:2] == ['vp9', '2']:
+ hdr = 'HDR10'
+ elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
+ 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
+ acodec = acodec or full_codec
+ elif parts[0] in ('stpp', 'wvtt'):
+ scodec = scodec or full_codec
+ else:
+ write_string(f'WARNING: Unknown codec {full_codec}\n')
+ if vcodec or acodec or scodec:
+ return {
+ 'vcodec': vcodec or 'none',
+ 'acodec': acodec or 'none',
+ 'dynamic_range': hdr,
+ **({'scodec': scodec} if scodec is not None else {}),
+ }
+ elif len(split_codecs) == 2:
+ return {
+ 'vcodec': split_codecs[0],
+ 'acodec': split_codecs[1],
+ }
+ return {}
+
+
+def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
+ assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
+
+ allow_mkv = not preferences or 'mkv' in preferences
+
+ if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
+ return 'mkv' # TODO: any other format allows this?
+
+ # TODO: All codecs supported by parse_codecs isn't handled here
+ COMPATIBLE_CODECS = {
+ 'mp4': {
+ 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
+ 'h264', 'aacl', 'ec-3', # Set in ISM
+ },
+ 'webm': {
+ 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
+ 'vp9x', 'vp8x', # in the webm spec
+ },
+ }
+
+ sanitize_codec = functools.partial(
+ try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
+ vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
+
+ for ext in preferences or COMPATIBLE_CODECS.keys():
+ codec_set = COMPATIBLE_CODECS.get(ext, set())
+ if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
+ return ext
+
+ COMPATIBLE_EXTS = (
+ {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
+ {'webm', 'weba'},
+ )
+ for ext in preferences or vexts:
+ current_exts = {ext, *vexts, *aexts}
+ if ext == 'mkv' or current_exts == {ext} or any(
+ ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
+ return ext
+ return 'mkv' if allow_mkv else preferences[-1]
+
+
+def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
+ getheader = url_handle.headers.get
+
+ cd = getheader('Content-Disposition')
+ if cd:
+ m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
+ if m:
+ e = determine_ext(m.group('filename'), default_ext=None)
+ if e:
+ return e
+
+ meta_ext = getheader('x-amz-meta-name')
+ if meta_ext:
+ e = meta_ext.rpartition('.')[2]
+ if e:
+ return e
+
+ return mimetype2ext(getheader('Content-Type'), default=default)
+
+
+def encode_data_uri(data, mime_type):
+ return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
+
+
+def age_restricted(content_limit, age_limit):
+ """ Returns True iff the content should be blocked """
+
+ if age_limit is None: # No limit set
+ return False
+ if content_limit is None:
+ return False # Content available for everyone
+ return age_limit < content_limit
+
+
+# List of known byte-order-marks (BOM)
+BOMS = [
+ (b'\xef\xbb\xbf', 'utf-8'),
+ (b'\x00\x00\xfe\xff', 'utf-32-be'),
+ (b'\xff\xfe\x00\x00', 'utf-32-le'),
+ (b'\xff\xfe', 'utf-16-le'),
+ (b'\xfe\xff', 'utf-16-be'),
+]
+
+
+def is_html(first_bytes):
+ """ Detect whether a file contains HTML by examining its first bytes. """
+
+ encoding = 'utf-8'
+ for bom, enc in BOMS:
+ while first_bytes.startswith(bom):
+ encoding, first_bytes = enc, first_bytes[len(bom):]
+
+ return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
+
+
+def determine_protocol(info_dict):
+ protocol = info_dict.get('protocol')
+ if protocol is not None:
+ return protocol
+
+ url = sanitize_url(info_dict['url'])
+ if url.startswith('rtmp'):
+ return 'rtmp'
+ elif url.startswith('mms'):
+ return 'mms'
+ elif url.startswith('rtsp'):
+ return 'rtsp'
+
+ ext = determine_ext(url)
+ if ext == 'm3u8':
+ return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
+ elif ext == 'f4m':
+ return 'f4m'
+
+ return urllib.parse.urlparse(url).scheme
+
+
+def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
+ """ Render a list of rows, each as a list of values.
+ Text after a \t will be right aligned """
+ def width(string):
+ return len(remove_terminal_sequences(string).replace('\t', ''))
+
+ def get_max_lens(table):
+ return [max(width(str(v)) for v in col) for col in zip(*table)]
+
+ def filter_using_list(row, filterArray):
+ return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
+
+ max_lens = get_max_lens(data) if hide_empty else []
+ header_row = filter_using_list(header_row, max_lens)
+ data = [filter_using_list(row, max_lens) for row in data]
+
+ table = [header_row] + data
+ max_lens = get_max_lens(table)
+ extra_gap += 1
+ if delim:
+ table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
+ table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
+ for row in table:
+ for pos, text in enumerate(map(str, row)):
+ if '\t' in text:
+ row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
+ else:
+ row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
+ ret = '\n'.join(''.join(row).rstrip() for row in table)
+ return ret
+
+
+def _match_one(filter_part, dct, incomplete):
+ # TODO: Generalize code with YoutubeDL._build_format_filter
+ STRING_OPERATORS = {
+ '*=': operator.contains,
+ '^=': lambda attr, value: attr.startswith(value),
+ '$=': lambda attr, value: attr.endswith(value),
+ '~=': lambda attr, value: re.search(value, attr),
+ }
+ COMPARISON_OPERATORS = {
+ **STRING_OPERATORS,
+ '<=': operator.le, # "<=" must be defined above "<"
+ '<': operator.lt,
+ '>=': operator.ge,
+ '>': operator.gt,
+ '=': operator.eq,
+ }
+
+ if isinstance(incomplete, bool):
+ is_incomplete = lambda _: incomplete
+ else:
+ is_incomplete = lambda k: k in incomplete
+
+ operator_rex = re.compile(r'''(?x)
+ (?P<key>[a-z_]+)
+ \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
+ (?:
+ (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
+ (?P<strval>.+?)
+ )
+ ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
+ m = operator_rex.fullmatch(filter_part.strip())
+ if m:
+ m = m.groupdict()
+ unnegated_op = COMPARISON_OPERATORS[m['op']]
+ if m['negation']:
+ op = lambda attr, value: not unnegated_op(attr, value)
+ else:
+ op = unnegated_op
+ comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
+ if m['quote']:
+ comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
+ actual_value = dct.get(m['key'])
+ numeric_comparison = None
+ if isinstance(actual_value, (int, float)):
+ # If the original field is a string and matching comparisonvalue is
+ # a number we should respect the origin of the original field
+ # and process comparison value as a string (see
+ # https://github.com/ytdl-org/youtube-dl/issues/11082)
+ try:
+ numeric_comparison = int(comparison_value)
+ except ValueError:
+ numeric_comparison = parse_filesize(comparison_value)
+ if numeric_comparison is None:
+ numeric_comparison = parse_filesize(f'{comparison_value}B')
+ if numeric_comparison is None:
+ numeric_comparison = parse_duration(comparison_value)
+ if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
+ raise ValueError('Operator %s only supports string values!' % m['op'])
+ if actual_value is None:
+ return is_incomplete(m['key']) or m['none_inclusive']
+ return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
+
+ UNARY_OPERATORS = {
+ '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
+ '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
+ }
+ operator_rex = re.compile(r'''(?x)
+ (?P<op>%s)\s*(?P<key>[a-z_]+)
+ ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
+ m = operator_rex.fullmatch(filter_part.strip())
+ if m:
+ op = UNARY_OPERATORS[m.group('op')]
+ actual_value = dct.get(m.group('key'))
+ if is_incomplete(m.group('key')) and actual_value is None:
+ return True
+ return op(actual_value)
+
+ raise ValueError('Invalid filter part %r' % filter_part)
+
+
+def match_str(filter_str, dct, incomplete=False):
+ """ Filter a dictionary with a simple string syntax.
+ @returns Whether the filter passes
+ @param incomplete Set of keys that is expected to be missing from dct.
+ Can be True/False to indicate all/none of the keys may be missing.
+ All conditions on incomplete keys pass if the key is missing
+ """
+ return all(
+ _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
+ for filter_part in re.split(r'(?<!\\)&', filter_str))
+
+
+def match_filter_func(filters, breaking_filters=None):
+ if not filters and not breaking_filters:
+ return None
+ breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
+ filters = set(variadic(filters or []))
+
+ interactive = '-' in filters
+ if interactive:
+ filters.remove('-')
+
+ def _match_func(info_dict, incomplete=False):
+ ret = breaking_filters(info_dict, incomplete)
+ if ret is not None:
+ raise RejectedVideoReached(ret)
+
+ if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
+ return NO_DEFAULT if interactive and not incomplete else None
+ else:
+ video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
+ filter_str = ') | ('.join(map(str.strip, filters))
+ return f'{video_title} does not pass filter ({filter_str}), skipping ..'
+ return _match_func
+
+
+class download_range_func:
+ def __init__(self, chapters, ranges, from_info=False):
+ self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
+
+ def __call__(self, info_dict, ydl):
+
+ warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
+ else 'Cannot match chapters since chapter information is unavailable')
+ for regex in self.chapters or []:
+ for i, chapter in enumerate(info_dict.get('chapters') or []):
+ if re.search(regex, chapter['title']):
+ warning = None
+ yield {**chapter, 'index': i}
+ if self.chapters and warning:
+ ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
+
+ for start, end in self.ranges or []:
+ yield {
+ 'start_time': self._handle_negative_timestamp(start, info_dict),
+ 'end_time': self._handle_negative_timestamp(end, info_dict),
+ }
+
+ if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
+ yield {
+ 'start_time': info_dict.get('start_time') or 0,
+ 'end_time': info_dict.get('end_time') or float('inf'),
+ }
+ elif not self.ranges and not self.chapters:
+ yield {}
+
+ @staticmethod
+ def _handle_negative_timestamp(time, info):
+ return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
+
+ def __eq__(self, other):
+ return (isinstance(other, download_range_func)
+ and self.chapters == other.chapters and self.ranges == other.ranges)
+
+ def __repr__(self):
+ return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
+
+
+def parse_dfxp_time_expr(time_expr):
+ if not time_expr:
+ return
+
+ mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
+ if mobj:
+ return float(mobj.group('time_offset'))
+
+ mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
+ if mobj:
+ return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
+
+
+def srt_subtitles_timecode(seconds):
+ return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
+
+
+def ass_subtitles_timecode(seconds):
+ time = timetuple_from_msec(seconds * 1000)
+ return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
+
+
+def dfxp2srt(dfxp_data):
+ '''
+ @param dfxp_data A bytes-like object containing DFXP data
+ @returns A unicode object containing converted SRT data
+ '''
+ LEGACY_NAMESPACES = (
+ (b'http://www.w3.org/ns/ttml', [
+ b'http://www.w3.org/2004/11/ttaf1',
+ b'http://www.w3.org/2006/04/ttaf1',
+ b'http://www.w3.org/2006/10/ttaf1',
+ ]),
+ (b'http://www.w3.org/ns/ttml#styling', [
+ b'http://www.w3.org/ns/ttml#style',
+ ]),
+ )
+
+ SUPPORTED_STYLING = [
+ 'color',
+ 'fontFamily',
+ 'fontSize',
+ 'fontStyle',
+ 'fontWeight',
+ 'textDecoration'
+ ]
+
+ _x = functools.partial(xpath_with_ns, ns_map={
+ 'xml': 'http://www.w3.org/XML/1998/namespace',
+ 'ttml': 'http://www.w3.org/ns/ttml',
+ 'tts': 'http://www.w3.org/ns/ttml#styling',
+ })
+
+ styles = {}
+ default_style = {}
+
+ class TTMLPElementParser:
+ _out = ''
+ _unclosed_elements = []
+ _applied_styles = []
+
+ def start(self, tag, attrib):
+ if tag in (_x('ttml:br'), 'br'):
+ self._out += '\n'
+ else:
+ unclosed_elements = []
+ style = {}
+ element_style_id = attrib.get('style')
+ if default_style:
+ style.update(default_style)
+ if element_style_id:
+ style.update(styles.get(element_style_id, {}))
+ for prop in SUPPORTED_STYLING:
+ prop_val = attrib.get(_x('tts:' + prop))
+ if prop_val:
+ style[prop] = prop_val
+ if style:
+ font = ''
+ for k, v in sorted(style.items()):
+ if self._applied_styles and self._applied_styles[-1].get(k) == v:
+ continue
+ if k == 'color':
+ font += ' color="%s"' % v
+ elif k == 'fontSize':
+ font += ' size="%s"' % v
+ elif k == 'fontFamily':
+ font += ' face="%s"' % v
+ elif k == 'fontWeight' and v == 'bold':
+ self._out += '<b>'
+ unclosed_elements.append('b')
+ elif k == 'fontStyle' and v == 'italic':
+ self._out += '<i>'
+ unclosed_elements.append('i')
+ elif k == 'textDecoration' and v == 'underline':
+ self._out += '<u>'
+ unclosed_elements.append('u')
+ if font:
+ self._out += '<font' + font + '>'
+ unclosed_elements.append('font')
+ applied_style = {}
+ if self._applied_styles:
+ applied_style.update(self._applied_styles[-1])
+ applied_style.update(style)
+ self._applied_styles.append(applied_style)
+ self._unclosed_elements.append(unclosed_elements)
+
+ def end(self, tag):
+ if tag not in (_x('ttml:br'), 'br'):
+ unclosed_elements = self._unclosed_elements.pop()
+ for element in reversed(unclosed_elements):
+ self._out += '</%s>' % element
+ if unclosed_elements and self._applied_styles:
+ self._applied_styles.pop()
+
+ def data(self, data):
+ self._out += data
+
+ def close(self):
+ return self._out.strip()
+
+ # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/hypervideo/hypervideo/issues/6543#issuecomment-1477169870
+ # This will not trigger false positives since only UTF-8 text is being replaced
+ dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
+
+ def parse_node(node):
+ target = TTMLPElementParser()
+ parser = xml.etree.ElementTree.XMLParser(target=target)
+ parser.feed(xml.etree.ElementTree.tostring(node))
+ return parser.close()
+
+ for k, v in LEGACY_NAMESPACES:
+ for ns in v:
+ dfxp_data = dfxp_data.replace(ns, k)
+
+ dfxp = compat_etree_fromstring(dfxp_data)
+ out = []
+ paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
+
+ if not paras:
+ raise ValueError('Invalid dfxp/TTML subtitle')
+
+ repeat = False
+ while True:
+ for style in dfxp.findall(_x('.//ttml:style')):
+ style_id = style.get('id') or style.get(_x('xml:id'))
+ if not style_id:
+ continue
+ parent_style_id = style.get('style')
+ if parent_style_id:
+ if parent_style_id not in styles:
+ repeat = True
+ continue
+ styles[style_id] = styles[parent_style_id].copy()
+ for prop in SUPPORTED_STYLING:
+ prop_val = style.get(_x('tts:' + prop))
+ if prop_val:
+ styles.setdefault(style_id, {})[prop] = prop_val
+ if repeat:
+ repeat = False
+ else:
+ break
+
+ for p in ('body', 'div'):
+ ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
+ if ele is None:
+ continue
+ style = styles.get(ele.get('style'))
+ if not style:
+ continue
+ default_style.update(style)
+
+ for para, index in zip(paras, itertools.count(1)):
+ begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
+ end_time = parse_dfxp_time_expr(para.attrib.get('end'))
+ dur = parse_dfxp_time_expr(para.attrib.get('dur'))
+ if begin_time is None:
+ continue
+ if not end_time:
+ if not dur:
+ continue
+ end_time = begin_time + dur
+ out.append('%d\n%s --> %s\n%s\n\n' % (
+ index,
+ srt_subtitles_timecode(begin_time),
+ srt_subtitles_timecode(end_time),
+ parse_node(para)))
+
+ return ''.join(out)
+
+
+def cli_option(params, command_option, param, separator=None):
+ param = params.get(param)
+ return ([] if param is None
+ else [command_option, str(param)] if separator is None
+ else [f'{command_option}{separator}{param}'])
+
+
+def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
+ param = params.get(param)
+ assert param in (True, False, None)
+ return cli_option({True: true_value, False: false_value}, command_option, param, separator)
+
+
+def cli_valueless_option(params, command_option, param, expected_value=True):
+ return [command_option] if params.get(param) == expected_value else []
+
+
+def cli_configuration_args(argdict, keys, default=[], use_compat=True):
+ if isinstance(argdict, (list, tuple)): # for backward compatibility
+ if use_compat:
+ return argdict
+ else:
+ argdict = None
+ if argdict is None:
+ return default
+ assert isinstance(argdict, dict)
+
+ assert isinstance(keys, (list, tuple))
+ for key_list in keys:
+ arg_list = list(filter(
+ lambda x: x is not None,
+ [argdict.get(key.lower()) for key in variadic(key_list)]))
+ if arg_list:
+ return [arg for args in arg_list for arg in args]
+ return default
+
+
+def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
+ main_key, exe = main_key.lower(), exe.lower()
+ root_key = exe if main_key == exe else f'{main_key}+{exe}'
+ keys = [f'{root_key}{k}' for k in (keys or [''])]
+ if root_key in keys:
+ if main_key != exe:
+ keys.append((main_key, exe))
+ keys.append('default')
+ else:
+ use_compat = False
+ return cli_configuration_args(argdict, keys, default, use_compat)
+
+
+class ISO639Utils:
+ # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
+ _lang_map = {
+ 'aa': 'aar',
+ 'ab': 'abk',
+ 'ae': 'ave',
+ 'af': 'afr',
+ 'ak': 'aka',
+ 'am': 'amh',
+ 'an': 'arg',
+ 'ar': 'ara',
+ 'as': 'asm',
+ 'av': 'ava',
+ 'ay': 'aym',
+ 'az': 'aze',
+ 'ba': 'bak',
+ 'be': 'bel',
+ 'bg': 'bul',
+ 'bh': 'bih',
+ 'bi': 'bis',
+ 'bm': 'bam',
+ 'bn': 'ben',
+ 'bo': 'bod',
+ 'br': 'bre',
+ 'bs': 'bos',
+ 'ca': 'cat',
+ 'ce': 'che',
+ 'ch': 'cha',
+ 'co': 'cos',
+ 'cr': 'cre',
+ 'cs': 'ces',
+ 'cu': 'chu',
+ 'cv': 'chv',
+ 'cy': 'cym',
+ 'da': 'dan',
+ 'de': 'deu',
+ 'dv': 'div',
+ 'dz': 'dzo',
+ 'ee': 'ewe',
+ 'el': 'ell',
+ 'en': 'eng',
+ 'eo': 'epo',
+ 'es': 'spa',
+ 'et': 'est',
+ 'eu': 'eus',
+ 'fa': 'fas',
+ 'ff': 'ful',
+ 'fi': 'fin',
+ 'fj': 'fij',
+ 'fo': 'fao',
+ 'fr': 'fra',
+ 'fy': 'fry',
+ 'ga': 'gle',
+ 'gd': 'gla',
+ 'gl': 'glg',
+ 'gn': 'grn',
+ 'gu': 'guj',
+ 'gv': 'glv',
+ 'ha': 'hau',
+ 'he': 'heb',
+ 'iw': 'heb', # Replaced by he in 1989 revision
+ 'hi': 'hin',
+ 'ho': 'hmo',
+ 'hr': 'hrv',
+ 'ht': 'hat',
+ 'hu': 'hun',
+ 'hy': 'hye',
+ 'hz': 'her',
+ 'ia': 'ina',
+ 'id': 'ind',
+ 'in': 'ind', # Replaced by id in 1989 revision
+ 'ie': 'ile',
+ 'ig': 'ibo',
+ 'ii': 'iii',
+ 'ik': 'ipk',
+ 'io': 'ido',
+ 'is': 'isl',
+ 'it': 'ita',
+ 'iu': 'iku',
+ 'ja': 'jpn',
+ 'jv': 'jav',
+ 'ka': 'kat',
+ 'kg': 'kon',
+ 'ki': 'kik',
+ 'kj': 'kua',
+ 'kk': 'kaz',
+ 'kl': 'kal',
+ 'km': 'khm',
+ 'kn': 'kan',
+ 'ko': 'kor',
+ 'kr': 'kau',
+ 'ks': 'kas',
+ 'ku': 'kur',
+ 'kv': 'kom',
+ 'kw': 'cor',
+ 'ky': 'kir',
+ 'la': 'lat',
+ 'lb': 'ltz',
+ 'lg': 'lug',
+ 'li': 'lim',
+ 'ln': 'lin',
+ 'lo': 'lao',
+ 'lt': 'lit',
+ 'lu': 'lub',
+ 'lv': 'lav',
+ 'mg': 'mlg',
+ 'mh': 'mah',
+ 'mi': 'mri',
+ 'mk': 'mkd',
+ 'ml': 'mal',
+ 'mn': 'mon',
+ 'mr': 'mar',
+ 'ms': 'msa',
+ 'mt': 'mlt',
+ 'my': 'mya',
+ 'na': 'nau',
+ 'nb': 'nob',
+ 'nd': 'nde',
+ 'ne': 'nep',
+ 'ng': 'ndo',
+ 'nl': 'nld',
+ 'nn': 'nno',
+ 'no': 'nor',
+ 'nr': 'nbl',
+ 'nv': 'nav',
+ 'ny': 'nya',
+ 'oc': 'oci',
+ 'oj': 'oji',
+ 'om': 'orm',
+ 'or': 'ori',
+ 'os': 'oss',
+ 'pa': 'pan',
+ 'pe': 'per',
+ 'pi': 'pli',
+ 'pl': 'pol',
+ 'ps': 'pus',
+ 'pt': 'por',
+ 'qu': 'que',
+ 'rm': 'roh',
+ 'rn': 'run',
+ 'ro': 'ron',
+ 'ru': 'rus',
+ 'rw': 'kin',
+ 'sa': 'san',
+ 'sc': 'srd',
+ 'sd': 'snd',
+ 'se': 'sme',
+ 'sg': 'sag',
+ 'si': 'sin',
+ 'sk': 'slk',
+ 'sl': 'slv',
+ 'sm': 'smo',
+ 'sn': 'sna',
+ 'so': 'som',
+ 'sq': 'sqi',
+ 'sr': 'srp',
+ 'ss': 'ssw',
+ 'st': 'sot',
+ 'su': 'sun',
+ 'sv': 'swe',
+ 'sw': 'swa',
+ 'ta': 'tam',
+ 'te': 'tel',
+ 'tg': 'tgk',
+ 'th': 'tha',
+ 'ti': 'tir',
+ 'tk': 'tuk',
+ 'tl': 'tgl',
+ 'tn': 'tsn',
+ 'to': 'ton',
+ 'tr': 'tur',
+ 'ts': 'tso',
+ 'tt': 'tat',
+ 'tw': 'twi',
+ 'ty': 'tah',
+ 'ug': 'uig',
+ 'uk': 'ukr',
+ 'ur': 'urd',
+ 'uz': 'uzb',
+ 've': 'ven',
+ 'vi': 'vie',
+ 'vo': 'vol',
+ 'wa': 'wln',
+ 'wo': 'wol',
+ 'xh': 'xho',
+ 'yi': 'yid',
+ 'ji': 'yid', # Replaced by yi in 1989 revision
+ 'yo': 'yor',
+ 'za': 'zha',
+ 'zh': 'zho',
+ 'zu': 'zul',
+ }
+
+ @classmethod
+ def short2long(cls, code):
+ """Convert language code from ISO 639-1 to ISO 639-2/T"""
+ return cls._lang_map.get(code[:2])
+
+ @classmethod
+ def long2short(cls, code):
+ """Convert language code from ISO 639-2/T to ISO 639-1"""
+ for short_name, long_name in cls._lang_map.items():
+ if long_name == code:
+ return short_name
+
+
+class ISO3166Utils:
+ # From http://data.okfn.org/data/core/country-list
+ _country_map = {
+ 'AF': 'Afghanistan',
+ 'AX': 'Åland Islands',
+ 'AL': 'Albania',
+ 'DZ': 'Algeria',
+ 'AS': 'American Samoa',
+ 'AD': 'Andorra',
+ 'AO': 'Angola',
+ 'AI': 'Anguilla',
+ 'AQ': 'Antarctica',
+ 'AG': 'Antigua and Barbuda',
+ 'AR': 'Argentina',
+ 'AM': 'Armenia',
+ 'AW': 'Aruba',
+ 'AU': 'Australia',
+ 'AT': 'Austria',
+ 'AZ': 'Azerbaijan',
+ 'BS': 'Bahamas',
+ 'BH': 'Bahrain',
+ 'BD': 'Bangladesh',
+ 'BB': 'Barbados',
+ 'BY': 'Belarus',
+ 'BE': 'Belgium',
+ 'BZ': 'Belize',
+ 'BJ': 'Benin',
+ 'BM': 'Bermuda',
+ 'BT': 'Bhutan',
+ 'BO': 'Bolivia, Plurinational State of',
+ 'BQ': 'Bonaire, Sint Eustatius and Saba',
+ 'BA': 'Bosnia and Herzegovina',
+ 'BW': 'Botswana',
+ 'BV': 'Bouvet Island',
+ 'BR': 'Brazil',
+ 'IO': 'British Indian Ocean Territory',
+ 'BN': 'Brunei Darussalam',
+ 'BG': 'Bulgaria',
+ 'BF': 'Burkina Faso',
+ 'BI': 'Burundi',
+ 'KH': 'Cambodia',
+ 'CM': 'Cameroon',
+ 'CA': 'Canada',
+ 'CV': 'Cape Verde',
+ 'KY': 'Cayman Islands',
+ 'CF': 'Central African Republic',
+ 'TD': 'Chad',
+ 'CL': 'Chile',
+ 'CN': 'China',
+ 'CX': 'Christmas Island',
+ 'CC': 'Cocos (Keeling) Islands',
+ 'CO': 'Colombia',
+ 'KM': 'Comoros',
+ 'CG': 'Congo',
+ 'CD': 'Congo, the Democratic Republic of the',
+ 'CK': 'Cook Islands',
+ 'CR': 'Costa Rica',
+ 'CI': 'Côte d\'Ivoire',
+ 'HR': 'Croatia',
+ 'CU': 'Cuba',
+ 'CW': 'Curaçao',
+ 'CY': 'Cyprus',
+ 'CZ': 'Czech Republic',
+ 'DK': 'Denmark',
+ 'DJ': 'Djibouti',
+ 'DM': 'Dominica',
+ 'DO': 'Dominican Republic',
+ 'EC': 'Ecuador',
+ 'EG': 'Egypt',
+ 'SV': 'El Salvador',
+ 'GQ': 'Equatorial Guinea',
+ 'ER': 'Eritrea',
+ 'EE': 'Estonia',
+ 'ET': 'Ethiopia',
+ 'FK': 'Falkland Islands (Malvinas)',
+ 'FO': 'Faroe Islands',
+ 'FJ': 'Fiji',
+ 'FI': 'Finland',
+ 'FR': 'France',
+ 'GF': 'French Guiana',
+ 'PF': 'French Polynesia',
+ 'TF': 'French Southern Territories',
+ 'GA': 'Gabon',
+ 'GM': 'Gambia',
+ 'GE': 'Georgia',
+ 'DE': 'Germany',
+ 'GH': 'Ghana',
+ 'GI': 'Gibraltar',
+ 'GR': 'Greece',
+ 'GL': 'Greenland',
+ 'GD': 'Grenada',
+ 'GP': 'Guadeloupe',
+ 'GU': 'Guam',
+ 'GT': 'Guatemala',
+ 'GG': 'Guernsey',
+ 'GN': 'Guinea',
+ 'GW': 'Guinea-Bissau',
+ 'GY': 'Guyana',
+ 'HT': 'Haiti',
+ 'HM': 'Heard Island and McDonald Islands',
+ 'VA': 'Holy See (Vatican City State)',
+ 'HN': 'Honduras',
+ 'HK': 'Hong Kong',
+ 'HU': 'Hungary',
+ 'IS': 'Iceland',
+ 'IN': 'India',
+ 'ID': 'Indonesia',
+ 'IR': 'Iran, Islamic Republic of',
+ 'IQ': 'Iraq',
+ 'IE': 'Ireland',
+ 'IM': 'Isle of Man',
+ 'IL': 'Israel',
+ 'IT': 'Italy',
+ 'JM': 'Jamaica',
+ 'JP': 'Japan',
+ 'JE': 'Jersey',
+ 'JO': 'Jordan',
+ 'KZ': 'Kazakhstan',
+ 'KE': 'Kenya',
+ 'KI': 'Kiribati',
+ 'KP': 'Korea, Democratic People\'s Republic of',
+ 'KR': 'Korea, Republic of',
+ 'KW': 'Kuwait',
+ 'KG': 'Kyrgyzstan',
+ 'LA': 'Lao People\'s Democratic Republic',
+ 'LV': 'Latvia',
+ 'LB': 'Lebanon',
+ 'LS': 'Lesotho',
+ 'LR': 'Liberia',
+ 'LY': 'Libya',
+ 'LI': 'Liechtenstein',
+ 'LT': 'Lithuania',
+ 'LU': 'Luxembourg',
+ 'MO': 'Macao',
+ 'MK': 'Macedonia, the Former Yugoslav Republic of',
+ 'MG': 'Madagascar',
+ 'MW': 'Malawi',
+ 'MY': 'Malaysia',
+ 'MV': 'Maldives',
+ 'ML': 'Mali',
+ 'MT': 'Malta',
+ 'MH': 'Marshall Islands',
+ 'MQ': 'Martinique',
+ 'MR': 'Mauritania',
+ 'MU': 'Mauritius',
+ 'YT': 'Mayotte',
+ 'MX': 'Mexico',
+ 'FM': 'Micronesia, Federated States of',
+ 'MD': 'Moldova, Republic of',
+ 'MC': 'Monaco',
+ 'MN': 'Mongolia',
+ 'ME': 'Montenegro',
+ 'MS': 'Montserrat',
+ 'MA': 'Morocco',
+ 'MZ': 'Mozambique',
+ 'MM': 'Myanmar',
+ 'NA': 'Namibia',
+ 'NR': 'Nauru',
+ 'NP': 'Nepal',
+ 'NL': 'Netherlands',
+ 'NC': 'New Caledonia',
+ 'NZ': 'New Zealand',
+ 'NI': 'Nicaragua',
+ 'NE': 'Niger',
+ 'NG': 'Nigeria',
+ 'NU': 'Niue',
+ 'NF': 'Norfolk Island',
+ 'MP': 'Northern Mariana Islands',
+ 'NO': 'Norway',
+ 'OM': 'Oman',
+ 'PK': 'Pakistan',
+ 'PW': 'Palau',
+ 'PS': 'Palestine, State of',
+ 'PA': 'Panama',
+ 'PG': 'Papua New Guinea',
+ 'PY': 'Paraguay',
+ 'PE': 'Peru',
+ 'PH': 'Philippines',
+ 'PN': 'Pitcairn',
+ 'PL': 'Poland',
+ 'PT': 'Portugal',
+ 'PR': 'Puerto Rico',
+ 'QA': 'Qatar',
+ 'RE': 'Réunion',
+ 'RO': 'Romania',
+ 'RU': 'Russian Federation',
+ 'RW': 'Rwanda',
+ 'BL': 'Saint Barthélemy',
+ 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
+ 'KN': 'Saint Kitts and Nevis',
+ 'LC': 'Saint Lucia',
+ 'MF': 'Saint Martin (French part)',
+ 'PM': 'Saint Pierre and Miquelon',
+ 'VC': 'Saint Vincent and the Grenadines',
+ 'WS': 'Samoa',
+ 'SM': 'San Marino',
+ 'ST': 'Sao Tome and Principe',
+ 'SA': 'Saudi Arabia',
+ 'SN': 'Senegal',
+ 'RS': 'Serbia',
+ 'SC': 'Seychelles',
+ 'SL': 'Sierra Leone',
+ 'SG': 'Singapore',
+ 'SX': 'Sint Maarten (Dutch part)',
+ 'SK': 'Slovakia',
+ 'SI': 'Slovenia',
+ 'SB': 'Solomon Islands',
+ 'SO': 'Somalia',
+ 'ZA': 'South Africa',
+ 'GS': 'South Georgia and the South Sandwich Islands',
+ 'SS': 'South Sudan',
+ 'ES': 'Spain',
+ 'LK': 'Sri Lanka',
+ 'SD': 'Sudan',
+ 'SR': 'Suriname',
+ 'SJ': 'Svalbard and Jan Mayen',
+ 'SZ': 'Swaziland',
+ 'SE': 'Sweden',
+ 'CH': 'Switzerland',
+ 'SY': 'Syrian Arab Republic',
+ 'TW': 'Taiwan, Province of China',
+ 'TJ': 'Tajikistan',
+ 'TZ': 'Tanzania, United Republic of',
+ 'TH': 'Thailand',
+ 'TL': 'Timor-Leste',
+ 'TG': 'Togo',
+ 'TK': 'Tokelau',
+ 'TO': 'Tonga',
+ 'TT': 'Trinidad and Tobago',
+ 'TN': 'Tunisia',
+ 'TR': 'Turkey',
+ 'TM': 'Turkmenistan',
+ 'TC': 'Turks and Caicos Islands',
+ 'TV': 'Tuvalu',
+ 'UG': 'Uganda',
+ 'UA': 'Ukraine',
+ 'AE': 'United Arab Emirates',
+ 'GB': 'United Kingdom',
+ 'US': 'United States',
+ 'UM': 'United States Minor Outlying Islands',
+ 'UY': 'Uruguay',
+ 'UZ': 'Uzbekistan',
+ 'VU': 'Vanuatu',
+ 'VE': 'Venezuela, Bolivarian Republic of',
+ 'VN': 'Viet Nam',
+ 'VG': 'Virgin Islands, British',
+ 'VI': 'Virgin Islands, U.S.',
+ 'WF': 'Wallis and Futuna',
+ 'EH': 'Western Sahara',
+ 'YE': 'Yemen',
+ 'ZM': 'Zambia',
+ 'ZW': 'Zimbabwe',
+ # Not ISO 3166 codes, but used for IP blocks
+ 'AP': 'Asia/Pacific Region',
+ 'EU': 'Europe',
+ }
+
+ @classmethod
+ def short2full(cls, code):
+ """Convert an ISO 3166-2 country code to the corresponding full name"""
+ return cls._country_map.get(code.upper())
+
+
+class GeoUtils:
+ # Major IPv4 address blocks per country
+ _country_ip_map = {
+ 'AD': '46.172.224.0/19',
+ 'AE': '94.200.0.0/13',
+ 'AF': '149.54.0.0/17',
+ 'AG': '209.59.64.0/18',
+ 'AI': '204.14.248.0/21',
+ 'AL': '46.99.0.0/16',
+ 'AM': '46.70.0.0/15',
+ 'AO': '105.168.0.0/13',
+ 'AP': '182.50.184.0/21',
+ 'AQ': '23.154.160.0/24',
+ 'AR': '181.0.0.0/12',
+ 'AS': '202.70.112.0/20',
+ 'AT': '77.116.0.0/14',
+ 'AU': '1.128.0.0/11',
+ 'AW': '181.41.0.0/18',
+ 'AX': '185.217.4.0/22',
+ 'AZ': '5.197.0.0/16',
+ 'BA': '31.176.128.0/17',
+ 'BB': '65.48.128.0/17',
+ 'BD': '114.130.0.0/16',
+ 'BE': '57.0.0.0/8',
+ 'BF': '102.178.0.0/15',
+ 'BG': '95.42.0.0/15',
+ 'BH': '37.131.0.0/17',
+ 'BI': '154.117.192.0/18',
+ 'BJ': '137.255.0.0/16',
+ 'BL': '185.212.72.0/23',
+ 'BM': '196.12.64.0/18',
+ 'BN': '156.31.0.0/16',
+ 'BO': '161.56.0.0/16',
+ 'BQ': '161.0.80.0/20',
+ 'BR': '191.128.0.0/12',
+ 'BS': '24.51.64.0/18',
+ 'BT': '119.2.96.0/19',
+ 'BW': '168.167.0.0/16',
+ 'BY': '178.120.0.0/13',
+ 'BZ': '179.42.192.0/18',
+ 'CA': '99.224.0.0/11',
+ 'CD': '41.243.0.0/16',
+ 'CF': '197.242.176.0/21',
+ 'CG': '160.113.0.0/16',
+ 'CH': '85.0.0.0/13',
+ 'CI': '102.136.0.0/14',
+ 'CK': '202.65.32.0/19',
+ 'CL': '152.172.0.0/14',
+ 'CM': '102.244.0.0/14',
+ 'CN': '36.128.0.0/10',
+ 'CO': '181.240.0.0/12',
+ 'CR': '201.192.0.0/12',
+ 'CU': '152.206.0.0/15',
+ 'CV': '165.90.96.0/19',
+ 'CW': '190.88.128.0/17',
+ 'CY': '31.153.0.0/16',
+ 'CZ': '88.100.0.0/14',
+ 'DE': '53.0.0.0/8',
+ 'DJ': '197.241.0.0/17',
+ 'DK': '87.48.0.0/12',
+ 'DM': '192.243.48.0/20',
+ 'DO': '152.166.0.0/15',
+ 'DZ': '41.96.0.0/12',
+ 'EC': '186.68.0.0/15',
+ 'EE': '90.190.0.0/15',
+ 'EG': '156.160.0.0/11',
+ 'ER': '196.200.96.0/20',
+ 'ES': '88.0.0.0/11',
+ 'ET': '196.188.0.0/14',
+ 'EU': '2.16.0.0/13',
+ 'FI': '91.152.0.0/13',
+ 'FJ': '144.120.0.0/16',
+ 'FK': '80.73.208.0/21',
+ 'FM': '119.252.112.0/20',
+ 'FO': '88.85.32.0/19',
+ 'FR': '90.0.0.0/9',
+ 'GA': '41.158.0.0/15',
+ 'GB': '25.0.0.0/8',
+ 'GD': '74.122.88.0/21',
+ 'GE': '31.146.0.0/16',
+ 'GF': '161.22.64.0/18',
+ 'GG': '62.68.160.0/19',
+ 'GH': '154.160.0.0/12',
+ 'GI': '95.164.0.0/16',
+ 'GL': '88.83.0.0/19',
+ 'GM': '160.182.0.0/15',
+ 'GN': '197.149.192.0/18',
+ 'GP': '104.250.0.0/19',
+ 'GQ': '105.235.224.0/20',
+ 'GR': '94.64.0.0/13',
+ 'GT': '168.234.0.0/16',
+ 'GU': '168.123.0.0/16',
+ 'GW': '197.214.80.0/20',
+ 'GY': '181.41.64.0/18',
+ 'HK': '113.252.0.0/14',
+ 'HN': '181.210.0.0/16',
+ 'HR': '93.136.0.0/13',
+ 'HT': '148.102.128.0/17',
+ 'HU': '84.0.0.0/14',
+ 'ID': '39.192.0.0/10',
+ 'IE': '87.32.0.0/12',
+ 'IL': '79.176.0.0/13',
+ 'IM': '5.62.80.0/20',
+ 'IN': '117.192.0.0/10',
+ 'IO': '203.83.48.0/21',
+ 'IQ': '37.236.0.0/14',
+ 'IR': '2.176.0.0/12',
+ 'IS': '82.221.0.0/16',
+ 'IT': '79.0.0.0/10',
+ 'JE': '87.244.64.0/18',
+ 'JM': '72.27.0.0/17',
+ 'JO': '176.29.0.0/16',
+ 'JP': '133.0.0.0/8',
+ 'KE': '105.48.0.0/12',
+ 'KG': '158.181.128.0/17',
+ 'KH': '36.37.128.0/17',
+ 'KI': '103.25.140.0/22',
+ 'KM': '197.255.224.0/20',
+ 'KN': '198.167.192.0/19',
+ 'KP': '175.45.176.0/22',
+ 'KR': '175.192.0.0/10',
+ 'KW': '37.36.0.0/14',
+ 'KY': '64.96.0.0/15',
+ 'KZ': '2.72.0.0/13',
+ 'LA': '115.84.64.0/18',
+ 'LB': '178.135.0.0/16',
+ 'LC': '24.92.144.0/20',
+ 'LI': '82.117.0.0/19',
+ 'LK': '112.134.0.0/15',
+ 'LR': '102.183.0.0/16',
+ 'LS': '129.232.0.0/17',
+ 'LT': '78.56.0.0/13',
+ 'LU': '188.42.0.0/16',
+ 'LV': '46.109.0.0/16',
+ 'LY': '41.252.0.0/14',
+ 'MA': '105.128.0.0/11',
+ 'MC': '88.209.64.0/18',
+ 'MD': '37.246.0.0/16',
+ 'ME': '178.175.0.0/17',
+ 'MF': '74.112.232.0/21',
+ 'MG': '154.126.0.0/17',
+ 'MH': '117.103.88.0/21',
+ 'MK': '77.28.0.0/15',
+ 'ML': '154.118.128.0/18',
+ 'MM': '37.111.0.0/17',
+ 'MN': '49.0.128.0/17',
+ 'MO': '60.246.0.0/16',
+ 'MP': '202.88.64.0/20',
+ 'MQ': '109.203.224.0/19',
+ 'MR': '41.188.64.0/18',
+ 'MS': '208.90.112.0/22',
+ 'MT': '46.11.0.0/16',
+ 'MU': '105.16.0.0/12',
+ 'MV': '27.114.128.0/18',
+ 'MW': '102.70.0.0/15',
+ 'MX': '187.192.0.0/11',
+ 'MY': '175.136.0.0/13',
+ 'MZ': '197.218.0.0/15',
+ 'NA': '41.182.0.0/16',
+ 'NC': '101.101.0.0/18',
+ 'NE': '197.214.0.0/18',
+ 'NF': '203.17.240.0/22',
+ 'NG': '105.112.0.0/12',
+ 'NI': '186.76.0.0/15',
+ 'NL': '145.96.0.0/11',
+ 'NO': '84.208.0.0/13',
+ 'NP': '36.252.0.0/15',
+ 'NR': '203.98.224.0/19',
+ 'NU': '49.156.48.0/22',
+ 'NZ': '49.224.0.0/14',
+ 'OM': '5.36.0.0/15',
+ 'PA': '186.72.0.0/15',
+ 'PE': '186.160.0.0/14',
+ 'PF': '123.50.64.0/18',
+ 'PG': '124.240.192.0/19',
+ 'PH': '49.144.0.0/13',
+ 'PK': '39.32.0.0/11',
+ 'PL': '83.0.0.0/11',
+ 'PM': '70.36.0.0/20',
+ 'PR': '66.50.0.0/16',
+ 'PS': '188.161.0.0/16',
+ 'PT': '85.240.0.0/13',
+ 'PW': '202.124.224.0/20',
+ 'PY': '181.120.0.0/14',
+ 'QA': '37.210.0.0/15',
+ 'RE': '102.35.0.0/16',
+ 'RO': '79.112.0.0/13',
+ 'RS': '93.86.0.0/15',
+ 'RU': '5.136.0.0/13',
+ 'RW': '41.186.0.0/16',
+ 'SA': '188.48.0.0/13',
+ 'SB': '202.1.160.0/19',
+ 'SC': '154.192.0.0/11',
+ 'SD': '102.120.0.0/13',
+ 'SE': '78.64.0.0/12',
+ 'SG': '8.128.0.0/10',
+ 'SI': '188.196.0.0/14',
+ 'SK': '78.98.0.0/15',
+ 'SL': '102.143.0.0/17',
+ 'SM': '89.186.32.0/19',
+ 'SN': '41.82.0.0/15',
+ 'SO': '154.115.192.0/18',
+ 'SR': '186.179.128.0/17',
+ 'SS': '105.235.208.0/21',
+ 'ST': '197.159.160.0/19',
+ 'SV': '168.243.0.0/16',
+ 'SX': '190.102.0.0/20',
+ 'SY': '5.0.0.0/16',
+ 'SZ': '41.84.224.0/19',
+ 'TC': '65.255.48.0/20',
+ 'TD': '154.68.128.0/19',
+ 'TG': '196.168.0.0/14',
+ 'TH': '171.96.0.0/13',
+ 'TJ': '85.9.128.0/18',
+ 'TK': '27.96.24.0/21',
+ 'TL': '180.189.160.0/20',
+ 'TM': '95.85.96.0/19',
+ 'TN': '197.0.0.0/11',
+ 'TO': '175.176.144.0/21',
+ 'TR': '78.160.0.0/11',
+ 'TT': '186.44.0.0/15',
+ 'TV': '202.2.96.0/19',
+ 'TW': '120.96.0.0/11',
+ 'TZ': '156.156.0.0/14',
+ 'UA': '37.52.0.0/14',
+ 'UG': '102.80.0.0/13',
+ 'US': '6.0.0.0/8',
+ 'UY': '167.56.0.0/13',
+ 'UZ': '84.54.64.0/18',
+ 'VA': '212.77.0.0/19',
+ 'VC': '207.191.240.0/21',
+ 'VE': '186.88.0.0/13',
+ 'VG': '66.81.192.0/20',
+ 'VI': '146.226.0.0/16',
+ 'VN': '14.160.0.0/11',
+ 'VU': '202.80.32.0/20',
+ 'WF': '117.20.32.0/21',
+ 'WS': '202.4.32.0/19',
+ 'YE': '134.35.0.0/16',
+ 'YT': '41.242.116.0/22',
+ 'ZA': '41.0.0.0/11',
+ 'ZM': '102.144.0.0/13',
+ 'ZW': '102.177.192.0/18',
+ }
+
+ @classmethod
+ def random_ipv4(cls, code_or_block):
+ if len(code_or_block) == 2:
+ block = cls._country_ip_map.get(code_or_block.upper())
+ if not block:
+ return None
+ else:
+ block = code_or_block
+ addr, preflen = block.split('/')
+ addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
+ addr_max = addr_min | (0xffffffff >> int(preflen))
+ return str(socket.inet_ntoa(
+ struct.pack('!L', random.randint(addr_min, addr_max))))
+
+
+# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
+# released into Public Domain
+# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
+
+def long_to_bytes(n, blocksize=0):
+ """long_to_bytes(n:long, blocksize:int) : string
+ Convert a long integer to a byte string.
+
+ If optional blocksize is given and greater than zero, pad the front of the
+ byte string with binary zeros so that the length is a multiple of
+ blocksize.
+ """
+ # after much testing, this algorithm was deemed to be the fastest
+ s = b''
+ n = int(n)
+ while n > 0:
+ s = struct.pack('>I', n & 0xffffffff) + s
+ n = n >> 32
+ # strip off leading zeros
+ for i in range(len(s)):
+ if s[i] != b'\000'[0]:
+ break
+ else:
+ # only happens when n == 0
+ s = b'\000'
+ i = 0
+ s = s[i:]
+ # add back some pad bytes. this could be done more efficiently w.r.t. the
+ # de-padding being done above, but sigh...
+ if blocksize > 0 and len(s) % blocksize:
+ s = (blocksize - len(s) % blocksize) * b'\000' + s
+ return s
+
+
+def bytes_to_long(s):
+ """bytes_to_long(string) : long
+ Convert a byte string to a long integer.
+
+ This is (essentially) the inverse of long_to_bytes().
+ """
+ acc = 0
+ length = len(s)
+ if length % 4:
+ extra = (4 - length % 4)
+ s = b'\000' * extra + s
+ length = length + extra
+ for i in range(0, length, 4):
+ acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
+ return acc
+
+
+def ohdave_rsa_encrypt(data, exponent, modulus):
+ '''
+ Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
+
+ Input:
+ data: data to encrypt, bytes-like object
+ exponent, modulus: parameter e and N of RSA algorithm, both integer
+ Output: hex string of encrypted data
+
+ Limitation: supports one block encryption only
+ '''
+
+ payload = int(binascii.hexlify(data[::-1]), 16)
+ encrypted = pow(payload, exponent, modulus)
+ return '%x' % encrypted
+
+
+def pkcs1pad(data, length):
+ """
+ Padding input data with PKCS#1 scheme
+
+ @param {int[]} data input data
+ @param {int} length target length
+ @returns {int[]} padded data
+ """
+ if len(data) > length - 11:
+ raise ValueError('Input data too long for PKCS#1 padding')
+
+ pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
+ return [0, 2] + pseudo_random + [0] + data
+
+
+def _base_n_table(n, table):
+ if not table and not n:
+ raise ValueError('Either table or n must be specified')
+ table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
+
+ if n and n != len(table):
+ raise ValueError(f'base {n} exceeds table length {len(table)}')
+ return table
+
+
+def encode_base_n(num, n=None, table=None):
+ """Convert given int to a base-n string"""
+ table = _base_n_table(n, table)
+ if not num:
+ return table[0]
+
+ result, base = '', len(table)
+ while num:
+ result = table[num % base] + result
+ num = num // base
+ return result
+
+
+def decode_base_n(string, n=None, table=None):
+ """Convert given base-n string to int"""
+ table = {char: index for index, char in enumerate(_base_n_table(n, table))}
+ result, base = 0, len(table)
+ for char in string:
+ result = result * base + table[char]
+ return result
+
+
+def decode_packed_codes(code):
+ mobj = re.search(PACKED_CODES_RE, code)
+ obfuscated_code, base, count, symbols = mobj.groups()
+ base = int(base)
+ count = int(count)
+ symbols = symbols.split('|')
+ symbol_table = {}
+
+ while count:
+ count -= 1
+ base_n_count = encode_base_n(count, base)
+ symbol_table[base_n_count] = symbols[count] or base_n_count
+
+ return re.sub(
+ r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
+ obfuscated_code)
+
+
+def caesar(s, alphabet, shift):
+ if shift == 0:
+ return s
+ l = len(alphabet)
+ return ''.join(
+ alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
+ for c in s)
+
+
+def rot47(s):
+ return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
+
+
+def parse_m3u8_attributes(attrib):
+ info = {}
+ for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
+ if val.startswith('"'):
+ val = val[1:-1]
+ info[key] = val
+ return info
+
+
+def urshift(val, n):
+ return val >> n if val >= 0 else (val + 0x100000000) >> n
+
+
+def write_xattr(path, key, value):
+ # Windows: Write xattrs to NTFS Alternate Data Streams:
+ # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
+ if compat_os_name == 'nt':
+ assert ':' not in key
+ assert os.path.exists(path)
+
+ try:
+ with open(f'{path}:{key}', 'wb') as f:
+ f.write(value)
+ except OSError as e:
+ raise XAttrMetadataError(e.errno, e.strerror)
+ return
+
+ # UNIX Method 1. Use xattrs/pyxattrs modules
+
+ setxattr = None
+ if getattr(xattr, '_hypervideo_dl__identifier', None) == 'pyxattr':
+ # Unicode arguments are not supported in pyxattr until version 0.5.0
+ # See https://github.com/ytdl-org/youtube-dl/issues/5498
+ if version_tuple(xattr.__version__) >= (0, 5, 0):
+ setxattr = xattr.set
+ elif xattr:
+ setxattr = xattr.setxattr
+
+ if setxattr:
+ try:
+ setxattr(path, key, value)
+ except OSError as e:
+ raise XAttrMetadataError(e.errno, e.strerror)
+ return
+
+ # UNIX Method 2. Use setfattr/xattr executables
+ exe = ('setfattr' if check_executable('setfattr', ['--version'])
+ else 'xattr' if check_executable('xattr', ['-h']) else None)
+ if not exe:
+ raise XAttrUnavailableError(
+ 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
+ + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
+
+ value = value.decode()
+ try:
+ _, stderr, returncode = Popen.run(
+ [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
+ text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
+ except OSError as e:
+ raise XAttrMetadataError(e.errno, e.strerror)
+ if returncode:
+ raise XAttrMetadataError(returncode, stderr)
+
+
+def random_birthday(year_field, month_field, day_field):
+ start_date = datetime.date(1950, 1, 1)
+ end_date = datetime.date(1995, 12, 31)
+ offset = random.randint(0, (end_date - start_date).days)
+ random_date = start_date + datetime.timedelta(offset)
+ return {
+ year_field: str(random_date.year),
+ month_field: str(random_date.month),
+ day_field: str(random_date.day),
+ }
+
+
+def find_available_port(interface=''):
+ try:
+ with socket.socket() as sock:
+ sock.bind((interface, 0))
+ return sock.getsockname()[1]
+ except OSError:
+ return None
+
+
+# Templates for internet shortcut files, which are plain text files.
+DOT_URL_LINK_TEMPLATE = '''\
+[InternetShortcut]
+URL=%(url)s
+'''
+
+DOT_WEBLOC_LINK_TEMPLATE = '''\
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+\t<key>URL</key>
+\t<string>%(url)s</string>
+</dict>
+</plist>
+'''
+
+DOT_DESKTOP_LINK_TEMPLATE = '''\
+[Desktop Entry]
+Encoding=UTF-8
+Name=%(filename)s
+Type=Link
+URL=%(url)s
+Icon=text-html
+'''
+
+LINK_TEMPLATES = {
+ 'url': DOT_URL_LINK_TEMPLATE,
+ 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
+ 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
+}
+
+
+def iri_to_uri(iri):
+ """
+ Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
+
+ The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
+ """
+
+ iri_parts = urllib.parse.urlparse(iri)
+
+ if '[' in iri_parts.netloc:
+ raise ValueError('IPv6 URIs are not, yet, supported.')
+ # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
+
+ # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
+
+ net_location = ''
+ if iri_parts.username:
+ net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
+ if iri_parts.password is not None:
+ net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
+ net_location += '@'
+
+ net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
+ # The 'idna' encoding produces ASCII text.
+ if iri_parts.port is not None and iri_parts.port != 80:
+ net_location += ':' + str(iri_parts.port)
+
+ return urllib.parse.urlunparse(
+ (iri_parts.scheme,
+ net_location,
+
+ urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
+
+ # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
+ urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
+
+ # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
+ urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
+
+ urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
+
+ # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
+
+
+def to_high_limit_path(path):
+ if sys.platform in ['win32', 'cygwin']:
+ # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
+ return '\\\\?\\' + os.path.abspath(path)
+
+ return path
+
+
+def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
+ val = traversal.traverse_obj(obj, *variadic(field))
+ if not val if ignore is NO_DEFAULT else val in variadic(ignore):
+ return default
+ return template % func(val)
+
+
+def clean_podcast_url(url):
+ url = re.sub(r'''(?x)
+ (?:
+ (?:
+ chtbl\.com/track|
+ media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
+ play\.podtrac\.com|
+ chrt\.fm/track|
+ mgln\.ai/e
+ )(?:/[^/.]+)?|
+ (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
+ flex\.acast\.com|
+ pd(?:
+ cn\.co| # https://podcorn.com/analytics-prefix/
+ st\.fm # https://podsights.com/docs/
+ )/e|
+ [0-9]\.gum\.fm|
+ pscrb\.fm/rss/p
+ )/''', '', url)
+ return re.sub(r'^\w+://(\w+://)', r'\1', url)
+
+
+_HEX_TABLE = '0123456789abcdef'
+
+
+def random_uuidv4():
+ return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
+
+
+def make_dir(path, to_screen=None):
+ try:
+ dn = os.path.dirname(path)
+ if dn:
+ os.makedirs(dn, exist_ok=True)
+ return True
+ except OSError as err:
+ if callable(to_screen) is not None:
+ to_screen(f'unable to create directory {err}')
+ return False
+
+
+def get_executable_path():
+ from zipimport import zipimporter
+ if hasattr(sys, 'frozen'): # Running from PyInstaller
+ path = os.path.dirname(sys.executable)
+ elif isinstance(globals().get('__loader__'), zipimporter): # Running from ZIP
+ path = os.path.join(os.path.dirname(__file__), '../..')
+ else:
+ path = os.path.join(os.path.dirname(__file__), '..')
+ return os.path.abspath(path)
+
+
+def get_user_config_dirs(package_name):
+ # .config (e.g. ~/.config/package_name)
+ xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
+ yield os.path.join(xdg_config_home, package_name)
+
+ # appdata (%APPDATA%/package_name)
+ appdata_dir = os.getenv('appdata')
+ if appdata_dir:
+ yield os.path.join(appdata_dir, package_name)
+
+ # home (~/.package_name)
+ yield os.path.join(compat_expanduser('~'), f'.{package_name}')
+
+
+def get_system_config_dirs(package_name):
+ # /etc/package_name
+ yield os.path.join('/etc', package_name)
+
+
+def time_seconds(**kwargs):
+ """
+ Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
+ """
+ return time.time() + datetime.timedelta(**kwargs).total_seconds()
+
+
+# create a JSON Web Signature (jws) with HS256 algorithm
+# the resulting format is in JWS Compact Serialization
+# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
+# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
+def jwt_encode_hs256(payload_data, key, headers={}):
+ header_data = {
+ 'alg': 'HS256',
+ 'typ': 'JWT',
+ }
+ if headers:
+ header_data.update(headers)
+ header_b64 = base64.b64encode(json.dumps(header_data).encode())
+ payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
+ h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
+ signature_b64 = base64.b64encode(h.digest())
+ token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
+ return token
+
+
+# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
+def jwt_decode_hs256(jwt):
+ header_b64, payload_b64, signature_b64 = jwt.split('.')
+ # add trailing ='s that may have been stripped, superfluous ='s are ignored
+ payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
+ return payload_data
+
+
+WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
+
+
+@functools.cache
+def supports_terminal_sequences(stream):
+ if compat_os_name == 'nt':
+ if not WINDOWS_VT_MODE:
+ return False
+ elif not os.getenv('TERM'):
+ return False
+ try:
+ return stream.isatty()
+ except BaseException:
+ return False
+
+
+def windows_enable_vt_mode():
+ """Ref: https://bugs.python.org/issue30075 """
+ if get_windows_version() < (10, 0, 10586):
+ return
+
+ import ctypes
+ import ctypes.wintypes
+ import msvcrt
+
+ ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
+
+ dll = ctypes.WinDLL('kernel32', use_last_error=False)
+ handle = os.open('CONOUT$', os.O_RDWR)
+ try:
+ h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
+ dw_original_mode = ctypes.wintypes.DWORD()
+ success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
+ if not success:
+ raise Exception('GetConsoleMode failed')
+
+ success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
+ dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
+ if not success:
+ raise Exception('SetConsoleMode failed')
+ finally:
+ os.close(handle)
+
+ global WINDOWS_VT_MODE
+ WINDOWS_VT_MODE = True
+ supports_terminal_sequences.cache_clear()
+
+
+_terminal_sequences_re = re.compile('\033\\[[^m]+m')
+
+
+def remove_terminal_sequences(string):
+ return _terminal_sequences_re.sub('', string)
+
+
+def number_of_digits(number):
+ return len('%d' % number)
+
+
+def join_nonempty(*values, delim='-', from_dict=None):
+ if from_dict is not None:
+ values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
+ return delim.join(map(str, filter(None, values)))
+
+
+def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
+ """
+ Find the largest format dimensions in terms of video width and, for each thumbnail:
+ * Modify the URL: Match the width with the provided regex and replace with the former width
+ * Update dimensions
+
+ This function is useful with video services that scale the provided thumbnails on demand
+ """
+ _keys = ('width', 'height')
+ max_dimensions = max(
+ (tuple(format.get(k) or 0 for k in _keys) for format in formats),
+ default=(0, 0))
+ if not max_dimensions[0]:
+ return thumbnails
+ return [
+ merge_dicts(
+ {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
+ dict(zip(_keys, max_dimensions)), thumbnail)
+ for thumbnail in thumbnails
+ ]
+
+
+def parse_http_range(range):
+ """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
+ if not range:
+ return None, None, None
+ crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
+ if not crg:
+ return None, None, None
+ return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
+
+
+def read_stdin(what):
+ eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
+ write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
+ return sys.stdin
+
+
+def determine_file_encoding(data):
+ """
+ Detect the text encoding used
+ @returns (encoding, bytes to skip)
+ """
+
+ # BOM marks are given priority over declarations
+ for bom, enc in BOMS:
+ if data.startswith(bom):
+ return enc, len(bom)
+
+ # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
+ # We ignore the endianness to get a good enough match
+ data = data.replace(b'\0', b'')
+ mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
+ return mobj.group(1).decode() if mobj else None, 0
+
+
+class Config:
+ own_args = None
+ parsed_args = None
+ filename = None
+ __initialized = False
+
+ def __init__(self, parser, label=None):
+ self.parser, self.label = parser, label
+ self._loaded_paths, self.configs = set(), []
+
+ def init(self, args=None, filename=None):
+ assert not self.__initialized
+ self.own_args, self.filename = args, filename
+ return self.load_configs()
+
+ def load_configs(self):
+ directory = ''
+ if self.filename:
+ location = os.path.realpath(self.filename)
+ directory = os.path.dirname(location)
+ if location in self._loaded_paths:
+ return False
+ self._loaded_paths.add(location)
+
+ self.__initialized = True
+ opts, _ = self.parser.parse_known_args(self.own_args)
+ self.parsed_args = self.own_args
+ for location in opts.config_locations or []:
+ if location == '-':
+ if location in self._loaded_paths:
+ continue
+ self._loaded_paths.add(location)
+ self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
+ continue
+ location = os.path.join(directory, expand_path(location))
+ if os.path.isdir(location):
+ location = os.path.join(location, 'hypervideo.conf')
+ if not os.path.exists(location):
+ self.parser.error(f'config location {location} does not exist')
+ self.append_config(self.read_file(location), location)
+ return True
+
+ def __str__(self):
+ label = join_nonempty(
+ self.label, 'config', f'"{self.filename}"' if self.filename else '',
+ delim=' ')
+ return join_nonempty(
+ self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
+ *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
+ delim='\n')
+
+ @staticmethod
+ def read_file(filename, default=[]):
+ try:
+ optionf = open(filename, 'rb')
+ except OSError:
+ return default # silently skip if file is not present
+ try:
+ enc, skip = determine_file_encoding(optionf.read(512))
+ optionf.seek(skip, io.SEEK_SET)
+ except OSError:
+ enc = None # silently skip read errors
+ try:
+ # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
+ contents = optionf.read().decode(enc or preferredencoding())
+ res = shlex.split(contents, comments=True)
+ except Exception as err:
+ raise ValueError(f'Unable to parse "{filename}": {err}')
+ finally:
+ optionf.close()
+ return res
+
+ @staticmethod
+ def hide_login_info(opts):
+ PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
+ eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
+
+ def _scrub_eq(o):
+ m = eqre.match(o)
+ if m:
+ return m.group('key') + '=PRIVATE'
+ else:
+ return o
+
+ opts = list(map(_scrub_eq, opts))
+ for idx, opt in enumerate(opts):
+ if opt in PRIVATE_OPTS and idx + 1 < len(opts):
+ opts[idx + 1] = 'PRIVATE'
+ return opts
+
+ def append_config(self, *args, label=None):
+ config = type(self)(self.parser, label)
+ config._loaded_paths = self._loaded_paths
+ if config.init(*args):
+ self.configs.append(config)
+
+ @property
+ def all_args(self):
+ for config in reversed(self.configs):
+ yield from config.all_args
+ yield from self.parsed_args or []
+
+ def parse_known_args(self, **kwargs):
+ return self.parser.parse_known_args(self.all_args, **kwargs)
+
+ def parse_args(self):
+ return self.parser.parse_args(self.all_args)
+
+
+class WebSocketsWrapper:
+ """Wraps websockets module to use in non-async scopes"""
+ pool = None
+
+ def __init__(self, url, headers=None, connect=True):
+ self.loop = asyncio.new_event_loop()
+ # XXX: "loop" is deprecated
+ self.conn = websockets.connect(
+ url, extra_headers=headers, ping_interval=None,
+ close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
+ if connect:
+ self.__enter__()
+ atexit.register(self.__exit__, None, None, None)
+
+ def __enter__(self):
+ if not self.pool:
+ self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
+ return self
+
+ def send(self, *args):
+ self.run_with_loop(self.pool.send(*args), self.loop)
+
+ def recv(self, *args):
+ return self.run_with_loop(self.pool.recv(*args), self.loop)
+
+ def __exit__(self, type, value, traceback):
+ try:
+ return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
+ finally:
+ self.loop.close()
+ self._cancel_all_tasks(self.loop)
+
+ # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
+ # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
+ @staticmethod
+ def run_with_loop(main, loop):
+ if not asyncio.iscoroutine(main):
+ raise ValueError(f'a coroutine was expected, got {main!r}')
+
+ try:
+ return loop.run_until_complete(main)
+ finally:
+ loop.run_until_complete(loop.shutdown_asyncgens())
+ if hasattr(loop, 'shutdown_default_executor'):
+ loop.run_until_complete(loop.shutdown_default_executor())
+
+ @staticmethod
+ def _cancel_all_tasks(loop):
+ to_cancel = asyncio.all_tasks(loop)
+
+ if not to_cancel:
+ return
+
+ for task in to_cancel:
+ task.cancel()
+
+ # XXX: "loop" is removed in python 3.10+
+ loop.run_until_complete(
+ asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
+
+ for task in to_cancel:
+ if task.cancelled():
+ continue
+ if task.exception() is not None:
+ loop.call_exception_handler({
+ 'message': 'unhandled exception during asyncio.run() shutdown',
+ 'exception': task.exception(),
+ 'task': task,
+ })
+
+
+def merge_headers(*dicts):
+ """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
+ return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
+
+
+def cached_method(f):
+ """Cache a method"""
+ signature = inspect.signature(f)
+
+ @functools.wraps(f)
+ def wrapper(self, *args, **kwargs):
+ bound_args = signature.bind(self, *args, **kwargs)
+ bound_args.apply_defaults()
+ key = tuple(bound_args.arguments.values())[1:]
+
+ cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
+ if key not in cache:
+ cache[key] = f(self, *args, **kwargs)
+ return cache[key]
+ return wrapper
+
+
+class classproperty:
+ """property access for class methods with optional caching"""
+ def __new__(cls, func=None, *args, **kwargs):
+ if not func:
+ return functools.partial(cls, *args, **kwargs)
+ return super().__new__(cls)
+
+ def __init__(self, func, *, cache=False):
+ functools.update_wrapper(self, func)
+ self.func = func
+ self._cache = {} if cache else None
+
+ def __get__(self, _, cls):
+ if self._cache is None:
+ return self.func(cls)
+ elif cls not in self._cache:
+ self._cache[cls] = self.func(cls)
+ return self._cache[cls]
+
+
+class function_with_repr:
+ def __init__(self, func, repr_=None):
+ functools.update_wrapper(self, func)
+ self.func, self.__repr = func, repr_
+
+ def __call__(self, *args, **kwargs):
+ return self.func(*args, **kwargs)
+
+ def __repr__(self):
+ if self.__repr:
+ return self.__repr
+ return f'{self.func.__module__}.{self.func.__qualname__}'
+
+
+class Namespace(types.SimpleNamespace):
+ """Immutable namespace"""
+
+ def __iter__(self):
+ return iter(self.__dict__.values())
+
+ @property
+ def items_(self):
+ return self.__dict__.items()
+
+
+MEDIA_EXTENSIONS = Namespace(
+ common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
+ video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
+ common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
+ audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
+ thumbnails=('jpg', 'png', 'webp'),
+ storyboards=('mhtml', ),
+ subtitles=('srt', 'vtt', 'ass', 'lrc'),
+ manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
+)
+MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
+MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
+
+KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
+
+
+class RetryManager:
+ """Usage:
+ for retry in RetryManager(...):
+ try:
+ ...
+ except SomeException as err:
+ retry.error = err
+ continue
+ """
+ attempt, _error = 0, None
+
+ def __init__(self, _retries, _error_callback, **kwargs):
+ self.retries = _retries or 0
+ self.error_callback = functools.partial(_error_callback, **kwargs)
+
+ def _should_retry(self):
+ return self._error is not NO_DEFAULT and self.attempt <= self.retries
+
+ @property
+ def error(self):
+ if self._error is NO_DEFAULT:
+ return None
+ return self._error
+
+ @error.setter
+ def error(self, value):
+ self._error = value
+
+ def __iter__(self):
+ while self._should_retry():
+ self.error = NO_DEFAULT
+ self.attempt += 1
+ yield self
+ if self.error:
+ self.error_callback(self.error, self.attempt, self.retries)
+
+ @staticmethod
+ def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
+ """Utility function for reporting retries"""
+ if count > retries:
+ if error:
+ return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
+ raise e
+
+ if not count:
+ return warn(e)
+ elif isinstance(e, ExtractorError):
+ e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
+ warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
+
+ delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
+ if delay:
+ info(f'Sleeping {delay:.2f} seconds ...')
+ time.sleep(delay)
+
+
+def make_archive_id(ie, video_id):
+ ie_key = ie if isinstance(ie, str) else ie.ie_key()
+ return f'{ie_key.lower()} {video_id}'
+
+
+def truncate_string(s, left, right=0):
+ assert left > 3 and right >= 0
+ if s is None or len(s) <= left + right:
+ return s
+ return f'{s[:left-3]}...{s[-right:] if right else ""}'
+
+
+def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
+ assert 'all' in alias_dict, '"all" alias is required'
+ requested = list(start or [])
+ for val in options:
+ discard = val.startswith('-')
+ if discard:
+ val = val[1:]
+
+ if val in alias_dict:
+ val = alias_dict[val] if not discard else [
+ i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
+ # NB: Do not allow regex in aliases for performance
+ requested = orderedSet_from_options(val, alias_dict, start=requested)
+ continue
+
+ current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
+ else [val] if val in alias_dict['all'] else None)
+ if current is None:
+ raise ValueError(val)
+
+ if discard:
+ for item in current:
+ while item in requested:
+ requested.remove(item)
+ else:
+ requested.extend(current)
+
+ return orderedSet(requested)
+
+
+# TODO: Rewrite
+class FormatSorter:
+ regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
+
+ default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
+ 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
+ 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
+ ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
+ 'height', 'width', 'proto', 'vext', 'abr', 'aext',
+ 'fps', 'fs_approx', 'source', 'id')
+
+ settings = {
+ 'vcodec': {'type': 'ordered', 'regex': True,
+ 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
+ 'acodec': {'type': 'ordered', 'regex': True,
+ 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
+ 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
+ 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
+ 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
+ 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
+ 'vext': {'type': 'ordered', 'field': 'video_ext',
+ 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
+ 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
+ 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
+ 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
+ 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
+ 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
+ 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
+ 'field': ('vcodec', 'acodec'),
+ 'function': lambda it: int(any(v != 'none' for v in it))},
+ 'ie_pref': {'priority': True, 'type': 'extractor'},
+ 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
+ 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
+ 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
+ 'quality': {'convert': 'float', 'default': -1},
+ 'filesize': {'convert': 'bytes'},
+ 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
+ 'id': {'convert': 'string', 'field': 'format_id'},
+ 'height': {'convert': 'float_none'},
+ 'width': {'convert': 'float_none'},
+ 'fps': {'convert': 'float_none'},
+ 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
+ 'tbr': {'convert': 'float_none'},
+ 'vbr': {'convert': 'float_none'},
+ 'abr': {'convert': 'float_none'},
+ 'asr': {'convert': 'float_none'},
+ 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
+
+ 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
+ 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
+ 'function': lambda it: next(filter(None, it), None)},
+ 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
+ 'function': lambda it: next(filter(None, it), None)},
+ 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
+ 'res': {'type': 'multiple', 'field': ('height', 'width'),
+ 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
+
+ # Actual field names
+ 'format_id': {'type': 'alias', 'field': 'id'},
+ 'preference': {'type': 'alias', 'field': 'ie_pref'},
+ 'language_preference': {'type': 'alias', 'field': 'lang'},
+ 'source_preference': {'type': 'alias', 'field': 'source'},
+ 'protocol': {'type': 'alias', 'field': 'proto'},
+ 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
+ 'audio_channels': {'type': 'alias', 'field': 'channels'},
+
+ # Deprecated
+ 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
+ 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
+ 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
+ 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
+ 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
+ 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
+ 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
+ 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
+ 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
+ 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
+ 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
+ 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
+ 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
+ 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
+ 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
+ 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
+ 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
+ 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
+ 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
+ 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
+ }
+
+ def __init__(self, ydl, field_preference):
+ self.ydl = ydl
+ self._order = []
+ self.evaluate_params(self.ydl.params, field_preference)
+ if ydl.params.get('verbose'):
+ self.print_verbose_info(self.ydl.write_debug)
+
+ def _get_field_setting(self, field, key):
+ if field not in self.settings:
+ if key in ('forced', 'priority'):
+ return False
+ self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
+ 'deprecated and may be removed in a future version')
+ self.settings[field] = {}
+ propObj = self.settings[field]
+ if key not in propObj:
+ type = propObj.get('type')
+ if key == 'field':
+ default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
+ elif key == 'convert':
+ default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
+ else:
+ default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
+ propObj[key] = default
+ return propObj[key]
+
+ def _resolve_field_value(self, field, value, convertNone=False):
+ if value is None:
+ if not convertNone:
+ return None
+ else:
+ value = value.lower()
+ conversion = self._get_field_setting(field, 'convert')
+ if conversion == 'ignore':
+ return None
+ if conversion == 'string':
+ return value
+ elif conversion == 'float_none':
+ return float_or_none(value)
+ elif conversion == 'bytes':
+ return parse_bytes(value)
+ elif conversion == 'order':
+ order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
+ use_regex = self._get_field_setting(field, 'regex')
+ list_length = len(order_list)
+ empty_pos = order_list.index('') if '' in order_list else list_length + 1
+ if use_regex and value is not None:
+ for i, regex in enumerate(order_list):
+ if regex and re.match(regex, value):
+ return list_length - i
+ return list_length - empty_pos # not in list
+ else: # not regex or value = None
+ return list_length - (order_list.index(value) if value in order_list else empty_pos)
+ else:
+ if value.isnumeric():
+ return float(value)
+ else:
+ self.settings[field]['convert'] = 'string'
+ return value
+
+ def evaluate_params(self, params, sort_extractor):
+ self._use_free_order = params.get('prefer_free_formats', False)
+ self._sort_user = params.get('format_sort', [])
+ self._sort_extractor = sort_extractor
+
+ def add_item(field, reverse, closest, limit_text):
+ field = field.lower()
+ if field in self._order:
+ return
+ self._order.append(field)
+ limit = self._resolve_field_value(field, limit_text)
+ data = {
+ 'reverse': reverse,
+ 'closest': False if limit is None else closest,
+ 'limit_text': limit_text,
+ 'limit': limit}
+ if field in self.settings:
+ self.settings[field].update(data)
+ else:
+ self.settings[field] = data
+
+ sort_list = (
+ tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
+ + (tuple() if params.get('format_sort_force', False)
+ else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
+ + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
+
+ for item in sort_list:
+ match = re.match(self.regex, item)
+ if match is None:
+ raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
+ field = match.group('field')
+ if field is None:
+ continue
+ if self._get_field_setting(field, 'type') == 'alias':
+ alias, field = field, self._get_field_setting(field, 'field')
+ if self._get_field_setting(alias, 'deprecated'):
+ self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
+ f'be removed in a future version. Please use {field} instead')
+ reverse = match.group('reverse') is not None
+ closest = match.group('separator') == '~'
+ limit_text = match.group('limit')
+
+ has_limit = limit_text is not None
+ has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
+ has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
+
+ fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
+ limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
+ limit_count = len(limits)
+ for (i, f) in enumerate(fields):
+ add_item(f, reverse, closest,
+ limits[i] if i < limit_count
+ else limits[0] if has_limit and not has_multiple_limits
+ else None)
+
+ def print_verbose_info(self, write_debug):
+ if self._sort_user:
+ write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
+ if self._sort_extractor:
+ write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
+ write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
+ '+' if self._get_field_setting(field, 'reverse') else '', field,
+ '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
+ self._get_field_setting(field, 'limit_text'),
+ self._get_field_setting(field, 'limit'))
+ if self._get_field_setting(field, 'limit_text') is not None else '')
+ for field in self._order if self._get_field_setting(field, 'visible')]))
+
+ def _calculate_field_preference_from_value(self, format, field, type, value):
+ reverse = self._get_field_setting(field, 'reverse')
+ closest = self._get_field_setting(field, 'closest')
+ limit = self._get_field_setting(field, 'limit')
+
+ if type == 'extractor':
+ maximum = self._get_field_setting(field, 'max')
+ if value is None or (maximum is not None and value >= maximum):
+ value = -1
+ elif type == 'boolean':
+ in_list = self._get_field_setting(field, 'in_list')
+ not_in_list = self._get_field_setting(field, 'not_in_list')
+ value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
+ elif type == 'ordered':
+ value = self._resolve_field_value(field, value, True)
+
+ # try to convert to number
+ val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
+ is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
+ if is_num:
+ value = val_num
+
+ return ((-10, 0) if value is None
+ else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
+ else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
+ else (0, value, 0) if not reverse and (limit is None or value <= limit)
+ else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
+ else (-1, value, 0))
+
+ def _calculate_field_preference(self, format, field):
+ type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
+ get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
+ if type == 'multiple':
+ type = 'field' # Only 'field' is allowed in multiple for now
+ actual_fields = self._get_field_setting(field, 'field')
+
+ value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
+ else:
+ value = get_value(field)
+ return self._calculate_field_preference_from_value(format, field, type, value)
+
+ def calculate_preference(self, format):
+ # Determine missing protocol
+ if not format.get('protocol'):
+ format['protocol'] = determine_protocol(format)
+
+ # Determine missing ext
+ if not format.get('ext') and 'url' in format:
+ format['ext'] = determine_ext(format['url'])
+ if format.get('vcodec') == 'none':
+ format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
+ format['video_ext'] = 'none'
+ else:
+ format['video_ext'] = format['ext']
+ format['audio_ext'] = 'none'
+ # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
+ # format['preference'] = -1000
+
+ if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
+ # HEVC-over-FLV is out-of-spec by FLV's original spec
+ # ref. https://trac.ffmpeg.org/ticket/6389
+ # ref. https://github.com/hypervideo/hypervideo/pull/5821
+ format['preference'] = -100
+
+ # Determine missing bitrates
+ if format.get('vcodec') == 'none':
+ format['vbr'] = 0
+ if format.get('acodec') == 'none':
+ format['abr'] = 0
+ if not format.get('vbr') and format.get('vcodec') != 'none':
+ format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
+ if not format.get('abr') and format.get('acodec') != 'none':
+ format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
+ if not format.get('tbr'):
+ format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
+
+ return tuple(self._calculate_field_preference(format, field) for field in self._order)
+
+
+# XXX: Temporary
+class _YDLLogger:
+ def __init__(self, ydl=None):
+ self._ydl = ydl
+
+ def debug(self, message):
+ if self._ydl:
+ self._ydl.write_debug(message)
+
+ def info(self, message):
+ if self._ydl:
+ self._ydl.to_screen(message)
+
+ def warning(self, message, *, once=False):
+ if self._ydl:
+ self._ydl.report_warning(message, once)
+
+ def error(self, message, *, is_error=True):
+ if self._ydl:
+ self._ydl.report_error(message, is_error=is_error)
+
+ def stdout(self, message):
+ if self._ydl:
+ self._ydl.to_stdout(message)
+
+ def stderr(self, message):
+ if self._ydl:
+ self._ydl.to_stderr(message)
diff --git a/hypervideo_dl/utils/networking.py b/hypervideo_dl/utils/networking.py
new file mode 100644
index 0000000..ba0493c
--- /dev/null
+++ b/hypervideo_dl/utils/networking.py
@@ -0,0 +1,163 @@
+import collections
+import random
+import urllib.parse
+import urllib.request
+
+from ._utils import remove_start
+
+
+def random_user_agent():
+ _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
+ _CHROME_VERSIONS = (
+ '90.0.4430.212',
+ '90.0.4430.24',
+ '90.0.4430.70',
+ '90.0.4430.72',
+ '90.0.4430.85',
+ '90.0.4430.93',
+ '91.0.4472.101',
+ '91.0.4472.106',
+ '91.0.4472.114',
+ '91.0.4472.124',
+ '91.0.4472.164',
+ '91.0.4472.19',
+ '91.0.4472.77',
+ '92.0.4515.107',
+ '92.0.4515.115',
+ '92.0.4515.131',
+ '92.0.4515.159',
+ '92.0.4515.43',
+ '93.0.4556.0',
+ '93.0.4577.15',
+ '93.0.4577.63',
+ '93.0.4577.82',
+ '94.0.4606.41',
+ '94.0.4606.54',
+ '94.0.4606.61',
+ '94.0.4606.71',
+ '94.0.4606.81',
+ '94.0.4606.85',
+ '95.0.4638.17',
+ '95.0.4638.50',
+ '95.0.4638.54',
+ '95.0.4638.69',
+ '95.0.4638.74',
+ '96.0.4664.18',
+ '96.0.4664.45',
+ '96.0.4664.55',
+ '96.0.4664.93',
+ '97.0.4692.20',
+ )
+ return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
+
+
+class HTTPHeaderDict(collections.UserDict, dict):
+ """
+ Store and access keys case-insensitively.
+ The constructor can take multiple dicts, in which keys in the latter are prioritised.
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__()
+ for dct in args:
+ if dct is not None:
+ self.update(dct)
+ self.update(kwargs)
+
+ def __setitem__(self, key, value):
+ if isinstance(value, bytes):
+ value = value.decode('latin-1')
+ super().__setitem__(key.title(), str(value))
+
+ def __getitem__(self, key):
+ return super().__getitem__(key.title())
+
+ def __delitem__(self, key):
+ super().__delitem__(key.title())
+
+ def __contains__(self, key):
+ return super().__contains__(key.title() if isinstance(key, str) else key)
+
+
+std_headers = HTTPHeaderDict({
+ 'User-Agent': random_user_agent(),
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ 'Accept-Language': 'en-us,en;q=0.5',
+ 'Sec-Fetch-Mode': 'navigate',
+})
+
+
+def clean_proxies(proxies: dict, headers: HTTPHeaderDict):
+ req_proxy = headers.pop('Ytdl-Request-Proxy', None)
+ if req_proxy:
+ proxies.clear() # XXX: compat: Ytdl-Request-Proxy takes preference over everything, including NO_PROXY
+ proxies['all'] = req_proxy
+ for proxy_key, proxy_url in proxies.items():
+ if proxy_url == '__noproxy__':
+ proxies[proxy_key] = None
+ continue
+ if proxy_key == 'no': # special case
+ continue
+ if proxy_url is not None:
+ # Ensure proxies without a scheme are http.
+ try:
+ proxy_scheme = urllib.request._parse_proxy(proxy_url)[0]
+ except ValueError:
+ # Ignore invalid proxy URLs. Sometimes these may be introduced through environment
+ # variables unrelated to proxy settings - e.g. Colab `COLAB_LANGUAGE_SERVER_PROXY`.
+ # If the proxy is going to be used, the Request Handler proxy validation will handle it.
+ continue
+ if proxy_scheme is None:
+ proxies[proxy_key] = 'http://' + remove_start(proxy_url, '//')
+
+ replace_scheme = {
+ 'socks5': 'socks5h', # compat: socks5 was treated as socks5h
+ 'socks': 'socks4' # compat: non-standard
+ }
+ if proxy_scheme in replace_scheme:
+ proxies[proxy_key] = urllib.parse.urlunparse(
+ urllib.parse.urlparse(proxy_url)._replace(scheme=replace_scheme[proxy_scheme]))
+
+
+def clean_headers(headers: HTTPHeaderDict):
+ if 'Youtubedl-No-Compression' in headers: # compat
+ del headers['Youtubedl-No-Compression']
+ headers['Accept-Encoding'] = 'identity'
+
+
+def remove_dot_segments(path):
+ # Implements RFC3986 5.2.4 remote_dot_segments
+ # Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4
+ # https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263
+ output = []
+ segments = path.split('/')
+ for s in segments:
+ if s == '.':
+ continue
+ elif s == '..':
+ if output:
+ output.pop()
+ else:
+ output.append(s)
+ if not segments[0] and (not output or output[0]):
+ output.insert(0, '')
+ if segments[-1] in ('.', '..'):
+ output.append('')
+ return '/'.join(output)
+
+
+def escape_rfc3986(s):
+ """Escape non-ASCII characters as suggested by RFC 3986"""
+ return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
+
+
+def normalize_url(url):
+ """Normalize URL as suggested by RFC 3986"""
+ url_parsed = urllib.parse.urlparse(url)
+ return url_parsed._replace(
+ netloc=url_parsed.netloc.encode('idna').decode('ascii'),
+ path=escape_rfc3986(remove_dot_segments(url_parsed.path)),
+ params=escape_rfc3986(url_parsed.params),
+ query=escape_rfc3986(url_parsed.query),
+ fragment=escape_rfc3986(url_parsed.fragment)
+ ).geturl()
diff --git a/hypervideo_dl/utils/traversal.py b/hypervideo_dl/utils/traversal.py
new file mode 100644
index 0000000..462c3ba
--- /dev/null
+++ b/hypervideo_dl/utils/traversal.py
@@ -0,0 +1,254 @@
+import collections.abc
+import contextlib
+import inspect
+import itertools
+import re
+
+from ._utils import (
+ IDENTITY,
+ NO_DEFAULT,
+ LazyList,
+ int_or_none,
+ is_iterable_like,
+ try_call,
+ variadic,
+)
+
+
+def traverse_obj(
+ obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
+ casesense=True, is_user_input=False, traverse_string=False):
+ """
+ Safely traverse nested `dict`s and `Iterable`s
+
+ >>> obj = [{}, {"key": "value"}]
+ >>> traverse_obj(obj, (1, "key"))
+ "value"
+
+ Each of the provided `paths` is tested and the first producing a valid result will be returned.
+ The next path will also be tested if the path branched but no results could be found.
+ Supported values for traversal are `Mapping`, `Iterable` and `re.Match`.
+ Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded.
+
+ The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
+
+ The keys in the path can be one of:
+ - `None`: Return the current object.
+ - `set`: Requires the only item in the set to be a type or function,
+ like `{type}`/`{func}`. If a `type`, returns only values
+ of this type. If a function, returns `func(obj)`.
+ - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`.
+ - `slice`: Branch out and return all values in `obj[key]`.
+ - `Ellipsis`: Branch out and return a list of all values.
+ - `tuple`/`list`: Branch out and return a list of all matching values.
+ Read as: `[traverse_obj(obj, branch) for branch in branches]`.
+ - `function`: Branch out and return values filtered by the function.
+ Read as: `[value for key, value in obj if function(key, value)]`.
+ For `Iterable`s, `key` is the index of the value.
+ For `re.Match`es, `key` is the group number (0 = full match)
+ as well as additionally any group names, if given.
+ - `dict` Transform the current object and return a matching dict.
+ Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
+
+ `tuple`, `list`, and `dict` all support nested paths and branches.
+
+ @params paths Paths which to traverse by.
+ @param default Value to return if the paths do not match.
+ If the last key in the path is a `dict`, it will apply to each value inside
+ the dict instead, depth first. Try to avoid if using nested `dict` keys.
+ @param expected_type If a `type`, only accept final values of this type.
+ If any other callable, try to call the function on each result.
+ If the last key in the path is a `dict`, it will apply to each value inside
+ the dict instead, recursively. This does respect branching paths.
+ @param get_all If `False`, return the first matching result, otherwise all matching ones.
+ @param casesense If `False`, consider string dictionary keys as case insensitive.
+
+ The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
+
+ @param is_user_input Whether the keys are generated from user input.
+ If `True` strings get converted to `int`/`slice` if needed.
+ @param traverse_string Whether to traverse into objects as strings.
+ If `True`, any non-compatible object will first be
+ converted into a string and then traversed into.
+ The return value of that path will be a string instead,
+ not respecting any further branching.
+
+
+ @returns The result of the object traversal.
+ If successful, `get_all=True`, and the path branches at least once,
+ then a list of results is returned instead.
+ If no `default` is given and the last path branches, a `list` of results
+ is always returned. If a path ends on a `dict` that result will always be a `dict`.
+ """
+ casefold = lambda k: k.casefold() if isinstance(k, str) else k
+
+ if isinstance(expected_type, type):
+ type_test = lambda val: val if isinstance(val, expected_type) else None
+ else:
+ type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
+
+ def apply_key(key, obj, is_last):
+ branching = False
+ result = None
+
+ if obj is None and traverse_string:
+ if key is ... or callable(key) or isinstance(key, slice):
+ branching = True
+ result = ()
+
+ elif key is None:
+ result = obj
+
+ elif isinstance(key, set):
+ assert len(key) == 1, 'Set should only be used to wrap a single item'
+ item = next(iter(key))
+ if isinstance(item, type):
+ if isinstance(obj, item):
+ result = obj
+ else:
+ result = try_call(item, args=(obj,))
+
+ elif isinstance(key, (list, tuple)):
+ branching = True
+ result = itertools.chain.from_iterable(
+ apply_path(obj, branch, is_last)[0] for branch in key)
+
+ elif key is ...:
+ branching = True
+ if isinstance(obj, collections.abc.Mapping):
+ result = obj.values()
+ elif is_iterable_like(obj):
+ result = obj
+ elif isinstance(obj, re.Match):
+ result = obj.groups()
+ elif traverse_string:
+ branching = False
+ result = str(obj)
+ else:
+ result = ()
+
+ elif callable(key):
+ branching = True
+ if isinstance(obj, collections.abc.Mapping):
+ iter_obj = obj.items()
+ elif is_iterable_like(obj):
+ iter_obj = enumerate(obj)
+ elif isinstance(obj, re.Match):
+ iter_obj = itertools.chain(
+ enumerate((obj.group(), *obj.groups())),
+ obj.groupdict().items())
+ elif traverse_string:
+ branching = False
+ iter_obj = enumerate(str(obj))
+ else:
+ iter_obj = ()
+
+ result = (v for k, v in iter_obj if try_call(key, args=(k, v)))
+ if not branching: # string traversal
+ result = ''.join(result)
+
+ elif isinstance(key, dict):
+ iter_obj = ((k, _traverse_obj(obj, v, False, is_last)) for k, v in key.items())
+ result = {
+ k: v if v is not None else default for k, v in iter_obj
+ if v is not None or default is not NO_DEFAULT
+ } or None
+
+ elif isinstance(obj, collections.abc.Mapping):
+ result = (try_call(obj.get, args=(key,)) if casesense or try_call(obj.__contains__, args=(key,)) else
+ next((v for k, v in obj.items() if casefold(k) == key), None))
+
+ elif isinstance(obj, re.Match):
+ if isinstance(key, int) or casesense:
+ with contextlib.suppress(IndexError):
+ result = obj.group(key)
+
+ elif isinstance(key, str):
+ result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
+
+ elif isinstance(key, (int, slice)):
+ if is_iterable_like(obj, collections.abc.Sequence):
+ branching = isinstance(key, slice)
+ with contextlib.suppress(IndexError):
+ result = obj[key]
+ elif traverse_string:
+ with contextlib.suppress(IndexError):
+ result = str(obj)[key]
+
+ return branching, result if branching else (result,)
+
+ def lazy_last(iterable):
+ iterator = iter(iterable)
+ prev = next(iterator, NO_DEFAULT)
+ if prev is NO_DEFAULT:
+ return
+
+ for item in iterator:
+ yield False, prev
+ prev = item
+
+ yield True, prev
+
+ def apply_path(start_obj, path, test_type):
+ objs = (start_obj,)
+ has_branched = False
+
+ key = None
+ for last, key in lazy_last(variadic(path, (str, bytes, dict, set))):
+ if is_user_input and isinstance(key, str):
+ if key == ':':
+ key = ...
+ elif ':' in key:
+ key = slice(*map(int_or_none, key.split(':')))
+ elif int_or_none(key) is not None:
+ key = int(key)
+
+ if not casesense and isinstance(key, str):
+ key = key.casefold()
+
+ if __debug__ and callable(key):
+ # Verify function signature
+ inspect.signature(key).bind(None, None)
+
+ new_objs = []
+ for obj in objs:
+ branching, results = apply_key(key, obj, last)
+ has_branched |= branching
+ new_objs.append(results)
+
+ objs = itertools.chain.from_iterable(new_objs)
+
+ if test_type and not isinstance(key, (dict, list, tuple)):
+ objs = map(type_test, objs)
+
+ return objs, has_branched, isinstance(key, dict)
+
+ def _traverse_obj(obj, path, allow_empty, test_type):
+ results, has_branched, is_dict = apply_path(obj, path, test_type)
+ results = LazyList(item for item in results if item not in (None, {}))
+ if get_all and has_branched:
+ if results:
+ return results.exhaust()
+ if allow_empty:
+ return [] if default is NO_DEFAULT else default
+ return None
+
+ return results[0] if results else {} if allow_empty and is_dict else None
+
+ for index, path in enumerate(paths, 1):
+ result = _traverse_obj(obj, path, index == len(paths), True)
+ if result is not None:
+ return result
+
+ return None if default is NO_DEFAULT else default
+
+
+def get_first(obj, *paths, **kwargs):
+ return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False)
+
+
+def dict_get(d, key_or_keys, default=None, skip_false_values=True):
+ for val in map(d.get, variadic(key_or_keys)):
+ if val is not None and (val or not skip_false_values):
+ return val
+ return default
diff --git a/hypervideo_dl/version.py b/hypervideo_dl/version.py
index 3b08699..40bcc7e 100644
--- a/hypervideo_dl/version.py
+++ b/hypervideo_dl/version.py
@@ -1,9 +1,11 @@
# Autogenerated by devscripts/update-version.py
-__version__ = '1.1.13'
+__version__ = '1.1.14'
-RELEASE_GIT_HEAD = '8b644025b'
+RELEASE_GIT_HEAD = 'b532a3481046e1eabb6232ee8196fb696c356ff6'
VARIANT = None
UPDATE_HINT = None
+
+CHANNEL = 'stable'
diff --git a/setup.cfg b/setup.cfg
index 2def390..6deaa79 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -26,12 +26,12 @@ markers =
[tox:tox]
skipsdist = true
-envlist = py{36,37,38,39,310},pypy{36,37,38,39}
+envlist = py{36,37,38,39,310,311},pypy{36,37,38,39}
skip_missing_interpreters = true
[testenv] # tox
deps =
- pytest
+ pytest
commands = pytest {posargs:"-m not download"}
passenv = HOME # For test_compat_expanduser
setenv =
diff --git a/setup.py b/setup.py
index 87e34f9..54151c9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,11 @@
#!/usr/bin/env python3
-# coding: utf-8
-import os.path
+
+# Allow execution from anywhere
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
import warnings
import sys
@@ -42,14 +47,15 @@ if sys.argv[1:2] == ['py2exe']:
'product_version': __version__,
}],
'options': {
- 'py2exe': {
- 'bundle_files': 0,
- 'compressed': 1,
- 'optimize': 2,
- 'dist_dir': './dist',
- 'excludes': ['Crypto', 'Cryptodome'], # py2exe cannot import Crypto
- 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'],
- }
+ 'bundle_files': 0,
+ 'compressed': 1,
+ 'optimize': 2,
+ 'dist_dir': './dist',
+ 'excludes': ['Crypto', 'Cryptodome'], # py2exe cannot import Crypto
+ 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'],
+ # Modules that are only imported dynamically must be added here
+ 'includes': ['hypervideo_dl.compat._legacy', 'hypervideo_dl.compat._deprecated',
+ 'hypervideo_dl.utils._legacy', 'hypervideo_dl.utils._deprecated'],
},
'zipfile': None
}
@@ -78,7 +84,10 @@ else:
}
if setuptools_available:
- params['entry_points'] = {'console_scripts': ['hypervideo = hypervideo_dl:main']}
+ params['entry_points'] = {
+ 'console_scripts': ['hypervideo = hypervideo_dl:main'],
+ 'pyinstaller40': ['hook-dirs = hypervideo_dl.__pyinstaller:get_hook_dirs'],
+ }
else:
params['scripts'] = ['hypervideo']
diff --git a/test/conftest.py b/test/conftest.py
new file mode 100644
index 0000000..48d9288
--- /dev/null
+++ b/test/conftest.py
@@ -0,0 +1,21 @@
+import functools
+import inspect
+
+import pytest
+
+from hypervideo_dl.networking import RequestHandler
+from hypervideo_dl.networking.common import _REQUEST_HANDLERS
+from hypervideo_dl.utils._utils import _YDLLogger as FakeLogger
+
+
+@pytest.fixture
+def handler(request):
+ RH_KEY = request.param
+ if inspect.isclass(RH_KEY) and issubclass(RH_KEY, RequestHandler):
+ handler = RH_KEY
+ elif RH_KEY in _REQUEST_HANDLERS:
+ handler = _REQUEST_HANDLERS[RH_KEY]
+ else:
+ pytest.skip(f'{RH_KEY} request handler is not available')
+
+ return functools.partial(handler, logger=FakeLogger)
diff --git a/test/helper.py b/test/helper.py
index 1dae86f..62f78b4 100644
--- a/test/helper.py
+++ b/test/helper.py
@@ -194,8 +194,8 @@ def sanitize_got_info_dict(got_dict):
'formats', 'thumbnails', 'subtitles', 'automatic_captions', 'comments', 'entries',
# Auto-generated
- 'autonumber', 'playlist', 'format_index', 'video_ext', 'audio_ext', 'duration_string', 'epoch',
- 'fulltitle', 'extractor', 'extractor_key', 'filepath', 'infojson_filename', 'original_url', 'n_entries',
+ 'autonumber', 'playlist', 'format_index', 'video_ext', 'audio_ext', 'duration_string', 'epoch', 'n_entries',
+ 'fulltitle', 'extractor', 'extractor_key', 'filename', 'filepath', 'infojson_filename', 'original_url',
# Only live_status needs to be checked
'is_live', 'was_live',
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
index 529da52..4712c91 100644
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -69,6 +69,7 @@ class TestInfoExtractor(unittest.TestCase):
<meta name="og:test1" content='foo > < bar'/>
<meta name="og:test2" content="foo >//< bar"/>
<meta property=og-test3 content='Ill-formatted opengraph'/>
+ <meta property=og:test4 content=unquoted-value/>
'''
self.assertEqual(ie._og_search_title(html), 'Foo')
self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')
@@ -81,6 +82,7 @@ class TestInfoExtractor(unittest.TestCase):
self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar')
self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True)
self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True)
+ self.assertEqual(ie._og_search_property('test4', html), 'unquoted-value')
def test_html_search_meta(self):
ie = self.ie
@@ -915,8 +917,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'acodec': 'mp4a.40.2',
'video_ext': 'mp4',
'audio_ext': 'none',
- 'vbr': 263.851,
- 'abr': 0,
}, {
'format_id': '577',
'format_index': None,
@@ -934,8 +934,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'acodec': 'mp4a.40.2',
'video_ext': 'mp4',
'audio_ext': 'none',
- 'vbr': 577.61,
- 'abr': 0,
}, {
'format_id': '915',
'format_index': None,
@@ -953,8 +951,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'acodec': 'mp4a.40.2',
'video_ext': 'mp4',
'audio_ext': 'none',
- 'vbr': 915.905,
- 'abr': 0,
}, {
'format_id': '1030',
'format_index': None,
@@ -972,8 +968,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'acodec': 'mp4a.40.2',
'video_ext': 'mp4',
'audio_ext': 'none',
- 'vbr': 1030.138,
- 'abr': 0,
}, {
'format_id': '1924',
'format_index': None,
@@ -991,8 +985,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'acodec': 'mp4a.40.2',
'video_ext': 'mp4',
'audio_ext': 'none',
- 'vbr': 1924.009,
- 'abr': 0,
}],
{
'en': [{
@@ -1404,6 +1396,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'vcodec': 'none',
'acodec': 'AACL',
'protocol': 'ism',
+ 'audio_channels': 2,
'_download_params': {
'stream_type': 'audio',
'duration': 8880746666,
@@ -1417,9 +1410,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'audio_ext': 'isma',
- 'video_ext': 'none',
- 'abr': 128,
}, {
'format_id': 'video-100',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest',
@@ -1443,9 +1433,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'video_ext': 'ismv',
- 'audio_ext': 'none',
- 'vbr': 100,
}, {
'format_id': 'video-326',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest',
@@ -1469,9 +1456,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'video_ext': 'ismv',
- 'audio_ext': 'none',
- 'vbr': 326,
}, {
'format_id': 'video-698',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest',
@@ -1495,9 +1479,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'video_ext': 'ismv',
- 'audio_ext': 'none',
- 'vbr': 698,
}, {
'format_id': 'video-1493',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest',
@@ -1521,9 +1502,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'video_ext': 'ismv',
- 'audio_ext': 'none',
- 'vbr': 1493,
}, {
'format_id': 'video-4482',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest',
@@ -1547,9 +1525,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'video_ext': 'ismv',
- 'audio_ext': 'none',
- 'vbr': 4482,
}],
{
'eng': [
@@ -1573,61 +1548,57 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'ec-3_test',
'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
[{
- 'format_id': 'audio_deu_1-224',
+ 'format_id': 'audio_deu-127',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
'ext': 'isma',
- 'tbr': 224,
+ 'tbr': 127,
'asr': 48000,
'vcodec': 'none',
- 'acodec': 'EC-3',
+ 'acodec': 'AACL',
'protocol': 'ism',
- '_download_params':
- {
+ 'language': 'deu',
+ 'audio_channels': 2,
+ '_download_params': {
'stream_type': 'audio',
'duration': 370000000,
'timescale': 10000000,
'width': 0,
'height': 0,
- 'fourcc': 'EC-3',
+ 'fourcc': 'AACL',
'language': 'deu',
- 'codec_private_data': '00063F000000AF87FBA7022DFB42A4D405CD93843BDD0700200F00',
+ 'codec_private_data': '1190',
'sampling_rate': 48000,
- 'channels': 6,
+ 'channels': 2,
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'audio_ext': 'isma',
- 'video_ext': 'none',
- 'abr': 224,
}, {
- 'format_id': 'audio_deu-127',
+ 'format_id': 'audio_deu_1-224',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
'ext': 'isma',
- 'tbr': 127,
+ 'tbr': 224,
'asr': 48000,
'vcodec': 'none',
- 'acodec': 'AACL',
+ 'acodec': 'EC-3',
'protocol': 'ism',
- '_download_params':
- {
+ 'language': 'deu',
+ 'audio_channels': 6,
+ '_download_params': {
'stream_type': 'audio',
'duration': 370000000,
'timescale': 10000000,
'width': 0,
'height': 0,
- 'fourcc': 'AACL',
+ 'fourcc': 'EC-3',
'language': 'deu',
- 'codec_private_data': '1190',
+ 'codec_private_data': '00063F000000AF87FBA7022DFB42A4D405CD93843BDD0700200F00',
'sampling_rate': 48000,
- 'channels': 2,
+ 'channels': 6,
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'audio_ext': 'isma',
- 'video_ext': 'none',
- 'abr': 127,
}, {
'format_id': 'video_deu-23',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@@ -1639,8 +1610,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
- '_download_params':
- {
+ 'language': 'deu',
+ '_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@@ -1653,9 +1624,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'video_ext': 'ismv',
- 'audio_ext': 'none',
- 'vbr': 23,
}, {
'format_id': 'video_deu-403',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@@ -1667,8 +1635,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
- '_download_params':
- {
+ 'language': 'deu',
+ '_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@@ -1681,9 +1649,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'video_ext': 'ismv',
- 'audio_ext': 'none',
- 'vbr': 403,
}, {
'format_id': 'video_deu-680',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@@ -1695,8 +1660,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
- '_download_params':
- {
+ 'language': 'deu',
+ '_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@@ -1709,9 +1674,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'video_ext': 'ismv',
- 'audio_ext': 'none',
- 'vbr': 680,
}, {
'format_id': 'video_deu-1253',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@@ -1723,8 +1685,9 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
- '_download_params':
- {
+ 'vbr': 1253,
+ 'language': 'deu',
+ '_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@@ -1737,9 +1700,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'video_ext': 'ismv',
- 'audio_ext': 'none',
- 'vbr': 1253,
}, {
'format_id': 'video_deu-2121',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@@ -1751,8 +1711,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
- '_download_params':
- {
+ 'language': 'deu',
+ '_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@@ -1765,9 +1725,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'video_ext': 'ismv',
- 'audio_ext': 'none',
- 'vbr': 2121,
}, {
'format_id': 'video_deu-3275',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@@ -1779,8 +1736,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
- '_download_params':
- {
+ 'language': 'deu',
+ '_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@@ -1793,9 +1750,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'video_ext': 'ismv',
- 'audio_ext': 'none',
- 'vbr': 3275,
}, {
'format_id': 'video_deu-5300',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@@ -1807,8 +1761,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
- '_download_params':
- {
+ 'language': 'deu',
+ '_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@@ -1821,9 +1775,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'video_ext': 'ismv',
- 'audio_ext': 'none',
- 'vbr': 5300,
}, {
'format_id': 'video_deu-8079',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@@ -1835,8 +1786,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
- '_download_params':
- {
+ 'language': 'deu',
+ '_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@@ -1849,9 +1800,6 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
- 'video_ext': 'ismv',
- 'audio_ext': 'none',
- 'vbr': 8079,
}],
{},
),
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py
index 2d4e827..2810080 100644
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -10,9 +10,8 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import copy
import json
-import urllib.error
-from test.helper import FakeYDL, assertRegexpMatches
+from test.helper import FakeYDL, assertRegexpMatches, try_rm
from hypervideo_dl import YoutubeDL
from hypervideo_dl.compat import compat_os_name
from hypervideo_dl.extractor import YoutubeIE
@@ -25,6 +24,7 @@ from hypervideo_dl.utils import (
int_or_none,
match_filter_func,
)
+from hypervideo_dl.utils.traversal import traverse_obj
TEST_URL = 'http://localhost/sample.mp4'
@@ -632,6 +632,7 @@ class TestYoutubeDL(unittest.TestCase):
outtmpl_info = {
'id': '1234',
+ 'id': '1234',
'ext': 'mp4',
'width': None,
'height': 1080,
@@ -669,7 +670,7 @@ class TestYoutubeDL(unittest.TestCase):
for (name, got), expect in zip((('outtmpl', out), ('filename', fname)), expected):
if callable(expect):
self.assertTrue(expect(got), f'Wrong {name} from {tmpl}')
- else:
+ elif expect is not None:
self.assertEqual(got, expect, f'Wrong {name} from {tmpl}')
# Side-effects
@@ -684,7 +685,8 @@ class TestYoutubeDL(unittest.TestCase):
test('%(id)s.%(ext)s', '1234.mp4')
test('%(duration_string)s', ('27:46:40', '27-46-40'))
test('%(resolution)s', '1080p')
- test('%(playlist_index)s', '001')
+ test('%(playlist_index|)s', '001')
+ test('%(playlist_index&{}!)s', '1!')
test('%(playlist_autonumber)s', '02')
test('%(autonumber)s', '00001')
test('%(autonumber+2)03d', '005', autonumber_start=3)
@@ -755,20 +757,23 @@ class TestYoutubeDL(unittest.TestCase):
test('%(ext)c', 'm')
test('%(id)d %(id)r', "1234 '1234'")
test('%(id)r %(height)r', "'1234' 1080")
+ test('%(title5)a %(height)a', (R"'\xe1\xe9\xed \U0001d400' 1080", None))
test('%(ext)s-%(ext|def)d', 'mp4-def')
- test('%(width|0)04d', '0000')
- test('a%(width|)d', 'a', outtmpl_na_placeholder='none')
+ test('%(width|0)04d', '0')
+ test('a%(width|b)d', 'ab', outtmpl_na_placeholder='none')
FORMATS = self.outtmpl_info['formats']
- sanitize = lambda x: x.replace(':', ':').replace('"', """).replace('\n', ' ')
# Custom type casting
test('%(formats.:.id)l', 'id 1, id 2, id 3')
test('%(formats.:.id)#l', ('id 1\nid 2\nid 3', 'id 1 id 2 id 3'))
test('%(ext)l', 'mp4')
test('%(formats.:.id) 18l', ' id 1, id 2, id 3')
- test('%(formats)j', (json.dumps(FORMATS), sanitize(json.dumps(FORMATS))))
- test('%(formats)#j', (json.dumps(FORMATS, indent=4), sanitize(json.dumps(FORMATS, indent=4))))
+ test('%(formats)j', (json.dumps(FORMATS), None))
+ test('%(formats)#j', (
+ json.dumps(FORMATS, indent=4),
+ json.dumps(FORMATS, indent=4).replace(':', ':').replace('"', """).replace('\n', ' ')
+ ))
test('%(title5).3B', 'á')
test('%(title5)U', 'áéí 𝐀')
test('%(title5)#U', 'a\u0301e\u0301i\u0301 𝐀')
@@ -793,8 +798,8 @@ class TestYoutubeDL(unittest.TestCase):
test('%(title|%)s %(title|%%)s', '% %%')
test('%(id+1-height+3)05d', '00158')
test('%(width+100)05d', 'NA')
- test('%(formats.0) 15s', ('% 15s' % FORMATS[0], '% 15s' % sanitize(str(FORMATS[0]))))
- test('%(formats.0)r', (repr(FORMATS[0]), sanitize(repr(FORMATS[0]))))
+ test('%(formats.0) 15s', ('% 15s' % FORMATS[0], None))
+ test('%(formats.0)r', (repr(FORMATS[0]), None))
test('%(height.0)03d', '001')
test('%(-height.0)04d', '-001')
test('%(formats.-1.id)s', FORMATS[-1]['id'])
@@ -806,7 +811,7 @@ class TestYoutubeDL(unittest.TestCase):
out = json.dumps([{'id': f['id'], 'height.:2': str(f['height'])[:2]}
if 'height' in f else {'id': f['id']}
for f in FORMATS])
- test('%(formats.:.{id,height.:2})j', (out, sanitize(out)))
+ test('%(formats.:.{id,height.:2})j', (out, None))
test('%(formats.:.{id,height}.id)l', ', '.join(f['id'] for f in FORMATS))
test('%(.{id,title})j', ('{"id": "1234"}', '{"id": "1234"}'))
@@ -822,6 +827,11 @@ class TestYoutubeDL(unittest.TestCase):
test('%(title&foo|baz)s.bar', 'baz.bar')
test('%(x,id&foo|baz)s.bar', 'foo.bar')
test('%(x,title&foo|baz)s.bar', 'baz.bar')
+ test('%(id&a\nb|)s', ('a\nb', 'a b'))
+ test('%(id&hi {:>10} {}|)s', 'hi 1234 1234')
+ test(R'%(id&{0} {}|)s', 'NA')
+ test(R'%(id&{0.1}|)s', 'NA')
+ test('%(height&{:,d})S', '1,080')
# Laziness
def gen():
@@ -867,12 +877,12 @@ class TestYoutubeDL(unittest.TestCase):
class SimplePP(PostProcessor):
def run(self, info):
- with open(audiofile, 'wt') as f:
+ with open(audiofile, 'w') as f:
f.write('EXAMPLE')
return [info['filepath']], info
def run_pp(params, PP):
- with open(filename, 'wt') as f:
+ with open(filename, 'w') as f:
f.write('EXAMPLE')
ydl = YoutubeDL(params)
ydl.add_post_processor(PP())
@@ -891,7 +901,7 @@ class TestYoutubeDL(unittest.TestCase):
class ModifierPP(PostProcessor):
def run(self, info):
- with open(info['filepath'], 'wt') as f:
+ with open(info['filepath'], 'w') as f:
f.write('MODIFIED')
return [], info
@@ -1093,11 +1103,6 @@ class TestYoutubeDL(unittest.TestCase):
test_selection({'playlist_items': '-15::2'}, INDICES[1::2], True)
test_selection({'playlist_items': '-15::15'}, [], True)
- def test_urlopen_no_file_protocol(self):
- # see https://github.com/ytdl-org/youtube-dl/issues/8227
- ydl = YDL()
- self.assertRaises(urllib.error.URLError, ydl.urlopen, 'file:///etc/passwd')
-
def test_do_not_override_ie_key_in_url_transparent(self):
ydl = YDL()
@@ -1211,6 +1216,129 @@ class TestYoutubeDL(unittest.TestCase):
self.assertEqual(downloaded['extractor'], 'Video')
self.assertEqual(downloaded['extractor_key'], 'Video')
+ def test_header_cookies(self):
+ from http.cookiejar import Cookie
+
+ ydl = FakeYDL()
+ ydl.report_warning = lambda *_, **__: None
+
+ def cookie(name, value, version=None, domain='', path='', secure=False, expires=None):
+ return Cookie(
+ version or 0, name, value, None, False,
+ domain, bool(domain), bool(domain), path, bool(path),
+ secure, expires, False, None, None, rest={})
+
+ _test_url = 'https://yt.dlp/test'
+
+ def test(encoded_cookies, cookies, *, headers=False, round_trip=None, error_re=None):
+ def _test():
+ ydl.cookiejar.clear()
+ ydl._load_cookies(encoded_cookies, autoscope=headers)
+ if headers:
+ ydl._apply_header_cookies(_test_url)
+ data = {'url': _test_url}
+ ydl._calc_headers(data)
+ self.assertCountEqual(
+ map(vars, ydl.cookiejar), map(vars, cookies),
+ 'Extracted cookiejar.Cookie is not the same')
+ if not headers:
+ self.assertEqual(
+ data.get('cookies'), round_trip or encoded_cookies,
+ 'Cookie is not the same as round trip')
+ ydl.__dict__['_YoutubeDL__header_cookies'] = []
+
+ with self.subTest(msg=encoded_cookies):
+ if not error_re:
+ _test()
+ return
+ with self.assertRaisesRegex(Exception, error_re):
+ _test()
+
+ test('test=value; Domain=.yt.dlp', [cookie('test', 'value', domain='.yt.dlp')])
+ test('test=value', [cookie('test', 'value')], error_re=r'Unscoped cookies are not allowed')
+ test('cookie1=value1; Domain=.yt.dlp; Path=/test; cookie2=value2; Domain=.yt.dlp; Path=/', [
+ cookie('cookie1', 'value1', domain='.yt.dlp', path='/test'),
+ cookie('cookie2', 'value2', domain='.yt.dlp', path='/')])
+ test('test=value; Domain=.yt.dlp; Path=/test; Secure; Expires=9999999999', [
+ cookie('test', 'value', domain='.yt.dlp', path='/test', secure=True, expires=9999999999)])
+ test('test="value; "; path=/test; domain=.yt.dlp', [
+ cookie('test', 'value; ', domain='.yt.dlp', path='/test')],
+ round_trip='test="value\\073 "; Domain=.yt.dlp; Path=/test')
+ test('name=; Domain=.yt.dlp', [cookie('name', '', domain='.yt.dlp')],
+ round_trip='name=""; Domain=.yt.dlp')
+
+ test('test=value', [cookie('test', 'value', domain='.yt.dlp')], headers=True)
+ test('cookie1=value; Domain=.yt.dlp; cookie2=value', [], headers=True, error_re=r'Invalid syntax')
+ ydl.deprecated_feature = ydl.report_error
+ test('test=value', [], headers=True, error_re=r'Passing cookies as a header is a potential security risk')
+
+ def test_infojson_cookies(self):
+ TEST_FILE = 'test_infojson_cookies.info.json'
+ TEST_URL = 'https://example.com/example.mp4'
+ COOKIES = 'a=b; Domain=.example.com; c=d; Domain=.example.com'
+ COOKIE_HEADER = {'Cookie': 'a=b; c=d'}
+
+ ydl = FakeYDL()
+ ydl.process_info = lambda x: ydl._write_info_json('test', x, TEST_FILE)
+
+ def make_info(info_header_cookies=False, fmts_header_cookies=False, cookies_field=False):
+ fmt = {'url': TEST_URL}
+ if fmts_header_cookies:
+ fmt['http_headers'] = COOKIE_HEADER
+ if cookies_field:
+ fmt['cookies'] = COOKIES
+ return _make_result([fmt], http_headers=COOKIE_HEADER if info_header_cookies else None)
+
+ def test(initial_info, note):
+ result = {}
+ result['processed'] = ydl.process_ie_result(initial_info)
+ self.assertTrue(ydl.cookiejar.get_cookies_for_url(TEST_URL),
+ msg=f'No cookies set in cookiejar after initial process when {note}')
+ ydl.cookiejar.clear()
+ with open(TEST_FILE) as infojson:
+ result['loaded'] = ydl.sanitize_info(json.load(infojson), True)
+ result['final'] = ydl.process_ie_result(result['loaded'].copy(), download=False)
+ self.assertTrue(ydl.cookiejar.get_cookies_for_url(TEST_URL),
+ msg=f'No cookies set in cookiejar after final process when {note}')
+ ydl.cookiejar.clear()
+ for key in ('processed', 'loaded', 'final'):
+ info = result[key]
+ self.assertIsNone(
+ traverse_obj(info, ((None, ('formats', 0)), 'http_headers', 'Cookie'), casesense=False, get_all=False),
+ msg=f'Cookie header not removed in {key} result when {note}')
+ self.assertEqual(
+ traverse_obj(info, ((None, ('formats', 0)), 'cookies'), get_all=False), COOKIES,
+ msg=f'No cookies field found in {key} result when {note}')
+
+ test({'url': TEST_URL, 'http_headers': COOKIE_HEADER, 'id': '1', 'title': 'x'}, 'no formats field')
+ test(make_info(info_header_cookies=True), 'info_dict header cokies')
+ test(make_info(fmts_header_cookies=True), 'format header cookies')
+ test(make_info(info_header_cookies=True, fmts_header_cookies=True), 'info_dict and format header cookies')
+ test(make_info(info_header_cookies=True, fmts_header_cookies=True, cookies_field=True), 'all cookies fields')
+ test(make_info(cookies_field=True), 'cookies format field')
+ test({'url': TEST_URL, 'cookies': COOKIES, 'id': '1', 'title': 'x'}, 'info_dict cookies field only')
+
+ try_rm(TEST_FILE)
+
+ def test_add_headers_cookie(self):
+ def check_for_cookie_header(result):
+ return traverse_obj(result, ((None, ('formats', 0)), 'http_headers', 'Cookie'), casesense=False, get_all=False)
+
+ ydl = FakeYDL({'http_headers': {'Cookie': 'a=b'}})
+ ydl._apply_header_cookies(_make_result([])['webpage_url']) # Scope to input webpage URL: .example.com
+
+ fmt = {'url': 'https://example.com/video.mp4'}
+ result = ydl.process_ie_result(_make_result([fmt]), download=False)
+ self.assertIsNone(check_for_cookie_header(result), msg='http_headers cookies in result info_dict')
+ self.assertEqual(result.get('cookies'), 'a=b; Domain=.example.com', msg='No cookies were set in cookies field')
+ self.assertIn('a=b', ydl.cookiejar.get_cookie_header(fmt['url']), msg='No cookies were set in cookiejar')
+
+ fmt = {'url': 'https://wrong.com/video.mp4'}
+ result = ydl.process_ie_result(_make_result([fmt]), download=False)
+ self.assertIsNone(check_for_cookie_header(result), msg='http_headers cookies for wrong domain')
+ self.assertFalse(result.get('cookies'), msg='Cookies set in cookies field for wrong domain')
+ self.assertFalse(ydl.cookiejar.get_cookie_header(fmt['url']), msg='Cookies set in cookiejar for wrong domain')
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py
index 26922d6..ffeb6f4 100644
--- a/test/test_YoutubeDLCookieJar.py
+++ b/test/test_YoutubeDLCookieJar.py
@@ -11,16 +11,16 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import re
import tempfile
-from hypervideo_dl.utils import YoutubeDLCookieJar
+from hypervideo_dl.cookies import YoutubeDLCookieJar
class TestYoutubeDLCookieJar(unittest.TestCase):
def test_keep_session_cookies(self):
cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/session_cookies.txt')
- cookiejar.load(ignore_discard=True, ignore_expires=True)
+ cookiejar.load()
tf = tempfile.NamedTemporaryFile(delete=False)
try:
- cookiejar.save(filename=tf.name, ignore_discard=True, ignore_expires=True)
+ cookiejar.save(filename=tf.name)
temp = tf.read().decode()
self.assertTrue(re.search(
r'www\.foobar\.foobar\s+FALSE\s+/\s+TRUE\s+0\s+YoutubeDLExpiresEmpty\s+YoutubeDLExpiresEmptyValue', temp))
@@ -32,7 +32,7 @@ class TestYoutubeDLCookieJar(unittest.TestCase):
def test_strip_httponly_prefix(self):
cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt')
- cookiejar.load(ignore_discard=True, ignore_expires=True)
+ cookiejar.load()
def assert_cookie_has_value(key):
self.assertEqual(cookiejar._cookies['www.foobar.foobar']['/'][key].value, key + '_VALUE')
@@ -42,11 +42,25 @@ class TestYoutubeDLCookieJar(unittest.TestCase):
def test_malformed_cookies(self):
cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/malformed_cookies.txt')
- cookiejar.load(ignore_discard=True, ignore_expires=True)
+ cookiejar.load()
# Cookies should be empty since all malformed cookie file entries
# will be ignored
self.assertFalse(cookiejar._cookies)
+ def test_get_cookie_header(self):
+ cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt')
+ cookiejar.load()
+ header = cookiejar.get_cookie_header('https://www.foobar.foobar')
+ self.assertIn('HTTPONLY_COOKIE', header)
+
+ def test_get_cookies_for_url(self):
+ cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/session_cookies.txt')
+ cookiejar.load()
+ cookies = cookiejar.get_cookies_for_url('https://www.foobar.foobar/')
+ self.assertEqual(len(cookies), 2)
+ cookies = cookiejar.get_cookies_for_url('https://foobar.foobar/')
+ self.assertFalse(cookies)
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_aes.py b/test/test_aes.py
index 0f35bc2..cace61c 100644
--- a/test/test_aes.py
+++ b/test/test_aes.py
@@ -26,7 +26,7 @@ from hypervideo_dl.aes import (
key_expansion,
pad_block,
)
-from hypervideo_dl.dependencies import Cryptodome_AES
+from hypervideo_dl.dependencies import Cryptodome
from hypervideo_dl.utils import bytes_to_intlist, intlist_to_bytes
# the encrypted data can be generate with 'devscripts/generate_aes_testdata.py'
@@ -48,7 +48,7 @@ class TestAES(unittest.TestCase):
data = b'\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6\x27\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd'
decrypted = intlist_to_bytes(aes_cbc_decrypt(bytes_to_intlist(data), self.key, self.iv))
self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg)
- if Cryptodome_AES:
+ if Cryptodome.AES:
decrypted = aes_cbc_decrypt_bytes(data, intlist_to_bytes(self.key), intlist_to_bytes(self.iv))
self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg)
@@ -78,7 +78,7 @@ class TestAES(unittest.TestCase):
decrypted = intlist_to_bytes(aes_gcm_decrypt_and_verify(
bytes_to_intlist(data), self.key, bytes_to_intlist(authentication_tag), self.iv[:12]))
self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg)
- if Cryptodome_AES:
+ if Cryptodome.AES:
decrypted = aes_gcm_decrypt_and_verify_bytes(
data, intlist_to_bytes(self.key), authentication_tag, intlist_to_bytes(self.iv[:12]))
self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg)
diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py
index 034359b..46b0996 100644
--- a/test/test_age_restriction.py
+++ b/test/test_age_restriction.py
@@ -10,6 +10,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import is_download_test, try_rm
from hypervideo_dl import YoutubeDL
+from hypervideo_dl.utils import DownloadError
def _download_restricted(url, filename, age):
@@ -25,10 +26,14 @@ def _download_restricted(url, filename, age):
ydl.add_default_info_extractors()
json_filename = os.path.splitext(filename)[0] + '.info.json'
try_rm(json_filename)
- ydl.download([url])
- res = os.path.exists(json_filename)
- try_rm(json_filename)
- return res
+ try:
+ ydl.download([url])
+ except DownloadError:
+ pass
+ else:
+ return os.path.exists(json_filename)
+ finally:
+ try_rm(json_filename)
@is_download_test
@@ -38,12 +43,12 @@ class TestAgeRestriction(unittest.TestCase):
self.assertFalse(_download_restricted(url, filename, age))
def test_youtube(self):
- self._assert_restricted('07FYdnEawAQ', '07FYdnEawAQ.mp4', 10)
+ self._assert_restricted('HtVdAasjOgU', 'HtVdAasjOgU.mp4', 10)
def test_youporn(self):
self._assert_restricted(
- 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
- '505835.mp4', 2, old_age=25)
+ 'https://www.youporn.com/watch/16715086/sex-ed-in-detention-18-asmr/',
+ '16715086.mp4', 2, old_age=25)
if __name__ == '__main__':
diff --git a/test/test_compat.py b/test/test_compat.py
index 7a191c0..e1ae193 100644
--- a/test/test_compat.py
+++ b/test/test_compat.py
@@ -9,15 +9,16 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import struct
-import urllib.parse
from hypervideo_dl import compat
+from hypervideo_dl.compat import urllib # isort: split
from hypervideo_dl.compat import (
compat_etree_fromstring,
compat_expanduser,
compat_urllib_parse_unquote,
compat_urllib_parse_urlencode,
)
+from hypervideo_dl.compat.urllib.request import getproxies
class TestCompat(unittest.TestCase):
@@ -28,8 +29,10 @@ class TestCompat(unittest.TestCase):
with self.assertWarns(DeprecationWarning):
compat.WINDOWS_VT_MODE
- # TODO: Test submodule
- # compat.asyncio.events # Must not raise error
+ self.assertEqual(urllib.request.getproxies, getproxies)
+
+ with self.assertWarns(DeprecationWarning):
+ compat.compat_pycrypto_AES # Must not raise error
def test_compat_expanduser(self):
old_home = os.environ.get('HOME')
diff --git a/test/test_config.py b/test/test_config.py
new file mode 100644
index 0000000..8da85a3
--- /dev/null
+++ b/test/test_config.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+
+# Allow direct execution
+import os
+import sys
+import unittest
+import unittest.mock
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import contextlib
+import itertools
+from pathlib import Path
+
+from hypervideo_dl.compat import compat_expanduser
+from hypervideo_dl.options import create_parser, parseOpts
+from hypervideo_dl.utils import Config, get_executable_path
+
+ENVIRON_DEFAULTS = {
+ 'HOME': None,
+ 'XDG_CONFIG_HOME': '/_xdg_config_home/',
+ 'USERPROFILE': 'C:/Users/testing/',
+ 'APPDATA': 'C:/Users/testing/AppData/Roaming/',
+ 'HOMEDRIVE': 'C:/',
+ 'HOMEPATH': 'Users/testing/',
+}
+
+
+@contextlib.contextmanager
+def set_environ(**kwargs):
+ saved_environ = os.environ.copy()
+
+ for name, value in {**ENVIRON_DEFAULTS, **kwargs}.items():
+ if value is None:
+ os.environ.pop(name, None)
+ else:
+ os.environ[name] = value
+
+ yield
+
+ os.environ.clear()
+ os.environ.update(saved_environ)
+
+
+def _generate_expected_groups():
+ xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
+ appdata_dir = os.getenv('appdata')
+ home_dir = compat_expanduser('~')
+ return {
+ 'Portable': [
+ Path(get_executable_path(), 'hypervideo.conf'),
+ ],
+ 'Home': [
+ Path('hypervideo.conf'),
+ ],
+ 'User': [
+ Path(xdg_config_home, 'hypervideo.conf'),
+ Path(xdg_config_home, 'hypervideo', 'config'),
+ Path(xdg_config_home, 'hypervideo', 'config.txt'),
+ *((
+ Path(appdata_dir, 'hypervideo.conf'),
+ Path(appdata_dir, 'hypervideo', 'config'),
+ Path(appdata_dir, 'hypervideo', 'config.txt'),
+ ) if appdata_dir else ()),
+ Path(home_dir, 'hypervideo.conf'),
+ Path(home_dir, 'hypervideo.conf.txt'),
+ Path(home_dir, '.hypervideo', 'config'),
+ Path(home_dir, '.hypervideo', 'config.txt'),
+ ],
+ 'System': [
+ Path('/etc/hypervideo.conf'),
+ Path('/etc/hypervideo/config'),
+ Path('/etc/hypervideo/config.txt'),
+ ]
+ }
+
+
+class TestConfig(unittest.TestCase):
+ maxDiff = None
+
+ @set_environ()
+ def test_config__ENVIRON_DEFAULTS_sanity(self):
+ expected = make_expected()
+ self.assertCountEqual(
+ set(expected), expected,
+ 'ENVIRON_DEFAULTS produces non unique names')
+
+ def test_config_all_environ_values(self):
+ for name, value in ENVIRON_DEFAULTS.items():
+ for new_value in (None, '', '.', value or '/some/dir'):
+ with set_environ(**{name: new_value}):
+ self._simple_grouping_test()
+
+ def test_config_default_expected_locations(self):
+ files, _ = self._simple_config_test()
+ self.assertEqual(
+ files, make_expected(),
+ 'Not all expected locations have been checked')
+
+ def test_config_default_grouping(self):
+ self._simple_grouping_test()
+
+ def _simple_grouping_test(self):
+ expected_groups = make_expected_groups()
+ for name, group in expected_groups.items():
+ for index, existing_path in enumerate(group):
+ result, opts = self._simple_config_test(existing_path)
+ expected = expected_from_expected_groups(expected_groups, existing_path)
+ self.assertEqual(
+ result, expected,
+ f'The checked locations do not match the expected ({name}, {index})')
+ self.assertEqual(
+ opts.outtmpl['default'], '1',
+ f'The used result value was incorrect ({name}, {index})')
+
+ def _simple_config_test(self, *stop_paths):
+ encountered = 0
+ paths = []
+
+ def read_file(filename, default=[]):
+ nonlocal encountered
+ path = Path(filename)
+ paths.append(path)
+ if path in stop_paths:
+ encountered += 1
+ return ['-o', f'{encountered}']
+
+ with ConfigMock(read_file):
+ _, opts, _ = parseOpts([], False)
+
+ return paths, opts
+
+ @set_environ()
+ def test_config_early_exit_commandline(self):
+ self._early_exit_test(0, '--ignore-config')
+
+ @set_environ()
+ def test_config_early_exit_files(self):
+ for index, _ in enumerate(make_expected(), 1):
+ self._early_exit_test(index)
+
+ def _early_exit_test(self, allowed_reads, *args):
+ reads = 0
+
+ def read_file(filename, default=[]):
+ nonlocal reads
+ reads += 1
+
+ if reads > allowed_reads:
+ self.fail('The remaining config was not ignored')
+ elif reads == allowed_reads:
+ return ['--ignore-config']
+
+ with ConfigMock(read_file):
+ parseOpts(args, False)
+
+ @set_environ()
+ def test_config_override_commandline(self):
+ self._override_test(0, '-o', 'pass')
+
+ @set_environ()
+ def test_config_override_files(self):
+ for index, _ in enumerate(make_expected(), 1):
+ self._override_test(index)
+
+ def _override_test(self, start_index, *args):
+ index = 0
+
+ def read_file(filename, default=[]):
+ nonlocal index
+ index += 1
+
+ if index > start_index:
+ return ['-o', 'fail']
+ elif index == start_index:
+ return ['-o', 'pass']
+
+ with ConfigMock(read_file):
+ _, opts, _ = parseOpts(args, False)
+
+ self.assertEqual(
+ opts.outtmpl['default'], 'pass',
+ 'The earlier group did not override the later ones')
+
+
+@contextlib.contextmanager
+def ConfigMock(read_file=None):
+ with unittest.mock.patch('hypervideo_dl.options.Config') as mock:
+ mock.return_value = Config(create_parser())
+ if read_file is not None:
+ mock.read_file = read_file
+
+ yield mock
+
+
+def make_expected(*filepaths):
+ return expected_from_expected_groups(_generate_expected_groups(), *filepaths)
+
+
+def make_expected_groups(*filepaths):
+ return _filter_expected_groups(_generate_expected_groups(), filepaths)
+
+
+def expected_from_expected_groups(expected_groups, *filepaths):
+ return list(itertools.chain.from_iterable(
+ _filter_expected_groups(expected_groups, filepaths).values()))
+
+
+def _filter_expected_groups(expected, filepaths):
+ if not filepaths:
+ return expected
+
+ result = {}
+ for group, paths in expected.items():
+ new_paths = []
+ for path in paths:
+ new_paths.append(path)
+ if path in filepaths:
+ break
+
+ result[group] = new_paths
+
+ return result
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_cookies.py b/test/test_cookies.py
index ab5dd02..46369ca 100644
--- a/test/test_cookies.py
+++ b/test/test_cookies.py
@@ -49,32 +49,38 @@ class TestCookies(unittest.TestCase):
""" based on https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util_unittest.cc """
test_cases = [
({}, _LinuxDesktopEnvironment.OTHER),
+ ({'DESKTOP_SESSION': 'my_custom_de'}, _LinuxDesktopEnvironment.OTHER),
+ ({'XDG_CURRENT_DESKTOP': 'my_custom_de'}, _LinuxDesktopEnvironment.OTHER),
({'DESKTOP_SESSION': 'gnome'}, _LinuxDesktopEnvironment.GNOME),
({'DESKTOP_SESSION': 'mate'}, _LinuxDesktopEnvironment.GNOME),
- ({'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE),
- ({'DESKTOP_SESSION': 'kde'}, _LinuxDesktopEnvironment.KDE),
+ ({'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE4),
+ ({'DESKTOP_SESSION': 'kde'}, _LinuxDesktopEnvironment.KDE3),
({'DESKTOP_SESSION': 'xfce'}, _LinuxDesktopEnvironment.XFCE),
({'GNOME_DESKTOP_SESSION_ID': 1}, _LinuxDesktopEnvironment.GNOME),
- ({'KDE_FULL_SESSION': 1}, _LinuxDesktopEnvironment.KDE),
+ ({'KDE_FULL_SESSION': 1}, _LinuxDesktopEnvironment.KDE3),
+ ({'KDE_FULL_SESSION': 1, 'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE4),
({'XDG_CURRENT_DESKTOP': 'X-Cinnamon'}, _LinuxDesktopEnvironment.CINNAMON),
+ ({'XDG_CURRENT_DESKTOP': 'Deepin'}, _LinuxDesktopEnvironment.DEEPIN),
({'XDG_CURRENT_DESKTOP': 'GNOME'}, _LinuxDesktopEnvironment.GNOME),
({'XDG_CURRENT_DESKTOP': 'GNOME:GNOME-Classic'}, _LinuxDesktopEnvironment.GNOME),
({'XDG_CURRENT_DESKTOP': 'GNOME : GNOME-Classic'}, _LinuxDesktopEnvironment.GNOME),
({'XDG_CURRENT_DESKTOP': 'Unity', 'DESKTOP_SESSION': 'gnome-fallback'}, _LinuxDesktopEnvironment.GNOME),
- ({'XDG_CURRENT_DESKTOP': 'KDE', 'KDE_SESSION_VERSION': '5'}, _LinuxDesktopEnvironment.KDE),
- ({'XDG_CURRENT_DESKTOP': 'KDE'}, _LinuxDesktopEnvironment.KDE),
+ ({'XDG_CURRENT_DESKTOP': 'KDE', 'KDE_SESSION_VERSION': '5'}, _LinuxDesktopEnvironment.KDE5),
+ ({'XDG_CURRENT_DESKTOP': 'KDE', 'KDE_SESSION_VERSION': '6'}, _LinuxDesktopEnvironment.KDE6),
+ ({'XDG_CURRENT_DESKTOP': 'KDE'}, _LinuxDesktopEnvironment.KDE4),
({'XDG_CURRENT_DESKTOP': 'Pantheon'}, _LinuxDesktopEnvironment.PANTHEON),
+ ({'XDG_CURRENT_DESKTOP': 'UKUI'}, _LinuxDesktopEnvironment.UKUI),
({'XDG_CURRENT_DESKTOP': 'Unity'}, _LinuxDesktopEnvironment.UNITY),
({'XDG_CURRENT_DESKTOP': 'Unity:Unity7'}, _LinuxDesktopEnvironment.UNITY),
({'XDG_CURRENT_DESKTOP': 'Unity:Unity8'}, _LinuxDesktopEnvironment.UNITY),
]
for env, expected_desktop_environment in test_cases:
- self.assertEqual(_get_linux_desktop_environment(env), expected_desktop_environment)
+ self.assertEqual(_get_linux_desktop_environment(env, Logger()), expected_desktop_environment)
def test_chrome_cookie_decryptor_linux_derive_key(self):
key = LinuxChromeCookieDecryptor.derive_key(b'abc')
diff --git a/test/test_download.py b/test/test_download.py
index 6f77343..7c05413 100755
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -10,10 +10,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import collections
import hashlib
-import http.client
import json
-import socket
-import urllib.error
from test.helper import (
assertGreaterEqual,
@@ -29,6 +26,7 @@ from test.helper import (
import hypervideo_dl.YoutubeDL # isort: split
from hypervideo_dl.extractor import get_info_extractor
+from hypervideo_dl.networking.exceptions import HTTPError, TransportError
from hypervideo_dl.utils import (
DownloadError,
ExtractorError,
@@ -162,8 +160,7 @@ def generator(test_case, tname):
force_generic_extractor=params.get('force_generic_extractor', False))
except (DownloadError, ExtractorError) as err:
# Check if the exception is not a network related one
- if (err.exc_info[0] not in (urllib.error.URLError, socket.timeout, UnavailableVideoError, http.client.BadStatusLine)
- or (err.exc_info[0] == urllib.error.HTTPError and err.exc_info[1].code == 503)):
+ if not isinstance(err.exc_info[1], (TransportError, UnavailableVideoError)) or (isinstance(err.exc_info[1], HTTPError) and err.exc_info[1].status == 503):
err.msg = f'{getattr(err, "msg", err)} ({tname})'
raise
@@ -249,7 +246,7 @@ def generator(test_case, tname):
# extractor returns full results even with extract_flat
res_tcs = [{'info_dict': e} for e in res_dict['entries']]
try_rm_tcs_files(res_tcs)
-
+ ydl.close()
return test_template
diff --git a/test/test_downloader_external.py b/test/test_downloader_external.py
new file mode 100644
index 0000000..3200e74
--- /dev/null
+++ b/test/test_downloader_external.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+
+# Allow direct execution
+import os
+import sys
+import unittest
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import http.cookiejar
+
+from test.helper import FakeYDL
+from hypervideo_dl.downloader.external import (
+ Aria2cFD,
+ AxelFD,
+ CurlFD,
+ FFmpegFD,
+ HttpieFD,
+ WgetFD,
+)
+
+TEST_COOKIE = {
+ 'version': 0,
+ 'name': 'test',
+ 'value': 'ytdlp',
+ 'port': None,
+ 'port_specified': False,
+ 'domain': '.example.com',
+ 'domain_specified': True,
+ 'domain_initial_dot': False,
+ 'path': '/',
+ 'path_specified': True,
+ 'secure': False,
+ 'expires': None,
+ 'discard': False,
+ 'comment': None,
+ 'comment_url': None,
+ 'rest': {},
+}
+
+TEST_INFO = {'url': 'http://www.example.com/'}
+
+
+class TestHttpieFD(unittest.TestCase):
+ def test_make_cmd(self):
+ with FakeYDL() as ydl:
+ downloader = HttpieFD(ydl, {})
+ self.assertEqual(
+ downloader._make_cmd('test', TEST_INFO),
+ ['http', '--download', '--output', 'test', 'http://www.example.com/'])
+
+ # Test cookie header is added
+ ydl.cookiejar.set_cookie(http.cookiejar.Cookie(**TEST_COOKIE))
+ self.assertEqual(
+ downloader._make_cmd('test', TEST_INFO),
+ ['http', '--download', '--output', 'test', 'http://www.example.com/', 'Cookie:test=ytdlp'])
+
+
+class TestAxelFD(unittest.TestCase):
+ def test_make_cmd(self):
+ with FakeYDL() as ydl:
+ downloader = AxelFD(ydl, {})
+ self.assertEqual(
+ downloader._make_cmd('test', TEST_INFO),
+ ['axel', '-o', 'test', '--', 'http://www.example.com/'])
+
+ # Test cookie header is added
+ ydl.cookiejar.set_cookie(http.cookiejar.Cookie(**TEST_COOKIE))
+ self.assertEqual(
+ downloader._make_cmd('test', TEST_INFO),
+ ['axel', '-o', 'test', '-H', 'Cookie: test=ytdlp', '--max-redirect=0', '--', 'http://www.example.com/'])
+
+
+class TestWgetFD(unittest.TestCase):
+ def test_make_cmd(self):
+ with FakeYDL() as ydl:
+ downloader = WgetFD(ydl, {})
+ self.assertNotIn('--load-cookies', downloader._make_cmd('test', TEST_INFO))
+ # Test cookiejar tempfile arg is added
+ ydl.cookiejar.set_cookie(http.cookiejar.Cookie(**TEST_COOKIE))
+ self.assertIn('--load-cookies', downloader._make_cmd('test', TEST_INFO))
+
+
+class TestCurlFD(unittest.TestCase):
+ def test_make_cmd(self):
+ with FakeYDL() as ydl:
+ downloader = CurlFD(ydl, {})
+ self.assertNotIn('--cookie', downloader._make_cmd('test', TEST_INFO))
+ # Test cookie header is added
+ ydl.cookiejar.set_cookie(http.cookiejar.Cookie(**TEST_COOKIE))
+ self.assertIn('--cookie', downloader._make_cmd('test', TEST_INFO))
+ self.assertIn('test=ytdlp', downloader._make_cmd('test', TEST_INFO))
+
+
+class TestAria2cFD(unittest.TestCase):
+ def test_make_cmd(self):
+ with FakeYDL() as ydl:
+ downloader = Aria2cFD(ydl, {})
+ downloader._make_cmd('test', TEST_INFO)
+ self.assertFalse(hasattr(downloader, '_cookies_tempfile'))
+
+ # Test cookiejar tempfile arg is added
+ ydl.cookiejar.set_cookie(http.cookiejar.Cookie(**TEST_COOKIE))
+ cmd = downloader._make_cmd('test', TEST_INFO)
+ self.assertIn(f'--load-cookies={downloader._cookies_tempfile}', cmd)
+
+
+@unittest.skipUnless(FFmpegFD.available(), 'ffmpeg not found')
+class TestFFmpegFD(unittest.TestCase):
+ _args = []
+
+ def _test_cmd(self, args):
+ self._args = args
+
+ def test_make_cmd(self):
+ with FakeYDL() as ydl:
+ downloader = FFmpegFD(ydl, {})
+ downloader._debug_cmd = self._test_cmd
+
+ downloader._call_downloader('test', {**TEST_INFO, 'ext': 'mp4'})
+ self.assertEqual(self._args, [
+ 'ffmpeg', '-y', '-hide_banner', '-i', 'http://www.example.com/',
+ '-c', 'copy', '-f', 'mp4', 'file:test'])
+
+ # Test cookies arg is added
+ ydl.cookiejar.set_cookie(http.cookiejar.Cookie(**TEST_COOKIE))
+ downloader._call_downloader('test', {**TEST_INFO, 'ext': 'mp4'})
+ self.assertEqual(self._args, [
+ 'ffmpeg', '-y', '-hide_banner', '-cookies', 'test=ytdlp; path=/; domain=.example.com;\r\n',
+ '-i', 'http://www.example.com/', '-c', 'copy', '-f', 'mp4', 'file:test'])
+
+ # Test with non-url input (ffmpeg reads from stdin '-' for websockets)
+ downloader._call_downloader('test', {'url': 'x', 'ext': 'mp4'})
+ self.assertEqual(self._args, [
+ 'ffmpeg', '-y', '-hide_banner', '-i', 'x', '-c', 'copy', '-f', 'mp4', 'file:test'])
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py
index 3b65859..a422141 100644
--- a/test/test_downloader_http.py
+++ b/test/test_downloader_http.py
@@ -16,6 +16,7 @@ from test.helper import http_server_port, try_rm
from hypervideo_dl import YoutubeDL
from hypervideo_dl.downloader.http import HttpFD
from hypervideo_dl.utils import encodeFilename
+from hypervideo_dl.utils._utils import _YDLLogger as FakeLogger
TEST_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -67,17 +68,6 @@ class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler):
assert False
-class FakeLogger:
- def debug(self, msg):
- pass
-
- def warning(self, msg):
- pass
-
- def error(self, msg):
- pass
-
-
class TestHttpFD(unittest.TestCase):
def setUp(self):
self.httpd = http.server.HTTPServer(
diff --git a/test/test_networking.py b/test/test_networking.py
new file mode 100644
index 0000000..ca7ecf0
--- /dev/null
+++ b/test/test_networking.py
@@ -0,0 +1,1439 @@
+#!/usr/bin/env python3
+
+# Allow direct execution
+import os
+import sys
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import gzip
+import http.client
+import http.cookiejar
+import http.server
+import io
+import pathlib
+import random
+import ssl
+import tempfile
+import threading
+import time
+import urllib.error
+import urllib.request
+import warnings
+import zlib
+from email.message import Message
+from http.cookiejar import CookieJar
+
+from test.helper import FakeYDL, http_server_port
+from hypervideo_dl.cookies import YoutubeDLCookieJar
+from hypervideo_dl.dependencies import brotli
+from hypervideo_dl.networking import (
+ HEADRequest,
+ PUTRequest,
+ Request,
+ RequestDirector,
+ RequestHandler,
+ Response,
+)
+from hypervideo_dl.networking._urllib import UrllibRH
+from hypervideo_dl.networking.exceptions import (
+ CertificateVerifyError,
+ HTTPError,
+ IncompleteRead,
+ NoSupportingHandlers,
+ RequestError,
+ SSLError,
+ TransportError,
+ UnsupportedRequest,
+)
+from hypervideo_dl.utils._utils import _YDLLogger as FakeLogger
+from hypervideo_dl.utils.networking import HTTPHeaderDict
+
+TEST_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+def _build_proxy_handler(name):
+ class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler):
+ proxy_name = name
+
+ def log_message(self, format, *args):
+ pass
+
+ def do_GET(self):
+ self.send_response(200)
+ self.send_header('Content-Type', 'text/plain; charset=utf-8')
+ self.end_headers()
+ self.wfile.write('{self.proxy_name}: {self.path}'.format(self=self).encode())
+ return HTTPTestRequestHandler
+
+
+class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler):
+ protocol_version = 'HTTP/1.1'
+
+ def log_message(self, format, *args):
+ pass
+
+ def _headers(self):
+ payload = str(self.headers).encode()
+ self.send_response(200)
+ self.send_header('Content-Type', 'application/json')
+ self.send_header('Content-Length', str(len(payload)))
+ self.end_headers()
+ self.wfile.write(payload)
+
+ def _redirect(self):
+ self.send_response(int(self.path[len('/redirect_'):]))
+ self.send_header('Location', '/method')
+ self.send_header('Content-Length', '0')
+ self.end_headers()
+
+ def _method(self, method, payload=None):
+ self.send_response(200)
+ self.send_header('Content-Length', str(len(payload or '')))
+ self.send_header('Method', method)
+ self.end_headers()
+ if payload:
+ self.wfile.write(payload)
+
+ def _status(self, status):
+ payload = f'<html>{status} NOT FOUND</html>'.encode()
+ self.send_response(int(status))
+ self.send_header('Content-Type', 'text/html; charset=utf-8')
+ self.send_header('Content-Length', str(len(payload)))
+ self.end_headers()
+ self.wfile.write(payload)
+
+ def _read_data(self):
+ if 'Content-Length' in self.headers:
+ return self.rfile.read(int(self.headers['Content-Length']))
+
+ def do_POST(self):
+ data = self._read_data() + str(self.headers).encode()
+ if self.path.startswith('/redirect_'):
+ self._redirect()
+ elif self.path.startswith('/method'):
+ self._method('POST', data)
+ elif self.path.startswith('/headers'):
+ self._headers()
+ else:
+ self._status(404)
+
+ def do_HEAD(self):
+ if self.path.startswith('/redirect_'):
+ self._redirect()
+ elif self.path.startswith('/method'):
+ self._method('HEAD')
+ else:
+ self._status(404)
+
+ def do_PUT(self):
+ data = self._read_data() + str(self.headers).encode()
+ if self.path.startswith('/redirect_'):
+ self._redirect()
+ elif self.path.startswith('/method'):
+ self._method('PUT', data)
+ else:
+ self._status(404)
+
+ def do_GET(self):
+ if self.path == '/video.html':
+ payload = b'<html><video src="/vid.mp4" /></html>'
+ self.send_response(200)
+ self.send_header('Content-Type', 'text/html; charset=utf-8')
+ self.send_header('Content-Length', str(len(payload)))
+ self.end_headers()
+ self.wfile.write(payload)
+ elif self.path == '/vid.mp4':
+ payload = b'\x00\x00\x00\x00\x20\x66\x74[video]'
+ self.send_response(200)
+ self.send_header('Content-Type', 'video/mp4')
+ self.send_header('Content-Length', str(len(payload)))
+ self.end_headers()
+ self.wfile.write(payload)
+ elif self.path == '/%E4%B8%AD%E6%96%87.html':
+ payload = b'<html><video src="/vid.mp4" /></html>'
+ self.send_response(200)
+ self.send_header('Content-Type', 'text/html; charset=utf-8')
+ self.send_header('Content-Length', str(len(payload)))
+ self.end_headers()
+ self.wfile.write(payload)
+ elif self.path == '/%c7%9f':
+ payload = b'<html><video src="/vid.mp4" /></html>'
+ self.send_response(200)
+ self.send_header('Content-Type', 'text/html; charset=utf-8')
+ self.send_header('Content-Length', str(len(payload)))
+ self.end_headers()
+ self.wfile.write(payload)
+ elif self.path.startswith('/redirect_loop'):
+ self.send_response(301)
+ self.send_header('Location', self.path)
+ self.send_header('Content-Length', '0')
+ self.end_headers()
+ elif self.path == '/redirect_dotsegments':
+ self.send_response(301)
+ # redirect to /headers but with dot segments before
+ self.send_header('Location', '/a/b/./../../headers')
+ self.send_header('Content-Length', '0')
+ self.end_headers()
+ elif self.path.startswith('/redirect_'):
+ self._redirect()
+ elif self.path.startswith('/method'):
+ self._method('GET', str(self.headers).encode())
+ elif self.path.startswith('/headers'):
+ self._headers()
+ elif self.path.startswith('/308-to-headers'):
+ self.send_response(308)
+ self.send_header('Location', '/headers')
+ self.send_header('Content-Length', '0')
+ self.end_headers()
+ elif self.path == '/trailing_garbage':
+ payload = b'<html><video src="/vid.mp4" /></html>'
+ self.send_response(200)
+ self.send_header('Content-Type', 'text/html; charset=utf-8')
+ self.send_header('Content-Encoding', 'gzip')
+ buf = io.BytesIO()
+ with gzip.GzipFile(fileobj=buf, mode='wb') as f:
+ f.write(payload)
+ compressed = buf.getvalue() + b'trailing garbage'
+ self.send_header('Content-Length', str(len(compressed)))
+ self.end_headers()
+ self.wfile.write(compressed)
+ elif self.path == '/302-non-ascii-redirect':
+ new_url = f'http://127.0.0.1:{http_server_port(self.server)}/中文.html'
+ self.send_response(301)
+ self.send_header('Location', new_url)
+ self.send_header('Content-Length', '0')
+ self.end_headers()
+ elif self.path == '/content-encoding':
+ encodings = self.headers.get('ytdl-encoding', '')
+ payload = b'<html><video src="/vid.mp4" /></html>'
+ for encoding in filter(None, (e.strip() for e in encodings.split(','))):
+ if encoding == 'br' and brotli:
+ payload = brotli.compress(payload)
+ elif encoding == 'gzip':
+ buf = io.BytesIO()
+ with gzip.GzipFile(fileobj=buf, mode='wb') as f:
+ f.write(payload)
+ payload = buf.getvalue()
+ elif encoding == 'deflate':
+ payload = zlib.compress(payload)
+ elif encoding == 'unsupported':
+ payload = b'raw'
+ break
+ else:
+ self._status(415)
+ return
+ self.send_response(200)
+ self.send_header('Content-Encoding', encodings)
+ self.send_header('Content-Length', str(len(payload)))
+ self.end_headers()
+ self.wfile.write(payload)
+ elif self.path.startswith('/gen_'):
+ payload = b'<html></html>'
+ self.send_response(int(self.path[len('/gen_'):]))
+ self.send_header('Content-Type', 'text/html; charset=utf-8')
+ self.send_header('Content-Length', str(len(payload)))
+ self.end_headers()
+ self.wfile.write(payload)
+ elif self.path.startswith('/incompleteread'):
+ payload = b'<html></html>'
+ self.send_response(200)
+ self.send_header('Content-Type', 'text/html; charset=utf-8')
+ self.send_header('Content-Length', '234234')
+ self.end_headers()
+ self.wfile.write(payload)
+ self.finish()
+ elif self.path.startswith('/timeout_'):
+ time.sleep(int(self.path[len('/timeout_'):]))
+ self._headers()
+ elif self.path == '/source_address':
+ payload = str(self.client_address[0]).encode()
+ self.send_response(200)
+ self.send_header('Content-Type', 'text/html; charset=utf-8')
+ self.send_header('Content-Length', str(len(payload)))
+ self.end_headers()
+ self.wfile.write(payload)
+ self.finish()
+ else:
+ self._status(404)
+
+ def send_header(self, keyword, value):
+ """
+ Forcibly allow HTTP server to send non percent-encoded non-ASCII characters in headers.
+ This is against what is defined in RFC 3986, however we need to test we support this
+ since some sites incorrectly do this.
+ """
+ if keyword.lower() == 'connection':
+ return super().send_header(keyword, value)
+
+ if not hasattr(self, '_headers_buffer'):
+ self._headers_buffer = []
+
+ self._headers_buffer.append(f'{keyword}: {value}\r\n'.encode())
+
+
+def validate_and_send(rh, req):
+ rh.validate(req)
+ return rh.send(req)
+
+
+class TestRequestHandlerBase:
+ @classmethod
+ def setup_class(cls):
+ cls.http_httpd = http.server.ThreadingHTTPServer(
+ ('127.0.0.1', 0), HTTPTestRequestHandler)
+ cls.http_port = http_server_port(cls.http_httpd)
+ cls.http_server_thread = threading.Thread(target=cls.http_httpd.serve_forever)
+ # FIXME: we should probably stop the http server thread after each test
+ # See: https://github.com/hypervideo/hypervideo/pull/7094#discussion_r1199746041
+ cls.http_server_thread.daemon = True
+ cls.http_server_thread.start()
+
+ # HTTPS server
+ certfn = os.path.join(TEST_DIR, 'testcert.pem')
+ cls.https_httpd = http.server.ThreadingHTTPServer(
+ ('127.0.0.1', 0), HTTPTestRequestHandler)
+ sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
+ sslctx.load_cert_chain(certfn, None)
+ cls.https_httpd.socket = sslctx.wrap_socket(cls.https_httpd.socket, server_side=True)
+ cls.https_port = http_server_port(cls.https_httpd)
+ cls.https_server_thread = threading.Thread(target=cls.https_httpd.serve_forever)
+ cls.https_server_thread.daemon = True
+ cls.https_server_thread.start()
+
+
+class TestHTTPRequestHandler(TestRequestHandlerBase):
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_verify_cert(self, handler):
+ with handler() as rh:
+ with pytest.raises(CertificateVerifyError):
+ validate_and_send(rh, Request(f'https://127.0.0.1:{self.https_port}/headers'))
+
+ with handler(verify=False) as rh:
+ r = validate_and_send(rh, Request(f'https://127.0.0.1:{self.https_port}/headers'))
+ assert r.status == 200
+ r.close()
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_ssl_error(self, handler):
+ # HTTPS server with too old TLS version
+ # XXX: is there a better way to test this than to create a new server?
+ https_httpd = http.server.ThreadingHTTPServer(
+ ('127.0.0.1', 0), HTTPTestRequestHandler)
+ sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
+ https_httpd.socket = sslctx.wrap_socket(https_httpd.socket, server_side=True)
+ https_port = http_server_port(https_httpd)
+ https_server_thread = threading.Thread(target=https_httpd.serve_forever)
+ https_server_thread.daemon = True
+ https_server_thread.start()
+
+ with handler(verify=False) as rh:
+ with pytest.raises(SSLError, match='sslv3 alert handshake failure') as exc_info:
+ validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers'))
+ assert not issubclass(exc_info.type, CertificateVerifyError)
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_percent_encode(self, handler):
+ with handler() as rh:
+ # Unicode characters should be encoded with uppercase percent-encoding
+ res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/中文.html'))
+ assert res.status == 200
+ res.close()
+ # don't normalize existing percent encodings
+ res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/%c7%9f'))
+ assert res.status == 200
+ res.close()
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_remove_dot_segments(self, handler):
+ with handler() as rh:
+ # This isn't a comprehensive test,
+ # but it should be enough to check whether the handler is removing dot segments
+ res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/a/b/./../../headers'))
+ assert res.status == 200
+ assert res.url == f'http://127.0.0.1:{self.http_port}/headers'
+ res.close()
+
+ res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_dotsegments'))
+ assert res.status == 200
+ assert res.url == f'http://127.0.0.1:{self.http_port}/headers'
+ res.close()
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_unicode_path_redirection(self, handler):
+ with handler() as rh:
+ r = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/302-non-ascii-redirect'))
+ assert r.url == f'http://127.0.0.1:{self.http_port}/%E4%B8%AD%E6%96%87.html'
+ r.close()
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_raise_http_error(self, handler):
+ with handler() as rh:
+ for bad_status in (400, 500, 599, 302):
+ with pytest.raises(HTTPError):
+ validate_and_send(rh, Request('http://127.0.0.1:%d/gen_%d' % (self.http_port, bad_status)))
+
+ # Should not raise an error
+ validate_and_send(rh, Request('http://127.0.0.1:%d/gen_200' % self.http_port)).close()
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_response_url(self, handler):
+ with handler() as rh:
+ # Response url should be that of the last url in redirect chain
+ res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_301'))
+ assert res.url == f'http://127.0.0.1:{self.http_port}/method'
+ res.close()
+ res2 = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/gen_200'))
+ assert res2.url == f'http://127.0.0.1:{self.http_port}/gen_200'
+ res2.close()
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_redirect(self, handler):
+ with handler() as rh:
+ def do_req(redirect_status, method, assert_no_content=False):
+ data = b'testdata' if method in ('POST', 'PUT') else None
+ res = validate_and_send(
+ rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_{redirect_status}', method=method, data=data))
+
+ headers = b''
+ data_sent = b''
+ if data is not None:
+ data_sent += res.read(len(data))
+ if data_sent != data:
+ headers += data_sent
+ data_sent = b''
+
+ headers += res.read()
+
+ if assert_no_content or data is None:
+ assert b'Content-Type' not in headers
+ assert b'Content-Length' not in headers
+ else:
+ assert b'Content-Type' in headers
+ assert b'Content-Length' in headers
+
+ return data_sent.decode(), res.headers.get('method', '')
+
+ # A 303 must either use GET or HEAD for subsequent request
+ assert do_req(303, 'POST', True) == ('', 'GET')
+ assert do_req(303, 'HEAD') == ('', 'HEAD')
+
+ assert do_req(303, 'PUT', True) == ('', 'GET')
+
+ # 301 and 302 turn POST only into a GET
+ assert do_req(301, 'POST', True) == ('', 'GET')
+ assert do_req(301, 'HEAD') == ('', 'HEAD')
+ assert do_req(302, 'POST', True) == ('', 'GET')
+ assert do_req(302, 'HEAD') == ('', 'HEAD')
+
+ assert do_req(301, 'PUT') == ('testdata', 'PUT')
+ assert do_req(302, 'PUT') == ('testdata', 'PUT')
+
+ # 307 and 308 should not change method
+ for m in ('POST', 'PUT'):
+ assert do_req(307, m) == ('testdata', m)
+ assert do_req(308, m) == ('testdata', m)
+
+ assert do_req(307, 'HEAD') == ('', 'HEAD')
+ assert do_req(308, 'HEAD') == ('', 'HEAD')
+
+ # These should not redirect and instead raise an HTTPError
+ for code in (300, 304, 305, 306):
+ with pytest.raises(HTTPError):
+ do_req(code, 'GET')
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_request_cookie_header(self, handler):
+ # We should accept a Cookie header being passed as in normal headers and handle it appropriately.
+ with handler() as rh:
+ # Specified Cookie header should be used
+ res = validate_and_send(
+ rh, Request(
+ f'http://127.0.0.1:{self.http_port}/headers',
+ headers={'Cookie': 'test=test'})).read().decode()
+ assert 'Cookie: test=test' in res
+
+ # Specified Cookie header should be removed on any redirect
+ res = validate_and_send(
+ rh, Request(
+ f'http://127.0.0.1:{self.http_port}/308-to-headers',
+ headers={'Cookie': 'test=test'})).read().decode()
+ assert 'Cookie: test=test' not in res
+
+ # Specified Cookie header should override global cookiejar for that request
+ cookiejar = YoutubeDLCookieJar()
+ cookiejar.set_cookie(http.cookiejar.Cookie(
+ version=0, name='test', value='ytdlp', port=None, port_specified=False,
+ domain='127.0.0.1', domain_specified=True, domain_initial_dot=False, path='/',
+ path_specified=True, secure=False, expires=None, discard=False, comment=None,
+ comment_url=None, rest={}))
+
+ with handler(cookiejar=cookiejar) as rh:
+ data = validate_and_send(
+ rh, Request(f'http://127.0.0.1:{self.http_port}/headers', headers={'cookie': 'test=test'})).read()
+ assert b'Cookie: test=ytdlp' not in data
+ assert b'Cookie: test=test' in data
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_redirect_loop(self, handler):
+ with handler() as rh:
+ with pytest.raises(HTTPError, match='redirect loop'):
+ validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_loop'))
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_incompleteread(self, handler):
+ with handler(timeout=2) as rh:
+ with pytest.raises(IncompleteRead):
+ validate_and_send(rh, Request('http://127.0.0.1:%d/incompleteread' % self.http_port)).read()
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_cookies(self, handler):
+ cookiejar = YoutubeDLCookieJar()
+ cookiejar.set_cookie(http.cookiejar.Cookie(
+ 0, 'test', 'ytdlp', None, False, '127.0.0.1', True,
+ False, '/headers', True, False, None, False, None, None, {}))
+
+ with handler(cookiejar=cookiejar) as rh:
+ data = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/headers')).read()
+ assert b'Cookie: test=ytdlp' in data
+
+ # Per request
+ with handler() as rh:
+ data = validate_and_send(
+ rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions={'cookiejar': cookiejar})).read()
+ assert b'Cookie: test=ytdlp' in data
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_headers(self, handler):
+
+ with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh:
+ # Global Headers
+ data = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/headers')).read()
+ assert b'Test1: test' in data
+
+ # Per request headers, merged with global
+ data = validate_and_send(rh, Request(
+ f'http://127.0.0.1:{self.http_port}/headers', headers={'test2': 'changed', 'test3': 'test3'})).read()
+ assert b'Test1: test' in data
+ assert b'Test2: changed' in data
+ assert b'Test2: test2' not in data
+ assert b'Test3: test3' in data
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_timeout(self, handler):
+ with handler() as rh:
+ # Default timeout is 20 seconds, so this should go through
+ validate_and_send(
+ rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_3'))
+
+ with handler(timeout=0.5) as rh:
+ with pytest.raises(TransportError):
+ validate_and_send(
+ rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1'))
+
+ # Per request timeout, should override handler timeout
+ validate_and_send(
+ rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1', extensions={'timeout': 4}))
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_source_address(self, handler):
+ source_address = f'127.0.0.{random.randint(5, 255)}'
+ with handler(source_address=source_address) as rh:
+ data = validate_and_send(
+ rh, Request(f'http://127.0.0.1:{self.http_port}/source_address')).read().decode()
+ assert source_address == data
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_gzip_trailing_garbage(self, handler):
+ with handler() as rh:
+ data = validate_and_send(rh, Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode()
+ assert data == '<html><video src="/vid.mp4" /></html>'
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ @pytest.mark.skipif(not brotli, reason='brotli support is not installed')
+ def test_brotli(self, handler):
+ with handler() as rh:
+ res = validate_and_send(
+ rh, Request(
+ f'http://127.0.0.1:{self.http_port}/content-encoding',
+ headers={'ytdl-encoding': 'br'}))
+ assert res.headers.get('Content-Encoding') == 'br'
+ assert res.read() == b'<html><video src="/vid.mp4" /></html>'
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_deflate(self, handler):
+ with handler() as rh:
+ res = validate_and_send(
+ rh, Request(
+ f'http://127.0.0.1:{self.http_port}/content-encoding',
+ headers={'ytdl-encoding': 'deflate'}))
+ assert res.headers.get('Content-Encoding') == 'deflate'
+ assert res.read() == b'<html><video src="/vid.mp4" /></html>'
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_gzip(self, handler):
+ with handler() as rh:
+ res = validate_and_send(
+ rh, Request(
+ f'http://127.0.0.1:{self.http_port}/content-encoding',
+ headers={'ytdl-encoding': 'gzip'}))
+ assert res.headers.get('Content-Encoding') == 'gzip'
+ assert res.read() == b'<html><video src="/vid.mp4" /></html>'
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_multiple_encodings(self, handler):
+ with handler() as rh:
+ for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'):
+ res = validate_and_send(
+ rh, Request(
+ f'http://127.0.0.1:{self.http_port}/content-encoding',
+ headers={'ytdl-encoding': pair}))
+ assert res.headers.get('Content-Encoding') == pair
+ assert res.read() == b'<html><video src="/vid.mp4" /></html>'
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_unsupported_encoding(self, handler):
+ with handler() as rh:
+ res = validate_and_send(
+ rh, Request(
+ f'http://127.0.0.1:{self.http_port}/content-encoding',
+ headers={'ytdl-encoding': 'unsupported'}))
+ assert res.headers.get('Content-Encoding') == 'unsupported'
+ assert res.read() == b'raw'
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_read(self, handler):
+ with handler() as rh:
+ res = validate_and_send(
+ rh, Request(f'http://127.0.0.1:{self.http_port}/headers'))
+ assert res.readable()
+ assert res.read(1) == b'H'
+ assert res.read(3) == b'ost'
+
+
+class TestHTTPProxy(TestRequestHandlerBase):
+ @classmethod
+ def setup_class(cls):
+ super().setup_class()
+ # HTTP Proxy server
+ cls.proxy = http.server.ThreadingHTTPServer(
+ ('127.0.0.1', 0), _build_proxy_handler('normal'))
+ cls.proxy_port = http_server_port(cls.proxy)
+ cls.proxy_thread = threading.Thread(target=cls.proxy.serve_forever)
+ cls.proxy_thread.daemon = True
+ cls.proxy_thread.start()
+
+ # Geo proxy server
+ cls.geo_proxy = http.server.ThreadingHTTPServer(
+ ('127.0.0.1', 0), _build_proxy_handler('geo'))
+ cls.geo_port = http_server_port(cls.geo_proxy)
+ cls.geo_proxy_thread = threading.Thread(target=cls.geo_proxy.serve_forever)
+ cls.geo_proxy_thread.daemon = True
+ cls.geo_proxy_thread.start()
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_http_proxy(self, handler):
+ http_proxy = f'http://127.0.0.1:{self.proxy_port}'
+ geo_proxy = f'http://127.0.0.1:{self.geo_port}'
+
+ # Test global http proxy
+ # Test per request http proxy
+ # Test per request http proxy disables proxy
+ url = 'http://foo.com/bar'
+
+ # Global HTTP proxy
+ with handler(proxies={'http': http_proxy}) as rh:
+ res = validate_and_send(rh, Request(url)).read().decode()
+ assert res == f'normal: {url}'
+
+ # Per request proxy overrides global
+ res = validate_and_send(rh, Request(url, proxies={'http': geo_proxy})).read().decode()
+ assert res == f'geo: {url}'
+
+ # and setting to None disables all proxies for that request
+ real_url = f'http://127.0.0.1:{self.http_port}/headers'
+ res = validate_and_send(
+ rh, Request(real_url, proxies={'http': None})).read().decode()
+ assert res != f'normal: {real_url}'
+ assert 'Accept' in res
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_noproxy(self, handler):
+ with handler(proxies={'proxy': f'http://127.0.0.1:{self.proxy_port}'}) as rh:
+ # NO_PROXY
+ for no_proxy in (f'127.0.0.1:{self.http_port}', '127.0.0.1', 'localhost'):
+ nop_response = validate_and_send(
+ rh, Request(f'http://127.0.0.1:{self.http_port}/headers', proxies={'no': no_proxy})).read().decode(
+ 'utf-8')
+ assert 'Accept' in nop_response
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_allproxy(self, handler):
+ url = 'http://foo.com/bar'
+ with handler() as rh:
+ response = validate_and_send(rh, Request(url, proxies={'all': f'http://127.0.0.1:{self.proxy_port}'})).read().decode(
+ 'utf-8')
+ assert response == f'normal: {url}'
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_http_proxy_with_idn(self, handler):
+ with handler(proxies={
+ 'http': f'http://127.0.0.1:{self.proxy_port}',
+ }) as rh:
+ url = 'http://中文.tw/'
+ response = rh.send(Request(url)).read().decode()
+ # b'xn--fiq228c' is '中文'.encode('idna')
+ assert response == 'normal: http://xn--fiq228c.tw/'
+
+
+class TestClientCertificate:
+
+ @classmethod
+ def setup_class(cls):
+ certfn = os.path.join(TEST_DIR, 'testcert.pem')
+ cls.certdir = os.path.join(TEST_DIR, 'testdata', 'certificate')
+ cacertfn = os.path.join(cls.certdir, 'ca.crt')
+ cls.httpd = http.server.ThreadingHTTPServer(('127.0.0.1', 0), HTTPTestRequestHandler)
+ sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
+ sslctx.verify_mode = ssl.CERT_REQUIRED
+ sslctx.load_verify_locations(cafile=cacertfn)
+ sslctx.load_cert_chain(certfn, None)
+ cls.httpd.socket = sslctx.wrap_socket(cls.httpd.socket, server_side=True)
+ cls.port = http_server_port(cls.httpd)
+ cls.server_thread = threading.Thread(target=cls.httpd.serve_forever)
+ cls.server_thread.daemon = True
+ cls.server_thread.start()
+
+ def _run_test(self, handler, **handler_kwargs):
+ with handler(
+ # Disable client-side validation of unacceptable self-signed testcert.pem
+ # The test is of a check on the server side, so unaffected
+ verify=False,
+ **handler_kwargs,
+ ) as rh:
+ validate_and_send(rh, Request(f'https://127.0.0.1:{self.port}/video.html')).read().decode()
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_certificate_combined_nopass(self, handler):
+ self._run_test(handler, client_cert={
+ 'client_certificate': os.path.join(self.certdir, 'clientwithkey.crt'),
+ })
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_certificate_nocombined_nopass(self, handler):
+ self._run_test(handler, client_cert={
+ 'client_certificate': os.path.join(self.certdir, 'client.crt'),
+ 'client_certificate_key': os.path.join(self.certdir, 'client.key'),
+ })
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_certificate_combined_pass(self, handler):
+ self._run_test(handler, client_cert={
+ 'client_certificate': os.path.join(self.certdir, 'clientwithencryptedkey.crt'),
+ 'client_certificate_password': 'foobar',
+ })
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_certificate_nocombined_pass(self, handler):
+ self._run_test(handler, client_cert={
+ 'client_certificate': os.path.join(self.certdir, 'client.crt'),
+ 'client_certificate_key': os.path.join(self.certdir, 'clientencrypted.key'),
+ 'client_certificate_password': 'foobar',
+ })
+
+
+class TestUrllibRequestHandler(TestRequestHandlerBase):
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_file_urls(self, handler):
+ # See https://github.com/ytdl-org/youtube-dl/issues/8227
+ tf = tempfile.NamedTemporaryFile(delete=False)
+ tf.write(b'foobar')
+ tf.close()
+ req = Request(pathlib.Path(tf.name).as_uri())
+ with handler() as rh:
+ with pytest.raises(UnsupportedRequest):
+ rh.validate(req)
+
+ # Test that urllib never loaded FileHandler
+ with pytest.raises(TransportError):
+ rh.send(req)
+
+ with handler(enable_file_urls=True) as rh:
+ res = validate_and_send(rh, req)
+ assert res.read() == b'foobar'
+ res.close()
+
+ os.unlink(tf.name)
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_http_error_returns_content(self, handler):
+ # urllib HTTPError will try close the underlying response if reference to the HTTPError object is lost
+ def get_response():
+ with handler() as rh:
+ # headers url
+ try:
+ validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/gen_404'))
+ except HTTPError as e:
+ return e.response
+
+ assert get_response().read() == b'<html></html>'
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_verify_cert_error_text(self, handler):
+ # Check the output of the error message
+ with handler() as rh:
+ with pytest.raises(
+ CertificateVerifyError,
+ match=r'\[SSL: CERTIFICATE_VERIFY_FAILED\] certificate verify failed: self.signed certificate'
+ ):
+ validate_and_send(rh, Request(f'https://127.0.0.1:{self.https_port}/headers'))
+
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ @pytest.mark.parametrize('req,match,version_check', [
+ # https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
+ # bpo-39603: Check implemented in 3.7.9+, 3.8.5+
+ (
+ Request('http://127.0.0.1', method='GET\n'),
+ 'method can\'t contain control characters',
+ lambda v: v < (3, 7, 9) or (3, 8, 0) <= v < (3, 8, 5)
+ ),
+ # https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1265
+ # bpo-38576: Check implemented in 3.7.8+, 3.8.3+
+ (
+ Request('http://127.0.0. 1', method='GET'),
+ 'URL can\'t contain control characters',
+ lambda v: v < (3, 7, 8) or (3, 8, 0) <= v < (3, 8, 3)
+ ),
+ # https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1288C31-L1288C50
+ (Request('http://127.0.0.1', headers={'foo\n': 'bar'}), 'Invalid header name', None),
+ ])
+ def test_httplib_validation_errors(self, handler, req, match, version_check):
+ if version_check and version_check(sys.version_info):
+ pytest.skip(f'Python {sys.version} version does not have the required validation for this test.')
+
+ with handler() as rh:
+ with pytest.raises(RequestError, match=match) as exc_info:
+ validate_and_send(rh, req)
+ assert not isinstance(exc_info.value, TransportError)
+
+
+def run_validation(handler, error, req, **handler_kwargs):
+ with handler(**handler_kwargs) as rh:
+ if error:
+ with pytest.raises(error):
+ rh.validate(req)
+ else:
+ rh.validate(req)
+
+
+class TestRequestHandlerValidation:
+
+ class ValidationRH(RequestHandler):
+ def _send(self, request):
+ raise RequestError('test')
+
+ class NoCheckRH(ValidationRH):
+ _SUPPORTED_FEATURES = None
+ _SUPPORTED_PROXY_SCHEMES = None
+ _SUPPORTED_URL_SCHEMES = None
+
+ def _check_extensions(self, extensions):
+ extensions.clear()
+
+ class HTTPSupportedRH(ValidationRH):
+ _SUPPORTED_URL_SCHEMES = ('http',)
+
+ URL_SCHEME_TESTS = [
+ # scheme, expected to fail, handler kwargs
+ ('Urllib', [
+ ('http', False, {}),
+ ('https', False, {}),
+ ('data', False, {}),
+ ('ftp', False, {}),
+ ('file', UnsupportedRequest, {}),
+ ('file', False, {'enable_file_urls': True}),
+ ]),
+ (NoCheckRH, [('http', False, {})]),
+ (ValidationRH, [('http', UnsupportedRequest, {})])
+ ]
+
+ PROXY_SCHEME_TESTS = [
+ # scheme, expected to fail
+ ('Urllib', [
+ ('http', False),
+ ('https', UnsupportedRequest),
+ ('socks4', False),
+ ('socks4a', False),
+ ('socks5', False),
+ ('socks5h', False),
+ ('socks', UnsupportedRequest),
+ ]),
+ (NoCheckRH, [('http', False)]),
+ (HTTPSupportedRH, [('http', UnsupportedRequest)]),
+ ]
+
+ PROXY_KEY_TESTS = [
+ # key, expected to fail
+ ('Urllib', [
+ ('all', False),
+ ('unrelated', False),
+ ]),
+ (NoCheckRH, [('all', False)]),
+ (HTTPSupportedRH, [('all', UnsupportedRequest)]),
+ (HTTPSupportedRH, [('no', UnsupportedRequest)]),
+ ]
+
+ EXTENSION_TESTS = [
+ ('Urllib', [
+ ({'cookiejar': 'notacookiejar'}, AssertionError),
+ ({'cookiejar': YoutubeDLCookieJar()}, False),
+ ({'cookiejar': CookieJar()}, AssertionError),
+ ({'timeout': 1}, False),
+ ({'timeout': 'notatimeout'}, AssertionError),
+ ({'unsupported': 'value'}, UnsupportedRequest),
+ ]),
+ (NoCheckRH, [
+ ({'cookiejar': 'notacookiejar'}, False),
+ ({'somerandom': 'test'}, False), # but any extension is allowed through
+ ]),
+ ]
+
+ @pytest.mark.parametrize('handler,scheme,fail,handler_kwargs', [
+ (handler_tests[0], scheme, fail, handler_kwargs)
+ for handler_tests in URL_SCHEME_TESTS
+ for scheme, fail, handler_kwargs in handler_tests[1]
+
+ ], indirect=['handler'])
+ def test_url_scheme(self, handler, scheme, fail, handler_kwargs):
+ run_validation(handler, fail, Request(f'{scheme}://'), **(handler_kwargs or {}))
+
+ @pytest.mark.parametrize('handler,fail', [('Urllib', False)], indirect=['handler'])
+ def test_no_proxy(self, handler, fail):
+ run_validation(handler, fail, Request('http://', proxies={'no': '127.0.0.1,github.com'}))
+ run_validation(handler, fail, Request('http://'), proxies={'no': '127.0.0.1,github.com'})
+
+ @pytest.mark.parametrize('handler,proxy_key,fail', [
+ (handler_tests[0], proxy_key, fail)
+ for handler_tests in PROXY_KEY_TESTS
+ for proxy_key, fail in handler_tests[1]
+ ], indirect=['handler'])
+ def test_proxy_key(self, handler, proxy_key, fail):
+ run_validation(handler, fail, Request('http://', proxies={proxy_key: 'http://example.com'}))
+ run_validation(handler, fail, Request('http://'), proxies={proxy_key: 'http://example.com'})
+
+ @pytest.mark.parametrize('handler,scheme,fail', [
+ (handler_tests[0], scheme, fail)
+ for handler_tests in PROXY_SCHEME_TESTS
+ for scheme, fail in handler_tests[1]
+ ], indirect=['handler'])
+ def test_proxy_scheme(self, handler, scheme, fail):
+ run_validation(handler, fail, Request('http://', proxies={'http': f'{scheme}://example.com'}))
+ run_validation(handler, fail, Request('http://'), proxies={'http': f'{scheme}://example.com'})
+
+ @pytest.mark.parametrize('handler', ['Urllib', HTTPSupportedRH], indirect=True)
+ def test_empty_proxy(self, handler):
+ run_validation(handler, False, Request('http://', proxies={'http': None}))
+ run_validation(handler, False, Request('http://'), proxies={'http': None})
+
+ @pytest.mark.parametrize('proxy_url', ['//example.com', 'example.com', '127.0.0.1', '/a/b/c'])
+ @pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
+ def test_invalid_proxy_url(self, handler, proxy_url):
+ run_validation(handler, UnsupportedRequest, Request('http://', proxies={'http': proxy_url}))
+
+ @pytest.mark.parametrize('handler,extensions,fail', [
+ (handler_tests[0], extensions, fail)
+ for handler_tests in EXTENSION_TESTS
+ for extensions, fail in handler_tests[1]
+ ], indirect=['handler'])
+ def test_extension(self, handler, extensions, fail):
+ run_validation(
+ handler, fail, Request('http://', extensions=extensions))
+
+ def test_invalid_request_type(self):
+ rh = self.ValidationRH(logger=FakeLogger())
+ for method in (rh.validate, rh.send):
+ with pytest.raises(TypeError, match='Expected an instance of Request'):
+ method('not a request')
+
+
+class FakeResponse(Response):
+ def __init__(self, request):
+ # XXX: we could make request part of standard response interface
+ self.request = request
+ super().__init__(fp=io.BytesIO(b''), headers={}, url=request.url)
+
+
+class FakeRH(RequestHandler):
+
+ def _validate(self, request):
+ return
+
+ def _send(self, request: Request):
+ if request.url.startswith('ssl://'):
+ raise SSLError(request.url[len('ssl://'):])
+ return FakeResponse(request)
+
+
+class FakeRHYDL(FakeYDL):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self._request_director = self.build_request_director([FakeRH])
+
+
+class TestRequestDirector:
+
+ def test_handler_operations(self):
+ director = RequestDirector(logger=FakeLogger())
+ handler = FakeRH(logger=FakeLogger())
+ director.add_handler(handler)
+ assert director.handlers.get(FakeRH.RH_KEY) is handler
+
+ # Handler should overwrite
+ handler2 = FakeRH(logger=FakeLogger())
+ director.add_handler(handler2)
+ assert director.handlers.get(FakeRH.RH_KEY) is not handler
+ assert director.handlers.get(FakeRH.RH_KEY) is handler2
+ assert len(director.handlers) == 1
+
+ class AnotherFakeRH(FakeRH):
+ pass
+ director.add_handler(AnotherFakeRH(logger=FakeLogger()))
+ assert len(director.handlers) == 2
+ assert director.handlers.get(AnotherFakeRH.RH_KEY).RH_KEY == AnotherFakeRH.RH_KEY
+
+ director.handlers.pop(FakeRH.RH_KEY, None)
+ assert director.handlers.get(FakeRH.RH_KEY) is None
+ assert len(director.handlers) == 1
+
+ # RequestErrors should passthrough
+ with pytest.raises(SSLError):
+ director.send(Request('ssl://something'))
+
+ def test_send(self):
+ director = RequestDirector(logger=FakeLogger())
+ with pytest.raises(RequestError):
+ director.send(Request('any://'))
+ director.add_handler(FakeRH(logger=FakeLogger()))
+ assert isinstance(director.send(Request('http://')), FakeResponse)
+
+ def test_unsupported_handlers(self):
+ class SupportedRH(RequestHandler):
+ _SUPPORTED_URL_SCHEMES = ['http']
+
+ def _send(self, request: Request):
+ return Response(fp=io.BytesIO(b'supported'), headers={}, url=request.url)
+
+ director = RequestDirector(logger=FakeLogger())
+ director.add_handler(SupportedRH(logger=FakeLogger()))
+ director.add_handler(FakeRH(logger=FakeLogger()))
+
+ # First should take preference
+ assert director.send(Request('http://')).read() == b'supported'
+ assert director.send(Request('any://')).read() == b''
+
+ director.handlers.pop(FakeRH.RH_KEY)
+ with pytest.raises(NoSupportingHandlers):
+ director.send(Request('any://'))
+
+ def test_unexpected_error(self):
+ director = RequestDirector(logger=FakeLogger())
+
+ class UnexpectedRH(FakeRH):
+ def _send(self, request: Request):
+ raise TypeError('something')
+
+ director.add_handler(UnexpectedRH(logger=FakeLogger))
+ with pytest.raises(NoSupportingHandlers, match=r'1 unexpected error'):
+ director.send(Request('any://'))
+
+ director.handlers.clear()
+ assert len(director.handlers) == 0
+
+ # Should not be fatal
+ director.add_handler(FakeRH(logger=FakeLogger()))
+ director.add_handler(UnexpectedRH(logger=FakeLogger))
+ assert director.send(Request('any://'))
+
+ def test_preference(self):
+ director = RequestDirector(logger=FakeLogger())
+ director.add_handler(FakeRH(logger=FakeLogger()))
+
+ class SomeRH(RequestHandler):
+ _SUPPORTED_URL_SCHEMES = ['http']
+
+ def _send(self, request: Request):
+ return Response(fp=io.BytesIO(b'supported'), headers={}, url=request.url)
+
+ def some_preference(rh, request):
+ return (0 if not isinstance(rh, SomeRH)
+ else 100 if 'prefer' in request.headers
+ else -1)
+
+ director.add_handler(SomeRH(logger=FakeLogger()))
+ director.preferences.add(some_preference)
+
+ assert director.send(Request('http://')).read() == b''
+ assert director.send(Request('http://', headers={'prefer': '1'})).read() == b'supported'
+
+
+# XXX: do we want to move this to test_YoutubeDL.py?
+class TestYoutubeDLNetworking:
+
+ @staticmethod
+ def build_handler(ydl, handler: RequestHandler = FakeRH):
+ return ydl.build_request_director([handler]).handlers.get(handler.RH_KEY)
+
+ def test_compat_opener(self):
+ with FakeYDL() as ydl:
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore', category=DeprecationWarning)
+ assert isinstance(ydl._opener, urllib.request.OpenerDirector)
+
+ @pytest.mark.parametrize('proxy,expected', [
+ ('http://127.0.0.1:8080', {'all': 'http://127.0.0.1:8080'}),
+ ('', {'all': '__noproxy__'}),
+ (None, {'http': 'http://127.0.0.1:8081', 'https': 'http://127.0.0.1:8081'}) # env, set https
+ ])
+ def test_proxy(self, proxy, expected):
+ old_http_proxy = os.environ.get('HTTP_PROXY')
+ try:
+ os.environ['HTTP_PROXY'] = 'http://127.0.0.1:8081' # ensure that provided proxies override env
+ with FakeYDL({'proxy': proxy}) as ydl:
+ assert ydl.proxies == expected
+ finally:
+ if old_http_proxy:
+ os.environ['HTTP_PROXY'] = old_http_proxy
+
+ def test_compat_request(self):
+ with FakeRHYDL() as ydl:
+ assert ydl.urlopen('test://')
+ urllib_req = urllib.request.Request('http://foo.bar', data=b'test', method='PUT', headers={'X-Test': '1'})
+ urllib_req.add_unredirected_header('Cookie', 'bob=bob')
+ urllib_req.timeout = 2
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore', category=DeprecationWarning)
+ req = ydl.urlopen(urllib_req).request
+ assert req.url == urllib_req.get_full_url()
+ assert req.data == urllib_req.data
+ assert req.method == urllib_req.get_method()
+ assert 'X-Test' in req.headers
+ assert 'Cookie' in req.headers
+ assert req.extensions.get('timeout') == 2
+
+ with pytest.raises(AssertionError):
+ ydl.urlopen(None)
+
+ def test_extract_basic_auth(self):
+ with FakeRHYDL() as ydl:
+ res = ydl.urlopen(Request('http://user:pass@foo.bar'))
+ assert res.request.headers['Authorization'] == 'Basic dXNlcjpwYXNz'
+
+ def test_sanitize_url(self):
+ with FakeRHYDL() as ydl:
+ res = ydl.urlopen(Request('httpss://foo.bar'))
+ assert res.request.url == 'https://foo.bar'
+
+ def test_file_urls_error(self):
+ # use urllib handler
+ with FakeYDL() as ydl:
+ with pytest.raises(RequestError, match=r'file:// URLs are disabled by default'):
+ ydl.urlopen('file://')
+
+ def test_legacy_server_connect_error(self):
+ with FakeRHYDL() as ydl:
+ for error in ('UNSAFE_LEGACY_RENEGOTIATION_DISABLED', 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
+ with pytest.raises(RequestError, match=r'Try using --legacy-server-connect'):
+ ydl.urlopen(f'ssl://{error}')
+
+ with pytest.raises(SSLError, match='testerror'):
+ ydl.urlopen('ssl://testerror')
+
+ @pytest.mark.parametrize('proxy_key,proxy_url,expected', [
+ ('http', '__noproxy__', None),
+ ('no', '127.0.0.1,foo.bar', '127.0.0.1,foo.bar'),
+ ('https', 'example.com', 'http://example.com'),
+ ('https', '//example.com', 'http://example.com'),
+ ('https', 'socks5://example.com', 'socks5h://example.com'),
+ ('http', 'socks://example.com', 'socks4://example.com'),
+ ('http', 'socks4://example.com', 'socks4://example.com'),
+ ('unrelated', '/bad/proxy', '/bad/proxy'), # clean_proxies should ignore bad proxies
+ ])
+ def test_clean_proxy(self, proxy_key, proxy_url, expected):
+ # proxies should be cleaned in urlopen()
+ with FakeRHYDL() as ydl:
+ req = ydl.urlopen(Request('test://', proxies={proxy_key: proxy_url})).request
+ assert req.proxies[proxy_key] == expected
+
+ # and should also be cleaned when building the handler
+ env_key = f'{proxy_key.upper()}_PROXY'
+ old_env_proxy = os.environ.get(env_key)
+ try:
+ os.environ[env_key] = proxy_url # ensure that provided proxies override env
+ with FakeYDL() as ydl:
+ rh = self.build_handler(ydl)
+ assert rh.proxies[proxy_key] == expected
+ finally:
+ if old_env_proxy:
+ os.environ[env_key] = old_env_proxy
+
+ def test_clean_proxy_header(self):
+ with FakeRHYDL() as ydl:
+ req = ydl.urlopen(Request('test://', headers={'ytdl-request-proxy': '//foo.bar'})).request
+ assert 'ytdl-request-proxy' not in req.headers
+ assert req.proxies == {'all': 'http://foo.bar'}
+
+ with FakeYDL({'http_headers': {'ytdl-request-proxy': '//foo.bar'}}) as ydl:
+ rh = self.build_handler(ydl)
+ assert 'ytdl-request-proxy' not in rh.headers
+ assert rh.proxies == {'all': 'http://foo.bar'}
+
+ def test_clean_header(self):
+ with FakeRHYDL() as ydl:
+ res = ydl.urlopen(Request('test://', headers={'Youtubedl-no-compression': True}))
+ assert 'Youtubedl-no-compression' not in res.request.headers
+ assert res.request.headers.get('Accept-Encoding') == 'identity'
+
+ with FakeYDL({'http_headers': {'Youtubedl-no-compression': True}}) as ydl:
+ rh = self.build_handler(ydl)
+ assert 'Youtubedl-no-compression' not in rh.headers
+ assert rh.headers.get('Accept-Encoding') == 'identity'
+
+ def test_build_handler_params(self):
+ with FakeYDL({
+ 'http_headers': {'test': 'testtest'},
+ 'socket_timeout': 2,
+ 'proxy': 'http://127.0.0.1:8080',
+ 'source_address': '127.0.0.45',
+ 'debug_printtraffic': True,
+ 'compat_opts': ['no-certifi'],
+ 'nocheckcertificate': True,
+ 'legacyserverconnect': True,
+ }) as ydl:
+ rh = self.build_handler(ydl)
+ assert rh.headers.get('test') == 'testtest'
+ assert 'Accept' in rh.headers # ensure std_headers are still there
+ assert rh.timeout == 2
+ assert rh.proxies.get('all') == 'http://127.0.0.1:8080'
+ assert rh.source_address == '127.0.0.45'
+ assert rh.verbose is True
+ assert rh.prefer_system_certs is True
+ assert rh.verify is False
+ assert rh.legacy_ssl_support is True
+
+ @pytest.mark.parametrize('ydl_params', [
+ {'client_certificate': 'fakecert.crt'},
+ {'client_certificate': 'fakecert.crt', 'client_certificate_key': 'fakekey.key'},
+ {'client_certificate': 'fakecert.crt', 'client_certificate_key': 'fakekey.key', 'client_certificate_password': 'foobar'},
+ {'client_certificate_key': 'fakekey.key', 'client_certificate_password': 'foobar'},
+ ])
+ def test_client_certificate(self, ydl_params):
+ with FakeYDL(ydl_params) as ydl:
+ rh = self.build_handler(ydl)
+ assert rh._client_cert == ydl_params # XXX: Too bound to implementation
+
+ def test_urllib_file_urls(self):
+ with FakeYDL({'enable_file_urls': False}) as ydl:
+ rh = self.build_handler(ydl, UrllibRH)
+ assert rh.enable_file_urls is False
+
+ with FakeYDL({'enable_file_urls': True}) as ydl:
+ rh = self.build_handler(ydl, UrllibRH)
+ assert rh.enable_file_urls is True
+
+
+class TestRequest:
+
+ def test_query(self):
+ req = Request('http://example.com?q=something', query={'v': 'xyz'})
+ assert req.url == 'http://example.com?q=something&v=xyz'
+
+ req.update(query={'v': '123'})
+ assert req.url == 'http://example.com?q=something&v=123'
+ req.update(url='http://example.com', query={'v': 'xyz'})
+ assert req.url == 'http://example.com?v=xyz'
+
+ def test_method(self):
+ req = Request('http://example.com')
+ assert req.method == 'GET'
+ req.data = b'test'
+ assert req.method == 'POST'
+ req.data = None
+ assert req.method == 'GET'
+ req.data = b'test2'
+ req.method = 'PUT'
+ assert req.method == 'PUT'
+ req.data = None
+ assert req.method == 'PUT'
+ with pytest.raises(TypeError):
+ req.method = 1
+
+ def test_request_helpers(self):
+ assert HEADRequest('http://example.com').method == 'HEAD'
+ assert PUTRequest('http://example.com').method == 'PUT'
+
+ def test_headers(self):
+ req = Request('http://example.com', headers={'tesT': 'test'})
+ assert req.headers == HTTPHeaderDict({'test': 'test'})
+ req.update(headers={'teSt2': 'test2'})
+ assert req.headers == HTTPHeaderDict({'test': 'test', 'test2': 'test2'})
+
+ req.headers = new_headers = HTTPHeaderDict({'test': 'test'})
+ assert req.headers == HTTPHeaderDict({'test': 'test'})
+ assert req.headers is new_headers
+
+ # test converts dict to case insensitive dict
+ req.headers = new_headers = {'test2': 'test2'}
+ assert isinstance(req.headers, HTTPHeaderDict)
+ assert req.headers is not new_headers
+
+ with pytest.raises(TypeError):
+ req.headers = None
+
+ def test_data_type(self):
+ req = Request('http://example.com')
+ assert req.data is None
+ # test bytes is allowed
+ req.data = b'test'
+ assert req.data == b'test'
+ # test iterable of bytes is allowed
+ i = [b'test', b'test2']
+ req.data = i
+ assert req.data == i
+
+ # test file-like object is allowed
+ f = io.BytesIO(b'test')
+ req.data = f
+ assert req.data == f
+
+ # common mistake: test str not allowed
+ with pytest.raises(TypeError):
+ req.data = 'test'
+ assert req.data != 'test'
+
+ # common mistake: test dict is not allowed
+ with pytest.raises(TypeError):
+ req.data = {'test': 'test'}
+ assert req.data != {'test': 'test'}
+
+ def test_content_length_header(self):
+ req = Request('http://example.com', headers={'Content-Length': '0'}, data=b'')
+ assert req.headers.get('Content-Length') == '0'
+
+ req.data = b'test'
+ assert 'Content-Length' not in req.headers
+
+ req = Request('http://example.com', headers={'Content-Length': '10'})
+ assert 'Content-Length' not in req.headers
+
+ def test_content_type_header(self):
+ req = Request('http://example.com', headers={'Content-Type': 'test'}, data=b'test')
+ assert req.headers.get('Content-Type') == 'test'
+ req.data = b'test2'
+ assert req.headers.get('Content-Type') == 'test'
+ req.data = None
+ assert 'Content-Type' not in req.headers
+ req.data = b'test3'
+ assert req.headers.get('Content-Type') == 'application/x-www-form-urlencoded'
+
+ def test_update_req(self):
+ req = Request('http://example.com')
+ assert req.data is None
+ assert req.method == 'GET'
+ assert 'Content-Type' not in req.headers
+ # Test that zero-byte payloads will be sent
+ req.update(data=b'')
+ assert req.data == b''
+ assert req.method == 'POST'
+ assert req.headers.get('Content-Type') == 'application/x-www-form-urlencoded'
+
+ def test_proxies(self):
+ req = Request(url='http://example.com', proxies={'http': 'http://127.0.0.1:8080'})
+ assert req.proxies == {'http': 'http://127.0.0.1:8080'}
+
+ def test_extensions(self):
+ req = Request(url='http://example.com', extensions={'timeout': 2})
+ assert req.extensions == {'timeout': 2}
+
+ def test_copy(self):
+ req = Request(
+ url='http://example.com',
+ extensions={'cookiejar': CookieJar()},
+ headers={'Accept-Encoding': 'br'},
+ proxies={'http': 'http://127.0.0.1'},
+ data=[b'123']
+ )
+ req_copy = req.copy()
+ assert req_copy is not req
+ assert req_copy.url == req.url
+ assert req_copy.headers == req.headers
+ assert req_copy.headers is not req.headers
+ assert req_copy.proxies == req.proxies
+ assert req_copy.proxies is not req.proxies
+
+ # Data is not able to be copied
+ assert req_copy.data == req.data
+ assert req_copy.data is req.data
+
+ # Shallow copy extensions
+ assert req_copy.extensions is not req.extensions
+ assert req_copy.extensions['cookiejar'] == req.extensions['cookiejar']
+
+ # Subclasses are copied by default
+ class AnotherRequest(Request):
+ pass
+
+ req = AnotherRequest(url='http://127.0.0.1')
+ assert isinstance(req.copy(), AnotherRequest)
+
+ def test_url(self):
+ req = Request(url='https://фtest.example.com/ some spaceв?ä=c',)
+ assert req.url == 'https://xn--test-z6d.example.com/%20some%20space%D0%B2?%C3%A4=c'
+
+ assert Request(url='//example.com').url == 'http://example.com'
+
+ with pytest.raises(TypeError):
+ Request(url='https://').url = None
+
+
+class TestResponse:
+
+ @pytest.mark.parametrize('reason,status,expected', [
+ ('custom', 200, 'custom'),
+ (None, 404, 'Not Found'), # fallback status
+ ('', 403, 'Forbidden'),
+ (None, 999, None)
+ ])
+ def test_reason(self, reason, status, expected):
+ res = Response(io.BytesIO(b''), url='test://', headers={}, status=status, reason=reason)
+ assert res.reason == expected
+
+ def test_headers(self):
+ headers = Message()
+ headers.add_header('Test', 'test')
+ headers.add_header('Test', 'test2')
+ headers.add_header('content-encoding', 'br')
+ res = Response(io.BytesIO(b''), headers=headers, url='test://')
+ assert res.headers.get_all('test') == ['test', 'test2']
+ assert 'Content-Encoding' in res.headers
+
+ def test_get_header(self):
+ headers = Message()
+ headers.add_header('Set-Cookie', 'cookie1')
+ headers.add_header('Set-cookie', 'cookie2')
+ headers.add_header('Test', 'test')
+ headers.add_header('Test', 'test2')
+ res = Response(io.BytesIO(b''), headers=headers, url='test://')
+ assert res.get_header('test') == 'test, test2'
+ assert res.get_header('set-Cookie') == 'cookie1'
+ assert res.get_header('notexist', 'default') == 'default'
+
+ def test_compat(self):
+ res = Response(io.BytesIO(b''), url='test://', status=404, headers={'test': 'test'})
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore', category=DeprecationWarning)
+ assert res.code == res.getcode() == res.status
+ assert res.geturl() == res.url
+ assert res.info() is res.headers
+ assert res.getheader('test') == res.get_header('test')
diff --git a/test/test_networking_utils.py b/test/test_networking_utils.py
new file mode 100644
index 0000000..71cd214
--- /dev/null
+++ b/test/test_networking_utils.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python3
+
+# Allow direct execution
+import os
+import sys
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import contextlib
+import io
+import platform
+import random
+import ssl
+import urllib.error
+import warnings
+
+from hypervideo_dl.cookies import YoutubeDLCookieJar
+from hypervideo_dl.dependencies import certifi
+from hypervideo_dl.networking import Response
+from hypervideo_dl.networking._helper import (
+ InstanceStoreMixin,
+ add_accept_encoding_header,
+ get_redirect_method,
+ make_socks_proxy_opts,
+ select_proxy,
+ ssl_load_certs,
+)
+from hypervideo_dl.networking.exceptions import (
+ HTTPError,
+ IncompleteRead,
+ _CompatHTTPError,
+)
+from hypervideo_dl.socks import ProxyType
+from hypervideo_dl.utils.networking import HTTPHeaderDict
+
+TEST_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+class TestNetworkingUtils:
+
+ def test_select_proxy(self):
+ proxies = {
+ 'all': 'socks5://example.com',
+ 'http': 'http://example.com:1080',
+ 'no': 'bypass.example.com,yt-dl.org'
+ }
+
+ assert select_proxy('https://example.com', proxies) == proxies['all']
+ assert select_proxy('http://example.com', proxies) == proxies['http']
+ assert select_proxy('http://bypass.example.com', proxies) is None
+ assert select_proxy('https://yt-dl.org', proxies) is None
+
+ @pytest.mark.parametrize('socks_proxy,expected', [
+ ('socks5h://example.com', {
+ 'proxytype': ProxyType.SOCKS5,
+ 'addr': 'example.com',
+ 'port': 1080,
+ 'rdns': True,
+ 'username': None,
+ 'password': None
+ }),
+ ('socks5://user:@example.com:5555', {
+ 'proxytype': ProxyType.SOCKS5,
+ 'addr': 'example.com',
+ 'port': 5555,
+ 'rdns': False,
+ 'username': 'user',
+ 'password': ''
+ }),
+ ('socks4://u%40ser:pa%20ss@127.0.0.1:1080', {
+ 'proxytype': ProxyType.SOCKS4,
+ 'addr': '127.0.0.1',
+ 'port': 1080,
+ 'rdns': False,
+ 'username': 'u@ser',
+ 'password': 'pa ss'
+ }),
+ ('socks4a://:pa%20ss@127.0.0.1', {
+ 'proxytype': ProxyType.SOCKS4A,
+ 'addr': '127.0.0.1',
+ 'port': 1080,
+ 'rdns': True,
+ 'username': '',
+ 'password': 'pa ss'
+ })
+ ])
+ def test_make_socks_proxy_opts(self, socks_proxy, expected):
+ assert make_socks_proxy_opts(socks_proxy) == expected
+
+ def test_make_socks_proxy_unknown(self):
+ with pytest.raises(ValueError, match='Unknown SOCKS proxy version: socks'):
+ make_socks_proxy_opts('socks://127.0.0.1')
+
+ @pytest.mark.skipif(not certifi, reason='certifi is not installed')
+ def test_load_certifi(self):
+ context_certifi = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+ context_certifi.load_verify_locations(cafile=certifi.where())
+ context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+ ssl_load_certs(context, use_certifi=True)
+ assert context.get_ca_certs() == context_certifi.get_ca_certs()
+
+ context_default = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+ context_default.load_default_certs()
+ context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+ ssl_load_certs(context, use_certifi=False)
+ assert context.get_ca_certs() == context_default.get_ca_certs()
+
+ if context_default.get_ca_certs() == context_certifi.get_ca_certs():
+ pytest.skip('System uses certifi as default. The test is not valid')
+
+ @pytest.mark.parametrize('method,status,expected', [
+ ('GET', 303, 'GET'),
+ ('HEAD', 303, 'HEAD'),
+ ('PUT', 303, 'GET'),
+ ('POST', 301, 'GET'),
+ ('HEAD', 301, 'HEAD'),
+ ('POST', 302, 'GET'),
+ ('HEAD', 302, 'HEAD'),
+ ('PUT', 302, 'PUT'),
+ ('POST', 308, 'POST'),
+ ('POST', 307, 'POST'),
+ ('HEAD', 308, 'HEAD'),
+ ('HEAD', 307, 'HEAD'),
+ ])
+ def test_get_redirect_method(self, method, status, expected):
+ assert get_redirect_method(method, status) == expected
+
+ @pytest.mark.parametrize('headers,supported_encodings,expected', [
+ ({'Accept-Encoding': 'br'}, ['gzip', 'br'], {'Accept-Encoding': 'br'}),
+ ({}, ['gzip', 'br'], {'Accept-Encoding': 'gzip, br'}),
+ ({'Content-type': 'application/json'}, [], {'Content-type': 'application/json', 'Accept-Encoding': 'identity'}),
+ ])
+ def test_add_accept_encoding_header(self, headers, supported_encodings, expected):
+ headers = HTTPHeaderDict(headers)
+ add_accept_encoding_header(headers, supported_encodings)
+ assert headers == HTTPHeaderDict(expected)
+
+
+class TestInstanceStoreMixin:
+
+ class FakeInstanceStoreMixin(InstanceStoreMixin):
+ def _create_instance(self, **kwargs):
+ return random.randint(0, 1000000)
+
+ def _close_instance(self, instance):
+ pass
+
+ def test_mixin(self):
+ mixin = self.FakeInstanceStoreMixin()
+ assert mixin._get_instance(d={'a': 1, 'b': 2, 'c': {'d', 4}}) == mixin._get_instance(d={'a': 1, 'b': 2, 'c': {'d', 4}})
+
+ assert mixin._get_instance(d={'a': 1, 'b': 2, 'c': {'e', 4}}) != mixin._get_instance(d={'a': 1, 'b': 2, 'c': {'d', 4}})
+
+ assert mixin._get_instance(d={'a': 1, 'b': 2, 'c': {'d', 4}} != mixin._get_instance(d={'a': 1, 'b': 2, 'g': {'d', 4}}))
+
+ assert mixin._get_instance(d={'a': 1}, e=[1, 2, 3]) == mixin._get_instance(d={'a': 1}, e=[1, 2, 3])
+
+ assert mixin._get_instance(d={'a': 1}, e=[1, 2, 3]) != mixin._get_instance(d={'a': 1}, e=[1, 2, 3, 4])
+
+ cookiejar = YoutubeDLCookieJar()
+ assert mixin._get_instance(b=[1, 2], c=cookiejar) == mixin._get_instance(b=[1, 2], c=cookiejar)
+
+ assert mixin._get_instance(b=[1, 2], c=cookiejar) != mixin._get_instance(b=[1, 2], c=YoutubeDLCookieJar())
+
+ # Different order
+ assert mixin._get_instance(c=cookiejar, b=[1, 2]) == mixin._get_instance(b=[1, 2], c=cookiejar)
+
+ m = mixin._get_instance(t=1234)
+ assert mixin._get_instance(t=1234) == m
+ mixin._clear_instances()
+ assert mixin._get_instance(t=1234) != m
+
+
+class TestNetworkingExceptions:
+
+ @staticmethod
+ def create_response(status):
+ return Response(fp=io.BytesIO(b'test'), url='http://example.com', headers={'tesT': 'test'}, status=status)
+
+ @pytest.mark.parametrize('http_error_class', [HTTPError, lambda r: _CompatHTTPError(HTTPError(r))])
+ def test_http_error(self, http_error_class):
+
+ response = self.create_response(403)
+ error = http_error_class(response)
+
+ assert error.status == 403
+ assert str(error) == error.msg == 'HTTP Error 403: Forbidden'
+ assert error.reason == response.reason
+ assert error.response is response
+
+ data = error.response.read()
+ assert data == b'test'
+ assert repr(error) == '<HTTPError 403: Forbidden>'
+
+ @pytest.mark.parametrize('http_error_class', [HTTPError, lambda *args, **kwargs: _CompatHTTPError(HTTPError(*args, **kwargs))])
+ def test_redirect_http_error(self, http_error_class):
+ response = self.create_response(301)
+ error = http_error_class(response, redirect_loop=True)
+ assert str(error) == error.msg == 'HTTP Error 301: Moved Permanently (redirect loop detected)'
+ assert error.reason == 'Moved Permanently'
+
+ def test_compat_http_error(self):
+ response = self.create_response(403)
+ error = _CompatHTTPError(HTTPError(response))
+ assert isinstance(error, HTTPError)
+ assert isinstance(error, urllib.error.HTTPError)
+
+ @contextlib.contextmanager
+ def raises_deprecation_warning():
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter('always')
+ yield
+
+ if len(w) == 0:
+ pytest.fail('Did not raise DeprecationWarning')
+ if len(w) > 1:
+ pytest.fail(f'Raised multiple warnings: {w}')
+
+ if not issubclass(w[-1].category, DeprecationWarning):
+ pytest.fail(f'Expected DeprecationWarning, got {w[-1].category}')
+ w.clear()
+
+ with raises_deprecation_warning():
+ assert error.code == 403
+
+ with raises_deprecation_warning():
+ assert error.getcode() == 403
+
+ with raises_deprecation_warning():
+ assert error.hdrs is error.response.headers
+
+ with raises_deprecation_warning():
+ assert error.info() is error.response.headers
+
+ with raises_deprecation_warning():
+ assert error.headers is error.response.headers
+
+ with raises_deprecation_warning():
+ assert error.filename == error.response.url
+
+ with raises_deprecation_warning():
+ assert error.url == error.response.url
+
+ with raises_deprecation_warning():
+ assert error.geturl() == error.response.url
+
+ # Passthrough file operations
+ with raises_deprecation_warning():
+ assert error.read() == b'test'
+
+ with raises_deprecation_warning():
+ assert not error.closed
+
+ with raises_deprecation_warning():
+ # Technically Response operations are also passed through, which should not be used.
+ assert error.get_header('test') == 'test'
+
+ # Should not raise a warning
+ error.close()
+
+ @pytest.mark.skipif(
+ platform.python_implementation() == 'PyPy', reason='garbage collector works differently in pypy')
+ def test_compat_http_error_autoclose(self):
+ # Compat HTTPError should not autoclose response
+ response = self.create_response(403)
+ _CompatHTTPError(HTTPError(response))
+ assert not response.closed
+
+ def test_incomplete_read_error(self):
+ error = IncompleteRead(b'test', 3, cause='test')
+ assert isinstance(error, IncompleteRead)
+ assert repr(error) == '<IncompleteRead: 4 bytes read, 3 more expected>'
+ assert str(error) == error.msg == '4 bytes read, 3 more expected'
+ assert error.partial == b'test'
+ assert error.expected == 3
+ assert error.cause == 'test'
+
+ error = IncompleteRead(b'aaa')
+ assert repr(error) == '<IncompleteRead: 3 bytes read>'
+ assert str(error) == '3 bytes read'
diff --git a/test/test_plugins.py b/test/test_plugins.py
new file mode 100644
index 0000000..38ca87c
--- /dev/null
+++ b/test/test_plugins.py
@@ -0,0 +1,73 @@
+import importlib
+import os
+import shutil
+import sys
+import unittest
+from pathlib import Path
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+TEST_DATA_DIR = Path(os.path.dirname(os.path.abspath(__file__)), 'testdata')
+sys.path.append(str(TEST_DATA_DIR))
+importlib.invalidate_caches()
+
+from hypervideo_dl.plugins import PACKAGE_NAME, directories, load_plugins
+
+
+class TestPlugins(unittest.TestCase):
+
+ TEST_PLUGIN_DIR = TEST_DATA_DIR / PACKAGE_NAME
+
+ def test_directories_containing_plugins(self):
+ self.assertIn(self.TEST_PLUGIN_DIR, map(Path, directories()))
+
+ def test_extractor_classes(self):
+ for module_name in tuple(sys.modules):
+ if module_name.startswith(f'{PACKAGE_NAME}.extractor'):
+ del sys.modules[module_name]
+ plugins_ie = load_plugins('extractor', 'IE')
+
+ self.assertIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys())
+ self.assertIn('NormalPluginIE', plugins_ie.keys())
+
+ # don't load modules with underscore prefix
+ self.assertFalse(
+ f'{PACKAGE_NAME}.extractor._ignore' in sys.modules.keys(),
+ 'loaded module beginning with underscore')
+ self.assertNotIn('IgnorePluginIE', plugins_ie.keys())
+
+ # Don't load extractors with underscore prefix
+ self.assertNotIn('_IgnoreUnderscorePluginIE', plugins_ie.keys())
+
+ # Don't load extractors not specified in __all__ (if supplied)
+ self.assertNotIn('IgnoreNotInAllPluginIE', plugins_ie.keys())
+ self.assertIn('InAllPluginIE', plugins_ie.keys())
+
+ def test_postprocessor_classes(self):
+ plugins_pp = load_plugins('postprocessor', 'PP')
+ self.assertIn('NormalPluginPP', plugins_pp.keys())
+
+ def test_importing_zipped_module(self):
+ zip_path = TEST_DATA_DIR / 'zipped_plugins.zip'
+ shutil.make_archive(str(zip_path)[:-4], 'zip', str(zip_path)[:-4])
+ sys.path.append(str(zip_path)) # add zip to search paths
+ importlib.invalidate_caches() # reset the import caches
+
+ try:
+ for plugin_type in ('extractor', 'postprocessor'):
+ package = importlib.import_module(f'{PACKAGE_NAME}.{plugin_type}')
+ self.assertIn(zip_path / PACKAGE_NAME / plugin_type, map(Path, package.__path__))
+
+ plugins_ie = load_plugins('extractor', 'IE')
+ self.assertIn('ZippedPluginIE', plugins_ie.keys())
+
+ plugins_pp = load_plugins('postprocessor', 'PP')
+ self.assertIn('ZippedPluginPP', plugins_pp.keys())
+
+ finally:
+ sys.path.remove(str(zip_path))
+ os.remove(zip_path)
+ importlib.invalidate_caches() # reset the import caches
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_socks.py b/test/test_socks.py
index 6651290..73047ec 100644
--- a/test/test_socks.py
+++ b/test/test_socks.py
@@ -1,113 +1,470 @@
#!/usr/bin/env python3
-
# Allow direct execution
import os
import sys
+import threading
import unittest
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import pytest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import abc
+import contextlib
+import enum
+import functools
+import http.server
+import json
import random
-import subprocess
-import urllib.request
+import socket
+import struct
+import time
+from socketserver import (
+ BaseRequestHandler,
+ StreamRequestHandler,
+ ThreadingTCPServer,
+)
-from test.helper import FakeYDL, get_params, is_download_test
+from test.helper import http_server_port
+from hypervideo_dl.networking import Request
+from hypervideo_dl.networking.exceptions import ProxyError, TransportError
+from hypervideo_dl.socks import (
+ SOCKS4_REPLY_VERSION,
+ SOCKS4_VERSION,
+ SOCKS5_USER_AUTH_SUCCESS,
+ SOCKS5_USER_AUTH_VERSION,
+ SOCKS5_VERSION,
+ Socks5AddressType,
+ Socks5Auth,
+)
+SOCKS5_USER_AUTH_FAILURE = 0x1
-@is_download_test
-class TestMultipleSocks(unittest.TestCase):
- @staticmethod
- def _check_params(attrs):
- params = get_params()
- for attr in attrs:
- if attr not in params:
- print('Missing %s. Skipping.' % attr)
- return
- return params
- def test_proxy_http(self):
- params = self._check_params(['primary_proxy', 'primary_server_ip'])
- if params is None:
+class Socks4CD(enum.IntEnum):
+ REQUEST_GRANTED = 90
+ REQUEST_REJECTED_OR_FAILED = 91
+ REQUEST_REJECTED_CANNOT_CONNECT_TO_IDENTD = 92
+ REQUEST_REJECTED_DIFFERENT_USERID = 93
+
+
+class Socks5Reply(enum.IntEnum):
+ SUCCEEDED = 0x0
+ GENERAL_FAILURE = 0x1
+ CONNECTION_NOT_ALLOWED = 0x2
+ NETWORK_UNREACHABLE = 0x3
+ HOST_UNREACHABLE = 0x4
+ CONNECTION_REFUSED = 0x5
+ TTL_EXPIRED = 0x6
+ COMMAND_NOT_SUPPORTED = 0x7
+ ADDRESS_TYPE_NOT_SUPPORTED = 0x8
+
+
+class SocksTestRequestHandler(BaseRequestHandler):
+
+ def __init__(self, *args, socks_info=None, **kwargs):
+ self.socks_info = socks_info
+ super().__init__(*args, **kwargs)
+
+
+class SocksProxyHandler(BaseRequestHandler):
+ def __init__(self, request_handler_class, socks_server_kwargs, *args, **kwargs):
+ self.socks_kwargs = socks_server_kwargs or {}
+ self.request_handler_class = request_handler_class
+ super().__init__(*args, **kwargs)
+
+
+class Socks5ProxyHandler(StreamRequestHandler, SocksProxyHandler):
+
+ # SOCKS5 protocol https://tools.ietf.org/html/rfc1928
+ # SOCKS5 username/password authentication https://tools.ietf.org/html/rfc1929
+
+ def handle(self):
+ sleep = self.socks_kwargs.get('sleep')
+ if sleep:
+ time.sleep(sleep)
+ version, nmethods = self.connection.recv(2)
+ assert version == SOCKS5_VERSION
+ methods = list(self.connection.recv(nmethods))
+
+ auth = self.socks_kwargs.get('auth')
+
+ if auth is not None and Socks5Auth.AUTH_USER_PASS not in methods:
+ self.connection.sendall(struct.pack('!BB', SOCKS5_VERSION, Socks5Auth.AUTH_NO_ACCEPTABLE))
+ self.server.close_request(self.request)
return
- ydl = FakeYDL({
- 'proxy': params['primary_proxy']
- })
- self.assertEqual(
- ydl.urlopen('http://yt-dl.org/ip').read().decode(),
- params['primary_server_ip'])
-
- def test_proxy_https(self):
- params = self._check_params(['primary_proxy', 'primary_server_ip'])
- if params is None:
+
+ elif Socks5Auth.AUTH_USER_PASS in methods:
+ self.connection.sendall(struct.pack("!BB", SOCKS5_VERSION, Socks5Auth.AUTH_USER_PASS))
+
+ _, user_len = struct.unpack('!BB', self.connection.recv(2))
+ username = self.connection.recv(user_len).decode()
+ pass_len = ord(self.connection.recv(1))
+ password = self.connection.recv(pass_len).decode()
+
+ if username == auth[0] and password == auth[1]:
+ self.connection.sendall(struct.pack('!BB', SOCKS5_USER_AUTH_VERSION, SOCKS5_USER_AUTH_SUCCESS))
+ else:
+ self.connection.sendall(struct.pack('!BB', SOCKS5_USER_AUTH_VERSION, SOCKS5_USER_AUTH_FAILURE))
+ self.server.close_request(self.request)
+ return
+
+ elif Socks5Auth.AUTH_NONE in methods:
+ self.connection.sendall(struct.pack('!BB', SOCKS5_VERSION, Socks5Auth.AUTH_NONE))
+ else:
+ self.connection.sendall(struct.pack('!BB', SOCKS5_VERSION, Socks5Auth.AUTH_NO_ACCEPTABLE))
+ self.server.close_request(self.request)
return
- ydl = FakeYDL({
- 'proxy': params['primary_proxy']
- })
- self.assertEqual(
- ydl.urlopen('https://yt-dl.org/ip').read().decode(),
- params['primary_server_ip'])
-
- def test_secondary_proxy_http(self):
- params = self._check_params(['secondary_proxy', 'secondary_server_ip'])
- if params is None:
+
+ version, command, _, address_type = struct.unpack('!BBBB', self.connection.recv(4))
+ socks_info = {
+ 'version': version,
+ 'auth_methods': methods,
+ 'command': command,
+ 'client_address': self.client_address,
+ 'ipv4_address': None,
+ 'domain_address': None,
+ 'ipv6_address': None,
+ }
+ if address_type == Socks5AddressType.ATYP_IPV4:
+ socks_info['ipv4_address'] = socket.inet_ntoa(self.connection.recv(4))
+ elif address_type == Socks5AddressType.ATYP_DOMAINNAME:
+ socks_info['domain_address'] = self.connection.recv(ord(self.connection.recv(1))).decode()
+ elif address_type == Socks5AddressType.ATYP_IPV6:
+ socks_info['ipv6_address'] = socket.inet_ntop(socket.AF_INET6, self.connection.recv(16))
+ else:
+ self.server.close_request(self.request)
+
+ socks_info['port'] = struct.unpack('!H', self.connection.recv(2))[0]
+
+ # dummy response, the returned IP is just a placeholder
+ self.connection.sendall(struct.pack(
+ '!BBBBIH', SOCKS5_VERSION, self.socks_kwargs.get('reply', Socks5Reply.SUCCEEDED), 0x0, 0x1, 0x7f000001, 40000))
+
+ self.request_handler_class(self.request, self.client_address, self.server, socks_info=socks_info)
+
+
+class Socks4ProxyHandler(StreamRequestHandler, SocksProxyHandler):
+
+ # SOCKS4 protocol http://www.openssh.com/txt/socks4.protocol
+ # SOCKS4A protocol http://www.openssh.com/txt/socks4a.protocol
+
+ def _read_until_null(self):
+ return b''.join(iter(functools.partial(self.connection.recv, 1), b'\x00'))
+
+ def handle(self):
+ sleep = self.socks_kwargs.get('sleep')
+ if sleep:
+ time.sleep(sleep)
+ socks_info = {
+ 'version': SOCKS4_VERSION,
+ 'command': None,
+ 'client_address': self.client_address,
+ 'ipv4_address': None,
+ 'port': None,
+ 'domain_address': None,
+ }
+ version, command, dest_port, dest_ip = struct.unpack('!BBHI', self.connection.recv(8))
+ socks_info['port'] = dest_port
+ socks_info['command'] = command
+ if version != SOCKS4_VERSION:
+ self.server.close_request(self.request)
return
- ydl = FakeYDL()
- req = urllib.request.Request('http://yt-dl.org/ip')
- req.add_header('Ytdl-request-proxy', params['secondary_proxy'])
- self.assertEqual(
- ydl.urlopen(req).read().decode(),
- params['secondary_server_ip'])
-
- def test_secondary_proxy_https(self):
- params = self._check_params(['secondary_proxy', 'secondary_server_ip'])
- if params is None:
+ use_remote_dns = False
+ if 0x0 < dest_ip <= 0xFF:
+ use_remote_dns = True
+ else:
+ socks_info['ipv4_address'] = socket.inet_ntoa(struct.pack("!I", dest_ip))
+
+ user_id = self._read_until_null().decode()
+ if user_id != (self.socks_kwargs.get('user_id') or ''):
+ self.connection.sendall(struct.pack(
+ '!BBHI', SOCKS4_REPLY_VERSION, Socks4CD.REQUEST_REJECTED_DIFFERENT_USERID, 0x00, 0x00000000))
+ self.server.close_request(self.request)
return
- ydl = FakeYDL()
- req = urllib.request.Request('https://yt-dl.org/ip')
- req.add_header('Ytdl-request-proxy', params['secondary_proxy'])
- self.assertEqual(
- ydl.urlopen(req).read().decode(),
- params['secondary_server_ip'])
+ if use_remote_dns:
+ socks_info['domain_address'] = self._read_until_null().decode()
-@is_download_test
-class TestSocks(unittest.TestCase):
- _SKIP_SOCKS_TEST = True
+ # dummy response, the returned IP is just a placeholder
+ self.connection.sendall(
+ struct.pack(
+ '!BBHI', SOCKS4_REPLY_VERSION,
+ self.socks_kwargs.get('cd_reply', Socks4CD.REQUEST_GRANTED), 40000, 0x7f000001))
- def setUp(self):
- if self._SKIP_SOCKS_TEST:
- return
+ self.request_handler_class(self.request, self.client_address, self.server, socks_info=socks_info)
- self.port = random.randint(20000, 30000)
- self.server_process = subprocess.Popen([
- 'srelay', '-f', '-i', '127.0.0.1:%d' % self.port],
- stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- def tearDown(self):
- if self._SKIP_SOCKS_TEST:
- return
+class IPv6ThreadingTCPServer(ThreadingTCPServer):
+ address_family = socket.AF_INET6
+
+
+class SocksHTTPTestRequestHandler(http.server.BaseHTTPRequestHandler, SocksTestRequestHandler):
+ def do_GET(self):
+ if self.path == '/socks_info':
+ payload = json.dumps(self.socks_info.copy())
+ self.send_response(200)
+ self.send_header('Content-Type', 'application/json; charset=utf-8')
+ self.send_header('Content-Length', str(len(payload)))
+ self.end_headers()
+ self.wfile.write(payload.encode())
+
+
+@contextlib.contextmanager
+def socks_server(socks_server_class, request_handler, bind_ip=None, **socks_server_kwargs):
+ server = server_thread = None
+ try:
+ bind_address = bind_ip or '127.0.0.1'
+ server_type = ThreadingTCPServer if '.' in bind_address else IPv6ThreadingTCPServer
+ server = server_type(
+ (bind_address, 0), functools.partial(socks_server_class, request_handler, socks_server_kwargs))
+ server_port = http_server_port(server)
+ server_thread = threading.Thread(target=server.serve_forever)
+ server_thread.daemon = True
+ server_thread.start()
+ if '.' not in bind_address:
+ yield f'[{bind_address}]:{server_port}'
+ else:
+ yield f'{bind_address}:{server_port}'
+ finally:
+ server.shutdown()
+ server.server_close()
+ server_thread.join(2.0)
+
+
+class SocksProxyTestContext(abc.ABC):
+ REQUEST_HANDLER_CLASS = None
+
+ def socks_server(self, server_class, *args, **kwargs):
+ return socks_server(server_class, self.REQUEST_HANDLER_CLASS, *args, **kwargs)
+
+ @abc.abstractmethod
+ def socks_info_request(self, handler, target_domain=None, target_port=None, **req_kwargs) -> dict:
+ """return a dict of socks_info"""
+
+
+class HTTPSocksTestProxyContext(SocksProxyTestContext):
+ REQUEST_HANDLER_CLASS = SocksHTTPTestRequestHandler
+
+ def socks_info_request(self, handler, target_domain=None, target_port=None, **req_kwargs):
+ request = Request(f'http://{target_domain or "127.0.0.1"}:{target_port or "40000"}/socks_info', **req_kwargs)
+ handler.validate(request)
+ return json.loads(handler.send(request).read().decode())
+
+
+CTX_MAP = {
+ 'http': HTTPSocksTestProxyContext,
+}
+
+
+@pytest.fixture(scope='module')
+def ctx(request):
+ return CTX_MAP[request.param]()
+
+
+class TestSocks4Proxy:
+ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
+ def test_socks4_no_auth(self, handler, ctx):
+ with handler() as rh:
+ with ctx.socks_server(Socks4ProxyHandler) as server_address:
+ response = ctx.socks_info_request(
+ rh, proxies={'all': f'socks4://{server_address}'})
+ assert response['version'] == 4
+
+ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
+ def test_socks4_auth(self, handler, ctx):
+ with handler() as rh:
+ with ctx.socks_server(Socks4ProxyHandler, user_id='user') as server_address:
+ with pytest.raises(ProxyError):
+ ctx.socks_info_request(rh, proxies={'all': f'socks4://{server_address}'})
+ response = ctx.socks_info_request(
+ rh, proxies={'all': f'socks4://user:@{server_address}'})
+ assert response['version'] == 4
+
+ @pytest.mark.parametrize('handler,ctx', [
+ pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
+ reason='socks4a implementation currently broken when destination is not a domain name'))
+ ], indirect=True)
+ def test_socks4a_ipv4_target(self, handler, ctx):
+ with ctx.socks_server(Socks4ProxyHandler) as server_address:
+ with handler(proxies={'all': f'socks4a://{server_address}'}) as rh:
+ response = ctx.socks_info_request(rh, target_domain='127.0.0.1')
+ assert response['version'] == 4
+ assert response['ipv4_address'] == '127.0.0.1'
+ assert response['domain_address'] is None
+
+ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
+ def test_socks4a_domain_target(self, handler, ctx):
+ with ctx.socks_server(Socks4ProxyHandler) as server_address:
+ with handler(proxies={'all': f'socks4a://{server_address}'}) as rh:
+ response = ctx.socks_info_request(rh, target_domain='localhost')
+ assert response['version'] == 4
+ assert response['ipv4_address'] is None
+ assert response['domain_address'] == 'localhost'
+
+ @pytest.mark.parametrize('handler,ctx', [
+ pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
+ reason='source_address is not yet supported for socks4 proxies'))
+ ], indirect=True)
+ def test_ipv4_client_source_address(self, handler, ctx):
+ with ctx.socks_server(Socks4ProxyHandler) as server_address:
+ source_address = f'127.0.0.{random.randint(5, 255)}'
+ with handler(proxies={'all': f'socks4://{server_address}'},
+ source_address=source_address) as rh:
+ response = ctx.socks_info_request(rh)
+ assert response['client_address'][0] == source_address
+ assert response['version'] == 4
+
+ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
+ @pytest.mark.parametrize('reply_code', [
+ Socks4CD.REQUEST_REJECTED_OR_FAILED,
+ Socks4CD.REQUEST_REJECTED_CANNOT_CONNECT_TO_IDENTD,
+ Socks4CD.REQUEST_REJECTED_DIFFERENT_USERID,
+ ])
+ def test_socks4_errors(self, handler, ctx, reply_code):
+ with ctx.socks_server(Socks4ProxyHandler, cd_reply=reply_code) as server_address:
+ with handler(proxies={'all': f'socks4://{server_address}'}) as rh:
+ with pytest.raises(ProxyError):
+ ctx.socks_info_request(rh)
+
+ @pytest.mark.parametrize('handler,ctx', [
+ pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
+ reason='IPv6 socks4 proxies are not yet supported'))
+ ], indirect=True)
+ def test_ipv6_socks4_proxy(self, handler, ctx):
+ with ctx.socks_server(Socks4ProxyHandler, bind_ip='::1') as server_address:
+ with handler(proxies={'all': f'socks4://{server_address}'}) as rh:
+ response = ctx.socks_info_request(rh, target_domain='127.0.0.1')
+ assert response['client_address'][0] == '::1'
+ assert response['ipv4_address'] == '127.0.0.1'
+ assert response['version'] == 4
+
+ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
+ def test_timeout(self, handler, ctx):
+ with ctx.socks_server(Socks4ProxyHandler, sleep=2) as server_address:
+ with handler(proxies={'all': f'socks4://{server_address}'}, timeout=1) as rh:
+ with pytest.raises(TransportError):
+ ctx.socks_info_request(rh)
+
+
+class TestSocks5Proxy:
+
+ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
+ def test_socks5_no_auth(self, handler, ctx):
+ with ctx.socks_server(Socks5ProxyHandler) as server_address:
+ with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
+ response = ctx.socks_info_request(rh)
+ assert response['auth_methods'] == [0x0]
+ assert response['version'] == 5
+
+ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
+ def test_socks5_user_pass(self, handler, ctx):
+ with ctx.socks_server(Socks5ProxyHandler, auth=('test', 'testpass')) as server_address:
+ with handler() as rh:
+ with pytest.raises(ProxyError):
+ ctx.socks_info_request(rh, proxies={'all': f'socks5://{server_address}'})
+
+ response = ctx.socks_info_request(
+ rh, proxies={'all': f'socks5://test:testpass@{server_address}'})
+
+ assert response['auth_methods'] == [Socks5Auth.AUTH_NONE, Socks5Auth.AUTH_USER_PASS]
+ assert response['version'] == 5
+
+ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
+ def test_socks5_ipv4_target(self, handler, ctx):
+ with ctx.socks_server(Socks5ProxyHandler) as server_address:
+ with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
+ response = ctx.socks_info_request(rh, target_domain='127.0.0.1')
+ assert response['ipv4_address'] == '127.0.0.1'
+ assert response['version'] == 5
+
+ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
+ def test_socks5_domain_target(self, handler, ctx):
+ with ctx.socks_server(Socks5ProxyHandler) as server_address:
+ with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
+ response = ctx.socks_info_request(rh, target_domain='localhost')
+ assert response['ipv4_address'] == '127.0.0.1'
+ assert response['version'] == 5
+
+ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
+ def test_socks5h_domain_target(self, handler, ctx):
+ with ctx.socks_server(Socks5ProxyHandler) as server_address:
+ with handler(proxies={'all': f'socks5h://{server_address}'}) as rh:
+ response = ctx.socks_info_request(rh, target_domain='localhost')
+ assert response['ipv4_address'] is None
+ assert response['domain_address'] == 'localhost'
+ assert response['version'] == 5
- self.server_process.terminate()
- self.server_process.communicate()
+ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
+ def test_socks5h_ip_target(self, handler, ctx):
+ with ctx.socks_server(Socks5ProxyHandler) as server_address:
+ with handler(proxies={'all': f'socks5h://{server_address}'}) as rh:
+ response = ctx.socks_info_request(rh, target_domain='127.0.0.1')
+ assert response['ipv4_address'] == '127.0.0.1'
+ assert response['domain_address'] is None
+ assert response['version'] == 5
- def _get_ip(self, protocol):
- if self._SKIP_SOCKS_TEST:
- return '127.0.0.1'
+ @pytest.mark.parametrize('handler,ctx', [
+ pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
+ reason='IPv6 destination addresses are not yet supported'))
+ ], indirect=True)
+ def test_socks5_ipv6_destination(self, handler, ctx):
+ with ctx.socks_server(Socks5ProxyHandler) as server_address:
+ with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
+ response = ctx.socks_info_request(rh, target_domain='[::1]')
+ assert response['ipv6_address'] == '::1'
+ assert response['port'] == 80
+ assert response['version'] == 5
- ydl = FakeYDL({
- 'proxy': '%s://127.0.0.1:%d' % (protocol, self.port),
- })
- return ydl.urlopen('http://yt-dl.org/ip').read().decode()
+ @pytest.mark.parametrize('handler,ctx', [
+ pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
+ reason='IPv6 socks5 proxies are not yet supported'))
+ ], indirect=True)
+ def test_ipv6_socks5_proxy(self, handler, ctx):
+ with ctx.socks_server(Socks5ProxyHandler, bind_ip='::1') as server_address:
+ with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
+ response = ctx.socks_info_request(rh, target_domain='127.0.0.1')
+ assert response['client_address'][0] == '::1'
+ assert response['ipv4_address'] == '127.0.0.1'
+ assert response['version'] == 5
- def test_socks4(self):
- self.assertTrue(isinstance(self._get_ip('socks4'), str))
+ # XXX: is there any feasible way of testing IPv6 source addresses?
+ # Same would go for non-proxy source_address test...
+ @pytest.mark.parametrize('handler,ctx', [
+ pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
+ reason='source_address is not yet supported for socks5 proxies'))
+ ], indirect=True)
+ def test_ipv4_client_source_address(self, handler, ctx):
+ with ctx.socks_server(Socks5ProxyHandler) as server_address:
+ source_address = f'127.0.0.{random.randint(5, 255)}'
+ with handler(proxies={'all': f'socks5://{server_address}'}, source_address=source_address) as rh:
+ response = ctx.socks_info_request(rh)
+ assert response['client_address'][0] == source_address
+ assert response['version'] == 5
- def test_socks4a(self):
- self.assertTrue(isinstance(self._get_ip('socks4a'), str))
+ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
+ @pytest.mark.parametrize('reply_code', [
+ Socks5Reply.GENERAL_FAILURE,
+ Socks5Reply.CONNECTION_NOT_ALLOWED,
+ Socks5Reply.NETWORK_UNREACHABLE,
+ Socks5Reply.HOST_UNREACHABLE,
+ Socks5Reply.CONNECTION_REFUSED,
+ Socks5Reply.TTL_EXPIRED,
+ Socks5Reply.COMMAND_NOT_SUPPORTED,
+ Socks5Reply.ADDRESS_TYPE_NOT_SUPPORTED,
+ ])
+ def test_socks5_errors(self, handler, ctx, reply_code):
+ with ctx.socks_server(Socks5ProxyHandler, reply=reply_code) as server_address:
+ with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
+ with pytest.raises(ProxyError):
+ ctx.socks_info_request(rh)
- def test_socks5(self):
- self.assertTrue(isinstance(self._get_ip('socks5'), str))
+ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
+ def test_timeout(self, handler, ctx):
+ with ctx.socks_server(Socks5ProxyHandler, sleep=2) as server_address:
+ with handler(proxies={'all': f'socks5://{server_address}'}, timeout=1) as rh:
+ with pytest.raises(TransportError):
+ ctx.socks_info_request(rh)
if __name__ == '__main__':
diff --git a/test/test_utils.py b/test/test_utils.py
index acb913a..c089582 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -5,6 +5,7 @@ import os
import re
import sys
import unittest
+import warnings
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -46,10 +47,9 @@ from hypervideo_dl.utils import (
encode_base_n,
encode_compat_str,
encodeFilename,
- escape_rfc3986,
- escape_url,
expand_path,
extract_attributes,
+ extract_basic_auth,
find_xpath_attr,
fix_xml_ampersands,
float_or_none,
@@ -102,15 +102,16 @@ from hypervideo_dl.utils import (
sanitize_filename,
sanitize_path,
sanitize_url,
- sanitized_Request,
shell_quote,
smuggle_url,
+ str_or_none,
str_to_int,
strip_jsonp,
strip_or_none,
subtitles_filename,
timeconvert,
traverse_obj,
+ try_call,
unescapeHTML,
unified_strdate,
unified_timestamp,
@@ -122,12 +123,19 @@ from hypervideo_dl.utils import (
urlencode_postdata,
urljoin,
urshift,
+ variadic,
version_tuple,
xpath_attr,
xpath_element,
xpath_text,
xpath_with_ns,
)
+from hypervideo_dl.utils.networking import (
+ HTTPHeaderDict,
+ escape_rfc3986,
+ normalize_url,
+ remove_dot_segments,
+)
class TestUtil(unittest.TestCase):
@@ -254,15 +262,6 @@ class TestUtil(unittest.TestCase):
self.assertEqual(sanitize_url('https://foo.bar'), 'https://foo.bar')
self.assertEqual(sanitize_url('foo bar'), 'foo bar')
- def test_extract_basic_auth(self):
- auth_header = lambda url: sanitized_Request(url).get_header('Authorization')
- self.assertFalse(auth_header('http://foo.bar'))
- self.assertFalse(auth_header('http://:foo.bar'))
- self.assertEqual(auth_header('http://@foo.bar'), 'Basic Og==')
- self.assertEqual(auth_header('http://:pass@foo.bar'), 'Basic OnBhc3M=')
- self.assertEqual(auth_header('http://user:@foo.bar'), 'Basic dXNlcjo=')
- self.assertEqual(auth_header('http://user:pass@foo.bar'), 'Basic dXNlcjpwYXNz')
-
def test_expand_path(self):
def env(var):
return f'%{var}%' if sys.platform == 'win32' else f'${var}'
@@ -659,6 +658,8 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_duration('P0Y0M0DT0H4M20.880S'), 260.88)
self.assertEqual(parse_duration('01:02:03:050'), 3723.05)
self.assertEqual(parse_duration('103:050'), 103.05)
+ self.assertEqual(parse_duration('1HR 3MIN'), 3780)
+ self.assertEqual(parse_duration('2hrs 3mins'), 7380)
def test_fix_xml_ampersands(self):
self.assertEqual(
@@ -935,24 +936,124 @@ class TestUtil(unittest.TestCase):
self.assertEqual(escape_rfc3986('foo bar'), 'foo%20bar')
self.assertEqual(escape_rfc3986('foo%20bar'), 'foo%20bar')
- def test_escape_url(self):
+ def test_normalize_url(self):
self.assertEqual(
- escape_url('http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavré_FD.mp4'),
+ normalize_url('http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavré_FD.mp4'),
'http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavre%CC%81_FD.mp4'
)
self.assertEqual(
- escape_url('http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erklärt/Das-Erste/Video?documentId=22673108&bcastId=5290'),
+ normalize_url('http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erklärt/Das-Erste/Video?documentId=22673108&bcastId=5290'),
'http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erkl%C3%A4rt/Das-Erste/Video?documentId=22673108&bcastId=5290'
)
self.assertEqual(
- escape_url('http://тест.рф/фрагмент'),
+ normalize_url('http://тест.рф/фрагмент'),
'http://xn--e1aybc.xn--p1ai/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82'
)
self.assertEqual(
- escape_url('http://тест.рф/абв?абв=абв#абв'),
+ normalize_url('http://тест.рф/абв?абв=абв#абв'),
'http://xn--e1aybc.xn--p1ai/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2'
)
- self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
+ self.assertEqual(normalize_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
+
+ self.assertEqual(normalize_url('http://www.example.com/../a/b/../c/./d.html'), 'http://www.example.com/a/c/d.html')
+
+ def test_remove_dot_segments(self):
+ self.assertEqual(remove_dot_segments('/a/b/c/./../../g'), '/a/g')
+ self.assertEqual(remove_dot_segments('mid/content=5/../6'), 'mid/6')
+ self.assertEqual(remove_dot_segments('/ad/../cd'), '/cd')
+ self.assertEqual(remove_dot_segments('/ad/../cd/'), '/cd/')
+ self.assertEqual(remove_dot_segments('/..'), '/')
+ self.assertEqual(remove_dot_segments('/./'), '/')
+ self.assertEqual(remove_dot_segments('/./a'), '/a')
+ self.assertEqual(remove_dot_segments('/abc/./.././d/././e/.././f/./../../ghi'), '/ghi')
+ self.assertEqual(remove_dot_segments('/'), '/')
+ self.assertEqual(remove_dot_segments('/t'), '/t')
+ self.assertEqual(remove_dot_segments('t'), 't')
+ self.assertEqual(remove_dot_segments(''), '')
+ self.assertEqual(remove_dot_segments('/../a/b/c'), '/a/b/c')
+ self.assertEqual(remove_dot_segments('../a'), 'a')
+ self.assertEqual(remove_dot_segments('./a'), 'a')
+ self.assertEqual(remove_dot_segments('.'), '')
+ self.assertEqual(remove_dot_segments('////'), '////')
+
+ def test_js_to_json_vars_strings(self):
+ self.assertDictEqual(
+ json.loads(js_to_json(
+ '''{
+ 'null': a,
+ 'nullStr': b,
+ 'true': c,
+ 'trueStr': d,
+ 'false': e,
+ 'falseStr': f,
+ 'unresolvedVar': g,
+ }''',
+ {
+ 'a': 'null',
+ 'b': '"null"',
+ 'c': 'true',
+ 'd': '"true"',
+ 'e': 'false',
+ 'f': '"false"',
+ 'g': 'var',
+ }
+ )),
+ {
+ 'null': None,
+ 'nullStr': 'null',
+ 'true': True,
+ 'trueStr': 'true',
+ 'false': False,
+ 'falseStr': 'false',
+ 'unresolvedVar': 'var'
+ }
+ )
+
+ self.assertDictEqual(
+ json.loads(js_to_json(
+ '''{
+ 'int': a,
+ 'intStr': b,
+ 'float': c,
+ 'floatStr': d,
+ }''',
+ {
+ 'a': '123',
+ 'b': '"123"',
+ 'c': '1.23',
+ 'd': '"1.23"',
+ }
+ )),
+ {
+ 'int': 123,
+ 'intStr': '123',
+ 'float': 1.23,
+ 'floatStr': '1.23',
+ }
+ )
+
+ self.assertDictEqual(
+ json.loads(js_to_json(
+ '''{
+ 'object': a,
+ 'objectStr': b,
+ 'array': c,
+ 'arrayStr': d,
+ }''',
+ {
+ 'a': '{}',
+ 'b': '"{}"',
+ 'c': '[]',
+ 'd': '"[]"',
+ }
+ )),
+ {
+ 'object': {},
+ 'objectStr': '{}',
+ 'array': [],
+ 'arrayStr': '[]',
+ }
+ )
def test_js_to_json_realworld(self):
inp = '''{
@@ -1110,6 +1211,13 @@ class TestUtil(unittest.TestCase):
self.assertEqual(js_to_json('42a1'), '42"a1"')
self.assertEqual(js_to_json('42a-1'), '42"a"-1')
+ def test_js_to_json_template_literal(self):
+ self.assertEqual(js_to_json('`Hello ${name}`', {'name': '"world"'}), '"Hello world"')
+ self.assertEqual(js_to_json('`${name}${name}`', {'name': '"X"'}), '"XX"')
+ self.assertEqual(js_to_json('`${name}${name}`', {'name': '5'}), '"55"')
+ self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""')
+ self.assertEqual(js_to_json('`${name}`', {}), '"name"')
+
def test_extract_attributes(self):
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
@@ -1745,6 +1853,8 @@ Line 1
def test_clean_podcast_url(self):
self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3')
self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3')
+ self.assertEqual(clean_podcast_url('https://pdst.fm/e/2.gum.fm/chtbl.com/track/chrt.fm/track/34D33/pscrb.fm/rss/p/traffic.megaphone.fm/ITLLC7765286967.mp3?updated=1687282661'), 'https://traffic.megaphone.fm/ITLLC7765286967.mp3?updated=1687282661')
+ self.assertEqual(clean_podcast_url('https://pdst.fm/e/https://mgln.ai/e/441/www.buzzsprout.com/1121972/13019085-ep-252-the-deep-life-stack.mp3'), 'https://www.buzzsprout.com/1121972/13019085-ep-252-the-deep-life-stack.mp3')
def test_LazyList(self):
it = list(range(10))
@@ -1874,6 +1984,8 @@ Line 1
vcodecs=[None], acodecs=[None], vexts=['webm'], aexts=['m4a']), 'mkv')
self.assertEqual(get_compatible_ext(
vcodecs=[None], acodecs=[None], vexts=['webm'], aexts=['webm']), 'webm')
+ self.assertEqual(get_compatible_ext(
+ vcodecs=[None], acodecs=[None], vexts=['webm'], aexts=['weba']), 'webm')
self.assertEqual(get_compatible_ext(
vcodecs=['h264'], acodecs=['mp4a'], vexts=['mov'], aexts=['m4a']), 'mp4')
@@ -1885,6 +1997,35 @@ Line 1
self.assertEqual(get_compatible_ext(
vcodecs=['av1'], acodecs=['mp4a'], vexts=['webm'], aexts=['m4a'], preferences=('webm', 'mkv')), 'mkv')
+ def test_try_call(self):
+ def total(*x, **kwargs):
+ return sum(x) + sum(kwargs.values())
+
+ self.assertEqual(try_call(None), None,
+ msg='not a fn should give None')
+ self.assertEqual(try_call(lambda: 1), 1,
+ msg='int fn with no expected_type should give int')
+ self.assertEqual(try_call(lambda: 1, expected_type=int), 1,
+ msg='int fn with expected_type int should give int')
+ self.assertEqual(try_call(lambda: 1, expected_type=dict), None,
+ msg='int fn with wrong expected_type should give None')
+ self.assertEqual(try_call(total, args=(0, 1, 0, ), expected_type=int), 1,
+ msg='fn should accept arglist')
+ self.assertEqual(try_call(total, kwargs={'a': 0, 'b': 1, 'c': 0}, expected_type=int), 1,
+ msg='fn should accept kwargs')
+ self.assertEqual(try_call(lambda: 1, expected_type=dict), None,
+ msg='int fn with no expected_type should give None')
+ self.assertEqual(try_call(lambda x: {}, total, args=(42, ), expected_type=int), 42,
+ msg='expect first int result with expected_type int')
+
+ def test_variadic(self):
+ self.assertEqual(variadic(None), (None, ))
+ self.assertEqual(variadic('spam'), ('spam', ))
+ self.assertEqual(variadic('spam', allowed_types=dict), 'spam')
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore')
+ self.assertEqual(variadic('spam', allowed_types=[dict]), 'spam')
+
def test_traverse_obj(self):
_TEST_DATA = {
100: 100,
@@ -1918,8 +2059,8 @@ Line 1
# Test Ellipsis behavior
self.assertCountEqual(traverse_obj(_TEST_DATA, ...),
- (item for item in _TEST_DATA.values() if item is not None),
- msg='`...` should give all values except `None`')
+ (item for item in _TEST_DATA.values() if item not in (None, {})),
+ msg='`...` should give all non discarded values')
self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', 0, ...)), _TEST_DATA['urls'][0].values(),
msg='`...` selection for dicts should select all values')
self.assertEqual(traverse_obj(_TEST_DATA, (..., ..., 'url')),
@@ -1927,6 +2068,8 @@ Line 1
msg='nested `...` queries should work')
self.assertCountEqual(traverse_obj(_TEST_DATA, (..., ..., 'index')), range(4),
msg='`...` query result should be flattened')
+ self.assertEqual(traverse_obj(iter(range(4)), ...), list(range(4)),
+ msg='`...` should accept iterables')
# Test function as key
self.assertEqual(traverse_obj(_TEST_DATA, lambda x, y: x == 'urls' and isinstance(y, list)),
@@ -1934,6 +2077,42 @@ Line 1
msg='function as query key should perform a filter based on (key, value)')
self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)), {'str'},
msg='exceptions in the query function should be catched')
+ self.assertEqual(traverse_obj(iter(range(4)), lambda _, x: x % 2 == 0), [0, 2],
+ msg='function key should accept iterables')
+ if __debug__:
+ with self.assertRaises(Exception, msg='Wrong function signature should raise in debug'):
+ traverse_obj(_TEST_DATA, lambda a: ...)
+ with self.assertRaises(Exception, msg='Wrong function signature should raise in debug'):
+ traverse_obj(_TEST_DATA, lambda a, b, c: ...)
+
+ # Test set as key (transformation/type, like `expected_type`)
+ self.assertEqual(traverse_obj(_TEST_DATA, (..., {str.upper}, )), ['STR'],
+ msg='Function in set should be a transformation')
+ self.assertEqual(traverse_obj(_TEST_DATA, (..., {str})), ['str'],
+ msg='Type in set should be a type filter')
+ self.assertEqual(traverse_obj(_TEST_DATA, {dict}), _TEST_DATA,
+ msg='A single set should be wrapped into a path')
+ self.assertEqual(traverse_obj(_TEST_DATA, (..., {str.upper})), ['STR'],
+ msg='Transformation function should not raise')
+ self.assertEqual(traverse_obj(_TEST_DATA, (..., {str_or_none})),
+ [item for item in map(str_or_none, _TEST_DATA.values()) if item is not None],
+ msg='Function in set should be a transformation')
+ if __debug__:
+ with self.assertRaises(Exception, msg='Sets with length != 1 should raise in debug'):
+ traverse_obj(_TEST_DATA, set())
+ with self.assertRaises(Exception, msg='Sets with length != 1 should raise in debug'):
+ traverse_obj(_TEST_DATA, {str.upper, str})
+
+ # Test `slice` as a key
+ _SLICE_DATA = [0, 1, 2, 3, 4]
+ self.assertEqual(traverse_obj(_TEST_DATA, ('dict', slice(1))), None,
+ msg='slice on a dictionary should not throw')
+ self.assertEqual(traverse_obj(_SLICE_DATA, slice(1)), _SLICE_DATA[:1],
+ msg='slice key should apply slice to sequence')
+ self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 2)), _SLICE_DATA[1:2],
+ msg='slice key should apply slice to sequence')
+ self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 4, 2)), _SLICE_DATA[1:4:2],
+ msg='slice key should apply slice to sequence')
# Test alternative paths
self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'str'), 'str',
@@ -1979,15 +2158,23 @@ Line 1
{0: ['https://www.example.com/1', 'https://www.example.com/0']},
msg='tripple nesting in dict path should be treated as branches')
self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}), {},
- msg='remove `None` values when dict key')
+ msg='remove `None` values when top level dict key fails')
self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}, default=...), {0: ...},
- msg='do not remove `None` values if `default`')
- self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {0: {}},
- msg='do not remove empty values when dict key')
- self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=...), {0: {}},
- msg='do not remove empty values when dict key and a default')
- self.assertEqual(traverse_obj(_TEST_DATA, {0: ('dict', ...)}), {0: []},
- msg='if branch in dict key not successful, return `[]`')
+ msg='use `default` if key fails and `default`')
+ self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {},
+ msg='remove empty values when dict key')
+ self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=...), {0: ...},
+ msg='use `default` when dict key and `default`')
+ self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 'fail'}}), {},
+ msg='remove empty values when nested dict key fails')
+ self.assertEqual(traverse_obj(None, {0: 'fail'}), {},
+ msg='default to dict if pruned')
+ self.assertEqual(traverse_obj(None, {0: 'fail'}, default=...), {0: ...},
+ msg='default to dict if pruned and default is given')
+ self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 'fail'}}, default=...), {0: {0: ...}},
+ msg='use nested `default` when nested dict key fails and `default`')
+ self.assertEqual(traverse_obj(_TEST_DATA, {0: ('dict', ...)}), {},
+ msg='remove key if branch in dict key not successful')
# Testing default parameter behavior
_DEFAULT_DATA = {'None': None, 'int': 0, 'list': []}
@@ -2011,20 +2198,55 @@ Line 1
msg='if branched but not successful return `[]`, not `default`')
self.assertEqual(traverse_obj(_DEFAULT_DATA, ('list', ...)), [],
msg='if branched but object is empty return `[]`, not `default`')
+ self.assertEqual(traverse_obj(None, ...), [],
+ msg='if branched but object is `None` return `[]`, not `default`')
+ self.assertEqual(traverse_obj({0: None}, (0, ...)), [],
+ msg='if branched but state is `None` return `[]`, not `default`')
+
+ branching_paths = [
+ ('fail', ...),
+ (..., 'fail'),
+ 100 * ('fail',) + (...,),
+ (...,) + 100 * ('fail',),
+ ]
+ for branching_path in branching_paths:
+ self.assertEqual(traverse_obj({}, branching_path), [],
+ msg='if branched but state is `None`, return `[]` (not `default`)')
+ self.assertEqual(traverse_obj({}, 'fail', branching_path), [],
+ msg='if branching in last alternative and previous did not match, return `[]` (not `default`)')
+ self.assertEqual(traverse_obj({0: 'x'}, 0, branching_path), 'x',
+ msg='if branching in last alternative and previous did match, return single value')
+ self.assertEqual(traverse_obj({0: 'x'}, branching_path, 0), 'x',
+ msg='if branching in first alternative and non-branching path does match, return single value')
+ self.assertEqual(traverse_obj({}, branching_path, 'fail'), None,
+ msg='if branching in first alternative and non-branching path does not match, return `default`')
# Testing expected_type behavior
_EXPECTED_TYPE_DATA = {'str': 'str', 'int': 0}
- self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=str), 'str',
- msg='accept matching `expected_type` type')
- self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int), None,
- msg='reject non matching `expected_type` type')
- self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: str(x)), '0',
- msg='transform type using type function')
- self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str',
- expected_type=lambda _: 1 / 0), None,
- msg='wrap expected_type fuction in try_call')
- self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, ..., expected_type=str), ['str'],
- msg='eliminate items that expected_type fails on')
+ self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=str),
+ 'str', msg='accept matching `expected_type` type')
+ self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int),
+ None, msg='reject non matching `expected_type` type')
+ self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: str(x)),
+ '0', msg='transform type using type function')
+ self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=lambda _: 1 / 0),
+ None, msg='wrap expected_type fuction in try_call')
+ self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, ..., expected_type=str),
+ ['str'], msg='eliminate items that expected_type fails on')
+ self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2}, expected_type=int),
+ {0: 100}, msg='type as expected_type should filter dict values')
+ self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2, 2: 'None'}, expected_type=str_or_none),
+ {0: '100', 1: '1.2'}, msg='function as expected_type should transform dict values')
+ self.assertEqual(traverse_obj(_TEST_DATA, ({0: 1.2}, 0, {int_or_none}), expected_type=int),
+ 1, msg='expected_type should not filter non final dict values')
+ self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 100, 1: 'str'}}, expected_type=int),
+ {0: {0: 100}}, msg='expected_type should transform deep dict values')
+ self.assertEqual(traverse_obj(_TEST_DATA, [({0: '...'}, {0: '...'})], expected_type=type(...)),
+ [{0: ...}, {0: ...}], msg='expected_type should transform branched dict values')
+ self.assertEqual(traverse_obj({1: {3: 4}}, [(1, 2), 3], expected_type=int),
+ [4], msg='expected_type regression for type matching in tuple branching')
+ self.assertEqual(traverse_obj(_TEST_DATA, ['data', ...], expected_type=int),
+ [], msg='expected_type regression for type matching in dict result')
# Test get_all behavior
_GET_ALL_DATA = {'key': [0, 1, 2]}
@@ -2064,14 +2286,23 @@ Line 1
traverse_string=True), '.',
msg='traverse into converted data if `traverse_string`')
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', ...),
- traverse_string=True), list('str'),
- msg='`...` branching into string should result in list')
+ traverse_string=True), 'str',
+ msg='`...` should result in string (same value) if `traverse_string`')
+ self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', slice(0, None, 2)),
+ traverse_string=True), 'sr',
+ msg='`slice` should result in string if `traverse_string`')
+ self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda i, v: i or v == "s"),
+ traverse_string=True), 'str',
+ msg='function should result in string if `traverse_string`')
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)),
traverse_string=True), ['s', 'r'],
- msg='branching into string should result in list')
- self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda _, x: x),
- traverse_string=True), list('str'),
- msg='function branching into string should result in list')
+ msg='branching should result in list if `traverse_string`')
+ self.assertEqual(traverse_obj({}, (0, ...), traverse_string=True), [],
+ msg='branching should result in list if `traverse_string`')
+ self.assertEqual(traverse_obj({}, (0, lambda x, y: True), traverse_string=True), [],
+ msg='branching should result in list if `traverse_string`')
+ self.assertEqual(traverse_obj({}, (0, slice(1)), traverse_string=True), [],
+ msg='branching should result in list if `traverse_string`')
# Test is_user_input behavior
_IS_USER_INPUT_DATA = {'range8': list(range(8))}
@@ -2108,6 +2339,48 @@ Line 1
msg='failing str key on a `re.Match` should return `default`')
self.assertEqual(traverse_obj(mobj, 8), None,
msg='failing int key on a `re.Match` should return `default`')
+ self.assertEqual(traverse_obj(mobj, lambda k, _: k in (0, 'group')), ['0123', '3'],
+ msg='function on a `re.Match` should give group name as well')
+
+ def test_http_header_dict(self):
+ headers = HTTPHeaderDict()
+ headers['ytdl-test'] = b'0'
+ self.assertEqual(list(headers.items()), [('Ytdl-Test', '0')])
+ headers['ytdl-test'] = 1
+ self.assertEqual(list(headers.items()), [('Ytdl-Test', '1')])
+ headers['Ytdl-test'] = '2'
+ self.assertEqual(list(headers.items()), [('Ytdl-Test', '2')])
+ self.assertTrue('ytDl-Test' in headers)
+ self.assertEqual(str(headers), str(dict(headers)))
+ self.assertEqual(repr(headers), str(dict(headers)))
+
+ headers.update({'X-dlp': 'data'})
+ self.assertEqual(set(headers.items()), {('Ytdl-Test', '2'), ('X-Dlp', 'data')})
+ self.assertEqual(dict(headers), {'Ytdl-Test': '2', 'X-Dlp': 'data'})
+ self.assertEqual(len(headers), 2)
+ self.assertEqual(headers.copy(), headers)
+ headers2 = HTTPHeaderDict({'X-dlp': 'data3'}, **headers, **{'X-dlp': 'data2'})
+ self.assertEqual(set(headers2.items()), {('Ytdl-Test', '2'), ('X-Dlp', 'data2')})
+ self.assertEqual(len(headers2), 2)
+ headers2.clear()
+ self.assertEqual(len(headers2), 0)
+
+ # ensure we prefer latter headers
+ headers3 = HTTPHeaderDict({'Ytdl-TeSt': 1}, {'Ytdl-test': 2})
+ self.assertEqual(set(headers3.items()), {('Ytdl-Test', '2')})
+ del headers3['ytdl-tesT']
+ self.assertEqual(dict(headers3), {})
+
+ headers4 = HTTPHeaderDict({'ytdl-test': 'data;'})
+ self.assertEqual(set(headers4.items()), {('Ytdl-Test', 'data;')})
+
+ def test_extract_basic_auth(self):
+ assert extract_basic_auth('http://:foo.bar') == ('http://:foo.bar', None)
+ assert extract_basic_auth('http://foo.bar') == ('http://foo.bar', None)
+ assert extract_basic_auth('http://@foo.bar') == ('http://foo.bar', 'Basic Og==')
+ assert extract_basic_auth('http://:pass@foo.bar') == ('http://foo.bar', 'Basic OnBhc3M=')
+ assert extract_basic_auth('http://user:@foo.bar') == ('http://foo.bar', 'Basic dXNlcjo=')
+ assert extract_basic_auth('http://user:pass@foo.bar') == ('http://foo.bar', 'Basic dXNlcjpwYXNz')
if __name__ == '__main__':
diff --git a/test/testdata/yt_dlp_plugins/extractor/_ignore.py b/test/testdata/yt_dlp_plugins/extractor/_ignore.py
new file mode 100644
index 0000000..3ee321b
--- /dev/null
+++ b/test/testdata/yt_dlp_plugins/extractor/_ignore.py
@@ -0,0 +1,5 @@
+from hypervideo_dl.extractor.common import InfoExtractor
+
+
+class IgnorePluginIE(InfoExtractor):
+ pass
diff --git a/test/testdata/yt_dlp_plugins/extractor/ignore.py b/test/testdata/yt_dlp_plugins/extractor/ignore.py
new file mode 100644
index 0000000..0f7eaa4
--- /dev/null
+++ b/test/testdata/yt_dlp_plugins/extractor/ignore.py
@@ -0,0 +1,12 @@
+from hypervideo_dl.extractor.common import InfoExtractor
+
+
+class IgnoreNotInAllPluginIE(InfoExtractor):
+ pass
+
+
+class InAllPluginIE(InfoExtractor):
+ pass
+
+
+__all__ = ['InAllPluginIE']
diff --git a/test/testdata/yt_dlp_plugins/extractor/normal.py b/test/testdata/yt_dlp_plugins/extractor/normal.py
new file mode 100644
index 0000000..905b6b3
--- /dev/null
+++ b/test/testdata/yt_dlp_plugins/extractor/normal.py
@@ -0,0 +1,9 @@
+from hypervideo_dl.extractor.common import InfoExtractor
+
+
+class NormalPluginIE(InfoExtractor):
+ pass
+
+
+class _IgnoreUnderscorePluginIE(InfoExtractor):
+ pass
diff --git a/test/testdata/yt_dlp_plugins/postprocessor/normal.py b/test/testdata/yt_dlp_plugins/postprocessor/normal.py
new file mode 100644
index 0000000..51d3be6
--- /dev/null
+++ b/test/testdata/yt_dlp_plugins/postprocessor/normal.py
@@ -0,0 +1,5 @@
+from hypervideo_dl.postprocessor.common import PostProcessor
+
+
+class NormalPluginPP(PostProcessor):
+ pass
diff --git a/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py b/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py
new file mode 100644
index 0000000..c2263e1
--- /dev/null
+++ b/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py
@@ -0,0 +1,5 @@
+from hypervideo_dl.extractor.common import InfoExtractor
+
+
+class ZippedPluginIE(InfoExtractor):
+ pass
diff --git a/test/testdata/zipped_plugins/yt_dlp_plugins/postprocessor/zipped.py b/test/testdata/zipped_plugins/yt_dlp_plugins/postprocessor/zipped.py
new file mode 100644
index 0000000..047ebae
--- /dev/null
+++ b/test/testdata/zipped_plugins/yt_dlp_plugins/postprocessor/zipped.py
@@ -0,0 +1,5 @@
+from hypervideo_dl.postprocessor.common import PostProcessor
+
+
+class ZippedPluginPP(PostProcessor):
+ pass